In [2]:
import os
import sys
from pathlib import Path

from IPython.display import display, HTML, Markdown
import numpy as np
import pandas as pd

import matplotlib as mpl
import matplotlib.pyplot as plt
import seaborn as sns

# Project level imports
sys.path.insert(0, '../lib')
from ncbi_remap.notebook import Nb
from ncbi_remap.plotting import make_figs

In [3]:
# Setup notebook
nbconfig = Nb.setup_notebook()

Please check output/fbgn2chrom.tsv. If it does not exist, run bin/fbgn2chrom.py
last updated: 2018-10-05 
Git hash: d9f50945fa864956cc17f22a30aafc5244874783


In [4]:
# Connect to data store
store = pd.HDFStore('../output/sra.h5', mode='r')

In [53]:
complete_srx = store['aln/complete'].srx.unique().tolist()

In [51]:
from pymongo import MongoClient
try:
    with open('../output/.mongodb_host', 'r') as fh:
        host = fh.read().strip()
except FileNotFoundError:
    host = 'localhost'

mongoClient = MongoClient(host=host, port=27017)
db = mongoClient['sramongo']
ncbi = db['ncbi']

In [54]:
libstrat = pd.DataFrame(list(ncbi.aggregate([
    {
        '$match': {
            '_id': {'$in': complete_srx},
        }
    },
    {
        '$unwind': {
            'path': "$runs"
        }
    },
    {
        '$project': {
            '_id': 0,
            'srx': '$srx',
            'srr': '$runs.srr',
            'library_strategy': '$sra.experiment.library_strategy'
        }
    }
]))).set_index(['srx', 'srr'])

In [90]:
srxs = libstrat.query('library_strategy == "RNA-Seq"').index.get_level_values('srx').unique()
srrs = libstrat.query('library_strategy == "RNA-Seq"').index.get_level_values('srr').unique()

In [186]:
flag_well_stranded = store.select('prealn/workflow/collectrnaseqmetrics/second', where='srx == srxs', columns=['PCT_CORRECT_STRAND_READS']) >= .99

In [187]:
stranded_srx = flag_well_stranded[flag_well_stranded.iloc[:, 0]].index.get_level_values('srx').unique()
stranded_srr = flag_well_stranded[flag_well_stranded.iloc[:, 0]].index.get_level_values('srr').unique()

In [188]:
len(stranded_srr)

2429

In [189]:
len(stranded_srx)

2293

In [190]:
with open('../data/312_sample_golden_set_2016-06-14.txt') as fh:
    golden = fh.read().split('\n')

In [191]:
cnt = 0
for srr in golden:
    if srr in srrs:
        cnt += 1

print(f'Number of golden in srrs: {cnt} of {len(golden)}')

Number of golden in srrs: 311 of 313


In [192]:
cnt = 0
for srr in golden:
    if srr in stranded_srr:
        cnt += 1
        
print(f'Number of golden in super strand: {cnt} of {len(golden)}')

Number of golden in super strand: 236 of 313


In [193]:
metadata = pd.read_csv('../output/geo-wf/rnaseq_metadata.tsv', sep='\t')

In [194]:
stranded_meta = metadata.query(f'sample_name == {stranded_srx.tolist()}')
stranded_meta.set_index('sample_name', inplace=True)

In [195]:
stranded_meta.contact.value_counts()

Brian Oliver <briano@helix.nih.gov>                          1075
Fiona Ingleby <f.ingleby@sussex.ac.uk>                        178
Jun Chen <cjnankai@gmail.com>                                  96
Stein Aerts <Stein.Aerts@med.kuleuven.be>                      45
Yuheng Huang <yuheng.huang.sysu@gmail.com>                     45
Michael Antosh <Michael_Antosh@brown.edu>                      32
Michael Elgart <elgart@gmail.com>                              28
Jean-Yves Roignant <j.roignant@imb-mainz.de>                   27
Chris Seidel <seidel@phageT4.org>                              25
Corbin Jones <cdjones@email.unc.edu>                           24
Tadeusz Kawecki <tadeusz.kawecki@unil.ch>                      24
Jin Li <jin.billy.li@stanford.edu>                             24
Carl Thummel <cthummel@genetics.utah.edu>                      20
Benoit Biteau <benoit_biteau@urmc.rochester.edu>               19
Maria Spletter <maria.spletter@gmail.com>                      18
John Tower

In [185]:
stranded_meta.tissue.value_counts()

whole body                                          998
head                                                154
ovary                                               149
embryo                                               81
wing disc                                            54
gut                                                  32
indirect flight muscle                               23
testis                                               21
midgut                                               18
antenna                                              18
eye disc                                             18
fat body                                             17
brain                                                16
abdomen without digestive or reproductive system     16
gonad                                                16
thorax without digestive system                      16
digestive system                                     16
imaginal disc                                   

In [180]:
stranded_meta.columns

Index(['title', 'organism', 'study', 'runs', 'GEO Experiment', 'GEO Sample',
       'BioSample ID', 'BioProject', 'pubmed', 'pubmed_title',
       'pubmed_citation', 'pubmed_authors', 'contact', 'sex',
       'developmental stage', 'tissue', 'cell type', 'molecule', 'description',
       'raw file'],
      dtype='object')

In [181]:
stranded_meta['cell type'].value_counts()

S2R+               2013
S2                   79
Kc167                21
OSC                  17
neuroblast           15
gut progenitor       13
OSS                   8
embryonic             4
head of fly           3
blastema              3
wing disc pouch       3
D17-c3                2
Name: cell type, dtype: int64

In [182]:
stranded_meta['developmental stage'].value_counts()

adult                               909
adult stage 0-10 days               217
third instar larval stage           143
larval stage                         89
pupal stage                          59
first instar larval stage            51
embryonic stage 2 h (ael)            28
embryonic stage 10-14 h (ael)        16
embryonic stage                      12
embryonic stage 2-3 h (ael)           9
pupal stage 30 h (apf)                8
pupal stage 72 h (apf)                7
embryonic stage 4-10 h (ael)          7
pupal stage 24 h (apf)                7
pupal stage 8 h (apf)                 6
pupal stage 96 h (apf)                6
embryonic stage 2-4 h (ael)           6
pupal stage 40 h (apf)                6
embryonic stage 10-11 h (ael)         5
embryonic stage 3-3.5 h (ael)         4
embryonic stage 1.5-3 h (ael)         4
embryonic stage 10-12 h (ael)         4
embryonic stage 15-21 h (ael)         4
embryonic stage 6-8 h (ael)           4
pupal stage 44 h (apf)                3


In [183]:
stranded_meta.study.value_counts().head(3)

SRP074593    1770
SRP045429     249
SRP064744     242
Name: study, dtype: int64

In [184]:
stranded_meta.query('study == "SRP069203"')

Unnamed: 0_level_0,title,organism,study,runs,GEO Experiment,GEO Sample,BioSample ID,BioProject,pubmed,pubmed_title,pubmed_citation,pubmed_authors,contact,sex,developmental stage,tissue,cell type,molecule,description,raw file
sample_name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1
SRX1557614,Sample_f_0_DNifmDNcff_rep1,Drosophila melanogaster,SRP069203,SRR3139779,GSE77492,GSM2053163,SAMN04452925,PRJNA310631,,,,,Brian Oliver <briano@helix.nih.gov>,female,,,,cDNA,cff1|ifm1|ll7|gonad|without,SRX1557614
SRX1557615,Sample_f_0_DNifmDNcff_rep2,Drosophila melanogaster,SRP069203,SRR3139780,GSE77492,GSM2053164,SAMN04452926,PRJNA310631,,,,,Brian Oliver <briano@helix.nih.gov>,female,,,,cDNA,cff1|ifm1|ll7|gonad|without,SRX1557615
SRX1557616,Sample_f_0_DNifmDNcff_rep3,Drosophila melanogaster,SRP069203,SRR3139781,GSE77492,GSM2053165,SAMN04452927,PRJNA310631,,,,,Brian Oliver <briano@helix.nih.gov>,female,,,,cDNA,cff1|ifm1|ll7|gonad|without,SRX1557616
SRX1557617,Sample_f_0_DNifmDNcff_rep4,Drosophila melanogaster,SRP069203,SRR3139782,GSE77492,GSM2053166,SAMN04452928,PRJNA310631,,,,,Brian Oliver <briano@helix.nih.gov>,female,,,,cDNA,cff1|ifm1|ll7|gonad|without,SRX1557617
SRX1557618,Sample_f_0_DNifm_rep1,Drosophila melanogaster,SRP069203,SRR3139783,GSE77492,GSM2053167,SAMN04452929,PRJNA310631,,,,,Brian Oliver <briano@helix.nih.gov>,female,,,,cDNA,ifm1|ll7|gonad|without,SRX1557618
SRX1557619,Sample_f_0_DNifm_rep2,Drosophila melanogaster,SRP069203,SRR3139784,GSE77492,GSM2053168,SAMN04452930,PRJNA310631,,,,,Brian Oliver <briano@helix.nih.gov>,female,,,,cDNA,ifm1|ll7|gonad|without,SRX1557619
SRX1557620,Sample_f_0_DNifm_rep3,Drosophila melanogaster,SRP069203,SRR3139785,GSE77492,GSM2053169,SAMN04452931,PRJNA310631,,,,,Brian Oliver <briano@helix.nih.gov>,female,,,,cDNA,ifm1|ll7|gonad|without,SRX1557620
SRX1557621,Sample_f_0_DNifm_rep4,Drosophila melanogaster,SRP069203,SRR3139786,GSE77492,GSM2053170,SAMN04452932,PRJNA310631,,,,,Brian Oliver <briano@helix.nih.gov>,female,,,,cDNA,ifm1|ll7|gonad|without,SRX1557621
SRX1557622,Sample_f_0_gpp_rep1,Drosophila melanogaster,SRP069203,SRR3139787,GSE77492,GSM2053171,SAMN04452933,PRJNA310631,,,,,Brian Oliver <briano@helix.nih.gov>,female,,,,cDNA,hms00160|trip|attp2|ll7|gonad|without,SRX1557622
SRX1557623,Sample_f_0_gpp_rep2,Drosophila melanogaster,SRP069203,SRR3139788,GSE77492,GSM2053172,SAMN04452934,PRJNA310631,,,,,Brian Oliver <briano@helix.nih.gov>,female,,,,cDNA,hms00160|trip|attp2|ll7|gonad|without,SRX1557623
