In [2]:
import os
import sys
from pathlib import Path

from IPython.display import display, HTML, Markdown
import numpy as np
import pandas as pd

import matplotlib as mpl
import matplotlib.pyplot as plt
import seaborn as sns

# Project level imports
sys.path.insert(0, '../lib')
from ncbi_remap.notebook import Nb
from ncbi_remap.plotting import make_figs

In [3]:
# Setup notebook
nbconfig = Nb.setup_notebook()

Please check output/fbgn2chrom.tsv. If it does not exist, run bin/fbgn2chrom.py
last updated: 2018-10-07 
Git hash: d9f50945fa864956cc17f22a30aafc5244874783


In [4]:
# Connect to data store
store = pd.HDFStore('../output/sra.h5', mode='r')

In [54]:
complete = store['aln/complete']
complete_srx = complete.srx.unique().tolist()
complete_srr = complete.srr.unique().tolist()

In [6]:
from pymongo import MongoClient
try:
    with open('../output/.mongodb_host', 'r') as fh:
        host = fh.read().strip()
except FileNotFoundError:
    host = 'localhost'

mongoClient = MongoClient(host=host, port=27017)
db = mongoClient['sra']
ncbi = db['ncbi']

In [58]:
rnaseq = pd.DataFrame(list(ncbi.aggregate([
    {
        '$match': {
            '_id': {'$in': complete_srx},
            'sra.experiment.library_strategy': 'RNA-Seq',
        }
    },
    {
        '$unwind': {
            'path': '$runs'
        }
    },
    {
        '$project': {
            'srx': '$srx',
            'srr': '$runs.srr',
            '_id': 0
        }
    }
    
])))

rnaseq_srx = rnaseq.srx.unique().tolist()
rnaseq_srr = rnaseq.srr.unique().tolist()

In [59]:
cols = [
    'num_reads', 
    'num_unaligned', 
    'num_uniquely_aligned',
    'num_multimappers', 
    'per_alignment'
]

hisat2 = store.select('aln/workflow/hisat2', where='srx == rnaseq_srx', columns=cols)

In [112]:
cols = [
    'PCT_CODING_BASES', 
    'PCT_MRNA_BASES', 
    'MEDIAN_CV_COVERAGE', 
]

metrics = store.select('prealn/workflow/collectrnaseqmetrics/unstranded', columns=cols)

In [137]:
cols = [
    'PERCENT_DUPLICATION',
]
dups = store.select('prealn/workflow/markduplicates', columns=cols)

In [138]:
features = hisat2.join(metrics).join(dups)

In [247]:
cutoffs = features.apply(lambda x: np.percentile(x, 50))

In [272]:
pos_cols = [
#    'num_reads',
#    'num_uniquely_aligned',
    'per_alignment',
#    'PCT_CODING_BASES',
#    'PCT_MRNA_BASES',
]

neg_cols = [
#    'num_unaligned',
#    'num_multimappers',
    'PERCENT_DUPLICATION',
#    'MEDIAN_CV_COVERAGE',
]

In [273]:
features_pos = features[pos_cols]
features_neg = features[neg_cols]

In [274]:
pos_oklist = (features_pos.ge(cutoffs[pos_cols])).all(axis=1)
neg_oklist = (features_neg.le(cutoffs[neg_cols])).all(axis=1)

In [275]:
pos_oklist.sum()

9243

In [276]:
neg_oklist.sum()

9242

In [277]:
(pos_oklist & neg_oklist).sum()

5200

In [278]:
okData = features[pos_oklist & neg_oklist]

In [279]:
okData

Unnamed: 0_level_0,Unnamed: 1_level_0,num_reads,num_unaligned,num_uniquely_aligned,num_multimappers,per_alignment,PCT_CODING_BASES,PCT_MRNA_BASES,MEDIAN_CV_COVERAGE,PERCENT_DUPLICATION
srx,srr,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
SRX2255260,SRR4436095,39628219.0,4995767.0,1488788.0,212465.0,93.70,0.726232,0.967897,0.431259,0.303477
SRX2255259,SRR4436094,35490377.0,4994868.0,1365590.0,152636.0,92.96,0.712084,0.953768,0.441185,0.269800
SRX2255258,SRR4436093,37704341.0,5708692.0,1127642.0,485116.0,92.43,0.652323,0.982388,0.433661,0.532842
SRX2255257,SRR4436092,37202571.0,4685662.0,1393743.0,175857.0,93.70,0.739976,0.987320,0.394226,0.367646
SRX2255256,SRR4436091,32174313.0,3785644.0,1229435.0,134241.0,94.12,0.753427,0.985226,0.427315,0.270137
SRX2255255,SRR4436090,30327565.0,3483269.0,1145703.0,110044.0,94.26,0.750054,0.986000,0.425631,0.243095
SRX2070038,SRR4105221,11523941.0,628652.0,235532.0,9180.0,97.27,0.755582,0.975699,1.079621,0.490246
SRX2070035,SRR4105218,11902658.0,683555.0,236372.0,6535.0,97.13,0.780214,0.985172,1.124970,0.592483
SRX2070032,SRR4105215,9895870.0,641193.0,199266.0,7209.0,96.76,0.754862,0.979406,1.168195,0.565394
SRX2070031,SRR4105214,14811487.0,635219.0,284564.0,10163.0,97.86,0.808578,0.982244,0.915832,0.368158


In [291]:
okSRX = okData.index.get_level_values('srx').unique().tolist()

In [298]:
strand = store.select('prealn/workflow/collectrnaseqmetrics/second', where="PCT_CORRECT_STRAND_READS >= .99", columns=['PCT_CORRECT_STRAND_READS'])

In [300]:
strand_srx = strand.index.get_level_values('srx').unique().tolist()

In [304]:
golden = list(set(strand_srx).intersection(set(okSRX)))

In [310]:
metadata = pd.read_csv('../output/geo-wf/rnaseq_metadata.tsv', sep='\t', index_col=0)

In [312]:
golden_meta = metadata.reindex(golden)

In [314]:
golden_meta.tissue.value_counts()

whole body         336
embryo              43
ovary               41
head                22
antenna             18
imaginal disc       15
gut                  7
head and thorax      5
brain                3
leg                  2
wing disc            1
testis               1
Name: tissue, dtype: int64

In [315]:
golden_meta['developmental stage'].value_counts()

adult                               172
adult stage 0-10 days               102
larval stage                         59
pupal stage                          59
third instar larval stage            37
embryonic stage 2 h (ael)            28
pupal stage 40 h (apf)                6
pupal stage 8 h (apf)                 6
embryonic stage 10-12 h (ael)         4
embryonic stage 2-4 h (ael)           4
embryonic stage                       4
embryonic stage 6-8 h (ael)           4
embryonic stage 0-4 h (ael)           3
embryonic stage 0.75-1.5 h (ael)      1
embryonic stage 18-24 h (ael)         1
embryonic stage 0.75 h (ael)          1
embryonic stage 1.5-6 h (ael)         1
first instar larval stage             1
embryonic stage 6-12 h (ael)          1
embryonic stage 12-18 h (ael)         1
Name: developmental stage, dtype: int64

In [316]:
golden_meta['cell type'].value_counts()

neuroblast        10
S2                 2
gut progenitor     1
Name: cell type, dtype: int64