In [2]:
import os
import sys
from pathlib import Path

from IPython.display import display, HTML, Markdown
import numpy as np
import pandas as pd

import matplotlib as mpl
import matplotlib.pyplot as plt
import seaborn as sns

# Project level imports
sys.path.insert(0, '../lib')
from ncbi_remap.notebook import Nb
from ncbi_remap.plotting import make_figs

In [3]:
# Setup notebook
nbconfig = Nb.setup_notebook()

Please check output/fbgn2chrom.tsv. If it does not exist, run bin/fbgn2chrom.py
last updated: 2018-10-05 
Git hash: d9f50945fa864956cc17f22a30aafc5244874783


In [4]:
# Connect to data store
store = pd.HDFStore('../output/sra.h5', mode='r')

In [5]:
complete_srx = store['aln/complete'].srx.unique().tolist()

In [6]:
from pymongo import MongoClient
try:
    with open('../output/.mongodb_host', 'r') as fh:
        host = fh.read().strip()
except FileNotFoundError:
    host = 'localhost'

mongoClient = MongoClient(host=host, port=27017)
db = mongoClient['sra']
ncbi = db['ncbi']

In [7]:
# Get list of RNA-Seq that are completed
libstrat = pd.DataFrame(list(ncbi.aggregate([
    {
        '$match': {
            '_id': {'$in': complete_srx},
        }
    },
    {
        '$unwind': {
            'path': "$sra.run"
        }
    },
    {
        '$project': {
            '_id': 0,
            'srx': '$_id',
            'srr': '$sra.run.run_id',
            'library_strategy': '$sra.experiment.library_strategy'
        }
    }
]))).set_index(['srx', 'srr'])

srxs = libstrat.query('library_strategy == "RNA-Seq"').index.get_level_values('srx').unique()
srrs = libstrat.query('library_strategy == "RNA-Seq"').index.get_level_values('srr').unique()

In [9]:
# Pull out super stranded tracks
flag_well_stranded = store.select('prealn/workflow/collectrnaseqmetrics/second', where='srx == srxs', columns=['PCT_CORRECT_STRAND_READS']) >= .99

stranded_srx = flag_well_stranded[flag_well_stranded.iloc[:, 0]].index.get_level_values('srx').unique()
stranded_srr = flag_well_stranded[flag_well_stranded.iloc[:, 0]].index.get_level_values('srr').unique()

print(f'There are {len(stranded_srx):,} super stranded SRXs or {len(stranded_srr):,} SRRs.')

There are 2,293 super stranded SRXs or 2,429 SRRs.


In [10]:
# Get super stranded metadata
metadata = pd.read_csv('../output/geo-wf/rnaseq_metadata.tsv', sep='\t')

stranded_meta = metadata.query(f'sample_name == {stranded_srx.tolist()}')
stranded_meta.set_index('sample_name', inplace=True)

In [11]:
# Tissue
stranded_meta.tissue.value_counts()

whole body                                          564
ovary                                                71
embryo                                               64
head                                                 50
wing disc                                            21
gut                                                  20
antenna                                              18
imaginal disc                                        15
gonad                                                15
digestive system                                     13
thorax without digestive system                      11
head and thorax                                      11
fat body                                             10
eye disc                                             10
abdomen without digestive or reproductive system     10
reproductive system without gonad and genitalia      10
indirect flight muscle                               10
genitalia                                       

In [12]:
# Cell Type
stranded_meta['cell type'].value_counts()

S2R+              782
S2                 23
neuroblast         13
Kc167              10
OSS                 8
gut progenitor      2
OSC                 2
Name: cell type, dtype: int64

In [13]:
# Developmental stage
stranded_meta['developmental stage'].value_counts()

adult                               393
third instar larval stage           117
adult stage 0-10 days               115
larval stage                         59
pupal stage                          59
embryonic stage 2 h (ael)            28
embryonic stage 10-14 h (ael)        15
embryonic stage                      11
embryonic stage 2-3 h (ael)           9
pupal stage 8 h (apf)                 6
pupal stage 72 h (apf)                6
embryonic stage 2-4 h (ael)           6
pupal stage 40 h (apf)                6
embryonic stage 10-12 h (ael)         4
embryonic stage 6-8 h (ael)           4
embryonic stage 0-4 h (ael)           3
pupal stage 48 h (apf)                3
pupal stage 30 h (apf)                2
pupal stage 90 h (apf)                2
embryonic stage 18-24 h (ael)         1
embryonic stage 0.75-1.5 h (ael)      1
first instar larval stage             1
pupal stage 24 h (apf)                1
embryonic stage 0.75 h (ael)          1
embryonic stage 12-18 h (ael)         1
