# Developmental Time Course

In [1]:
import os
import sys
from pathlib import Path
from textwrap import dedent

from IPython.display import display, HTML, Markdown
import numpy as np
import pandas as pd

import matplotlib as mpl
import matplotlib.pyplot as plt
import seaborn as sns

# Project level imports
sys.path.insert(0, '../lib')
from ncbi_remap.notebook import Nb
from ncbi_remap.plotting import make_figs

# Setup notebook
nbconfig = Nb.setup_notebook()

# Connect to data store
store = pd.HDFStore('../output/sra.h5', mode='r')

Please check output/fbgn2chrom.tsv. If it does not exist, run bin/fbgn2chrom.py
last updated: 2018-05-01 
Git hash: 69e627d87b83358b44de6959a32570f5edba2b8c


In [2]:
from pymongo import MongoClient
try:
    with open('../output/.mongodb_host', 'r') as fh:
        host = fh.read().strip()
except FileNotFoundError:
    host = 'localhost'

mongoClient = MongoClient(host=host, port=27022)
db = mongoClient['sra']
ncbi = db['ncbi']
biometa = db['biometa']

In [3]:
# Get list of completed srx
complete = store['aln/complete'].srx.unique().tolist()

In [4]:
len(complete)

24028

In [5]:
passed = [x for x in store['prealn/qc_passed'].srx.unique().tolist() if x in complete]

In [6]:
len(passed)

17178

In [7]:
rnaseq = [x['_id'] for x in ncbi.aggregate([
    {
        '$match': {
            '_id': {'$in': passed},
            'sra.experiment.library_strategy': 'RNA-Seq'
        }
    },
    {
        '$project': {
            '_id': 1
        }
    }
])]

In [8]:
len(rnaseq)

10965

In [13]:
names = set([x['name'] for x in biometa.aggregate([
    {
        '$unwind': {
            'path': '$sample_attributes'
        }
    },
    {
        '$match': {
            'sample_attributes.name': {'$exists': 1}
        }
    },
    {
        '$project': {
            '_id': 0,
            'name': '$sample_attributes.name'
        }
    }
])])

In [13]:
# List some possible names to use
"age"
"age_of_animals"
"age_of_flies"
"age_of_fly_in_days_post_eclosion"
"ages"
"biosamplemodel"
"biosourceprovider"
"biosourcetype"
"days_after_eclosion"
"developemntal_stage"
"developmentalstage"
"developmental_stage"
"developmental_temperature"
"developmental_time"
"developmental_time_point"
"development_point"
"development_stage"
"develpomental_stage"
"dev_stage"
"dev-stage"
"embryonic_stage"
"embryonic_stages"
"embryonic_temperature"
"embryos_hand-sorted_or_not"
"embryos_per_sample"
"gender/age"
"gestational_age"
"mating_status"
"mating_system"
"sex/age"
"tissue_age"
"tissue/development_stage"
"tissue_source"

In [9]:
dev_stage = [
    "developemntal_stage",
    "developmentalstage",
    "developmental_stage",
    "development_point",
    "development_stage",
    "develpomental_stage",
    "dev_stage",
    "dev-stage"
]

In [10]:
staged = list(set([x['_id'] for x in biometa.aggregate([
    {
        '$unwind': {
            'path': '$experiments'
        }
    },
    {
        '$match': {
            'experiments.srx': {'$in': rnaseq}
        }
    },
    {
        '$unwind': {
            'path': '$sample_attributes'
        }
    },
    {
        '$match': {
            'sample_attributes.name': {'$in': dev_stage}
        }
    },
    {
        '$group': {
            '_id': '$experiments.srx'
        }
    },
    {
        '$project': {
            '_id': 1,
        }
    }
    
])]))

In [11]:
len(staged)

4229

In [12]:
biosamples = list(set([x['_id'] for x in biometa.aggregate([
    {
        '$unwind': {
            'path': '$experiments'
        }
    },
    {
        '$match': {
            'experiments.srx': {'$in': staged}
        }
    },
    {
        '$group': {
            '_id': '$_id'
        }
    },
])]))

In [13]:
len(biosamples)

3873

In [14]:
keepers = list(biometa.find({'_id': {'$in': biosamples}}))

In [15]:
len(keepers)

3873

In [16]:
keepers[0]

{'_id': 'SAMD00025834',
 'bioproject': 'PRJDB3522',
 'contacts': [],
 'description': 'light condition: constant light',
 'experiments': [{'runs': ['DRR030335'], 'srx': 'DRX027351'}],
 'papers': [],
 'sample_attributes': [{'name': 'collection_date', 'value': '2014-10-01'},
  {'name': 'dev_stage', 'value': 'adult'},
  {'name': 'strain', 'value': 'Dark-fly'},
  {'name': 'tissue_type', 'value': 'whole'},
  {'name': 'sample_name', 'value': 'D1'},
  {'name': 'sex', 'value': 'female'},
  {'name': 'tissue', 'value': 'whole'}],
 'sample_title': 'Dark-fly_LL_1',
 'srp': 'DRP003494',
 'srs': 'DRS041155',
 'study_abstract': 'Flies (Drosophila melanogater) of 4 strains (M: Oregon-R, D: Dark-fly, U: Urbana-S, R: RAL-774) were reared in three light conditions (1, 4, 7: constant light; 2, 5, 8: light/dark cycling; 3, 6, 9: constant dark). Total RNAs were extracted from whole bodies of adult females. RNA expression was compared between strains and between light conditions to reveal effects of genome x 

In [17]:
import json

In [18]:
with open('../data/dev_stage.json') as fh:
    res = json.loads(fh.read())

In [19]:
clean = [x for x in res if 'study_title' not in x]

In [21]:
clean[0]

{'_id': 'SAMD00025834',
 'sample_attributes': [{'name': 'dev_stage', 'value': 'adult stage'},
  {'name': 'sample_name', 'value': 'D1'},
  {'name': 'sex', 'value': 'female'},
  {'name': 'tissue', 'value': 'whole body'}]}

In [22]:
data = []
for k in keepers:
    dat = {
        'biosample': k['_id'],
        'description': k.get('description', ''),
        'sample_title': k.get('study_title', ''),
        'study_abstract': k.get('study_abstract', ''),
    }
    
    if len(k['papers']) > 0:
        dat['pmid'] = k['papers'][0]['pubmed_id']
    else:
        dat['pmid'] = ''
        
    for attr in k['sample_attributes']:
        dat['author_' + attr['name']] = attr['value']
    
    data.append(dat)
    
df = pd.DataFrame(data)
df.set_index('biosample', inplace=True)

In [23]:
data = []
for k in clean:
    dat = {'biosample': k['_id']}
    for attr in k['sample_attributes']:
        dat['nih_' + attr['name']] = attr['value']
        
    data.append(dat)
        
df2 = pd.DataFrame(data)
df2.set_index('biosample', inplace=True)

In [24]:
biosample2srx = [(x['_id'], x['biosample'][0]['biosample_accn']) for x in ncbi.find({'_id': {'$in': staged}}, {'_id': 1, 'biosample.biosample_accn': 1})]

In [25]:
mapper = pd.DataFrame(biosample2srx, columns=['srx', 'biosample'])
mapper.set_index('biosample', inplace=True)

In [26]:
metadata = mapper.join(df2).join(df).reset_index().set_index('srx')

In [27]:
metadata.shape

(4229, 149)

In [36]:
cnts = []
missing = []
for srx in staged:
    try:
        dd = pd.read_parquet(f'../output/aln-wf/gene_counts/{srx}.parquet')
        cnts.append(dd.reset_index().pivot('srx', 'FBgn'))
    except:
        missing.append(srx)
dfCnts = pd.concat(cnts).T
dfCnts.index = dfCnts.index.droplevel(0)

In [37]:
dfCnts.head()

srx,SRX193500,SRX146431,SRX1329269,SRX3145626,SRX1006095,SRX3526138,SRX674969,SRX1339396,SRX645263,SRX675335,...,SRX1000484,SRX1342081,SRX2878010,SRX1743455,SRX1006454,SRX1411408,SRX3327320,SRX2549141,SRX1006213,SRX1006364
FBgn,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
FBgn0000003,3,510,0,0,76,0,0,1,1,5,...,4,4,0,0,2,1,490,0,2,1
FBgn0000008,621,919,17,365,182,8,107,9461,2077,81,...,9,280,158,64,15,59,76,325,0,97
FBgn0000014,2190,8,19,222,0,47,154,29,2,92,...,466,388,257,35,0,0,451,752,0,0
FBgn0000015,169,8,27,145,0,42,211,68,0,160,...,290,245,196,40,0,19,183,276,0,6
FBgn0000017,244,3321,76,8292,928,51,467,24434,4994,511,...,324,441,671,238,81,881,324,452,21,713


In [38]:
dfCnts.to_csv('../output/notebook/20180501_agg_gene_level_cnts.tsv', sep='\t')

In [40]:
metadata.loc[dfCnts.columns].to_csv('../output/notebook/20180501_metadata.tsv', sep='\t')

In [41]:
metadata.shape

(4229, 149)

In [34]:
missing = []
for srx in complete:
    if not Path(f"../output/aln-wf/gene_counts/{srx}.parquet").exists():
        missing.append(srx)

In [35]:
len(missing)

4543