# Developmental Time Course

In [1]:
import os
import sys
from pathlib import Path
from textwrap import dedent

from IPython.display import display, HTML, Markdown
import numpy as np
import pandas as pd

import matplotlib as mpl
import matplotlib.pyplot as plt
import seaborn as sns

# Project level imports
sys.path.insert(0, '../lib')
from ncbi_remap.notebook import Nb
from ncbi_remap.plotting import make_figs

# Setup notebook
nbconfig = Nb.setup_notebook()

# Connect to data store
store = pd.HDFStore('../output/sra.h5', mode='r')

Please check output/fbgn2chrom.tsv. If it does not exist, run bin/fbgn2chrom.py
last updated: 2018-05-02 
Git hash: 69e627d87b83358b44de6959a32570f5edba2b8c


In [2]:
from pymongo import MongoClient
try:
    with open('../output/.mongodb_host', 'r') as fh:
        host = fh.read().strip()
except FileNotFoundError:
    host = 'localhost'

mongoClient = MongoClient(host=host, port=27022)
db = mongoClient['sra']
ncbi = db['ncbi']
biometa = db['biometa']

In [3]:
# Get list of completed srx
complete = store['aln/complete'].srx.unique().tolist()

In [4]:
len(complete)

24028

In [5]:
passed = [x for x in store['prealn/qc_passed'].srx.unique().tolist() if x in complete]

In [6]:
len(passed)

17178

In [7]:
rnaseq = [x['_id'] for x in ncbi.aggregate([
    {
        '$match': {
            '_id': {'$in': passed},
            'sra.experiment.library_strategy': 'RNA-Seq'
        }
    },
    {
        '$project': {
            '_id': 1
        }
    }
])]

In [8]:
len(rnaseq)

10965

In [9]:
biosamples = list(set([x['_id'] for x in biometa.aggregate([
    {
        '$unwind': {
            'path': '$experiments'
        }
    },
    {
        '$match': {
            'experiments.srx': {'$in': rnaseq}
        }
    },
    {
        '$group': {
            '_id': '$srx'
        }
    },
])]))

In [10]:
len(biosamples)

10473

In [11]:
keepers = list(biometa.find({'_id': {'$in': biosamples}}))

In [12]:
len(keepers)

10473

In [13]:
import json

In [14]:
with open('../output/notebook/20180502_rnaseq_db_dump.json', 'w') as fh:
    fh.write(json.dumps(keepers, default=str, indent=4, separators=(',', ': ')))

In [18]:
data = []
for k in keepers:
    dat = {
        'biosample': k['_id'],
        'description': k.get('description', ''),
        'sample_title': k.get('study_title', ''),
        'study_abstract': k.get('study_abstract', ''),
    }
    
    if len(k['papers']) > 0:
        dat['pmid'] = k['papers'][0]['pubmed_id']
    else:
        dat['pmid'] = ''
        
    for attr in k['sample_attributes']:
        dat['author_' + attr['name']] = attr['value']
    
    data.append(dat)
    
df = pd.DataFrame(data)
df.set_index('biosample', inplace=True)

In [19]:
biosample2srx = [(x['_id'], x['biosample'][0]['biosample_accn']) for x in ncbi.find({'_id': {'$in': rnaseq}}, {'_id': 1, 'biosample.biosample_accn': 1})]

In [20]:
mapper = pd.DataFrame(biosample2srx, columns=['srx', 'biosample'])
mapper.set_index('biosample', inplace=True)

In [21]:
metadata = mapper.join(df).reset_index().set_index('srx')

In [22]:
metadata.shape

(10965, 311)

In [23]:
!mkdir ../output/notebook/20180502_genelvl_cnts

In [26]:
missing = []
for srx in rnaseq:
    try:
        dd = pd.read_parquet(f'../output/aln-wf/gene_counts/{srx}.parquet')
        dd.to_csv(f'../output/notebook/20180502_genelvl_cnts/{srx}.tsv', sep='\t')
    except:
        missing.append(srx)
        
print(len(missing))

0


In [27]:
metadata.loc[rnaseq].to_csv('../output/notebook/20180502_rnaseq_metadata.tsv', sep='\t')