# Add metadata to modENCDOE sample table

For the larval gonad and the S2 cell RNAi projects I will need RNA-seq data from the SRA. Here I build a sample table to allow me to easily use these data in other projects.

In [6]:
import os
import sys
from pathlib import Path

import numpy as np
import pandas as pd

import matplotlib as mpl
import matplotlib.pyplot as plt
import seaborn as sns

from pymongo import MongoClient

# Project level imports
sys.path.insert(0, '../lib')
from ncbi_remap.notebook import Nb
from ncbi_remap.plotting import make_figs

# Setup notebook
nbconfig = Nb.setup_notebook()

# Turn on cache
from joblib import Memory
memory = Memory(cachedir=nbconfig.cache, verbose=0)

# Connect to data store
store = pd.HDFStore('../output/sra.h5', mode='r')

with open('../output/.mongodb_host', 'r') as fh:
    host = fh.read().strip()
client = MongoClient(host=host, port=27022)
db = client['sra2']
ncbi = db['ncbi']

last updated: 2018-02-07 
Git hash: ee50699617862434948897e5bd8fff65735b002f


In [7]:
# Import all modencode from modMine dump.
modenc = pd.read_csv('../data/modMine_results.tsv', sep='\t')
modenc.columns = ['modENCODE_id', 'assay_factor', 'experiment_type', 'title', 'srr', 'factor', 'factor_type']
modenc.set_index(['srr', 'modENCODE_id'], inplace=True)

In [8]:
# Pull out the rna-seq samples
rnaseq = modenc[(modenc.experiment_type == 'RNA-seq') & (modenc.assay_factor == 'total-RNA')].copy()

In [9]:
# collapse factors if there are multiples
# I was not able to do a simple pivot because there are some cases 
# where factor_type has multiple factor values. I loop and collapse down.
res = []
for g, df in rnaseq.groupby(level=0):
    newdf = {}
    newdf['srr'] = g
    for g2, df2 in df.groupby("factor_type"):
        factors = '|'.join(list(set(df2.factor.tolist())))
        newdf[g2.replace(' ', '_')] = factors
    res.append(newdf)

metadata = pd.DataFrame(res).set_index('srr').drop(['compartment', 'compound'], axis=1).fillna('')

# Drop samples that have concat factors because don't know what they really are.
metadata = metadata[~metadata.developmental_stage.str.contains('\|')].copy()

In [34]:
# Look up corresponding srx given an SRR or vice versa
srrs = [x for x in metadata.index.unique().tolist() if x.startswith('SRR')]
srxs = [x for x in metadata.index.unique().tolist() if x.startswith('SRX')]

# Get the SRXs for my srrs
srr2srx = pd.DataFrame(list(ncbi.aggregate([
    {
        '$unwind': {
            'path': '$runs'
        }
    },
    {
        '$match': {
            'runs.srr': {'$in': srrs}
        }
    },
    {
        '$project': {
            '_id': 0,
            'srx': '$srx',
            'srr': '$runs.srr',
        }
    }
])))


srx2srr = pd.DataFrame(list(ncbi.aggregate([
    {
        '$unwind': {
            'path': '$runs'
        }
    },
    {
        '$match': {
            '_id': {'$in': srxs}
        }
    },
    {
        '$project': {
            '_id': 0,
            'srx': '$srx',
            'srr': '$runs.srr',
        }
    }
])))

needed_modENCODE = pd.concat([srr2srx[['srx', 'srr']], srx2srr[['srx', 'srr']]])

In [35]:
# make sure no solid
abi = store['prealn/abi_solid']
abi_srx = abi.srx.unique()
needed_modENCODE = needed_modENCODE[~needed_modENCODE.srx.isin(abi_srx)].copy()

In [36]:
# save table to use in aln-wf to fill in the gaps.
needed_modENCODE.to_csv('../output/modencode_samples.tsv', sep='\t', index=False)

In [37]:
# Merge with FlyMine metadata
cleanMeta = metadata.join(needed_modENCODE.set_index('srr'), how='right')\
    .reset_index()\
    .drop('srr', axis=1)\
    .drop_duplicates()\
    .set_index('srx')

# Save table for use with larval gonad project
fname = Path('~/Projects/larval_gonad/data/external/sra_modENCODE_metadata.tsv').expanduser()
with fname.open('w') as fh:
    fh.write('# File generated by ncbi_remap:2018-02-06_modencode_rnaseq_table.ipynb\n')
    
cleanMeta.to_csv(str(fname), sep='\t', mode='a')

In [38]:
# aggregate coverage counts
dfs = []
for srx in needed_modENCODE.srx.unique().tolist():
    cnts = Path(f'../output/aln-wf/samples/{srx}/{srx}.bam.counts')
    if not cnts.exists():
        print(srx)
        continue
        
    idx = []
    values = []
    with cnts.open() as fh:
        for row in fh:
            if row.startswith('#'):
                continue
            if  row.startswith('Geneid'):
                # print(row)
                continue
            cols = row.strip().split('\t')
            idx.append(cols[0])
            values.append(int(cols[-1]))
    dfs.append(pd.Series(data=values, index=idx, name=srx))

cnts = pd.concat(dfs, axis=1)
cnts.index.name = 'FBgn'

SRX054462
SRX054461
SRX054460
SRX054459
SRX016573
SRX145284
SRX145283
SRX099772
SRX099771
SRX099770
SRX099769
SRX099768
SRX099767
SRX099766
SRX099765
SRX099764
SRX099763
SRX099762
SRX099761
SRX099760
SRX099757
SRX099756
SRX099755
SRX099754
SRX099753
SRX099752
SRX099751
SRX099750
SRX099749
SRX099748
SRX099747
SRX099746
SRX099745
SRX099744
SRX099743
SRX099742
SRX099741
SRX099740
SRX099739
SRX099738
SRX099737
SRX099736
SRX099735
SRX099733
SRX099732
SRX099731
SRX099730
SRX099729
SRX099728
SRX099727
SRX099726
SRX099725
SRX099723
SRX099722
SRX099721
SRX099720
SRX099719
SRX099717
SRX099716
SRX099715
SRX099713
SRX087428
SRX087426
SRX085096
SRX085095
SRX085098
SRX085097
SRX033334
SRX033335
SRX033333
SRX033332
SRX043517
SRX043516
SRX043514
SRX043508
SRX029445
SRX029374
SRX029214
SRX029209
SRX029115
SRX029114
SRX016332
SRX016331
SRX015869
SRX012271
SRX012270
SRX012269
SRX010758
SRX008542
SRX008278
SRX008277
SRX008276
SRX008275
SRX008274
SRX008273
SRX008272
SRX008271
SRX008270
SRX008269
SRX008268


In [39]:
# Save counts to be used by larval gonad projects
fname = Path('~/Projects/larval_gonad/data/external/sra_modENCODE.tsv').expanduser()
with fname.resolve().open('w') as fh:
    fh.write('# File generated by ncbi_remap:2018-02-06_modencode_rnaseq_table.ipynb\n')
    
cnts.to_csv(str(fname), sep='\t', mode='a')

In [42]:
ncbi.find_one({'bioproject': 'PRJNA317989'})