# Build Metadata Table

I need to build a metadata table for submitting to GEO. We are going to do these submissions in batches, so here I am focusing on the ~5k samples I want to send to FlyBase. The metadata table should include all links to other databases and any other information that I find relevant.

In [8]:
import os
import sys
from pathlib import Path

import numpy as np
import pandas as pd

import matplotlib as mpl
import matplotlib.pyplot as plt
import seaborn as sns
from pymongo import MongoClient

# Project level imports
sys.path.insert(0, '../lib')
from ncbi_remap.notebook import Nb
from ncbi_remap.plotting import make_figs
from ncbi_remap.parser import parse_hisat2

# Setup notebook
nbconfig = Nb.setup_notebook()

# Turn on cache
from joblib import Memory
memory = Memory(cachedir=nbconfig.cache, verbose=0)

# Connect to data store
store = pd.HDFStore('../sra.h5', mode='r')

# Connect to DB
host_pth = Path('../output/.mongodb_host')
if host_pth.exists():
    with host_pht.open() as fh:
        host = fh.read().strip()
else:
    host = 'localhost'

client = MongoClient(host=host, port=27022)
db = client['sra2']
ncbi = db['ncbi']
biometa = db['biometa']
remap = db['remap']

last updated: 2017-12-22 
Git hash: 1b967a52a83923ec9a93b57db9d3dee8b476e63f


In [2]:
# Get list of SRX
good = pd.read_csv('../output/flybase_samples.tsv', sep='\t') 
srx = good.srx.values.tolist()

In [3]:
records = list(ncbi.aggregate([
    {
        '$match': {
            '_id': {'$in': srx}
        }
    },
    {
        '$project': {
            '_id': 0,
            'srx': '$_id',
            'bioproject': '$bioproject.bioproject_accn',
            'biosample': { '$arrayElemAt': [ '$biosample.biosample_accn', 0] },
            'srs': '$sra.sample.sample_id',
            'gsm': '$sra.sample.GEO',
            'srp': '$sra.study.study_id',
            'sra': '$sra.submission.submission_id',
            'pubmed': '$pubmed.pubmed_id',
            'contact': {'$arrayElemAt': [{'$arrayElemAt': ['$biosample.contacts', 0]}, 0]},
        }
    },
    {
        '$project': {
            'srx': '$srx',
            'bioproject': '$bioproject',
            'biosample': '$biosample',
            'srs': '$srs',
            'gsm': '$gsm',
            'srp': '$srp',
            'sra': '$sra',
            'pubmed': '$pubmed',
            'first_name': '$contact.first_name',
            'last_name': '$contact.last_name',
            'email': '$contact.email',
        }
    }
]))

In [4]:
records[0]

{'bioproject': 'PRJDB2557',
 'biosample': 'SAMD00012584',
 'pubmed': ['25003736', '25003736'],
 'sra': 'DRA001188',
 'srp': 'DRP001250',
 'srs': 'DRS012553',
 'srx': 'DRX012753'}

In [43]:
def cleanup_pubmed(l):
    if not isinstance(l, list):
        return l
    if l == []:
        return np.nan
    return '|'.join(list(set(l)))

df = pd.DataFrame(records)
df.pubmed = df.pubmed.apply(cleanup_pubmed)
df = df.merge(good, on='srx', how='outer')
df.set_index(['srx', 'srr'], inplace=True)

header = ['srs', 'srp', 'gsm', 'bioproject', 'biosample', 'pubmed', 'first_name', 'last_name', 'email',]
df = df[header].copy()

In [44]:
df.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,srs,srp,gsm,bioproject,biosample,pubmed,first_name,last_name,email
srx,srr,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
DRX012753,DRR014222,DRS012553,DRP001250,,PRJDB2557,SAMD00012584,25003736.0,,,
DRX012754,DRR014223,DRS012553,DRP001250,,PRJDB2557,SAMD00012584,25003736.0,,,
DRX012755,DRR014224,DRS012553,DRP001250,,PRJDB2557,SAMD00012584,25003736.0,,,
DRX073117,DRR079273,DRS039996,DRP003423,,PRJDB5381,SAMD00069299,,,,
DRX073118,DRR079274,DRS039997,DRP003423,,PRJDB5381,SAMD00069300,,,,


In [27]:
dfs = []
for _, row in good.iterrows():
    srx = row.to_dict()['srx']
    srr = row.to_dict()['srr']
    fname = f'../aln-wf/output/samples/{srx}/{srr}/{srr}.fq.bam.log'
    dfs.append(parse_hisat2(srx, srr, fname))

hisat2 = pd.concat(dfs)

In [56]:
first = store['prealn/workflow/collectrnaseqmetrics/first']['PCT_CORRECT_STRAND_READS']
first.name = 'prop_reads_aln_first_strand'

In [58]:
dat = pd.concat([store['layout'], store['strand'], store['prealn/flags'], store['prealn/workflow/fastq'][['libsize_R1', 'avgLen_R1', 'libsize_R2', 'avgLen_R2']], hisat2.per_alignment, first], axis=1)

In [62]:
merged = df.join(dat)
merged.head(20)

Unnamed: 0_level_0,Unnamed: 1_level_0,srs,srp,gsm,bioproject,biosample,pubmed,first_name,last_name,email,layout,...,flag_download_bad,flag_quality_scores_bad,flag_qc_passed,flag_merge,libsize_R1,avgLen_R1,libsize_R2,avgLen_R2,per_alignment,prop_reads_aln_first_strand
srx,srr,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1
DRX012753,DRR014222,DRS012553,DRP001250,,PRJDB2557,SAMD00012584,25003736.0,,,,SE,...,False,False,True,True,164160619.0,101.0,,,89.24,0.997615
DRX012754,DRR014223,DRS012553,DRP001250,,PRJDB2557,SAMD00012584,25003736.0,,,,keep_R2,...,False,False,True,True,151593644.0,6.0,151593644.0,95.0,91.91,0.998734
DRX012755,DRR014224,DRS012553,DRP001250,,PRJDB2557,SAMD00012584,25003736.0,,,,SE,...,False,False,True,True,150847202.0,126.0,,,93.61,0.998618
DRX073117,DRR079273,DRS039996,DRP003423,,PRJDB5381,SAMD00069299,,,,,PE,...,False,False,True,True,22884479.0,100.0,22884479.0,100.0,98.56,0.005493
DRX073118,DRR079274,DRS039997,DRP003423,,PRJDB5381,SAMD00069300,,,,,PE,...,False,False,True,True,27009319.0,100.0,27009319.0,100.0,98.51,0.013322
DRX073119,DRR079275,DRS039998,DRP003423,,PRJDB5381,SAMD00069301,,,,,PE,...,False,False,True,True,39188793.0,100.0,39188793.0,100.0,98.65,0.002504
DRX073120,DRR079276,DRS039999,DRP003423,,PRJDB5381,SAMD00069302,,,,,PE,...,False,False,True,True,34601889.0,100.0,34601889.0,100.0,98.31,0.003888
DRX073121,DRR079277,DRS040000,DRP003423,,PRJDB5381,SAMD00069303,,,,,PE,...,False,False,True,True,25450968.0,100.0,25450968.0,100.0,98.28,0.007763
DRX073122,DRR079278,DRS040001,DRP003423,,PRJDB5381,SAMD00069304,,,,,PE,...,False,False,True,True,24563756.0,100.0,24563756.0,100.0,98.56,0.007232
DRX073123,DRR079279,DRS040002,DRP003423,,PRJDB5381,SAMD00069305,,,,,PE,...,False,False,True,True,34575327.0,100.0,34575327.0,100.0,98.66,0.00422
