In [22]:
# %load ../start.py
# Load useful extensions

# Activate the autoreload extension for easy reloading of external packages
%reload_ext autoreload
%autoreload 2

# Trun on the water mark
%reload_ext watermark
%watermark -u -d -g

# Load ipycache extension
%reload_ext ipycache
from ipycache import CacheMagics
CacheMagics.cachedir = '../cachedir'

# Add project library to path
import sys
sys.path.insert(0, '../../lib/python')

# The usual suspects
import os
import re
import numpy as np
import pandas as pd

# plotting
import matplotlib as mpl
import matplotlib.pyplot as plt
import seaborn as sns

%matplotlib inline
sns.set_context('poster')

# Turn off scientific notation
np.set_printoptions(precision=5, suppress=True)

last updated: 2017-11-03 
Git hash: 658176c5bd52d20a209ce689557b33c7f562447e


In [15]:
# %load ../../bin/load.py
from pymongo import MongoClient
with open('/home/fearjm/Projects/ncbi_remap/output/.mongodb_host', 'r') as fh:
    host = fh.read().strip()
client = MongoClient(host=host, port=27022)
db = client['sra2']
remap = db['remap']
ncbi = db['ncbi']
biometa = db['biometa']

In [16]:
biometa.find_one()

In [87]:
ncbi.find_one({'pubmed': {'$exists': 1}}, {'pubmed.pubmed_id': 1})

{'_id': 'SRX2551010',
 'pubmed': [{'pubmed_id': '28007888'}, {'pubmed_id': '28007888'}]}

In [88]:
regex = re.compile(r'^(\s*|.*\s)(s2|Schneider)(R\+\s|,\s|\s|\s*).*$', re.IGNORECASE)

s2 = pd.DataFrame(list(ncbi.aggregate([
    {'$unwind': '$sra.sample.attributes'},
    {
        '$match': {
            'sra.sample.attributes.value': {'$regex': regex},
            'sra.experiment.library_strategy': 'ChIP-Seq',
        }
    },
    {
        '$project': {
            '_id': 0,
            'srx': "$_id",
            'cell_line': '$sra.sample.attributes.value',
            'geo': '$sra.sample.GEO',
            'strategy': '$sra.experiment.library_strategy',
            'paper': '$pubmed.pubmed_id'
        }
    },
]))).set_index('srx')

In [89]:
regex2 = re.compile(r'.*antibody.*', re.IGNORECASE)

ab = pd.DataFrame(list(ncbi.aggregate([
    {'$unwind': '$sra.sample.attributes'},
    {
        '$match': {
            'sra.sample.attributes.name': {'$regex': regex2},
        }
    },
    {
        '$project': {
            '_id': 0,
            'srx': "$_id",
            'attribute': '$sra.sample.attributes.value',
        }
    },
])))

In [90]:
ab.shape

(2891, 2)

In [91]:
ab2 = ab.groupby('srx').apply(lambda x: '|'.join(x.attribute.values)).to_frame()
ab2.columns = ['antibody']

In [92]:
df = s2.join(ab2, how='left')

In [93]:
df

Unnamed: 0_level_0,cell_line,geo,paper,strategy,antibody
srx,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
ERX012710,Schneider S2,,,ChIP-Seq,
ERX012711,Schneider S2,,,ChIP-Seq,
ERX032305,S2,,,ChIP-Seq,
ERX088857,S2,,,ChIP-Seq,
ERX088858,S2,,,ChIP-Seq,
ERX088859,S2,,,ChIP-Seq,
ERX088868,S2,,,ChIP-Seq,
ERX088869,S2,,,ChIP-Seq,
ERX088870,S2,,,ChIP-Seq,
ERX088871,S2,,,ChIP-Seq,


In [70]:
store = pd.HDFStore('../../sra.h5')

In [78]:
qc = store['prealn/qc_passed'].srx

In [94]:
df_clean = df[df.index.isin(qc)]

In [95]:
df_clean.shape

(618, 5)

In [98]:
df_clean.to_csv('../../output/20171103_s2cell_chip-seq.tsv', sep='\t')