For the s2RNAi project I need a list of ChIP-seq samples. I am doing it here because this is related to SRA project.

In [1]:
# %load ../start.py
# Load useful extensions

# Activate the autoreload extension for easy reloading of external packages
%reload_ext autoreload
%autoreload 2

# Trun on the water mark
%reload_ext watermark
%watermark -u -d -g

# Load ipycache extension
%reload_ext ipycache
from ipycache import CacheMagics
CacheMagics.cachedir = '../cachedir'

# Add project library to path
import sys
sys.path.insert(0, '../../lib/python')

# The usual suspects
import os
import numpy as np
import pandas as pd

# plotting
import matplotlib as mpl
import matplotlib.pyplot as plt
import seaborn as sns

%matplotlib inline
sns.set_context('poster')

# Turn off scientific notation
np.set_printoptions(precision=5, suppress=True)


last updated: 2017-09-20 
Git hash: 4bc9218af8c49c09524cdcb0c0e81f7e74f4da46


In [2]:
# %load ../../bin/load.py
from pymongo import MongoClient
with open('/home/fearjm/Projects/ncbi_remap/output/.mongodb_host', 'r') as fh:
    host = fh.read().strip()
client = MongoClient(host=host, port=27022)
db = client['sra2']
ncbi = db['ncbi']
remap = db['remap']

In [53]:
df = pd.DataFrame(list(ncbi.aggregate([
    {'$unwind': '$biosample'},
    {'$unwind': '$biosample.attributes'},
    {
        '$match': {
            'biosample.attributes.value': {'$regex': '(S2[\sr\+\-]+|.*Schneider.*)', '$options': 'i'},
        }
    },
    {
        '$project': {
            '_id': 0,
            'srx': '$_id',
            'biosample': '$biosample.biosample_accn',
            'name': '$biosample.attributes.name',
            'value': '$biosample.attributes.value',
            'sra_strategy': '$sra.experiment.library_strategy',
            'sra_source': '$sra.experiment.library_source',
            'sra_selection': '$sra.experiment.library_selection',
        }
    },
])))

In [105]:
print(df.shape)

(3877, 7)
(3877, 7)


In [109]:
df[df.srx.duplicated()]

Unnamed: 0,biosample,name,sra_selection,sra_source,sra_strategy,srx,value
16,SAMN05710807,cell_line,ChIP,GENOMIC,ChIP-Seq,SRX2055967,S2-DRSC
18,SAMN05710808,cell_line,ChIP,GENOMIC,ChIP-Seq,SRX2055966,S2-DRSC
20,SAMN05710809,cell_line,ChIP,GENOMIC,ChIP-Seq,SRX2055965,S2-DRSC
22,SAMN05710810,cell_line,ChIP,GENOMIC,ChIP-Seq,SRX2055964,S2-DRSC
24,SAMN05710811,cell_line,ChIP,GENOMIC,ChIP-Seq,SRX2055963,S2-DRSC
26,SAMN05710812,cell_line,ChIP,GENOMIC,ChIP-Seq,SRX2055962,S2-DRSC
28,SAMN05710813,cell_line,ChIP,GENOMIC,ChIP-Seq,SRX2055961,S2-DRSC
30,SAMN05710814,cell_line,ChIP,GENOMIC,ChIP-Seq,SRX2055960,S2-DRSC
32,SAMN05710815,cell_line,ChIP,GENOMIC,ChIP-Seq,SRX2055959,S2-DRSC
34,SAMN05710816,cell_line,ChIP,GENOMIC,ChIP-Seq,SRX2055958,S2-DRSC


In [104]:
df.srx.shape

(3877,)

In [55]:
flip = df.pivot(values='value', columns='name')

In [56]:
flip.head()

name,breed,cell_line,cell_type,dev_stage,genotype,growth_protocol,isolate,library source,sample group,sample_name,sample_type,source,source_name,strain,tissue,treatment
0,,,,,,,,,,,,,"Drosophila S2 cells, dnaj1 KD",,,
1,,,,,,,,,,,,,"Drosophila S2 cells, dnaj1 KD",,,
2,,,,,,,,,,,,,"Drosophila S2 cells, mlf KD",,,
3,,,,,,,,,,,,,"Drosophila S2 cells, mlf KD",,,
4,,,,,,,,,,,,,"Drosophila S2 cells, lacZ KD",,,


In [57]:
merged = df.loc[:, ['srx', 'biosample', 'sra_selection', 'sra_source', 'sra_strategy']].join(flip)

In [58]:
merged.head()

Unnamed: 0,srx,biosample,sra_selection,sra_source,sra_strategy,breed,cell_line,cell_type,dev_stage,genotype,...,isolate,library source,sample group,sample_name,sample_type,source,source_name,strain,tissue,treatment
0,SRX2172438,SAMN05784497,cDNA,TRANSCRIPTOMIC,RNA-Seq,,,,,,...,,,,,,,"Drosophila S2 cells, dnaj1 KD",,,
1,SRX2172437,SAMN05784495,cDNA,TRANSCRIPTOMIC,RNA-Seq,,,,,,...,,,,,,,"Drosophila S2 cells, dnaj1 KD",,,
2,SRX2172436,SAMN05784496,cDNA,TRANSCRIPTOMIC,RNA-Seq,,,,,,...,,,,,,,"Drosophila S2 cells, mlf KD",,,
3,SRX2172435,SAMN05784491,cDNA,TRANSCRIPTOMIC,RNA-Seq,,,,,,...,,,,,,,"Drosophila S2 cells, mlf KD",,,
4,SRX2172434,SAMN05784492,cDNA,TRANSCRIPTOMIC,RNA-Seq,,,,,,...,,,,,,,"Drosophila S2 cells, lacZ KD",,,


In [98]:
isa = ['modENCODE_2626', 'modENCODE_2627', 'modENCODE_2629', 'modENCODE_2630',
'modENCODE_2633', 'modENCODE_2634', 'modENCODE_2636', 'modENCODE_2638',
'modENCODE_2639', 'modENCODE_2641', 'modENCODE_2642', 'modENCODE_3229',
'modENCODE_3230', 'modENCODE_3232', 'modENCODE_3234', 'modENCODE_3237',
'modENCODE_3238', 'modENCODE_3239', 'modENCODE_3240', 'modENCODE_3241',
'modENCODE_3242', 'modENCODE_3243', 'modENCODE_3245', 'modENCODE_3390',
'modENCODE_3391', 'modENCODE_3392', 'modENCODE_3393', 'modENCODE_3394',
'modENCODE_3395', 'modENCODE_3396', 'modENCODE_3398', 'modENCODE_3399',
'modENCODE_3400', 'modENCODE_3401', 'modENCODE_3402', 'modENCODE_3403',
'modENCODE_3830', 'modENCODE_3956', 'modENCODE_3957', 'modENCODE_3958',
'modENCODE_3959', 'modENCODE_3960', 'modENCODE_4069', 'modENCODE_4070',
'modENCODE_4071', 'modENCODE_4074', 'modENCODE_4078', 'modENCODE_4080',
'modENCODE_4081', 'modENCODE_4082', 'modENCODE_4089', 'modENCODE_4091',
'modENCODE_4094', 'modENCODE_4095', 'modENCODE_4096', 'modENCODE_4098',
'modENCODE_4099', 'modENCODE_4103', 'modENCODE_4104', 'modENCODE_4105',
'modENCODE_4107', 'modENCODE_4113', 'modENCODE_4114', 'modENCODE_4119',
'modENCODE_4352', 'modENCODE_4936', 'modENCODE_4943', 'modENCODE_4944',
'modENCODE_4974', 'modENCODE_4976', 'modENCODE_4981', 'modENCODE_4982',
'modENCODE_4998', 'modENCODE_5004', 'modENCODE_5005', 'modENCODE_5008',
'modENCODE_5014', 'modENCODE_5017', 'modENCODE_5023', 'modENCODE_5024',
'modENCODE_5025', 'modENCODE_5028', 'modENCODE_5029', 'modENCODE_5068',
'modENCODE_5069', 'modENCODE_5070', 'modENCODE_5071', 'modENCODE_5072',
'modENCODE_5110', 'modENCODE_5111', 'modENCODE_5112', 'modENCODE_5113',
'modENCODE_5114', 'modENCODE_5115', 'modENCODE_5116', 'modENCODE_5117',
'modENCODE_5118', 'modENCODE_5119', 'modENCODE_5120', 'modENCODE_5121',
'modENCODE_5122', 'modENCODE_5123', 'modENCODE_5124', 'modENCODE_5125',
'modENCODE_5126', 'modENCODE_5127', 'modENCODE_5128', 'modENCODE_5129',
'modENCODE_5257', 'modENCODE_5264', 'modENCODE_5568', 'modENCODE_5569',
'modENCODE_5570', 'modENCODE_5571', 'modENCODE_5574', 'modENCODE_5575',
'modENCODE_5576', 'modENCODE_5577', 'modENCODE_5579', 'modENCODE_5580',
'modENCODE_5587', 'modENCODE_5590', 'modENCODE_5591', 'modENCODE_5592',
'modENCODE_5593', 'modENCODE_5594', 'modENCODE_5597', 'modENCODE_5598',
'modENCODE_5599', 'modENCODE_5606', 'modENCODE_846', 'modENCODE_847',
'modENCODE_848', 'modENCODE_849', 'modENCODE_850', 'modENCODE_851',
'modENCODE_852', 'modENCODE_853', 'modENCODE_854', 'modENCODE_855',
'modENCODE_856', 'modENCODE_857', 'modENCODE_858', 'modENCODE_859',
'modENCODE_860', 'modENCODE_861', 'modENCODE_862', 'modENCODE_863',
'modENCODE_984', 'modENCODE_985']
print(len(isa))

150


In [95]:
modencode = pd.read_table('../../output/modENCODE_sampletable.tsv')
modencode.drop('sra_strategy', axis=1, inplace=True)
modencode['modencode'] = True

In [97]:
print(len(modencode.modENCODE_id.unique()))
modencode.head()

805


Unnamed: 0,srx,srr,modENCODE_id,modENCODE_assay,modENCODE_type,modencode
0,SRX002599,SRR013488,modENCODE_1040,UTR,RACE,True
1,SRX002600,SRR013489,modENCODE_1040,UTR,RACE,True
2,SRX002600,SRR013490,modENCODE_1040,UTR,RACE,True
3,SRX002601,SRR013491,modENCODE_1040,UTR,RACE,True
4,SRX002602,SRR013492,modENCODE_1040,UTR,RACE,True


In [91]:
merged2 = merged.merge(modencode, left_on='srx', right_on='srx', how='left')
merged2.modencode = merged2.modencode.fillna(False)

In [99]:
print(merged2.shape)
merged2.head()

(4102, 27)


Unnamed: 0,srx,biosample,sra_selection,sra_source,sra_strategy,breed,cell_line,cell_type,dev_stage,genotype,...,source_name,strain,tissue,treatment,srr,modENCODE_id,modENCODE_assay,modENCODE_type,modencode,isa
0,SRX2172438,SAMN05784497,cDNA,TRANSCRIPTOMIC,RNA-Seq,,,,,,...,"Drosophila S2 cells, dnaj1 KD",,,,,,,,False,
1,SRX2172437,SAMN05784495,cDNA,TRANSCRIPTOMIC,RNA-Seq,,,,,,...,"Drosophila S2 cells, dnaj1 KD",,,,,,,,False,
2,SRX2172436,SAMN05784496,cDNA,TRANSCRIPTOMIC,RNA-Seq,,,,,,...,"Drosophila S2 cells, mlf KD",,,,,,,,False,
3,SRX2172435,SAMN05784491,cDNA,TRANSCRIPTOMIC,RNA-Seq,,,,,,...,"Drosophila S2 cells, mlf KD",,,,,,,,False,
4,SRX2172434,SAMN05784492,cDNA,TRANSCRIPTOMIC,RNA-Seq,,,,,,...,"Drosophila S2 cells, lacZ KD",,,,,,,,False,


In [100]:
len(merged2.srx.unique())

3216

In [93]:
pd.crosstab(merged2.sra_strategy, merged2.modencode, margins=True)

modencode,False,True,All
sra_strategy,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
ChIP-Seq,568,62,630
DNase-Hypersensitivity,14,0,14
EST,0,11,11
MNase-Seq,104,0,104
OTHER,298,149,447
POOLCLONE,12,0,12
RIP-Seq,82,0,82
RNA-Seq,2384,349,2733
WGS,4,2,6
miRNA-Seq,58,0,58


In [9]:
ncbi.find_one()

{'_cls': 'Ncbi',
 '_id': 'SRX2581987',
 'bioproject': {'bioproject_accn': 'PRJNA357269',
  'bioproject_id': '357269',
  'description': "RNA molecules can attach to chromatin and thus provide a type of epigenomic information. It remains difficult to know what RNAs are associated with chromatin and where are the genomic target loci of these RNAs. Here, we present MARGI (Mapping RNA-genome interactions), a technology to massively reveal native RNA-chromatin interactions from unperturbed cells. The gist of this technology is to ligate chromatin-associated RNA (caRNA) with their target genomic sequence by proximity ligation, forming RNA-DNA chimeric sequences, which are converted to sequencing library for paired-end sequencing. Using MARGI, we produced RNA-genome interaction maps for human embryonic stem (ES) cells and HEK cells. MARGI revealed hundreds of chromatin-associated RNAs (caRNA), including previously known XIST, SNHG1, NEAT1, and MALAT1, as well as each caRNA's genomic interactio