Today I am checking Isabelle's modEncode results table. I want to see what kinds of datasets we have and figure why we are missing some datasets from the website. 

In [2]:
# %load ../start.py
# Load useful extensions
import os
import sys

# Activate the autoreload extension for easy reloading of external packages
%reload_ext autoreload
%autoreload 1

# Set up cashdir
from ipycache import CacheMagics
CacheMagics.cachedir = '../cachedir'

# Trun on the water mark
%reload_ext watermark
%watermark -u -d -v -g

# Load ipycache extension
%reload_ext ipycache
from ipycache import CacheMagics
CacheMagics.cachedir = '../cachedir'

# Add project library to path
sys.path.insert(0, '../../lcdb-wf/lib')
sys.path.insert(0, '../../lib/python')

# Set up references
import yaml
with open('../../config/config.yml') as fh:
    config = yaml.load(fh)

assembly = config['assembly']
tag = config['aligner']['tag']
REF = os.path.join(os.environ['REFERENCES_DIR'], assembly, tag)


last updated: 2017-09-12 

CPython 3.5.2
IPython 6.1.0
Git hash: 30a5c2596ed0b2a6df8b58fe4ad3d7ee3e26839d


In [15]:
# Imports
import pandas as pd
from intermine.webservice import Service

In [87]:
# Query modMine
## connect to web service
service = Service("http://intermine.modencode.org/release-33/service")

# Get a new query:
query = service.new_query("Submission")

# The view specifies the output columns
query.add_view(
    "DCCid", "assayFactor", "title", "lab.name", "experimentalFactors.type",
    "experimentalFactors.name"
)

# You can edit the constraint values below
query.add_constraint("experimentType", "=", "ChIP-seq", code = "A")
query.add_constraint("organism.species", "=", "melanogaster", code = "B")

# Make data frame of results.
df = pd.DataFrame([x.to_d() for x in query.rows()])

In [88]:
# Import Isabelle's final table of modEnocde peaks.
isaDf = pd.read_csv('../../output/modENCODE_finaltable', sep='\t', header=None, low_memory=False)
isaDf.columns = [
    'name', 
    'score', 
    'chrom', 
    'start', 
    'end', 
    'strand',
    'modENCODE_id',
    'peak_fbgn',
    'peak_symbol', 
    'antibody',
    'cell_line',
    'dev_stage',
    'strain',
    'target_gene',
    'tissue',
    'TF_name',
    'TF_fbgn'
]

In [89]:
# Check overlap
## Convert IDs to sets
isa_ids = set(isaDf.modENCODE_id.unique().tolist())
modEncode_ids = set(df['Submission.DCCid'].unique().tolist())

## look at intersection and overlap
intersect = modEncode_ids.intersection(isa_ids)
difference = modEncode_ids.difference(isa_ids)
print(
    "Isabelle IDs: {}\n"
    "modEncode IDs: {}\n"
    "Overlapping IDs: {}\n"
    "Differeng IDs: {}".format(
        len(isa_ids),
        len(modEncode_ids),
        len(intersect),
        len(difference)
    )
 )

Isabelle IDs: 107
modEncode IDs: 351
Overlapping IDs: 107
Differeng IDs: 244


In [90]:
# Get more details about missing cells and dev stage
missing = df['Submission.DCCid'].isin(list(difference))
there = df['Submission.DCCid'].isin(list(intersect))
dev_stage = df['Submission.experimentalFactors.type'] == 'developmental stage'
cell_line = df['Submission.experimentalFactors.type'] == 'cell line'

df['Isabelle'] = False
df.loc[there, 'Isabelle'] = True

df['Missing'] = False
df.loc[missing, 'Missing'] = True

# Make summary Table
df.loc[
        (dev_stage|cell_line), 
        ['Isabelle', 'Missing', 'Submission.experimentalFactors.name']
      ].groupby('Submission.experimentalFactors.name').sum().applymap(lambda x: int(x))

Unnamed: 0_level_0,Isabelle,Missing
Submission.experimentalFactors.name,Unnamed: 1_level_1,Unnamed: 2_level_1
3rd Instar Larvae,13,25
Adult Female,2,5
Adult Male,0,6
CME W1 Cl.8+,3,1
Embryo 0-12 h,6,8
Embryo 0-4 h,0,15
Embryo 0-8 h,9,5
Embryo 1-6 h,1,1
Embryo 12-16 h,0,8
Embryo 12-24 h,2,1


In [91]:
# Get more details about missing tissues
tissue = df['Submission.experimentalFactors.type'] == 'tissue'

df['Isabelle'] = False
df.loc[there, 'Isabelle'] = True

df['Missing'] = False
df.loc[missing, 'Missing'] = True

# Make summary Table
df.loc[
        tissue, 
        ['Isabelle', 'Missing', 'Submission.experimentalFactors.name']
      ].groupby('Submission.experimentalFactors.name').sum().applymap(lambda x: int(x))

Unnamed: 0_level_0,Isabelle,Missing
Submission.experimentalFactors.name,Unnamed: 1_level_1,Unnamed: 2_level_1
Heads OR,12,27
salivary glands,0,1


In [92]:
df.columns

Index(['Submission.DCCid', 'Submission.RNAsize', 'Submission.assayFactor',
       'Submission.description', 'Submission.design', 'Submission.embargoDate',
       'Submission.experimentDate', 'Submission.experimentType',
       'Submission.experimentalFactors.name',
       'Submission.experimentalFactors.type', 'Submission.id',
       'Submission.lab.name', 'Submission.multiplyMappedReadCount',
       'Submission.name', 'Submission.notice', 'Submission.publicReleaseDate',
       'Submission.qualityControl', 'Submission.replacesSubmission',
       'Submission.replicate', 'Submission.title',
       'Submission.totalMappedReadCount', 'Submission.totalReadCount',
       'Submission.uniquelyMappedReadCount', 'Submission.url',
       'Submission.version', 'Isabelle', 'Missing'],
      dtype='object')

In [97]:
df.loc[tissue, 'Submission.experimentalFactors.name'].unique()

array(['salivary glands', 'Heads OR'], dtype=object)

In [103]:
df[df['Submission.DCCid'] == 'modENCODE_6389']

Unnamed: 0,Submission.DCCid,Submission.RNAsize,Submission.assayFactor,Submission.description,Submission.design,Submission.embargoDate,Submission.experimentDate,Submission.experimentType,Submission.experimentalFactors.name,Submission.experimentalFactors.type,...,Submission.replacesSubmission,Submission.replicate,Submission.title,Submission.totalMappedReadCount,Submission.totalReadCount,Submission.uniquelyMappedReadCount,Submission.url,Submission.version,Isabelle,Missing
842,modENCODE_6389,,H3K36me2,We aim to determine the locations of the major...,binding_site_identification_design,Wed Mar 06 00:00:00 UTC 2013,Sat Apr 14 00:00:00 UTC 2012,ChIP-seq,H3K36me2_W,antibody,...,,biological_replicate,H3K36me2_W.BG3.Solexa,52409495.0,73107572.0,52409495.0,,,False,True
843,modENCODE_6389,,H3K36me2,We aim to determine the locations of the major...,binding_site_identification_design,Wed Mar 06 00:00:00 UTC 2013,Sat Apr 14 00:00:00 UTC 2012,ChIP-seq,ML-DmBG3-c2,cell line,...,,biological_replicate,H3K36me2_W.BG3.Solexa,52409495.0,73107572.0,52409495.0,,,False,True


In [104]:
df[df['Submission.experimentalFactors.name'] == 'S2-DRSC']

Unnamed: 0,Submission.DCCid,Submission.RNAsize,Submission.assayFactor,Submission.description,Submission.design,Submission.embargoDate,Submission.experimentDate,Submission.experimentType,Submission.experimentalFactors.name,Submission.experimentalFactors.type,...,Submission.replacesSubmission,Submission.replicate,Submission.title,Submission.totalMappedReadCount,Submission.totalReadCount,Submission.uniquelyMappedReadCount,Submission.url,Submission.version,Isabelle,Missing
30,modENCODE_2638,,CTCF,The White Lab is aiming to map the association...,binding_site_identification_design,Mon Oct 25 00:00:00 UTC 2010,Tue Jan 26 00:00:00 UTC 2010,ChIP-seq,S2-DRSC,cell line,...,,biological_replicate,S2_CTCF200_ChIP-seq,,,,,,False,True
33,modENCODE_2639,,CTCF,The White Lab is aiming to map the association...,binding_site_identification_design,Mon Oct 25 00:00:00 UTC 2010,Tue Jan 26 00:00:00 UTC 2010,ChIP-seq,S2-DRSC,cell line,...,,biological_replicate,S2_CTCF500_ChIP-seq,,,,,,False,True
47,modENCODE_2979,,MCM2-7,We will precisely identify sequence elements t...,binding_site_identification_design,Tue Apr 05 00:00:00 UTC 2011,Fri Feb 13 00:00:00 UTC 2009,ChIP-seq,S2-DRSC,cell line,...,,biological_replicate,MCM S2 July 2010,,54687237.0,38822184.0,,,False,True
75,modENCODE_3189,,H3K36me3,The White Lab is aiming to map the association...,binding_site_identification_design,Thu Jun 09 00:00:00 UTC 2011,Mon May 13 00:00:00 UTC 2013,ChIP-seq,S2-DRSC,cell line,...,,biological_replicate,H3K36me3_S2_cells_ChIP-chip,,,,,,False,True
212,modENCODE_3953,,H3K9me2,We aim to determine the locations of the major...,binding_site_identification_design,Sat Jun 30 00:00:00 UTC 2012,Thu Mar 29 00:00:00 UTC 2012,ChIP-seq,S2-DRSC,cell line,...,,biological_replicate,Solexa.H3K9me2_antibody2.S2,28960378.0,58418187.0,28960378.0,,,False,True
344,modENCODE_4715,,H3K36me3,Chromatin immunoprecipitation (ChIP) followed ...,binding_site_identification_design,Fri Dec 07 00:00:00 UTC 2012,Mon Mar 26 00:00:00 UTC 2012,ChIP-seq,S2-DRSC,cell line,...,,technical_replicate,Ultra-deep ChIP-seq of H3K36me3 in D. melanoga...,,,,,,False,True
346,modENCODE_4716,,Su(Hw),Chromatin immunoprecipitation (ChIP) followed ...,binding_site_identification_design,Wed Dec 26 00:00:00 UTC 2012,Tue Mar 27 00:00:00 UTC 2012,ChIP-seq,S2-DRSC,cell line,...,,technical_replicate,Ultra-deep ChIP-seq of Su(Hw) in D. melanogast...,,,,,,False,True
425,modENCODE_4966,,CG8478,The White Lab is aiming to map the association...,binding_site_identification_design,Fri Jan 11 00:00:00 UTC 2013,Thu Aug 15 00:00:00 UTC 2013,ChIP-seq,S2-DRSC,cell line,...,,biological_replicate,CG8478_S2_cells_ChIP-seq,57186237.0,64502447.0,57186237.0,,,False,True
445,modENCODE_4985,,H3K4me1,The White Lab is aiming to map the association...,binding_site_identification_design,Sat Jan 12 00:00:00 UTC 2013,Thu Aug 15 00:00:00 UTC 2013,ChIP-seq,S2-DRSC,cell line,...,,biological_replicate,H3K4me1_S2_cells_ChIP-seq,68445159.0,76134721.0,68445159.0,,,False,True
453,modENCODE_4988,,H3K4me3,The White Lab is aiming to map the association...,binding_site_identification_design,Sat Jan 12 00:00:00 UTC 2013,Fri Aug 16 00:00:00 UTC 2013,ChIP-seq,S2-DRSC,cell line,...,,biological_replicate,H3K4me3_S2_cells_ChIP-seq,76388085.0,82001205.0,76388085.0,,,False,True
