# Assembly of ChIP-seq modENCODE data

In [5]:
import tempfile
import pandas as pd
import urllib
import os
from urllib.request import urlopen

This is the White lab's modENCODE ChIP-seq dataset: 

In [19]:
url = 'http://intermine.modencode.org/release-33/features.do?type=experiment&action=export&experiment=Chromatin%20Binding%20Site%20Mapping%20of%20Transcription%20Factors%20in%20D.%20melanogaster%20by%20ChIP-seq&feature=BindingSite&format=csv'    
response = urlopen(url)
    
with tempfile.NamedTemporaryFile() as temp:     
    temp.write(response.read())
    temp.seek(0)
    df = pd.read_csv(temp)

In [21]:
df.columns = ['DB_identifier','score','chrom','start','end','strand','modENCODE_id','pivot','name']

In [22]:
df.head()

Unnamed: 0,DB_identifier,score,chrom,start,end,strand,modENCODE_id,pivot,name
0,Dll_WPP_ChIP_seq.gff_ID000002,28.67,2L,101,402,0,modENCODE_4974,strain,yellow cinnabar brown speck
1,Dll_WPP_ChIP_seq.gff_ID000002,28.67,2L,101,402,0,modENCODE_4974,antibody,dll
2,Dll_WPP_ChIP_seq.gff_ID000002,28.67,2L,101,402,0,modENCODE_4974,developmental stage,White prepupae (WPP)
3,yki_8_16h_embryonic_ChIP_seq.gff_ID000002,26.68,2L,133,3342,0,modENCODE_5029,strain,yellow cinnabar brown speck
4,yki_8_16h_embryonic_ChIP_seq.gff_ID000002,26.68,2L,133,3342,0,modENCODE_5029,developmental stage,Embryo 8-16 h


In [24]:
happy = df.set_index(
    ['modENCODE_id','DB_identifier','score','chrom','start','end','strand','pivot']).pivot_table(
        columns='pivot', index=['DB_identifier','score','chrom','start','end','strand','modENCODE_id'], 
        values='name',aggfunc='first')

In [25]:
happy.reset_index().head()

pivot,DB_identifier,score,chrom,start,end,strand,modENCODE_id,antibody,cell line,developmental stage,strain,target gene
0,15T_8_16_NW_GFP.1,50.62,2L,470602,471025,0,modENCODE_3402,No Antibody Control,,Embryo 8-16 h,hairy-GFP-P[acman]-15T,
1,15T_8_16_NW_GFP.10,51.05,2L,3153151,3153697,0,modENCODE_3402,No Antibody Control,,Embryo 8-16 h,hairy-GFP-P[acman]-15T,
2,15T_8_16_NW_GFP.100,70.02,2R,4416316,4416839,0,modENCODE_3402,No Antibody Control,,Embryo 8-16 h,hairy-GFP-P[acman]-15T,
3,15T_8_16_NW_GFP.101,106.48,2R,4453214,4453566,0,modENCODE_3402,No Antibody Control,,Embryo 8-16 h,hairy-GFP-P[acman]-15T,
4,15T_8_16_NW_GFP.102,109.55,2R,5009424,5010118,0,modENCODE_3402,No Antibody Control,,Embryo 8-16 h,hairy-GFP-P[acman]-15T,


In [397]:
happy[(happy['cell line'] == 'S2-DRSC')].modENCODE_id.unique()


array(['modENCODE_4966', 'modENCODE_5007'], dtype=object)

In [405]:
happy[happy['cell line'] == 'None'].modENCODE_id.unique()

array([], dtype=object)

In [401]:
happy['cell line'].unique()

array([None, 'S2-DRSC', 'Kc167'], dtype=object)

In [26]:
happy.describe()

pivot,antibody,cell line,developmental stage,strain,target gene
count,269462,55513,213949,213949,42117
unique,24,2,11,15,7
top,GFP_ab290,Kc167,Embryo 0-8 h,yellow cinnabar brown speck,lola
freq,42117,50874,61198,146053,11515


In [27]:
happy=happy.reset_index()

In [379]:
happy.modENCODE_id.unique()

array(['modENCODE_3402', 'modENCODE_3401', 'modENCODE_3403',
       'modENCODE_3399', 'modENCODE_4962', 'modENCODE_4961',
       'modENCODE_4078', 'modENCODE_4966', 'modENCODE_4080',
       'modENCODE_4081', 'modENCODE_4082', 'modENCODE_4974',
       'modENCODE_3396', 'modENCODE_3395', 'modENCODE_3391',
       'modENCODE_3392', 'modENCODE_3390', 'modENCODE_3394',
       'modENCODE_3393', 'modENCODE_3398', 'modENCODE_4976',
       'modENCODE_4089', 'modENCODE_4982', 'modENCODE_4983',
       'modENCODE_4998', 'modENCODE_5000', 'modENCODE_5011',
       'modENCODE_4095', 'modENCODE_4096', 'modENCODE_3400',
       'modENCODE_5014', 'modENCODE_4103', 'modENCODE_5017',
       'modENCODE_4069', 'modENCODE_4068', 'modENCODE_5018',
       'modENCODE_4074', 'modENCODE_4107', 'modENCODE_5024',
       'modENCODE_5025', 'modENCODE_5023', 'modENCODE_4114',
       'modENCODE_4113', 'modENCODE_4109', 'modENCODE_4077',
       'modENCODE_4967', 'modENCODE_5006', 'modENCODE_5007',
       'modENCODE_5008',

In [29]:
happy.modENCODE_id.describe()

count             269462
unique                60
top       modENCODE_4974
freq               16117
Name: modENCODE_id, dtype: object

In [234]:
cnt = happy.groupby(['modENCODE_id']).agg({'start':'count'})
cnt.start.describe()

count       60.000000
mean      4491.033333
std       3982.430554
min        353.000000
25%       1481.750000
50%       3458.000000
75%       5292.750000
max      16117.000000
Name: start, dtype: float64

In [30]:
phantompeaks = pd.read_excel('/Users/bergeric/Downloads/gkv637_Supplementary_Data/Supplementary_table_3__List_of_Phantom_Peaks.xlsx')

In [31]:
phantom_overlap = pd.read_excel('/Users/bergeric/Downloads/gkv637_Supplementary_Data/Supplementary_table_5__Overlap_of_the_Phantom_Peaks_with_non-histone_modENCODE_ChIPSeq_profiles.xlsx', header=1)

In [32]:
phantom_overlap.head()

Unnamed: 0,Profile_file_name,Total peaks,Overlap,Non.overlapping,Percent,P.value <0.01,P.value <0.05,Description,Organism,Method,Type,Factor,Stage,Lab,modE_ID
0,3232_E0-12h_Sin3A_peaks.bed,4046,2519,1527,62.26,14.236283,54.770143,Sin3A;Embryos 0-12 hr;ChIP-seq,D. melanogaster,ChIP-seq,Transcriptional Factor,Sin3A,Embryos 0-12 hr,"White, K.",3232
1,3826_E16-24h_Fer3_peaks.bed.gff,2284,1416,868,62.0,18.345009,52.451839,Fer3;Embryos 16-24 hr;ChIP-seq,D. melanogaster,ChIP-seq,Transcriptional Factor,Fer3,Embryos 16-24 hr,"White, K.",3826
2,3399_A.Female_Trem_peaks.bed.gff,1213,886,327,73.04,14.674361,49.95878,trem;Adult Female;ChIP-seq,D. melanogaster,ChIP-seq,Transcriptional Factor,trem,Adult Female,"White, K.",3399
3,3394_E16-24h_Kr-D2_peaks.bed.gff,353,228,125,64.59,28.895184,42.776204,Kruppel;Embryos 16-24 hr;ChIP-seq,D. melanogaster,ChIP-seq,Transcriptional Factor,Kruppel,Embryos 16-24 hr,"White, K.",3394
4,3827_E8-16h_h_peaks.bed.gff,431,262,169,60.79,13.225058,39.675174,hairy;Embryos 8-16 hr;ChIP-seq,D. melanogaster,ChIP-seq,Transcriptional Factor,hairy,Embryos 8-16 hr,"White, K.",3827


In [33]:
new_ids = []
for val in list(phantom_overlap.modE_ID):
    newval = 'modENCODE_'+str(val)
    new_ids.append(newval)

In [34]:
len(new_ids)

153

In [35]:
len(list(phantom_overlap.modE_ID))

153

In [36]:
phantom_overlap['modENCODE_id'] = new_ids

In [37]:
phantom_overlap.head()

Unnamed: 0,Profile_file_name,Total peaks,Overlap,Non.overlapping,Percent,P.value <0.01,P.value <0.05,Description,Organism,Method,Type,Factor,Stage,Lab,modE_ID,modENCODE_id
0,3232_E0-12h_Sin3A_peaks.bed,4046,2519,1527,62.26,14.236283,54.770143,Sin3A;Embryos 0-12 hr;ChIP-seq,D. melanogaster,ChIP-seq,Transcriptional Factor,Sin3A,Embryos 0-12 hr,"White, K.",3232,modENCODE_3232
1,3826_E16-24h_Fer3_peaks.bed.gff,2284,1416,868,62.0,18.345009,52.451839,Fer3;Embryos 16-24 hr;ChIP-seq,D. melanogaster,ChIP-seq,Transcriptional Factor,Fer3,Embryos 16-24 hr,"White, K.",3826,modENCODE_3826
2,3399_A.Female_Trem_peaks.bed.gff,1213,886,327,73.04,14.674361,49.95878,trem;Adult Female;ChIP-seq,D. melanogaster,ChIP-seq,Transcriptional Factor,trem,Adult Female,"White, K.",3399,modENCODE_3399
3,3394_E16-24h_Kr-D2_peaks.bed.gff,353,228,125,64.59,28.895184,42.776204,Kruppel;Embryos 16-24 hr;ChIP-seq,D. melanogaster,ChIP-seq,Transcriptional Factor,Kruppel,Embryos 16-24 hr,"White, K.",3394,modENCODE_3394
4,3827_E8-16h_h_peaks.bed.gff,431,262,169,60.79,13.225058,39.675174,hairy;Embryos 8-16 hr;ChIP-seq,D. melanogaster,ChIP-seq,Transcriptional Factor,hairy,Embryos 8-16 hr,"White, K.",3827,modENCODE_3827


In [38]:
overlapping = happy.merge(phantom_overlap, on='modENCODE_id', how='inner')

In [39]:
phantom_overlap.shape

(153, 16)

In [40]:
overlapping.modENCODE_id.describe()

count              52460
unique                21
top       modENCODE_4982
freq                8958
Name: modENCODE_id, dtype: object

In [41]:
phantompeaks.head()

Unnamed: 0,Name,chr,start,end,strand
0,Phantom Peak-1,chr2L,18637,19089,+
1,Phantom Peak-2,chr2L,102118,102570,+
2,Phantom Peak-3,chr2L,107669,108121,+
3,Phantom Peak-4,chr2L,143051,143503,+
4,Phantom Peak-5,chr2L,155891,156343,+


In [42]:
phantom_overlap.modENCODE_id.describe()

count               153
unique              153
top       modENCODE_847
freq                  1
Name: modENCODE_id, dtype: object

In [43]:
phantom_overlap.merge(happy, on='modENCODE_id', how='left').head()

Unnamed: 0,Profile_file_name,Total peaks,Overlap,Non.overlapping,Percent,P.value <0.01,P.value <0.05,Description,Organism,Method,...,score,chrom,start,end,strand,antibody,cell line,developmental stage,strain,target gene
0,3232_E0-12h_Sin3A_peaks.bed,4046,2519,1527,62.26,14.236283,54.770143,Sin3A;Embryos 0-12 hr;ChIP-seq,D. melanogaster,ChIP-seq,...,,,,,,,,,,
1,3826_E16-24h_Fer3_peaks.bed.gff,2284,1416,868,62.0,18.345009,52.451839,Fer3;Embryos 16-24 hr;ChIP-seq,D. melanogaster,ChIP-seq,...,,,,,,,,,,
2,3399_A.Female_Trem_peaks.bed.gff,1213,886,327,73.04,14.674361,49.95878,trem;Adult Female;ChIP-seq,D. melanogaster,ChIP-seq,...,212.41,2L,72446.0,74074.0,0.0,No Antibody Control,,Adult Female,yellow cinnabar brown speck,
3,3399_A.Female_Trem_peaks.bed.gff,1213,886,327,73.04,14.674361,49.95878,trem;Adult Female;ChIP-seq,D. melanogaster,ChIP-seq,...,3100.0,2L,868493.0,874595.0,0.0,No Antibody Control,,Adult Female,yellow cinnabar brown speck,
4,3399_A.Female_Trem_peaks.bed.gff,1213,886,327,73.04,14.674361,49.95878,trem;Adult Female;ChIP-seq,D. melanogaster,ChIP-seq,...,54.01,2L,10333061.0,10335019.0,0.0,No Antibody Control,,Adult Female,yellow cinnabar brown speck,


In [44]:
count=0
for val in list(happy.modENCODE_id.unique()): 
    if val in list(phantom_overlap.modENCODE_id):
        count += 1 
print(count)

21


What fraction of peaks fall within promoter region? 

How many DamID modENCODE datasets are there?

Histone dataset

In [45]:
missing = []
for val in list(phantom_overlap.modENCODE_id):
    if val not in list(happy.modENCODE_id.unique()):
        missing.append(val)

In [46]:
len(missing)

132

In [47]:
blob = pd.Series(missing)

In [90]:
url = 'http://data.modencode.org/cgi-bin/cloud_list.pl?accessions=3954,3393,3806,3825,3231,2625,2626,2637,3403,4078,3240,4080,5068,4082,4081,3959,5069,2638,2639,3395,3235,4974,5008,5070,5071,5072,5577,3229,3230,3402,3401,2640,2641,2642,3234,3236,3239,3241,3400,3398,4976,3824,3826,4089,3809,3238,3397,5028,3814,3245,3830,4119,4981,5257,5073,5074,5595,5596,5075,4953,5258,5076,5077,5078,5079,5080,5081,5082,4946,5083,4951,4960,4120,5084,5578,844,845,834,837,838,839,835,836,840,841,895,842,843,5085,5086,5087,5088,5582,3807,3810,3811,3815,3955,5089,5583,3829,820,810&urls=1'
page = urlopen(url).read().decode('utf-8')
page = page.split('\n')

table = []
for line in page:
    if not line.startswith('#'):
        modid = 'modENCODE_'+str(line.split()[0]) #gives an error but still works? 
        link = line.split()[1]
        table.append([modid, link])

IndexError: list index out of range

In [91]:
download_urls = pd.DataFrame(table, columns=['modENCODE_id', 'link'])

In [92]:
download_urls.head()

Unnamed: 0,modENCODE_id,link
0,modENCODE_3954,ftp://data.modencode.org/all_files/dmel-interp...
1,modENCODE_3954,ftp://data.modencode.org/all_files/dmel-interp...
2,modENCODE_3954,ftp://data.modencode.org/all_files/dmel-interp...
3,modENCODE_3954,ftp://data.modencode.org/all_files/dmel-raw-6/...
4,modENCODE_3954,ftp://data.modencode.org/all_files/dmel-raw-6/...


In [93]:
len(download_urls.modENCODE_id.unique())

103

In [94]:
download_these = []
for val in list(download_urls.modENCODE_id.unique()):
    if val not in list(happy.modENCODE_id.unique()):
        download_these.append(val)        

In [95]:
len(download_these)

87

In [279]:
PATH = '../../data/modENCODE_downloads/'
os.makedirs(PATH, exist_ok=True)

for index, row in download_urls.iterrows(): 
    if row['modENCODE_id'] in list(download_these): 
        if 'gff' in row['link']: 
            urllib.request.urlretrieve(row['link'], PATH+row['modENCODE_id']+'.gff.gz')       

* Unzip all files using gunzip 
* Iterate to make a column with file name: 

In [None]:
#import zipfile
#with zipfile.ZipFile("file.zip","r") as zip_ref:
    #zip_ref.extractall("targetdir")

In [282]:
import glob
import os
for fname in glob.glob('../../data/modENCODE_downloads/modENCODE_*.gff'): 
    name = os.path.splitext(os.path.basename(fname))[0]
    df = pd.read_table(fname, header=None, comment='#')
    df[8] = 'ID='+name
    new = []
    for val in df[0]:
        newval = 'chr'+val
        new.append(newval)
    df[0] = new
    df.to_csv(fname, sep='\t', header=None, index=False)

In [283]:
df.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8
0,chr2L,CisGenome,protein_binding_site,318353,319615,,,.,ID=modENCODE_895
1,chr2L,CisGenome,protein_binding_site,320068,323222,,,.,ID=modENCODE_895
2,chr2L,CisGenome,protein_binding_site,328033,329598,,,.,ID=modENCODE_895
3,chr2L,CisGenome,protein_binding_site,336128,337547,,,.,ID=modENCODE_895
4,chr2L,CisGenome,protein_binding_site,365844,384321,,,.,ID=modENCODE_895


### Bedtools intersect to get gene information: 
First I need to get my files as beds

In [284]:
import pybedtools
from pybedtools import BedTool
from pybedtools.featurefuncs import gff2bed

In [285]:
genes = BedTool('../../data/dmel-all-r6.12.gene_only.chr.gff')

In [286]:
genes_bed = genes.each(gff2bed, name_field='ID').saveas()

Fix gff files with issues: (thanks modENCODE) 

In [287]:
moden895=pd.read_table('../../data/modENCODE_downloads/modENCODE_895.gff', header=None)
moden895[5] = '.'
moden895[6] = '.'
moden895.to_csv('../../data/modENCODE_downloads/modENCODE_895.gff', sep='\t', header=None, index=False)

In [288]:
#problem file 5070: 
peak = pd.read_table('../../data/modENCODE_downloads/modENCODE_5070.gff', header=None)
peak[4] = peak[4].astype(int)
peak[3] = peak[3].astype(int) 
peak.to_csv('../../data/modENCODE_downloads/modENCODE_5070.gff', sep='\t', header=None, index=False)

In [289]:
#problem file 5084: 
peak = pd.read_table('../../data/modENCODE_downloads/modENCODE_5084.gff', header=None)
peak[4] = peak[4].astype(int)
peak[3] = peak[3].astype(int) 
peak.to_csv('../../data/modENCODE_downloads/modENCODE_5084.gff', sep='\t', header=None, index=False)

In [290]:
#problem file 5583: 
peak = pd.read_table('../../data/modENCODE_downloads/modENCODE_5583.gff', header=None)
peak[4] = peak[4].astype(int)
peak[3] = peak[3].astype(int) 
peak.to_csv('../../data/modENCODE_downloads/modENCODE_5583.gff', sep='\t', header=None, index=False)

**** NOTE!! ****

This file is truncated... I manually deleted the last line to get this to work: 

In [291]:
#problem file 844: 
peak = pd.read_table('../../data/modENCODE_downloads/modENCODE_844.gff', header=None)
peak[4] = peak[4].astype(int)
peak[3] = peak[3].astype(int) 
peak.to_csv('../../data/modENCODE_downloads/modENCODE_844.gff', sep='\t', header=None, index=False)

Then I can iterate over the gffs to get bed files: 

In [293]:
#iterate over modENCODE_downloads files to get beds for liftover
PATH = '../../data/modENCODE_downloads/'
concat = []
for fname in glob.glob('../../data/modENCODE_downloads/modENCODE_*.gff'): 
    peaks = BedTool(fname)
    peaks_bed = peaks.remove_invalid().each(gff2bed).saveas(PATH+os.path.splitext(os.path.basename(fname))[0]+'.bed')   

After I do gff2bed I do a liftover using script my_liftover

In [247]:
#need to figure out intersect step now that I have liftover files 
#think it might be easier to save giant bed file and do the intersect on that
#instead of in a loop? 

Here I read in all my bed files as dataframes and concat them all so that I can figure out some stats about the dataset: 

In [310]:
concat = []
for fname in glob.glob('../../data/modENCODE_downloads/modENCODE_*.liftover'):
    if os.path.getsize(fname) > 0 :
        df = pd.read_table(fname, header=None)
        concat.append(df)
    else:
        print(fname)

../../data/modENCODE_downloads/modENCODE_3807.liftover
../../data/modENCODE_downloads/modENCODE_3809.liftover
../../data/modENCODE_downloads/modENCODE_3810.liftover


In [309]:
bigdf = pd.concat(concat)
#save this matrix of bed files
bigdf.to_csv('../../data/modENCODE_downloads/modENCODE_allliftovers', sep='\t', header=None, index=False)
bigdf.head()

Unnamed: 0,0,1,2,3,4,5
0,chr2L,5596,5988,modENCODE_2625,.,.
1,chr2L,94554,95483,modENCODE_2625,.,.
2,chr2L,131539,132445,modENCODE_2625,.,.
3,chr2L,158233,159613,modENCODE_2625,.,.
4,chr2L,160302,162520,modENCODE_2625,.,.


In [381]:
bigdf.columns = ['chrom','start','end','id','blank1','blank2']

In [382]:
len(bigdf['id'].unique())

83

In [298]:
cnts = bigdf.groupby(['id']).agg({'start': 'count'})

In [299]:
cnts.start.describe()

count       86.000000
mean      7212.476744
std       5451.545026
min          1.000000
25%       3232.750000
50%       6102.500000
75%       9329.750000
max      23332.000000
Name: start, dtype: float64

In [300]:
intergenicdf = pd.read_table('../../data/dmel-all-r6.12.chr.intergenic.bed', header=None, names=['chrom','start','end'])

In [301]:
intergenicdf['length'] = (intergenicdf['end'] - intergenicdf['start'])

In [302]:
intergenicdf.length.describe()

count     12891.000000
mean       3534.168722
std       15765.843317
min           0.000000
25%         201.000000
50%         521.000000
75%        2024.000000
max      710734.000000
Name: length, dtype: float64

Now the intersect: 

In [312]:
#one giant bed intersect using big file "modENCODE_allliftovers" 
with open('../../data/modENCODE_downloads/modENCODE_allliftovers') as f: 
    peaks_bed = BedTool(f)
    intersect = genes_bed.intersect(peaks_bed, u=True).saveas()
    intdf = intersect.to_dataframe()

In [315]:
len(intdf.name.unique())

14431

In [412]:
intdf.head()

Unnamed: 0,chrom,start,end,name,score,strand
0,chr2L,7528,9484,FBgn0031208,.,+
1,chr2L,9838,21376,FBgn0002121,.,-
2,chr2L,21822,25155,FBgn0031209,.,-
3,chr2L,21951,24237,FBgn0263584,.,+
4,chr2L,25401,65404,FBgn0051973,.,-


Also need an intersect  with the happy dataframe (which is the white lab data):

In [407]:
happy.to_csv('../../data/modENCODE_downloads/modENCODE_whitelab', sep='\t', header=None, index=False)

In [409]:
happypeaks = BedTool('../../data/modENCODE_downloads/modENCODE_whitelab')
#lose a lot of information during gff2bed step... 
happypeaks_bed = peaks.remove_invalid().each(gff2bed).saveas('../../data/modENCODE_downloads/modENCODE_whitelab.bed')

In [418]:
with open('../../data/modENCODE_downloads/modENCODE_whitelab.liftover') as f: 
    peaks_bed = BedTool(f)
    intersect = genes_bed.intersect(peaks_bed, u=True, wb=True).saveas()
    white = intersect.to_dataframe()

In [419]:
white.head()

Unnamed: 0,chrom,start,end,name,score,strand
0,chr2L,305934,355566,FBgn0004611,.,+
1,chr2L,318361,319259,FBgn0031248,.,-
2,chr2L,320278,321248,FBgn0031249,.,-
3,chr2L,322100,323091,FBgn0053127,.,+
4,chr2L,325521,329761,FBgn0264086,.,-


In [416]:
allintersects = pd.concat([white, intdf])

In [422]:
allintersects.drop_duplicates().shape

(14431, 6)

In [421]:
intdf.shape

(14431, 6)

In [None]:
#Can merge back on start/end to get information lost... should probably do that before
#concat. Although maybe not if super redundant? 