## PHANTOM PEAKS

Here I begin to think about the conservative ChIP-seq matrix, which we're thinking will not have phantom peaks.  

In [1]:
import pandas as pd
import gffutils
from gffutils import pybedtools_integration
import pybedtools

Import table of phantom peak locations (from Jain et al 2015 Supplementary Data): 

In [2]:
phantompeaks = pd.read_excel('../../output/chip/gkv637_Supplementary_Data/Supplementary_table_3__List_of_Phantom_Peaks.xlsx')

In [3]:
phantompeaks.head()

Unnamed: 0,Name,chr,start,end,strand
0,Phantom Peak-1,chr2L,18637,19089,+
1,Phantom Peak-2,chr2L,102118,102570,+
2,Phantom Peak-3,chr2L,107669,108121,+
3,Phantom Peak-4,chr2L,143051,143503,+
4,Phantom Peak-5,chr2L,155891,156343,+


In [4]:
columns = ['DB_id', 'score', 'chrom', 'start', 'end', 'strand', 'modENCODE_id',
       'peak_location_fbgn',
       'peak_location_symbol', 'antibody', 'cell line', 'developmental stage',
       'strain', 'target gene','tissue', 'TF', 'TF_fbgn']

Import my modENCODE data table: 

In [5]:
modENCODE = pd.read_table('../../output/chip/modENCODE_downloads_2ndtry/modENCODE_finaltable', header=None, names= columns)

  interactivity=interactivity, compiler=compiler, result=result)


In [51]:
modENCODE.head()

Unnamed: 0,DB_id,score,chrom,start,end,strand,modENCODE_id,peak_location_fbgn,peak_location_symbol,antibody,cell line,developmental stage,strain,target gene,tissue,TF,TF_fbgn
0,10T_E16_24h_GFP.1,.,chr2L,516190,516870,+,modENCODE_3229,FBgn0003963,ush,No Antibody Control,,Embryo 16-24 h,FlyStrain:10T-DFD-GFP:KW:1&oldid=39548,,,eGFP,
1,10T_E16_24h_GFP.10,.,chr2L,3124123,3125019,-,modENCODE_3229,FBgn0015600,toc,No Antibody Control,,Embryo 16-24 h,FlyStrain:10T-DFD-GFP:KW:1&oldid=39548,,,eGFP,
2,10T_E16_24h_GFP.100,.,chr2L,21306821,21307495,-,modENCODE_3229,FBgn0032940,Mio,No Antibody Control,,Embryo 16-24 h,FlyStrain:10T-DFD-GFP:KW:1&oldid=39548,,,eGFP,
3,10T_E16_24h_GFP.11,.,chr2L,3301741,3302706,-,modENCODE_3229,FBgn0031516,CG9663,No Antibody Control,,Embryo 16-24 h,FlyStrain:10T-DFD-GFP:KW:1&oldid=39548,,,eGFP,
4,10T_E16_24h_GFP.113,.,chr2R,1734511,1735276,+,modENCODE_3229,FBgn0087011,CG41520,No Antibody Control,,Embryo 16-24 h,FlyStrain:10T-DFD-GFP:KW:1&oldid=39548,,,eGFP,


Perform intersect to obtain phantom peak overlap: 

In [8]:
mod_bed = pybedtools.BedTool.from_dataframe(modENCODE[['chrom','start','end','modENCODE_id']])

In [9]:
intersect = mod_bed.intersect(pybedtools.BedTool.from_dataframe(phantompeaks[['chr ','start','end','Name']]), wo=True).to_dataframe()

In [10]:
intersect.head()

Unnamed: 0,chrom,start,end,name,score,strand,thickStart,thickEnd,itemRgb
0,chr2R,4998091,4999786,modENCODE_3229,chr2R,4998039,4998491,Phantom Peak-647,400
1,chr2R,13477753,13478853,modENCODE_3229,chr2R,13478116,13478599,Phantom Peak-913,483
2,chr3L,6736743,6737742,modENCODE_3229,chr3L,6736750,6737038,Phantom Peak-1287,288
3,chr3L,16858482,16858712,modENCODE_3229,chr3L,16858127,16858579,Phantom Peak-1532,97
4,chr3L,16858482,16859111,modENCODE_3229,chr3L,16858127,16858579,Phantom Peak-1532,97


In [11]:
len(intersect.name.unique())

150

In [12]:
intersect.shape

(190567, 9)

In [13]:
len(intersect.thickEnd.unique())

2254

In Jain et al 2014, an overlap of at least 50bp is requested. Filter for this:  

In [14]:
#intersect with overlap more than 50: 
filtered = intersect[intersect.itemRgb >= 50]

In [16]:
filtered.shape

(178125, 9)

Import table of phantom peak overlap with modENCODE from paper: 

In [34]:
phantom_overlap = pd.read_excel('../../data/chip/gkv637_Supplementary_Data/Supplementary_table_5__Overlap_of_the_Phantom_Peaks_with_non-histone_modENCODE_ChIPSeq_profiles.xlsx', header=1)
phantom_overlap.head()

Unnamed: 0,Profile_file_name,Total peaks,Overlap,Non.overlapping,Percent,P.value <0.01,P.value <0.05,Description,Organism,Method,Type,Factor,Stage,Lab,modE_ID
0,3232_E0-12h_Sin3A_peaks.bed,4046,2519,1527,62.26,14.236283,54.770143,Sin3A;Embryos 0-12 hr;ChIP-seq,D. melanogaster,ChIP-seq,Transcriptional Factor,Sin3A,Embryos 0-12 hr,"White, K.",3232
1,3826_E16-24h_Fer3_peaks.bed.gff,2284,1416,868,62.0,18.345009,52.451839,Fer3;Embryos 16-24 hr;ChIP-seq,D. melanogaster,ChIP-seq,Transcriptional Factor,Fer3,Embryos 16-24 hr,"White, K.",3826
2,3399_A.Female_Trem_peaks.bed.gff,1213,886,327,73.04,14.674361,49.95878,trem;Adult Female;ChIP-seq,D. melanogaster,ChIP-seq,Transcriptional Factor,trem,Adult Female,"White, K.",3399
3,3394_E16-24h_Kr-D2_peaks.bed.gff,353,228,125,64.59,28.895184,42.776204,Kruppel;Embryos 16-24 hr;ChIP-seq,D. melanogaster,ChIP-seq,Transcriptional Factor,Kruppel,Embryos 16-24 hr,"White, K.",3394
4,3827_E8-16h_h_peaks.bed.gff,431,262,169,60.79,13.225058,39.675174,hairy;Embryos 8-16 hr;ChIP-seq,D. melanogaster,ChIP-seq,Transcriptional Factor,hairy,Embryos 8-16 hr,"White, K.",3827


In [19]:
number_overlapping = phantom_overlap.Overlap.values

In [20]:
sum(number_overlapping)

189036

There is a difference of about 10,000 in the amount of phantom peaks the paper found overlapping with modENCODE and the amount I have overlapping with my dataset. Look into this a little further: 

In [17]:
modENCODE_sets_fromme = filtered.name.unique()

In [22]:
modE_ID_from_paper = phantom_overlap.modE_ID.unique()

In [23]:
modENCODE_sets_frompaper = ['modENCODE_'+str(x) for x in modE_ID_from_paper]

In [24]:
for x in list(modENCODE_sets_frompaper):
    if x not in list(modENCODE_sets_fromme):
        print(x)

modENCODE_3826
modENCODE_3827
modENCODE_3954
modENCODE_4351
modENCODE_2755
modENCODE_3253
modENCODE_2753
modENCODE_2979
modENCODE_2754
modENCODE_2635
modENCODE_3625
modENCODE_2632
modENCODE_2783
modENCODE_3233
modENCODE_3247
modENCODE_3825
modENCODE_3806
modENCODE_3231
modENCODE_3251
modENCODE_2631
modENCODE_3236
modENCODE_4193
modENCODE_2628
modENCODE_2625
modENCODE_3824
modENCODE_3397
modENCODE_2640
modENCODE_3235
modENCODE_3823
modENCODE_2637


In [25]:
no_data = ['2635', '2625', '2637', '2631', '3806', '2632', '3827', '3826', '3235', '3236', '3823', '3824', '3825', '2640', '3231', '3397', '3233', '2628'] 

In [26]:
modENCODE_no_data = ['modENCODE_'+str(x) for x in no_data]

In [28]:
downloads=[3393,3806,3825,3231,2625,2626,2637,3403,4078,3240,4080,5068,4082,4081,3959,5069,2638,2639,3395,3235,4974,5008,5070,5071,5072,5577,3229,3230,3402,3401,2640,2641,2642,3234,3236,3239,3241,3400,3398,4976,3824,3826,4089,3809,3238,3397,5028,3814,3245,3830,4119,4981,5257,3815,4982,3827,2629,2630,2635,2636,2633,2634,2627,2628,2631,2632,3391,3392,4091,3956,4936,5592,5111,5110,5590,5587,5112,5591,5264,5113,5593,4944,3390,5023,4998,4070,3242,5114,5575,5115,5594,3243,3394,3812,5004,5005,5116,5597,5574,5117,4094,5118,854,858,3233,862,863,856,849,857,855,985,859,860,861,5119,4074,4107,3957,3237,3813,3816,5120,5121,5598,4095,4096,3960,3808,4098,5576,5570,5579,5571,984,850,848,4352,5580,5569,5122,5123,5124,846,847,851,852,853,5125,5126,5599,4099,4071,3232,5568,5014,4103,5017,4069,4104,3958,4716,4105,4943,5127,5606,3396,5128,5024,5025,3399,4114,4113,3823,5029,5129]

In [29]:
modENCODE_downloads = ['modENCODE_'+str(x) for x in downloads]

In [30]:
try_adding = []
for x in list(modENCODE_sets_frompaper):
    if x not in list(modENCODE_sets_fromme):
        if x not in list(modENCODE_no_data):
            if x not in list(modENCODE_downloads):
                try_adding.append(x)

In [31]:
for x in list(modENCODE_no_data):
    if x in list(modENCODE_sets_frompaper):
        print(x)

modENCODE_2635
modENCODE_2625
modENCODE_2637
modENCODE_2631
modENCODE_3806
modENCODE_2632
modENCODE_3827
modENCODE_3826
modENCODE_3235
modENCODE_3236
modENCODE_3823
modENCODE_3824
modENCODE_3825
modENCODE_2640
modENCODE_3231
modENCODE_3397
modENCODE_3233
modENCODE_2628


In [33]:
#Try adding to download pipeline and see if theres data?
try_adding 
#Nevermind! 
#3954 doesn't exist? 
#4351, 2755, 3253, 2753, 2979, 2754, 3625, 2783, 3247, 3251, 4193 protein binding

['modENCODE_3954',
 'modENCODE_4351',
 'modENCODE_2755',
 'modENCODE_3253',
 'modENCODE_2753',
 'modENCODE_2979',
 'modENCODE_2754',
 'modENCODE_3625',
 'modENCODE_2783',
 'modENCODE_3247',
 'modENCODE_3251',
 'modENCODE_4193']

### Remove phantom peaks: 

In [49]:
#Next step = get rid of phantom peaks to make stricter data set ?
filtered.shape

(178125, 9)

In [47]:
#worried I lose too much if I do a reverse intersect? Maybe ask Justin about this but trying to figure out if
#pandas has a better way

outermerge = modENCODE.merge(filtered, how='outer', on=['start','end'], indicator=True)
no_phantom = outermerge[outermerge._merge == 'left_only']

In [48]:
no_phantom.shape

(1552415, 25)

In [None]:
#losing around 200,000 == good !! 