# Build Histone ChIP Binary Matrix

Goal: binary matrix TF/gene for Histones

Need to do: 
- get antibody information 
- collapse to gene level: if histone is in a gene


In [2]:
import pandas as pd
import gffutils
from gffutils import pybedtools_integration
import pybedtools
from pybedtools.featurefuncs import gff2bed

In [3]:
hist = pd.read_table('../output/chip/ALL_HIST_CHIP_filtered.bed', header=None,
                     names=['chrom','start','end','srx','score','caller'])

In [4]:
spreadsheet = pd.read_csv('../output/chip/20171103_s2cell_chip-seq.csv')
#For now we are excluding datasets with no input: 
spreadsheet = spreadsheet[spreadsheet.input != 'no input?']
antibody_table = spreadsheet[['srx','target']]

In [5]:
hist2 = hist.merge(antibody_table, on='srx', how='left')

In [6]:
hist2.head()

Unnamed: 0,chrom,start,end,srx,score,caller,target
0,chr2L,16617,16893,SRX191913,35.59567,macs2,CTCF
1,chr2L,21242,21484,SRX191913,9.77751,macs2,CTCF
2,chr2L,34118,34304,SRX191913,5.98254,macs2,CTCF
3,chr2L,35499,35830,SRX191913,41.82674,macs2,CTCF
4,chr2L,43248,43463,SRX191913,5.26053,macs2,CTCF


In [7]:
len(hist2.srx.unique()), len(hist2.target.unique())

(37, 8)

### Restrict region (?)

In [8]:
diff = pybedtools.BedTool('../output/dm6_diff.bed')
gene_info = pybedtools.BedTool('../output/chip/dmel6.12.genes.bed')

In [9]:
#make bed file for gene body and 1 kb upstream
geneslop = diff.cat(gene_info).saveas('../output/dm6_gene_sloptranscript.bed')

In [10]:
intersect = pybedtools.BedTool.from_dataframe(hist2).intersect(geneslop).to_dataframe()

In [11]:
intersect.head()

Unnamed: 0,chrom,start,end,name,score,strand,thickStart
0,chr2L,16617,16893,SRX191913,35.59567,macs2,CTCF
1,chr2L,21242,21484,SRX191913,9.77751,macs2,CTCF
2,chr2L,34118,34304,SRX191913,5.98254,macs2,CTCF
3,chr2L,35499,35830,SRX191913,41.82674,macs2,CTCF
4,chr2L,43248,43463,SRX191913,5.26053,macs2,CTCF


## Target gene intersect: 

In [12]:
targene_intersect = gene_info.intersect(pybedtools.BedTool.from_dataframe(intersect), 
                                        wb=True).saveas().to_dataframe()[[3,6,7,8,9,10,11,12]]

['chrom', 'start', 'end', 'name', 'score', 'strand', 'thickStart', 'thickEnd', 'itemRgb', 'blockCount', 'blockSizes', 'blockStarts']
but file has 13 fields; you can supply custom names with the `names` kwarg
  % (self.file_type, _names, self.field_count()))


In [13]:
targene_intersect.columns = ['target_gene','chrom','start','end','srx','log10qval','caller','antibody']

In [14]:
targene_intersect.head()

Unnamed: 0,target_gene,chrom,start,end,srx,log10qval,caller,antibody
0,FBgn0031208,chr2L,6528,11179,SRX193335,43.21521,macs2,H3K4me1
1,FBgn0031208,chr2L,6528,10460,SRX193336,39.33482,macs2,H3K4me1
2,FBgn0031208,chr2L,6528,10035,SRX193321,20.15121,macs2,H3K4me1
3,FBgn0031208,chr2L,7440,9409,SRX193334,5.01516,macs2,H3K27me3
4,FBgn0031208,chr2L,7525,9406,SRX193320,4.28135,macs2,H3K27me3


In [15]:
targene_intersect.to_csv('../output/chip/hist_chip_targeneintersect', sep='\t', index=False)

## Collapse to binary: 
- New matrix w/no duplicates

In [14]:
#If peak in gene region count it as a 1 
#only need gene,TF_fbgn
binary_collapse = targene_intersect[['target_gene','antibody']]

In [15]:
binary_collapse['binary'] = 1

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  """Entry point for launching an IPython kernel.


In [16]:
binary_collapse.drop_duplicates(inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  """Entry point for launching an IPython kernel.


In [17]:
#index can't have duplicates
binary_collapse.set_index(['target_gene','antibody'], inplace=True)
matrix = binary_collapse.unstack()

In [18]:
matrix.fillna(value=0).to_csv('../output/chip/histone_matrix', sep='\t')

In [24]:
matrix.head()

Unnamed: 0_level_0,binary,binary,binary,binary,binary,binary,binary,binary
antibody,CTCF,H3K27ac,H3K27me3,H3K36me3,H3K4me1,H3K4me3,H3K9me2,His2Av
target_gene,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2
FBgn0000003,1.0,,,,,,,
FBgn0000008,1.0,,1.0,1.0,1.0,,,1.0
FBgn0000014,1.0,,1.0,,1.0,,,
FBgn0000015,1.0,,1.0,1.0,1.0,,,
FBgn0000017,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0


In [25]:
#split into activating or repressing 
# H3K27ac, H3K36me3, H3K4me1, H3K4me3 = activating
# H3K27me3, H3K9me2 = repressing
# CTCF, His2Av = both

activating = matrix['binary'][['H3K27ac', 'H3K36me3','H3K4me1','H3K4me3']]
repressing = matrix['binary'][['H3K27me3','H3K9me2']]
both = matrix['binary'][['CTCF','His2Av']]

In [27]:
activating.fillna(value=0).to_csv('../output/chip/activating_histone_matrix', sep='\t')

In [29]:
repressing.fillna(value=0).to_csv('../output/chip/repressing_histone_matrix', sep='\t')

In [31]:
both.fillna(value=0).to_csv('../output/chip/both_histone_matrix', sep='\t')