# Build Histone ChIP Binary Matrix

Goal: binary matrix TF/gene for Histones

Need to do: 
- get antibody information 
- collapse to gene level: if histone is in a gene


In [3]:
import pandas as pd
import gffutils
from gffutils import pybedtools_integration
import pybedtools
from pybedtools.featurefuncs import gff2bed

In [4]:
hist = pd.read_table('../output/chip/ALL_HIST_CHIP_filtered.bed', header=None,
                     names=['chrom','start','end','srx','score','caller'])

In [5]:
spreadsheet = pd.read_csv('../output/chip/20171103_s2cell_chip-seq.csv')
#For now we are excluding datasets with no input: 
spreadsheet = spreadsheet[spreadsheet.input != 'no input?']
antibody_table = spreadsheet[['srx','target']]

In [6]:
hist2 = hist.merge(antibody_table, on='srx', how='left')

In [7]:
hist2.head()

Unnamed: 0,chrom,start,end,srx,score,caller,target
0,chr2L,16617,16893,SRX191913,33.40816,macs2,CTCF
1,chr2L,21242,21484,SRX191913,7.92056,macs2,CTCF
2,chr2L,34118,34304,SRX191913,4.27362,macs2,CTCF
3,chr2L,35499,35830,SRX191913,39.60189,macs2,CTCF
4,chr2L,43248,43463,SRX191913,3.57781,macs2,CTCF


In [8]:
len(hist2.srx.unique()), len(hist2.target.unique())

(33, 8)

### Restrict region (?)

In [9]:
#bed file containing introns and 1 kb upstream
intslop = pybedtools.BedTool('../output/dm6_intron_sloptranscript.bed')

In [10]:
intersect = pybedtools.BedTool.from_dataframe(hist2).intersect(intslop).to_dataframe()

In [11]:
intersect.head()

Unnamed: 0,chrom,start,end,name,score,strand,thickStart
0,chr2L,16617,16893,SRX191913,33.40816,macs2,CTCF
1,chr2L,21376,21484,SRX191913,7.92056,macs2,CTCF
2,chr2L,34288,34304,SRX191913,4.27362,macs2,CTCF
3,chr2L,35499,35745,SRX191913,39.60189,macs2,CTCF
4,chr2L,43248,43463,SRX191913,3.57781,macs2,CTCF


## Target gene intersect: 

In [12]:
gene_info = pybedtools.BedTool('../output/chip/dmel6.12.genes.bed')

In [14]:
targene_intersect = gene_info.intersect(pybedtools.BedTool.from_dataframe(intersect), 
                                        wb=True).saveas().to_dataframe()[[3,6,7,8,9,10,11,12]]

['chrom', 'start', 'end', 'name', 'score', 'strand', 'thickStart', 'thickEnd', 'itemRgb', 'blockCount', 'blockSizes', 'blockStarts']
but file has 13 fields; you can supply custom names with the `names` kwarg
  % (self.file_type, _names, self.field_count()))


In [15]:
targene_intersect.columns = ['target_gene','chrom','start','end','srx','log10qval','caller','antibody']

In [16]:
targene_intersect.head()

Unnamed: 0,target_gene,chrom,start,end,srx,log10qval,caller,antibody
0,FBgn0031208,chr2L,8116,8192,SRX193335,40.9803,macs2,H3K4me1
1,FBgn0031208,chr2L,8116,8192,SRX193336,37.41729,macs2,H3K4me1
2,FBgn0031208,chr2L,8116,8192,SRX193321,18.49746,macs2,H3K4me1
3,FBgn0031208,chr2L,8116,8192,SRX193334,3.58543,macs2,H3K27me3
4,FBgn0031208,chr2L,8116,8192,SRX193320,2.79066,macs2,H3K27me3


## Collapse to binary: 
- New matrix w/no duplicates

In [23]:
#If peak in gene region count it as a 1 
#only need gene,TF_fbgn
binary_collapse = targene_intersect[['target_gene','antibody']]

In [24]:
binary_collapse['binary'] = 1

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  if __name__ == '__main__':


In [25]:
binary_collapse.drop_duplicates(inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  if __name__ == '__main__':


In [26]:
#index can't have duplicates
binary_collapse.set_index(['target_gene','antibody'], inplace=True)
matrix = binary_collapse.unstack()

In [29]:
matrix.fillna(value=0).to_csv('../output/chip/histone_matrix', sep='\t')