# Build TF ChIP Binary Matrix

Goal: 2 binary matrices, 1 for TFs and 1 for histone modifications. Going to work on TF here and then repeat same process for histones

Need to do: 
- get TF information: 
    - see how well the symbol map works 
- collapse to gene level (if peak overlaps the region count it)
    - need bed of regions I want/which gene they are in


In [1]:
import pandas as pd
import gffutils
from gffutils import pybedtools_integration
import pybedtools
from pybedtools.featurefuncs import gff2bed

In [2]:
tf = pd.read_table('../output/chip/ALL_TF_CHIP_filtered.bed', header=None, 
                   names=['chrom','start','end','srx','score','caller'])

In [3]:
spreadsheet = pd.read_csv('../output/chip/20171103_s2cell_chip-seq.csv')
#For now we are excluding datasets with no input: 
spreadsheet = spreadsheet[spreadsheet.input != 'no input?']
antibody_table = spreadsheet[['srx','target']]

In [4]:
tf2 = tf.merge(antibody_table, on='srx',how='left')

In [5]:
tf2.head()

Unnamed: 0,chrom,start,end,srx,score,caller,target
0,chr2L,66743,67064,SRX017462,3.74271,macs2,Orc2
1,chr2L,72379,72633,SRX017462,2.55253,macs2,Orc2
2,chr2L,73097,73294,SRX017462,5.32918,macs2,Orc2
3,chr2L,73334,73505,SRX017462,3.6456,macs2,Orc2
4,chr2L,120707,120958,SRX017462,8.2708,macs2,Orc2


In [6]:
len(tf2.srx.unique()), len(tf2.target.unique())

(121, 58)

### Obtain antibody fbgn: 

In [7]:
#bed file containing introns and 1 kb upstream
intslop = pybedtools.BedTool('../output/dm6_intron_sloptranscript.bed')

In [8]:
tf_intersect = pybedtools.BedTool.from_dataframe(tf2).intersect(intslop).to_dataframe()

In [9]:
tf_intersect.head()

Unnamed: 0,chrom,start,end,name,score,strand,thickStart
0,chr2L,67003,67042,SRX017462,3.74271,macs2,Orc2
1,chr2L,129245,129351,SRX017462,5.97573,macs2,Orc2
2,chr2L,132021,132076,SRX017462,5.55793,macs2,Orc2
3,chr2L,166651,166846,SRX017462,9.40102,macs2,Orc2
4,chr2L,247625,247858,SRX017462,5.32918,macs2,Orc2


In [10]:
#symbol maps map gene symbol to FBgn: 
symbolmap = pd.read_table('/data/LCDB/lcdb-references/dmel/r6-11/gtf/dmel_r6-11.SYMBOL.csv', sep=',') 
symbolmap2 = pd.read_table('../data/fb_synonym.tsv', sep=' ', header=None)
symbolmap2.columns = ['gene','a','b']

In [11]:
#make copy df
tf_intersect_copy = tf_intersect.copy()

In [12]:
tf_intersect_copy.loc[tf_intersect.thickStart == 'HP1a','thickStart'] = 'Su(var)205'
tf_intersect_copy.loc[tf_intersect.thickStart == 'Hp1a','thickStart'] = 'Su(var)205'
tf_intersect_copy.loc[tf_intersect.thickStart == 'CP190','thickStart'] = 'Cp190'
tf_intersect_copy.loc[tf_intersect.thickStart == 'CG8436','thickStart'] = 'Ibf1'
tf_intersect_copy.loc[tf_intersect.thickStart == 'CG9740','thickStart'] = 'Ibf2'
tf_intersect_copy.loc[tf_intersect.thickStart == 'NSL3','thickStart'] = 'Rcd1'
tf_intersect_copy.loc[tf_intersect.thickStart == 'UTX','thickStart'] = 'Utx'
tf_intersect_copy.loc[tf_intersect.thickStart == 'LPT','thickStart'] = 'Lpt'
tf_intersect_copy.loc[tf_intersect.thickStart == 'Trr','thickStart'] = 'trr'
tf_intersect_copy.loc[tf_intersect.thickStart == 'dCAP-D3','thickStart'] = 'Cap-D3'
tf_intersect_copy.loc[tf_intersect.thickStart == 'DnaJ1','thickStart'] = 'DnaJ-1'
tf_intersect_copy.loc[tf_intersect.thickStart == 'MYST5','thickStart'] = 'CG1894'
tf_intersect_copy.loc[tf_intersect.thickStart == 'ZIPIC','thickStart'] = 'CG7928'

In [13]:
merge1 = tf_intersect_copy.merge(symbolmap, left_on='thickStart', right_on='SYMBOL', how='left')
merge2 = merge1.merge(symbolmap2, left_on='thickStart', right_on='a', how='left')[['chrom','start','end','name',
                                                                'score','strand','thickStart','ENSEMBL','gene']]

In [14]:
merge2.fillna('')
merge2['anti_FBgn']= merge2.ENSEMBL.combine_first(merge2.gene)
trim = merge2[['chrom','start','end','name','score','strand','thickStart','anti_FBgn']]

In [15]:
#drop these because they aren't fly genes (except for Ph but it wasn't specified which ph)
trim[trim.anti_FBgn.isnull()].thickStart.unique()

array(['Rpb1', 'FLAG', 'GFP', 'Ph', 'control', 'Rpb3', 'control ', 'Rbp3'], dtype=object)

In [16]:
drop_bad_antibodies = trim[~trim.anti_FBgn.isnull()]

## Filter for RNAi TFs

In [17]:
drop_bad_antibodies.head()

Unnamed: 0,chrom,start,end,name,score,strand,thickStart,anti_FBgn
0,chr2L,67003,67042,SRX017462,3.74271,macs2,Orc2,FBgn0015270
1,chr2L,129245,129351,SRX017462,5.97573,macs2,Orc2,FBgn0015270
2,chr2L,132021,132076,SRX017462,5.55793,macs2,Orc2,FBgn0015270
3,chr2L,166651,166846,SRX017462,9.40102,macs2,Orc2,FBgn0015270
4,chr2L,247625,247858,SRX017462,5.32918,macs2,Orc2,FBgn0015270


In [18]:
#Remember filter for RNAi TFs 
TF_list = pd.read_table('../output/list_of_tfs.txt', header=None)
TF_list.columns=['TF']

In [35]:
TF_list.shape

(488, 1)

In [19]:
merge_on_our_TFs = drop_bad_antibodies.merge(TF_list, left_on='anti_FBgn', right_on='TF', how='inner')

In [20]:
merge_on_our_TFs.head()

Unnamed: 0,chrom,start,end,name,score,strand,thickStart,anti_FBgn,TF
0,chr2L,6528,7316,SRX885700,62.80834,macs2,Scm,FBgn0003334,FBgn0003334
1,chr2L,7432,7528,SRX885700,8.14595,macs2,Scm,FBgn0003334,FBgn0003334
2,chr2L,8116,8192,SRX885700,8.14595,macs2,Scm,FBgn0003334,FBgn0003334
3,chr2L,9484,9612,SRX885700,4.98406,macs2,Scm,FBgn0003334,FBgn0003334
4,chr2L,66242,66317,SRX885700,2.59312,macs2,Scm,FBgn0003334,FBgn0003334


In [21]:
len(merge_on_our_TFs.thickStart.unique())

21

## Target gene intersect: 

In [22]:
gene_info = pybedtools.BedTool('../output/chip/dmel6.12.genes.bed')

In [23]:
targene_intersect = gene_info.intersect(pybedtools.BedTool.from_dataframe(merge_on_our_TFs), 
                                        wb=True).saveas().to_dataframe()[[3,6,7,8,9,10,11,12,13]]

['chrom', 'start', 'end', 'name', 'score', 'strand', 'thickStart', 'thickEnd', 'itemRgb', 'blockCount', 'blockSizes', 'blockStarts']
but file has 15 fields; you can supply custom names with the `names` kwarg
  % (self.file_type, _names, self.field_count()))


In [24]:
targene_intersect.columns = ['target_gene','chrom','start','end','srx','log10qval','caller','TF','TF_fbgn']

In [25]:
targene_intersect.head()

Unnamed: 0,target_gene,chrom,start,end,srx,log10qval,caller,TF,TF_fbgn
0,FBgn0031208,chr2L,8116,8192,SRX885700,8.14595,macs2,Scm,FBgn0003334
1,FBgn0031208,chr2L,8116,8192,SRX885698,58.31385,macs2,Scm,FBgn0003334
2,FBgn0031208,chr2L,8116,8192,SRX097617,136.58838,macs2,lilli,FBgn0041111
3,FBgn0031208,chr2L,8116,8192,SRX097617,3.405688,spp,lilli,FBgn0041111
4,FBgn0031208,chr2L,8116,8192,SRX097617,3.405688,spp,lilli,FBgn0041111


## Collapse to binary: 
- New matrix w/no duplicates

In [26]:
#If peak in gene region count it as a 1 
#only need gene,TF_fbgn
binary_collapse = targene_intersect[['target_gene','TF_fbgn']]

In [27]:
binary_collapse['binary'] = 1

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  """Entry point for launching an IPython kernel.


In [28]:
binary_collapse.drop_duplicates(inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  """Entry point for launching an IPython kernel.


In [29]:
#index can't have duplicate entries so I need to condense this information down
binary_collapse.set_index(['target_gene','TF_fbgn'], inplace=True)
matrix = binary_collapse.unstack()

In [30]:
matrix.fillna(value=0).to_csv('../output/chip/tf_matrix', sep='\t')

In [31]:
matrix.head()

Unnamed: 0_level_0,binary,binary,binary,binary,binary,binary,binary,binary,binary,binary,binary,binary,binary,binary,binary,binary,binary,binary,binary,binary,binary
TF_fbgn,FBgn0000283,FBgn0001206,FBgn0002775,FBgn0003042,FBgn0003334,FBgn0003567,FBgn0003607,FBgn0010328,FBgn0015602,FBgn0020388,...,FBgn0033998,FBgn0034878,FBgn0037746,FBgn0038016,FBgn0039019,FBgn0039740,FBgn0041111,FBgn0259785,FBgn0262656,FBgn0263667
target_gene,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2,Unnamed: 13_level_2,Unnamed: 14_level_2,Unnamed: 15_level_2,Unnamed: 16_level_2,Unnamed: 17_level_2,Unnamed: 18_level_2,Unnamed: 19_level_2,Unnamed: 20_level_2,Unnamed: 21_level_2
FBgn0000008,,1.0,,,1.0,,,1.0,,1.0,...,1.0,,,1.0,,1.0,1.0,,,
FBgn0000014,1.0,,,1.0,1.0,1.0,,,,,...,,,,,,1.0,1.0,,,1.0
FBgn0000015,1.0,1.0,,1.0,1.0,1.0,,1.0,,1.0,...,,1.0,1.0,1.0,1.0,1.0,1.0,1.0,,1.0
FBgn0000017,1.0,1.0,1.0,1.0,,1.0,,1.0,,1.0,...,1.0,,1.0,1.0,,1.0,1.0,1.0,,1.0
FBgn0000018,,,1.0,,,,,,,,...,,,,,,,,,,


In [42]:
matrix.sum(axis=0)

        TF_fbgn    
binary  FBgn0000283    5365.0
        FBgn0001206    1498.0
        FBgn0002775    5072.0
        FBgn0003042    1770.0
        FBgn0003334    2321.0
        FBgn0003567    4903.0
        FBgn0003607    1139.0
        FBgn0010328    3702.0
        FBgn0015602    1041.0
        FBgn0020388    3343.0
        FBgn0023518    1510.0
        FBgn0033998    6304.0
        FBgn0034878    1336.0
        FBgn0037746    1772.0
        FBgn0038016    5860.0
        FBgn0039019    1544.0
        FBgn0039740    3912.0
        FBgn0041111    2104.0
        FBgn0259785    5535.0
        FBgn0262656     534.0
        FBgn0263667    3459.0
dtype: float64

In [45]:
row = targene_intersect[targene_intersect.TF_fbgn == 'FBgn0033998']
row.head()

Unnamed: 0,target_gene,chrom,start,end,srx,log10qval,caller,TF,TF_fbgn
103,FBgn0002121,chr2L,17515,17676,SRX326966,2.69786,macs2,row,FBgn0033998
104,FBgn0002121,chr2L,17769,18025,SRX326966,50.72078,macs2,row,FBgn0033998
105,FBgn0002121,chr2L,18168,18260,SRX326966,50.72078,macs2,row,FBgn0033998
106,FBgn0002121,chr2L,20391,20830,SRX326966,31.25111,macs2,row,FBgn0033998
107,FBgn0002121,chr2L,20973,21065,SRX326966,31.25111,macs2,row,FBgn0033998


In [46]:
row.srx.unique()

array(['SRX326966'], dtype=object)

In [47]:
row.caller.unique()

array(['macs2', 'spp'], dtype=object)

In [48]:
row[['chrom','start','end']].to_csv('../output/chip/row.bed', header=None, index=False, sep='\t')