# Make nice bed files for Fly Meeting

In [1]:
import pandas as pd
import gffutils
from gffutils import pybedtools_integration
import pybedtools
from pybedtools.featurefuncs import gff2bed

### MOTIFS:

In [2]:
df = pd.read_table('../output/motif/tf_merge.bed', header=None, 
              names=['chrom', 'start','end','pval','motif_symbol','motif_FBgn'])

In [3]:
#Filter for RNAi TFs
TF_list = pd.read_table('../output/list_of_tfs.txt', header=None)
TF_list.columns=['TF']

merge_on_our_TFs = df.merge(TF_list, left_on='motif_FBgn', right_on='TF', how='inner')

In [4]:
bed = merge_on_our_TFs[['chrom', 'start','end','motif_FBgn', 'pval']]

In [5]:
bed.head()

Unnamed: 0,chrom,start,end,motif_FBgn,pval
0,chr2L,6782,6789,FBgn0283451,3.6e-05
1,chr2L,17019,17026,FBgn0283451,3.6e-05
2,chr2L,19846,19853,FBgn0283451,3.6e-05
3,chr2L,29403,29410,FBgn0283451,3.6e-05
4,chr2L,30047,30054,FBgn0283451,3.6e-05


In [6]:
len(bed.motif_FBgn.unique())

115

In [8]:
bed.to_csv('../../datashare/tf_motifs.bed', sep='\t', index=False, header=None)
#Note I'm going to manually write in a bed header line because it has to be formatted "track name="

### CHIP:

In [2]:
tf = pd.read_table('../output/chip/ALL_TF_CHIP_filtered.bed', header=None, 
                   names=['chrom','start','end','srx','score','caller'])

In [9]:
#spreadsheet contains antibody information
spreadsheet = pd.read_csv('../output/chip/20171103_s2cell_chip-seq.csv')
#For now we are excluding datasets with no input: 
spreadsheet = spreadsheet[spreadsheet.input != 'no input?']
antibody_table = spreadsheet[['srx','target']]

In [10]:
#symbol maps map gene symbol to FBgn: 
symbolmap = pd.read_table('/data/LCDB/lcdb-references/dmel/r6-11/gtf/dmel_r6-11.SYMBOL.csv', sep=',') 
symbolmap2 = pd.read_table('../data/fb_synonym.tsv', sep=' ', header=None)
symbolmap2.columns = ['gene','a','b']

In [11]:
#merge on spreadsheet to get antibody symbol
tf2 = tf.merge(antibody_table, on='srx',how='left')

#bed file containing introns and 1 kb upstream
intslop = pybedtools.BedTool('../output/dm6_intron_sloptranscript.bed')

tf_intersect = pybedtools.BedTool.from_dataframe(tf2).intersect(intslop).to_dataframe()

#make copy df
tf_intersect_copy = tf_intersect.copy()
#Fix all bad symbol names
tf_intersect_copy.loc[tf_intersect.thickStart == 'HP1a','thickStart'] = 'Su(var)205'
tf_intersect_copy.loc[tf_intersect.thickStart == 'Hp1a','thickStart'] = 'Su(var)205'
tf_intersect_copy.loc[tf_intersect.thickStart == 'CP190','thickStart'] = 'Cp190'
tf_intersect_copy.loc[tf_intersect.thickStart == 'CG8436','thickStart'] = 'Ibf1'
tf_intersect_copy.loc[tf_intersect.thickStart == 'CG9740','thickStart'] = 'Ibf2'
tf_intersect_copy.loc[tf_intersect.thickStart == 'NSL3','thickStart'] = 'Rcd1'
tf_intersect_copy.loc[tf_intersect.thickStart == 'UTX','thickStart'] = 'Utx'
tf_intersect_copy.loc[tf_intersect.thickStart == 'LPT','thickStart'] = 'Lpt'
tf_intersect_copy.loc[tf_intersect.thickStart == 'Trr','thickStart'] = 'trr'
tf_intersect_copy.loc[tf_intersect.thickStart == 'dCAP-D3','thickStart'] = 'Cap-D3'
tf_intersect_copy.loc[tf_intersect.thickStart == 'DnaJ1','thickStart'] = 'DnaJ-1'
tf_intersect_copy.loc[tf_intersect.thickStart == 'MYST5','thickStart'] = 'CG1894'
tf_intersect_copy.loc[tf_intersect.thickStart == 'ZIPIC','thickStart'] = 'CG7928'
#merge on symbol maps
merge1 = tf_intersect_copy.merge(symbolmap, left_on='thickStart', right_on='SYMBOL', how='left')
merge2 = merge1.merge(symbolmap2, left_on='thickStart', right_on='a', how='left')[['chrom','start','end','name',
                                                                'score','strand','thickStart','ENSEMBL','gene']]
merge2.fillna('')
merge2['anti_FBgn']= merge2.ENSEMBL.combine_first(merge2.gene)
trim = merge2[['chrom','start','end','name','score','strand','thickStart','anti_FBgn']]

#drop these because they aren't fly genes (except for Ph but it wasn't specified which ph)
drop_bad_antibodies = trim[~trim.anti_FBgn.isnull()]

#filter for RNAi TFs again
merge_on_our_TFs_chip = drop_bad_antibodies.merge(TF_list, left_on='anti_FBgn', right_on='TF', how='inner')

In [13]:
chip_bed = merge_on_our_TFs_chip[['chrom', 'start','end','anti_FBgn','score']]

In [14]:
chip_bed.head()

Unnamed: 0,chrom,start,end,anti_FBgn,score
0,chr2L,6528,7316,FBgn0003334,62.80834
1,chr2L,7432,7528,FBgn0003334,8.14595
2,chr2L,8116,8192,FBgn0003334,8.14595
3,chr2L,9484,9612,FBgn0003334,4.98406
4,chr2L,66242,66317,FBgn0003334,2.59312


In [15]:
chip_bed.to_csv('../../datashare/tf_peaks.bed', sep='\t', index=False, header=None)
#Note I'm going to manually write in a bed header line because it has to be formatted "track name="

### HISTONE: 

In [16]:
hist = pd.read_table('../output/chip/ALL_HIST_CHIP_filtered.bed', header=None,
                     names=['chrom','start','end','srx','score','caller'])

In [17]:
hist2 = hist.merge(antibody_table, on='srx', how='left')

#choosing not to restrict region. can always add back in later

In [19]:
hist_bed = hist2[['chrom', 'start', 'end', 'target','score']]

In [20]:
hist_bed.head()

Unnamed: 0,chrom,start,end,target,score
0,chr2L,16617,16893,CTCF,33.40816
1,chr2L,21242,21484,CTCF,7.92056
2,chr2L,34118,34304,CTCF,4.27362
3,chr2L,35499,35830,CTCF,39.60189
4,chr2L,43248,43463,CTCF,3.57781


In [21]:
hist_bed.to_csv('../../datashare/histone_peaks.bed', sep='\t', index=False, header=None)
#Note I'm going to manually write in a bed header line because it has to be formatted "track name="