# Working binary motif matrix

In [1]:
import pandas as pd
import gffutils
from gffutils import pybedtools_integration
import pybedtools
from pybedtools.featurefuncs import gff2bed

In [2]:
df = pd.read_table('../output/motif/tf_merge.bed', header=None, 
              names=['chrom', 'start','end','pval','motif_symbol','motif_FBgn'])

In [3]:
len(df.motif_FBgn.unique())

327

## fix blank motif fbgns

In [3]:
df[df.motif_FBgn == '.'].motif_symbol.unique()

array(['br-Z1', 'br-Z3', 'br-Z4', 'BEAF-32B', 'br-Z2', 'Cf2-II', 'dsx-F',
       'dsx-M', 'Espl', 'His2B', 'HLHm5', 'p120', 'SuH', 'suHw'], dtype=object)

In [9]:
df_copy = df.copy()

In [5]:
df_copy.loc[df_copy.motif_symbol == 'br-Z1', 'motif_FBgn'] = 'FBgn0283451'
df_copy.loc[df_copy.motif_symbol == 'br-Z2', 'motif_FBgn'] = 'FBgn0283451'
df_copy.loc[df_copy.motif_symbol == 'br-Z3', 'motif_FBgn'] = 'FBgn0283451'
df_copy.loc[df_copy.motif_symbol == 'br-Z4', 'motif_FBgn'] = 'FBgn0283451'
df_copy.loc[df_copy.motif_symbol == 'BEAF-32B', 'motif_FBgn'] = 'FBgn0015602'
df_copy.loc[df_copy.motif_symbol == 'dsx-F', 'motif_FBgn'] = 'FBgn0000504'
df_copy.loc[df_copy.motif_symbol == 'dsx-M', 'motif_FBgn'] = 'FBgn0000504'
df_copy.loc[df_copy.motif_symbol == 'Espl', 'motif_FBgn'] = 'FBgn0000591'
df_copy.loc[df_copy.motif_symbol == 'His2B', 'motif_FBgn'] = 'FBgn0001198'
df_copy.loc[df_copy.motif_symbol == 'HLHm5', 'motif_FBgn'] = 'FBgn0002631'
df_copy.loc[df_copy.motif_symbol == 'p120', 'motif_FBgn'] = 'FBgn0260799'
df_copy.loc[df_copy.motif_symbol == 'suHw', 'motif_FBgn'] = 'FBgn0003567'
df_copy.loc[df_copy.motif_symbol == 'SuH', 'motif_FBgn'] = 'FBgn0004837'

In [6]:
#save updated tf_merge
df_copy.to_csv('../output/motif/tf_merge.bed', sep='\t', header=None, index=False)

# filter for RNAi TFs 

In [10]:
TF_list = pd.read_table('../output/list_of_tfs.txt', header=None)
TF_list.columns=['TF']

In [11]:
# make dictionary of alt fbgns

fbgn = {}
with open('/data/LCDB/lcdb-references/dmel/r6-16/fb_annotation/dmel_r6-16.fb_annotation') as f:
    next(f)
    for line in f:
        split = line.split('\t')
        first = split[1]
        seconds = split[2].split(',')
        fbgn[first] = first
        for x in seconds:
            if x:
                fbgn[x] = first

In [12]:
TF_list['update'] = TF_list.TF.map(lambda x: fbgn[x])

In [13]:
TF_list.drop('TF', axis=1, inplace=True)

In [14]:
merge_on_our_TFs = df_copy.merge(TF_list, left_on='motif_FBgn', right_on='update', how='inner')

In [15]:
len(merge_on_our_TFs.motif_FBgn.unique())

116

In [49]:
TF_list.update == 'FBgn0000283'

False

## Target gene intersect

In [18]:
gene_info = pybedtools.BedTool('../output/chip/dmel6.12.genes.bed')

In [None]:
targene_intersect = gene_info.intersect(pybedtools.BedTool.from_dataframe(merge_on_our_TFs), 
                                        wb=True).saveas().to_dataframe()[[3,6,7,8,9,10,11]]

In [None]:
targene_intersect.columns = ['target_gene','chrom','start','end','pval','motif_symbol','motif_fbgn']

## Make sure all FBgns are updated!!

In [None]:
targene_intersect['update_fbgn'] = targene_intersect.motif_fbgn.map(lambda x: fbgn[x])
targene_intersect.head()

## Binary collapse

In [None]:
#If peak in gene region count it as a 1 
#only need gene,TF_fbgn
binary_collapse = targene_intersect[['target_gene','update_fbgn']]

In [None]:
binary_collapse['binary'] = 1

In [None]:
binary_collapse.drop_duplicates(inplace=True)

In [18]:
#index can't have duplicate entries so I need to condense this information down
binary_collapse.set_index(['target_gene','update_fbgn'], inplace=True)
matrix = binary_collapse.unstack()

In [19]:
matrix.fillna(value=0).to_csv('../output/motif/motif_matrix', sep='\t')

In [44]:
motifs.head()

Unnamed: 0_level_0,FBgn0000097,FBgn0000286,FBgn0000370,FBgn0000546,FBgn0000567,FBgn0000568,FBgn0000611,FBgn0001078,FBgn0001168,FBgn0001185,...,FBgn0262582,FBgn0262656,FBgn0263108,FBgn0264075,FBgn0265276,FBgn0267033,FBgn0267821,FBgn0267978,FBgn0283451,FBgn0284249
update_fbgn,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
FBgn0000008,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0
FBgn0000014,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0
FBgn0000015,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0
FBgn0000017,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0
FBgn0000024,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0


In [21]:
motifs = pd.read_table('../output/motif/motif_matrix', index_col=0, skiprows=(0,2))

In [45]:
motifs.sum(axis=0).sort_values(ascending=False)

FBgn0283451    7863.0
FBgn0267821    7197.0
FBgn0000286    7032.0
FBgn0035625    5943.0
FBgn0000611    5915.0
FBgn0027339    5369.0
FBgn0003396    5277.0
FBgn0010768    5213.0
FBgn0260632    5213.0
FBgn0011655    5080.0
FBgn0003870    5055.0
FBgn0003567    4981.0
FBgn0040318    4946.0
FBgn0003345    4720.0
FBgn0264075    4678.0
FBgn0003507    4607.0
FBgn0000567    4588.0
FBgn0002521    4487.0
FBgn0005694    4470.0
FBgn0038316    4460.0
FBgn0004652    4399.0
FBgn0038766    4392.0
FBgn0024321    4354.0
FBgn0014949    4311.0
FBgn0022935    4299.0
FBgn0038787    4272.0
FBgn0003053    4256.0
FBgn0035407    4230.0
FBgn0029822    4210.0
FBgn0037446    4146.0
                ...  
FBgn0037722    3009.0
FBgn0000370    2965.0
FBgn0003117    2960.0
FBgn0033749    2958.0
FBgn0035849    2953.0
FBgn0002609    2951.0
FBgn0033010    2941.0
FBgn0035160    2857.0
FBgn0000097    2830.0
FBgn0029711    2811.0
FBgn0013799    2762.0
FBgn0003118    2752.0
FBgn0284249    2744.0
FBgn0005660    2690.0
FBgn002309

In [34]:
#Check row
motifs[motifs.index == 'FBgn0033998'].sum(axis=1)

update_fbgn
FBgn0033998    16.0
dtype: float64

In [35]:
#Check MBD-R2
motifs[motifs.index == 'FBgn0038016'].sum(axis=1)

update_fbgn
FBgn0038016    18.0
dtype: float64

In [36]:
#Check putzig
motifs[motifs.index == 'FBgn0259785'].sum(axis=1)

update_fbgn
FBgn0259785    2.0
dtype: float64

In [47]:
motifs.FBgn0000097

update_fbgn
FBgn0000008    1.0
FBgn0000014    1.0
FBgn0000015    1.0
FBgn0000017    1.0
FBgn0000024    1.0
FBgn0000028    1.0
FBgn0000032    0.0
FBgn0000036    1.0
FBgn0000037    1.0
FBgn0000038    0.0
FBgn0000039    1.0
FBgn0000042    0.0
FBgn0000043    0.0
FBgn0000044    0.0
FBgn0000045    0.0
FBgn0000046    0.0
FBgn0000047    0.0
FBgn0000052    0.0
FBgn0000053    0.0
FBgn0000054    0.0
FBgn0000055    0.0
FBgn0000056    0.0
FBgn0000057    0.0
FBgn0000061    1.0
FBgn0000063    0.0
FBgn0000064    1.0
FBgn0000071    0.0
FBgn0000075    0.0
FBgn0000083    1.0
FBgn0000084    0.0
              ... 
FBgn0283510    0.0
FBgn0283521    1.0
FBgn0283531    1.0
FBgn0283535    1.0
FBgn0283536    0.0
FBgn0283545    0.0
FBgn0283546    0.0
FBgn0283547    0.0
FBgn0283548    0.0
FBgn0283550    0.0
FBgn0283551    0.0
FBgn0283557    0.0
FBgn0283559    0.0
FBgn0283626    0.0
FBgn0283627    0.0
FBgn0283638    0.0
FBgn0283639    0.0
FBgn0283649    0.0
FBgn0283651    1.0
FBgn0283652    0.0
FBgn0283657    1.0


In [42]:
#Check Cp190
motifs.FBgn0000283

AttributeError: 'DataFrame' object has no attribute 'FBgn0000283'

In [38]:
#Check msl3
motifs[motifs.index == 'FBgn0002775'].sum(axis=1)

update_fbgn
FBgn0002775    18.0
dtype: float64

In [39]:
#Check msl3
motifs[motifs.index == 'FBgn0000015'].sum(axis=1)

update_fbgn
FBgn0034878    41.0
dtype: float64