# Working binary motif matrix

In [1]:
import pandas as pd
import gffutils
from gffutils import pybedtools_integration
import pybedtools
from pybedtools.featurefuncs import gff2bed

In [2]:
df = pd.read_table('../output/motif/tf_merge.bed', header=None, 
              names=['chrom', 'start','end','pval','motif_symbol','motif_FBgn'])

## fix blank motif fbgns

In [3]:
df[df.motif_FBgn == '.'].motif_symbol.unique()

array(['br-Z1', 'br-Z3', 'br-Z4', 'BEAF-32B', 'br-Z2', 'Cf2-II', 'dsx-F',
       'dsx-M', 'Espl', 'His2B', 'HLHm5', 'p120', 'SuH', 'suHw'], dtype=object)

In [4]:
df_copy = df.copy()

In [5]:
df_copy.loc[df_copy.motif_symbol == 'br-Z1', 'motif_FBgn'] = 'FBgn0283451'
df_copy.loc[df_copy.motif_symbol == 'br-Z2', 'motif_FBgn'] = 'FBgn0283451'
df_copy.loc[df_copy.motif_symbol == 'br-Z3', 'motif_FBgn'] = 'FBgn0283451'
df_copy.loc[df_copy.motif_symbol == 'br-Z4', 'motif_FBgn'] = 'FBgn0283451'
df_copy.loc[df_copy.motif_symbol == 'BEAF-32B', 'motif_FBgn'] = 'FBgn0015602'
df_copy.loc[df_copy.motif_symbol == 'dsx-F', 'motif_FBgn'] = 'FBgn0000504'
df_copy.loc[df_copy.motif_symbol == 'dsx-M', 'motif_FBgn'] = 'FBgn0000504'
df_copy.loc[df_copy.motif_symbol == 'Espl', 'motif_FBgn'] = 'FBgn0000591'
df_copy.loc[df_copy.motif_symbol == 'His2B', 'motif_FBgn'] = 'FBgn0001198'
df_copy.loc[df_copy.motif_symbol == 'HLHm5', 'motif_FBgn'] = 'FBgn0002631'
df_copy.loc[df_copy.motif_symbol == 'p120', 'motif_FBgn'] = 'FBgn0260799'
df_copy.loc[df_copy.motif_symbol == 'suHw', 'motif_FBgn'] = 'FBgn0003567'
df_copy.loc[df_copy.motif_symbol == 'SuH', 'motif_FBgn'] = 'FBgn0004837'

In [6]:
#save updated tf_merge
df_copy.to_csv('../output/motif/tf_merge.bed', sep='\t', header=None, index=False)

# filter for RNAi TFs 

In [18]:
TF_list = pd.read_table('../output/list_of_tfs.txt', header=None)
TF_list.columns=['TF']

In [19]:
merge_on_our_TFs = df_copy.merge(TF_list, left_on='motif_FBgn', right_on='TF', how='inner')

In [22]:
len(merge_on_our_TFs.motif_FBgn.unique())

115

## Target gene intersect

In [27]:
gene_info = pybedtools.BedTool('../output/chip/dmel6.12.genes.bed')

In [28]:
targene_intersect = gene_info.intersect(pybedtools.BedTool.from_dataframe(merge_on_our_TFs), 
                                        wb=True).saveas().to_dataframe()[[3,6,7,8,9,10,11]]

['chrom', 'start', 'end', 'name', 'score', 'strand', 'thickStart', 'thickEnd', 'itemRgb', 'blockCount', 'blockSizes', 'blockStarts']
but file has 13 fields; you can supply custom names with the `names` kwarg
  % (self.file_type, _names, self.field_count()))


In [29]:
targene_intersect.columns = ['target_gene','chrom','start','end','pval','motif_symbol','motif_fbgn']

## Binary collapse

In [30]:
#If peak in gene region count it as a 1 
#only need gene,TF_fbgn
binary_collapse = targene_intersect[['target_gene','motif_fbgn']]

In [31]:
binary_collapse['binary'] = 1

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  """Entry point for launching an IPython kernel.


In [32]:
binary_collapse.drop_duplicates(inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  """Entry point for launching an IPython kernel.


In [34]:
#index can't have duplicate entries so I need to condense this information down
binary_collapse.set_index(['target_gene','motif_fbgn'], inplace=True)
matrix = binary_collapse.unstack()

In [36]:
matrix.fillna(value=0).to_csv('../output/motif/motif_matrix', sep='\t')