# Big fimo output table cleanup

The "distribution_qvals_dmmpmm" notebook worked to create a big table containing information from five different motif databases. This notebook will clean up, collapse, and work with this big table. 

First, I'll import the table as a dataframe: 

In [1]:
import pandas as pd
import numpy as np
import seaborn as sb
import matplotlib.pyplot as plt
%matplotlib inline 

In [6]:
bigtable = pd.read_csv('/../../output/concat.txt', sep='\t', index_col=0)

  mask |= (ar1 == a)


Collapsing over motif: 

In [21]:
bigtable['motif_hits'] = 0
grp=bigtable.groupby(['motif_FBgn','target_gene','source','#hits'])

In [22]:
agg=grp.agg({'min_pval': ['min'], 'motif_hits': 'count'})

In [23]:
agg.columns = ['min_pval', 'motif_hits'] 

In [28]:
agg2 = agg.reset_index()

In [29]:
agg2.columns = ['motif_FBgn','target_gene','source','pos_hits','min_pval', 'motif_hits']

In [31]:
agg2.head()

Unnamed: 0,motif_FBgn,target_gene,source,pos_hits,min_pval,motif_hits
0,FBgn0000014,FBgn0000003,flyReg,1,2.7e-05,1
1,FBgn0000014,FBgn0000003,onTheFly,1,2.2e-05,2
2,FBgn0000014,FBgn0000008,flyFactor,26,0.000118,1
3,FBgn0000014,FBgn0000008,flyReg,21,2.7e-05,1
4,FBgn0000014,FBgn0000008,idmmpmm,26,2.7e-05,1


Collapsing over database: 

In [47]:
#agg2['sum_pos_hits'] = 0
#agg2['sum_motif_hits'] = 0 
grp2 = agg2.groupby(['motif_FBgn','target_gene'])

In [48]:
agg3 = grp2.agg({'min_pval':['min'], 'pos_hits':['sum'],'motif_hits':['sum']})

In [49]:
agg3.columns = ['min_pval', 'sum_motif_hits', 'sum_pos_hits'] 

In [50]:
agg4 = agg3.reset_index()

In [57]:
#add column for motif_symbol
symbolmap = pd.read_table('/data/LCDB/lcdb-references/dmel/r6-11/gtf/dmel_r6-11.SYMBOL.csv', sep=',', na_values='NA', keep_default_na=False) 
update = agg4.merge(symbolmap, left_on='motif_FBgn', right_on='ENSEMBL', how='left')
update = update.rename(columns={'SYMBOL': 'motif_symbol'})
trim = update[['motif_FBgn','motif_symbol','target_gene','min_pval','sum_motif_hits','sum_pos_hits']]
trim.head()

Unnamed: 0,motif_FBgn,motif_symbol,target_gene,min_pval,sum_motif_hits,sum_pos_hits
0,FBgn0000014,abd-A,FBgn0000003,2.2e-05,3,2
1,FBgn0000014,abd-A,FBgn0000008,2.2e-05,7,151
2,FBgn0000014,abd-A,FBgn0000014,2.2e-05,7,121
3,FBgn0000014,abd-A,FBgn0000015,2.2e-05,7,202
4,FBgn0000014,abd-A,FBgn0000017,2.2e-05,7,95


In [59]:
#add column for gene_symbol
update2 = trim.merge(symbolmap, left_on='target_gene', right_on='ENSEMBL', how='left')
update2 = update2.rename(columns={'SYMBOL': 'gene_symbol'})
trim2 = update2[['motif_FBgn','motif_symbol','target_gene','gene_symbol','min_pval','sum_motif_hits','sum_pos_hits']]

In [103]:
trim2.groupby(['target_gene']).agg({'sum_motif_hits':['count']}).describe()

Unnamed: 0_level_0,sum_motif_hits
Unnamed: 0_level_1,count
count,17659.0
mean,264.82281
std,265.228333
min,13.0
25%,216.0
50%,247.0
75%,283.0
max,5681.0


In [104]:
len(trim2.motif_FBgn.unique())

299

In [None]:
trim2.to_csv('/../../output/concat.txt', sep='\t')