# FIMO output using D. virilis

First: 
- collapse to the gene level, compare existence of TF/gene relationships

In [1]:
import pandas as pd
import gffutils
from gffutils import pybedtools_integration
import pybedtools
from pybedtools.featurefuncs import gff2bed
import seaborn as sb

### Import gffs from fimo output and change to bed format:  
5 motif databases: dmmpmm, flyFactor, flyReg, idmmpmm, & onTheFly

In [2]:
#note: score is min(1000, -10*(log10(pvalue)))
#for now I'm not going to worry about trying to get the pvalue since we're just collapsing everything to gene
dmm = pybedtools.BedTool('../motif-wf/data/fimo_dvir_02-15/motif_alignments_dmmpmm2009_dvir.gff')
dmmbed = dmm.each(gff2bed, name_field='ID').saveas().to_dataframe()
dmmbed['name'] = [x.split('-')[0] for x in dmmbed.name]

In [3]:
#add column for motif_FBgn
symbolmap = pd.read_table('/data/LCDB/lcdb-references/dmel/r6-11/gtf/dmel_r6-11.SYMBOL.csv', sep=',') 
dmm_w_motiffbgn = dmmbed.merge(symbolmap, left_on='name', right_on='SYMBOL', how='left')

In [4]:
dmm_w_motiffbgn.head()

Unnamed: 0,chrom,start,end,name,score,strand,ENSEMBL,SYMBOL
0,scaffold_13049,340807,340815,bcd,48.3,-,FBgn0000166,bcd
1,scaffold_13049,630585,630593,bcd,48.3,-,FBgn0000166,bcd
2,scaffold_13049,666194,666202,bcd,48.3,-,FBgn0000166,bcd
3,scaffold_13049,915737,915745,bcd,48.3,-,FBgn0000166,bcd
4,scaffold_13049,1103900,1103908,bcd,48.3,-,FBgn0000166,bcd


In [5]:
dmm_merge = dmm_w_motiffbgn[['chrom','start','end','ENSEMBL']].rename(columns={'ENSEMBL':'name'})

In [6]:
flyfac = pybedtools.BedTool('../motif-wf/data/fimo_dvir_02-15/motif_alignments_flyFactor_dvir.gff')
ffbed = flyfac.each(gff2bed, name_field='Name').saveas().to_dataframe()
ffbed['name'] = [x.split('_')[0] for x in ffbed.name]

In [7]:
ffbed.head()

Unnamed: 0,chrom,start,end,name,score,strand
0,scaffold_13049,53234,53242,FBgn0000014,45.8,+
1,scaffold_13049,145683,145691,FBgn0000014,45.8,+
2,scaffold_13049,149968,149976,FBgn0000014,45.8,+
3,scaffold_13049,155609,155617,FBgn0000014,45.8,+
4,scaffold_13049,181619,181627,FBgn0000014,45.8,+


In [8]:
ffmerge = ffbed[['chrom','start','end','name']]

In [9]:
flyreg = pybedtools.BedTool('../motif-wf/data/fimo_dvir_02-15/motif_alignments_flyReg_dvir.gff')
frbed = flyreg.each(gff2bed, name_field='Name').saveas().to_dataframe()
frbed['name'] = [x.split('_')[0] for x in frbed.name]

In [10]:
#add column for motif_FBgn
fr_w_motiffbgn = frbed.merge(symbolmap, left_on='name', right_on='SYMBOL', how='left')

In [11]:
fr_w_motiffbgn.head()

Unnamed: 0,chrom,start,end,name,score,strand,ENSEMBL,SYMBOL
0,scaffold_13049,53234,53242,abd-A,45.8,+,FBgn0000014,abd-A
1,scaffold_13049,145683,145691,abd-A,45.8,+,FBgn0000014,abd-A
2,scaffold_13049,149968,149976,abd-A,45.8,+,FBgn0000014,abd-A
3,scaffold_13049,155609,155617,abd-A,45.8,+,FBgn0000014,abd-A
4,scaffold_13049,181619,181627,abd-A,45.8,+,FBgn0000014,abd-A


In [12]:
fr_merge = fr_w_motiffbgn[['chrom','start','end','ENSEMBL']].rename(columns={'ENSEMBL':'name'})

In [13]:
idm = pybedtools.BedTool('../motif-wf/data/fimo_dvir_02-15/motif_alignments_idmmpmm2009_dvir.gff')
idmbed = idm.each(gff2bed, name_field='Name').saveas().to_dataframe()
idmbed['name'] = [x.split('_')[0] for x in idmbed.name]

In [14]:
idm_w_motiffbgn = idmbed.merge(symbolmap, left_on='name', right_on='SYMBOL', how='left')

In [15]:
idm_w_motiffbgn.head()

Unnamed: 0,chrom,start,end,name,score,strand,ENSEMBL,SYMBOL
0,scaffold_13049,15087,15095,abd-A,45.8,+,FBgn0000014,abd-A
1,scaffold_13049,80992,81000,abd-A,45.8,+,FBgn0000014,abd-A
2,scaffold_13049,209212,209220,abd-A,45.8,+,FBgn0000014,abd-A
3,scaffold_13049,270146,270154,abd-A,45.8,+,FBgn0000014,abd-A
4,scaffold_13049,270292,270300,abd-A,45.8,+,FBgn0000014,abd-A


In [16]:
idm_merge = idm_w_motiffbgn[['chrom','start','end','ENSEMBL']].rename(columns={'ENSEMBL':'name'})

In [18]:
onthefly = pybedtools.BedTool('../motif-wf/data/fimo_dvir_02-15/motif_alignments_onTheFly_dvir.gff')


In [20]:
alt_otf = onthefly.each(gff2bed, name_field='Name').saveas().to_dataframe()
alt_otf['name'] = [x.split('_')[0] for x in alt_otf.name]

In [21]:
alt_otf.head()

Unnamed: 0,chrom,start,end,name,score,strand
0,scaffold_13049,3366,3373,OTF0001.1,41.9,-
1,scaffold_13049,4465,4472,OTF0001.1,41.9,-
2,scaffold_13049,25387,25394,OTF0001.1,41.9,-
3,scaffold_13049,28133,28140,OTF0001.1,41.9,-
4,scaffold_13049,69626,69633,OTF0001.1,41.9,-


In [22]:
#add column for motif_FBGN
otfmap = pd.read_table('../motif-wf/data/onTheFlyMap.tsv')
y = otfmap[['name', 'FBgn']]
alt_otf_w_fbgn = alt_otf.merge(y, on='name', how='left')


In [25]:
alt_otf_w_fbgn[alt_otf_w_fbgn.FBgn.isnull()].name.unique()

array(['OTF0415.1'], dtype=object)

In [26]:
#a mystery OTF... no known information about what FBgn
otfbed = alt_otf_w_fbgn[alt_otf_w_fbgn.name != 'OTF0415.1']

In [27]:
otf_merge = otfbed[['chrom','start','end','FBgn']].rename(columns={'FBgn':'name'})

### Combine datasets
Target gene intersect, make binary, drop duplicates

In [29]:
alldatasets = pd.concat([dmm_merge, ffmerge, fr_merge, idm_merge, otf_merge])

In [48]:
#alldatasets.to_csv('../output/motif/dvir_fimo_alldata.bed', sep='\t', index=False)

In [None]:
#start here next time and import alldatasets

In [2]:
alldatasets = pd.read_table('../output/motif/dvir_fimo_alldata.bed')

  interactivity=interactivity, compiler=compiler, result=result)


In [3]:
genegtf = pybedtools.BedTool('/data/Oliverlab/data/annotation/dvir.v3.gtf')

In [4]:
gene_info = genegtf.each(gff2bed, name_field='gene_id').saveas()

In [5]:
targene_intersect = gene_info.intersect(pybedtools.BedTool.from_dataframe(alldatasets), 
                            wb=True).saveas().to_dataframe()[['name','thickStart','thickEnd','itemRgb','blockCount']]

In [6]:
targene_intersect.columns = ['target_gene','chrom','start','end','motif_fbgn']

In [7]:
targene_intersect.head()

Unnamed: 0,target_gene,chrom,start,end,motif_fbgn
0,DvirG_000001,scaffold_10001,143,151,FBgn0003145
1,DvirG_000001,scaffold_10001,279,287,FBgn0000014
2,DvirG_000001,scaffold_10001,279,287,FBgn0000095
3,DvirG_000001,scaffold_10001,279,287,FBgn0000095
4,DvirG_000001,scaffold_10001,307,318,FBgn0000413


In [8]:
collapse = targene_intersect[['target_gene', 'motif_fbgn']].drop_duplicates()

In [9]:
#switch to flybase annotation
refmap = pd.read_table('../output/annotation/gffcmp.dvir.v3.gtf.refmap')
refmap['gene_fbgn'] = [x.split('|')[0] for x in refmap.qry_id_list]
genemap = refmap[['ref_gene_id','class_code','gene_fbgn']]

In [10]:
completematch = genemap[genemap.class_code == '='][['ref_gene_id','gene_fbgn']]
completematch.head()

Unnamed: 0,ref_gene_id,gene_fbgn
0,FBgn0256559,DvirG_000001
1,FBgn0282270,DvirG_000002
2,FBgn0282538,DvirG_000003
3,FBgn0256560,DvirG_000004
4,FBgn0282609,DvirG_000005


In [11]:
merge_1 = collapse.merge(completematch, left_on='target_gene', right_on='gene_fbgn', how='inner')[['motif_fbgn',
                                                                            'ref_gene_id','gene_fbgn']]

In [12]:
merge_1 = merge_1.rename(columns={'ref_gene_id':'target_gene'})

In [13]:
#import orthologs
dvirorthologs = pd.read_table('/data/Oliverlab/data/orthologs/flyBase_orthoDB_one2one/dvir.concise.one2one',
                             header=None, names=['Dmel','Symbol','Dvir'])
dvirorthologs.head()

Unnamed: 0,Dmel,Symbol,Dvir
0,FBgn0025571,SF1,FBgn0283420
1,FBgn0085468,ND-MWFE,FBgn0283412
2,FBgn0001253,ImpE1,FBgn0283406
3,FBgn0050022,CG30022,FBgn0283403
4,FBgn0037365,CG2104,FBgn0283394


In [14]:
merge_2 = merge_1.merge(dvirorthologs, left_on='target_gene', right_on='Dvir', how='inner')

In [15]:
ortholog_target_genes = merge_2[['motif_fbgn','Dmel']].rename(columns={'Dmel':'target_gene'})

In [16]:
ortholog_target_genes['binary'] = 1

## Compare to motifs we have in Dmel

In [17]:
dmel_motifs = pd.read_table('../output/motif/tf_merge.bed', header=None, 
                            names=['chrom', 'start','end','pval','motif_symbol','motif_FBgn'])

In [18]:
dmel_motifs.head()

Unnamed: 0,chrom,start,end,pval,motif_symbol,motif_FBgn
0,chr2L,6782,6789,3.6e-05,br-Z1,FBgn0283451
1,chr2L,17019,17026,3.6e-05,br-Z1,FBgn0283451
2,chr2L,19846,19853,3.6e-05,br-Z1,FBgn0283451
3,chr2L,29403,29410,3.6e-05,br-Z1,FBgn0283451
4,chr2L,30047,30054,3.6e-05,br-Z1,FBgn0283451


In [19]:
gene_info = pybedtools.BedTool('../output/chip/dmel6.12.genes.bed')

In [20]:
targene_dmel = gene_info.intersect(pybedtools.BedTool.from_dataframe(dmel_motifs), 
                wb=True).saveas().to_dataframe()[['name','thickStart','thickEnd','itemRgb','blockSizes','blockStarts']]
targene_dmel.columns = ['target_gene','chrom','start','end','motif_symbol','motif_fbgn']

In [21]:
collapse_dmel = targene_dmel[['target_gene','motif_fbgn']]

In [22]:
collapse_dmel.drop_duplicates(inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  """Entry point for launching an IPython kernel.


In [23]:
merge_on_orthologs = collapse_dmel.merge(dvirorthologs, left_on='target_gene', right_on='Dmel', 
                                            how='inner')

In [24]:
ortho_dmel = merge_on_orthologs[['target_gene','motif_fbgn']]
ortho_dmel['binary'] = 1 

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  


In [27]:
len(ortholog_target_genes.motif_fbgn.unique())

331

In [28]:
len(ortho_dmel.motif_fbgn.unique())

327

In [29]:
#inner merge by motif
merged = []
not_merged = []

for name, group in ortholog_target_genes.groupby('motif_fbgn'):
    dmel = ortho_dmel[ortho_dmel.motif_fbgn == name]
    dvir = ortholog_target_genes[ortholog_target_genes.motif_fbgn == name]
    try:
        merge = dmel.merge(dvir, how='inner')
        merged.append(merge)
    except:
        not_merged.append(name)

In [30]:
innermerge = pd.concat(merged)

In [31]:
innermerge.drop_duplicates(inplace=True)

In [32]:
len(innermerge.motif_fbgn.unique())

295

In [33]:
len(innermerge.target_gene.unique())

9316

In [34]:
innermerge.shape

(856228, 3)

In [None]:
#get count information first

In [36]:
targene_intersect['binary'] = 1

In [37]:
vircounts = targene_intersect.groupby(['motif_fbgn','target_gene']).agg({'binary':'count'}).reset_index()

In [38]:
fba_vircount = vircounts.merge(completematch, left_on='target_gene', right_on='gene_fbgn', how='inner')[['motif_fbgn',
                                    'ref_gene_id','binary']].rename(columns={'ref_gene_id':'target_gene'})

In [39]:
vir_worthologs = fba_vircount.merge(dvirorthologs, left_on='target_gene', right_on='Dvir', how='inner')[['motif_fbgn',
                                                                        'Dvir','Dmel', 'binary']]

In [45]:
vir_worthologs['dmel_count'] = 0
vir_worthologs['dvir_count'] = vir_worthologs['binary']

In [41]:
targene_dmel['binary'] = 1 

In [42]:
melcounts = targene_dmel.groupby(['motif_fbgn','target_gene']).agg({'binary':'count'}).reset_index()

In [43]:
mel_worthologs = melcounts.merge(dvirorthologs, left_on='target_gene', right_on='Dmel', how='inner')[['motif_fbgn',
                                                                                'Dvir','Dmel','binary']]

In [46]:
mel_worthologs['dmel_count'] = mel_worthologs['binary']
mel_worthologs['dvir_count'] = 0

In [48]:
vir_worthologs.drop('binary', axis=1, inplace=True)
mel_worthologs.drop('binary', axis=1, inplace=True)

In [52]:
together = pd.concat([vir_worthologs, mel_worthologs])

In [54]:
together.drop_duplicates(inplace=True)

In [64]:
together[(together.motif_fbgn == 'FBgn0000014') & (together.Dvir == 'FBgn0282451')]

Unnamed: 0,motif_fbgn,Dvir,Dmel,dmel_count,dvir_count
0,FBgn0000014,FBgn0282451,FBgn0004910,0,176
588876,FBgn0000014,FBgn0282451,FBgn0004910,36,0


In [57]:
sumattempt = together.groupby(['motif_fbgn','Dvir','Dmel'])[['dmel_count','dvir_count']].sum()

In [60]:
sumattempt.reset_index(inplace=True)

In [65]:
sumattempt[(sumattempt.motif_fbgn == 'FBgn0000014') & (sumattempt.Dvir == 'FBgn0282451')]

Unnamed: 0,motif_fbgn,Dvir,Dmel,dmel_count,dvir_count
11129,FBgn0000014,FBgn0282451,FBgn0004910,36,176


In [67]:
sumattempt['target_gene'] = sumattempt.Dvir + ':' + sumattempt.Dmel

In [69]:
table_ = sumattempt[['motif_fbgn','target_gene','dmel_count','dvir_count']]

In [None]:
#want to do this but it seems like it won't run for some reason... 
#sb.lmplot(x='target_gene',y='dmel_count', data=table_)