In [None]:
import sys
import pandas as pd
import numpy as np

sys.path.append('/opt/hydra/')

import library.analysis as hydra

In [None]:
exp_path = '../data/TH-ALVEOLAR-log2TPM1.tsv'
mm_path = '../data/hydra-alveolar/MultiModalGenes'

In [None]:
exp = pd.read_csv(exp_path, sep='\t', index_col=0)

In [None]:
scan = hydra.ScanEnrichmentAnalysis(mm_path, 
                                    exp_path, 
                                    'GO',
                                     #min_prob_range=[0.25, 0.33, 0.35],
                                     min_prob_range=np.linspace(0.2, 0.4, 15),
                                     K=5).scan()

In [None]:
scan

In [None]:
en = hydra.EnrichmentAnalysis(exp_path=exp_path,
                              mm_path=mm_path,
                              gmt_path='GO',
                              min_prob_filter=0.33)

In [None]:
en.get_enriched_terms()

In [None]:
hclust = hydra.HClust(exp.reindex(en.get_enriched_term_genes()))
hclust.plot()

In [None]:
clust = hydra.MultivariateMixtureModel(data=exp.reindex(en.get_enriched_term_genes()),
                                       center=True,
                                       gamma=5.0,
                                       variance=2.0,
                                       K=1)

In [None]:
fgsea = clust.get_cluster_features(exp=exp,
                                   gmt='/opt/hydra/gene-sets/Human_GOBP_AllPathways_no_GO_iea_December_01_2018_symbol.gmt')

In [None]:
def format_pathway(x):
    fields = x.split('%')
    return '%s (%s)' % (fields[0], fields[1])

In [None]:
fgsea_df = None
for key, values in fgsea.items():
    if fgsea_df is None:
        header = pd.MultiIndex.from_product([list(fgsea.keys()), 
                                             ['padj', 'NES']],
                                           names=['cluster', 'feature'])
        
        fgsea_df = pd.DataFrame(index=values.index.values, columns=header)
        
    print 'Key: ', key,
    t = values.reset_index().reindex(['pathway', 'padj', 'NES'], axis=1)
    t['pathway'] = t['pathway'].apply(format_pathway) 
    print(t[(t['NES'] > 0) & (t['padj'] < 0.05)].sort_values('NES', ascending=False).head(10))
    
    fgsea_df[(key, 'padj')] = values.loc[fgsea_df.index, 'padj'].values
    fgsea_df[(key, 'NES')] = values.loc[fgsea_df.index, 'NES'].values

In [None]:
pth = '../data/TH-aRMS-Pathways-Enrichment.tsv'
fgsea_df.to_csv(pth, sep='\t')

In [None]:
cpth = '../img/TH-aRMS-top-10-pathways-cluster-%d.tex'
for key, values in fgsea.items():
    with pd.option_context('display.precision', 2):
        t = values.reset_index().reindex(['pathway', 'padj', 'NES'], axis=1)
        t['pathway'] = t['pathway'].apply(format_pathway)
        with open(pth % key, 'w') as f:
            print(t[(t['NES'] > 0) & (t['padj'] < 0.05)].sort_values('NES', ascending=False).head(10).to_latex(index=False))
            t[(t['NES'] > 0) & (t['padj'] < 0.05)].sort_values('NES', ascending=False).head(10).to_latex(f, index=False)

In [None]:
assignments = clust.get_assignments(exp.reindex(en.get_enriched_term_genes()))

assign = pd.DataFrame(index=exp.columns,
                      columns=[1])

for sample, assignment in zip(exp.columns, assignments):
    assign.loc[sample, 1] = assignment + 1

In [None]:
import seaborn as sns
import matplotlib.pyplot as plt

col_order = assign.sort_values(1).index.values
cmap = sns.color_palette("Set2", max(assign[1].values))
col_colors = [cmap[x - 1] for x in assign.sort_values(1)[1].values]

g = sns.clustermap(exp.reindex(en.get_enriched_term_genes()).reindex(col_order, axis=1),
                   col_cluster=False,
                   col_colors=col_colors,
                   z_score=0,
                   method='ward',
                   center=0,
                   cmap=sns.diverging_palette(240, 10, n=7),
                   figsize=(10, 10))

ax = g.ax_heatmap

ax.set_xticklabels([])
ax.set_xticks([])
ax.set_yticklabels([])
ax.set_yticks([])

pth = '../img/aRMS-expression-heatmap.svg'
plt.savefig(pth, format='svg', bbox_inches='tight')

pth = '../img/aRMS-expression-heatmap.png'
plt.savefig(pth, format='png', bbox_inches='tight')

In [None]:
def format_pathway(x):
    fields = x.split('%')
    return '%s (%s)' % (fields[0], fields[1])

In [None]:
for key, values in fgsea.items():
    print 'Key: ', key,
    t = values.reset_index().reindex(['pathway', 'padj', 'NES'], axis=1)
    t['pathway'] = t['pathway'].apply(format_pathway) 
    print(t[(t['NES'] > 0) & (t['padj'] < 0.05)].sort_values('NES', ascending=False).head(10))

In [None]:
pth = '../img/TH-aRMS-top-10-pathways-cluster-%d.tex'
for key, values in fgsea.items():
    with pd.option_context('display.precision', 2):
        t = values.reset_index().reindex(['pathway', 'padj', 'NES'], axis=1)
        t['pathway'] = t['pathway'].apply(format_pathway)
        with open(pth % key, 'w') as f:
            print(t[(t['NES'] > 0) & (t['padj'] < 0.05)].sort_values('NES', ascending=False).head(10).to_latex(index=False))
            t[(t['NES'] > 0) & (t['padj'] < 0.05)].sort_values('NES', ascending=False).head(10).to_latex(f, index=False)