In [None]:
import sys
import pandas as pd
import numpy as np

sys.path.append('/opt/hydra/')

import library.analysis as hydra

In [None]:
exp_path = '../data/synovial-microarray-58-processed.tsv'
mm_path = '../data/micro-58-filter/MultiModalGenes'

In [None]:
exp = pd.read_csv(exp_path, sep='\t', index_col=0)

In [None]:
if False:
    min_prob_range = [round(x, 2) for x in np.linspace(0.1, 0.15, 5)]
    print(min_prob_range)
    scan = hydra.ScanEnrichmentAnalysis(mm_path, 
                                        exp_path, 
                                        'GO', 
                                        min_prob_range=min_prob_range,
                                        CPU=7).scan()

In [None]:
import numpy as np

mm = hydra.EnrichmentAnalysis(mm_path,
                              exp_path,
                              min_prob_filter=0.14,
                              gmt_path='GO')

In [None]:
clus = hydra.MultivariateMixtureModel(data=exp.reindex(mm.get_enriched_term_genes()),
                                      center=True,
                                      gamma=5.0,
                                      variance=2.0,
                                      K=1)

In [None]:
cfeat = clus.get_cluster_features(exp)

In [None]:
def format_pathway(x):
    fields = x.split('%')
    return '%s (%s)' % (fields[0], fields[1])

pth = '../img/microarray-58-top-10-pathways-cluster-%d.tex'
for key, values in cfeat.items():
    with pd.option_context('display.precision', 2):
        t = values.reset_index().reindex(['pathway', 'padj', 'NES'], axis=1)
        t['pathway'] = t['pathway'].apply(format_pathway)
        with open(pth % key, 'w') as f:
            print(t[(t['NES'] > 0) & (t['padj'] < 0.05)].sort_values('NES', ascending=False).head(10).to_latex(index=False))
            t[(t['NES'] > 0) & (t['padj'] < 0.05)].sort_values('NES', ascending=False).head(10).to_latex(f, index=False)

In [None]:
fgsea_df = None
for key, values in cfeat.items():
    if fgsea_df is None:
        header = pd.MultiIndex.from_product([list(cfeat.keys()), 
                                             ['padj', 'NES']],
                                           names=['cluster', 'feature'])
        
        fgsea_df = pd.DataFrame(index=values.index.values, columns=header)
        
    print 'Key: ', key,
    t = values.reset_index().reindex(['pathway', 'padj', 'NES'], axis=1)
    t['pathway'] = t['pathway'].apply(format_pathway) 
    print(t[(t['NES'] > 0) & (t['padj'] < 0.05)].sort_values('NES', ascending=False).head(10))
    
    fgsea_df[(key, 'padj')] = values.loc[fgsea_df.index, 'padj'].values
    fgsea_df[(key, 'NES')] = values.loc[fgsea_df.index, 'NES'].values

In [None]:
pth = '../data/micro-58-synovial-Pathways-Enrichment.tsv'
fgsea_df.to_csv(pth, sep='\t')

In [None]:
cfeat[2].head()

In [None]:
pth = '../data/pdata-group-2.tsv'
p2 = pd.read_csv(pth, sep='\t', index_col=1)
p2.head()

In [None]:
surv = pd.DataFrame(columns=['sample', 'cluster', 'fusion', 'metastasis', 'time'])

for sample, assignment in zip(exp.columns, clus.get_assignments(exp)):
    met = 1 if p2.loc[sample, 'metastasis:ch1'] == 'yes' else 0
    surv.loc[len(surv), :] = [sample, assignment, p2.loc[sample, 'fusion transcrit:ch1'], met, p2.loc[sample, 'time:ch1']]

In [None]:
surv.to_csv("../data/micro-58-survival-data.tsv", sep='\t', index=False)

In [None]:
surv[surv['cluster'].isin([0, 1])].to_csv("../data/micro-58-cluster-0-or-1-survival-data.tsv", 
                                         sep='\t', 
                                         index=False)