In [2]:
import sys
import pandas as pd
import numpy as np

sys.path.append('/opt/hydra/')

import library.analysis as hydra

In [3]:
exp_path = '../data/synovial-microarray-58-processed.tsv'
mm_path = '../data/micro-58-filter/MultiModalGenes'

In [4]:
exp = pd.read_csv(exp_path, sep='\t', index_col=0)

In [5]:
exp.shape

(18834, 58)

In [5]:
help(hydra.ScanEnrichmentAnalysis)

Help on class ScanEnrichmentAnalysis in module library.analysis:

class ScanEnrichmentAnalysis(__builtin__.object)
 |  Methods defined here:
 |  
 |  __init__(self, mm_path, exp_path, gmt_path, min_prob_range=None, min_effect_filter=1.0, **kwargs)
 |      Class to explore the function of multimodally expressed genes.
 |      
 |      :param min_prob_range (iterable): Iterable containing floats between 0 and 0.5
 |  
 |  plot(self)
 |  
 |  scan(self)
 |  
 |  ----------------------------------------------------------------------
 |  Data descriptors defined here:
 |  
 |  __dict__
 |      dictionary for instance variables (if defined)
 |  
 |  __weakref__
 |      list of weak references to the object (if defined)



In [6]:
if False:
    min_prob_range = [round(x, 2) for x in np.linspace(0.1, 0.15, 5)]
    print(min_prob_range)
    scan = hydra.ScanEnrichmentAnalysis(mm_path, 
                                        exp_path, 
                                        'GO', 
                                        min_prob_range=min_prob_range,
                                        CPU=7).scan()

In [7]:
import numpy as np

mm = hydra.EnrichmentAnalysis(mm_path,
                              exp_path,
                              min_prob_filter=0.14,
                              gmt_path='GO')

In [8]:
clus = hydra.MultivariateMixtureModel(data=exp.reindex(mm.get_enriched_term_genes()),
                                      center=True,
                                      gamma=5.0,
                                      variance=2.0,
                                      K=1)

In [9]:
cfeat = clus.get_cluster_features(exp)

In [33]:
def format_pathway(x):
    fields = x.split('%')
    return '%s (%s)' % (fields[0], fields[1])

pth = '../img/microarray-58-top-10-pathways-cluster-%d.tex'
for key, values in cfeat.items():
    with pd.option_context('display.precision', 2):
        t = values.reset_index().reindex(['pathway', 'padj', 'NES'], axis=1)
        t['pathway'] = t['pathway'].apply(format_pathway)
        with open(pth % key, 'w') as f:
            print(t[(t['NES'] > 0) & (t['padj'] < 0.05)].sort_values('NES', ascending=False).head(10).to_latex(index=False))
            t[(t['NES'] > 0) & (t['padj'] < 0.05)].sort_values('NES', ascending=False).head(10).to_latex(f, index=False)

\begin{tabular}{lrr}
\toprule
                                           pathway &      padj &   NES \\
\midrule
                            TRANSLATION (REACTOME) &  3.36e-03 &  4.52 \\
                                TRANSLATION (GOBP) &  3.36e-03 &  4.42 \\
                        RIBOSOME BIOGENESIS (GOBP) &  3.36e-03 &  4.42 \\
 RRNA PROCESSING (REACTOME DATABASE ID RELEASE 66) &  3.36e-03 &  4.29 \\
 RRNA PROCESSING IN THE NUCLEUS AND CYTOSOL (RE... &  3.36e-03 &  4.25 \\
 MAJOR PATHWAY OF RRNA PROCESSING IN THE NUCLEO... &  3.36e-03 &  4.22 \\
               PEPTIDE BIOSYNTHETIC PROCESS (GOBP) &  3.36e-03 &  4.17 \\
                            RRNA PROCESSING (GOBP) &  3.36e-03 &  4.16 \\
                           NCRNA PROCESSING (GOBP) &  3.36e-03 &  4.16 \\
               HALLMARK\_MYC\_TARGETS\_V1 (MSIGDB\_C2) &  3.36e-03 &  4.09 \\
\bottomrule
\end{tabular}

\begin{tabular}{lrr}
\toprule
                                           pathway &      padj &   NES \\
\midrule
   

In [35]:
fgsea_df = None
for key, values in cfeat.items():
    if fgsea_df is None:
        header = pd.MultiIndex.from_product([list(cfeat.keys()), 
                                             ['padj', 'NES']],
                                           names=['cluster', 'feature'])
        
        fgsea_df = pd.DataFrame(index=values.index.values, columns=header)
        
    print 'Key: ', key,
    t = values.reset_index().reindex(['pathway', 'padj', 'NES'], axis=1)
    t['pathway'] = t['pathway'].apply(format_pathway) 
    print(t[(t['NES'] > 0) & (t['padj'] < 0.05)].sort_values('NES', ascending=False).head(10))
    
    fgsea_df[(key, 'padj')] = values.loc[fgsea_df.index, 'padj'].values
    fgsea_df[(key, 'NES')] = values.loc[fgsea_df.index, 'NES'].values

Key:  0                                              pathway      padj       NES
0                             TRANSLATION (REACTOME)  0.003357  4.519679
1                                 TRANSLATION (GOBP)  0.003357  4.424136
2                         RIBOSOME BIOGENESIS (GOBP)  0.003357  4.417018
3  RRNA PROCESSING (REACTOME DATABASE ID RELEASE 66)  0.003357  4.288639
4  RRNA PROCESSING IN THE NUCLEUS AND CYTOSOL (RE...  0.003357  4.252425
5  MAJOR PATHWAY OF RRNA PROCESSING IN THE NUCLEO...  0.003357  4.215953
6                PEPTIDE BIOSYNTHETIC PROCESS (GOBP)  0.003357  4.174270
7                             RRNA PROCESSING (GOBP)  0.003357  4.164961
8                            NCRNA PROCESSING (GOBP)  0.003357  4.157995
9                HALLMARK_MYC_TARGETS_V1 (MSIGDB_C2)  0.003357  4.088252
Key:  1                                              pathway      padj       NES
0     HALLMARK_INTERFERON_GAMMA_RESPONSE (MSIGDB_C2)  0.001768  5.112493
1         HALLMARK_INFLAMMATORY_RES

In [36]:
pth = '../data/micro-58-synovial-Pathways-Enrichment.tsv'
fgsea_df.to_csv(pth, sep='\t')

In [39]:
cfeat[2].head()

Unnamed: 0_level_0,pval,padj,ES,NES,nMoreExtreme,size,leadingEdge
pathway,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
POSITIVE REGULATION OF SYNAPSE ASSEMBLY%GOBP%GO:0051965,0.000123,0.010205,0.601092,2.567875,0,24,NLGN3|CLSTN2|EPHB3|CUX2|CLSTN1|BHLHB9|IL1RAPL1...
DENDRITIC SPINE MORPHOGENESIS%GOBP%GO:0060997,0.000124,0.010205,0.683882,2.501744,0,15,PAK3|SHANK1|SHANK2|EPHB3|ZNF365|WNT7A|EPHB1|EP...
CELL MORPHOGENESIS INVOLVED IN NEURON DIFFERENTIATION%GOBP%GO:0048667,0.000107,0.010205,0.316988,2.440983,0,256,NLGN3|PAK3|NRP2|SPTBN4|EPHA4|CDHR1|APP|ADGRB1|...
DENDRITIC SPINE DEVELOPMENT%GOBP%GO:0060996,0.000122,0.010205,0.622953,2.429497,0,18,PAK3|SHANK1|SHANK2|EPHB3|ZNF365|ARF4|WNT7A|EPH...
CYTOSKELETON-DEPENDENT INTRACELLULAR TRANSPORT%GOBP%GO:0030705,0.000112,0.010205,0.352694,2.41505,0,137,DLG2|KIF3B|APP|CLUAP1|MAP6|IFT88|SSX2IP|KIF3A|...


In [1]:
pth = '../data/pdata-group-2.tsv'
p2 = pd.read_csv(pth, sep='\t', index_col=1)
p2.head()

NameError: name 'pd' is not defined

In [16]:
surv = pd.DataFrame(columns=['sample', 'cluster', 'fusion', 'metastasis', 'time'])

for sample, assignment in zip(exp.columns, clus.get_assignments(exp)):
    met = 1 if p2.loc[sample, 'metastasis:ch1'] == 'yes' else 0
    surv.loc[len(surv), :] = [sample, assignment, p2.loc[sample, 'fusion transcrit:ch1'], met, p2.loc[sample, 'time:ch1']]

In [18]:
surv.to_csv("../data/micro-58-survival-data.tsv", sep='\t', index=False)

In [21]:
surv[surv['cluster'].isin([0, 1])].to_csv("../data/micro-58-cluster-0-or-1-survival-data.tsv", 
                                         sep='\t', 
                                         index=False)