In [1]:
import sys
import pandas as pd
import numpy as np

sys.path.append('/opt/hydra/')

import library.analysis as hydra

In [2]:
exp_path = '../data/synovial-microarray-58-processed.tsv'
mm_path = '../fit/micro-58-filter/MultiModalGenes'

In [3]:
exp = pd.read_csv(exp_path, sep='\t', index_col=0)

In [4]:
exp.shape

(18834, 58)

In [5]:
help(hydra.ScanEnrichmentAnalysis)

Help on class ScanEnrichmentAnalysis in module library.analysis:

class ScanEnrichmentAnalysis(__builtin__.object)
 |  Methods defined here:
 |  
 |  __init__(self, mm_path, exp_path, gmt_path, min_prob_range=None, min_effect_filter=1.0, **kwargs)
 |      Class to explore the function of multimodally expressed genes.
 |      
 |      :param min_prob_range (iterable): Iterable containing floats between 0 and 0.5
 |  
 |  plot(self)
 |  
 |  scan(self)
 |  
 |  ----------------------------------------------------------------------
 |  Data descriptors defined here:
 |  
 |  __dict__
 |      dictionary for instance variables (if defined)
 |  
 |  __weakref__
 |      list of weak references to the object (if defined)



In [6]:
if False:
    min_prob_range = [round(x, 2) for x in np.linspace(0.1, 0.15, 5)]
    print(min_prob_range)
    scan = hydra.ScanEnrichmentAnalysis(mm_path, 
                                        exp_path, 
                                        'GO', 
                                        min_prob_range=min_prob_range,
                                        CPU=7).scan()

In [7]:
import numpy as np

mm = hydra.EnrichmentAnalysis(mm_path,
                              exp_path,
                              min_prob_filter=0.14,
                              gmt_path='GO')

In [8]:
clus = hydra.MultivariateMixtureModel(data=exp.reindex(mm.get_enriched_term_genes()),
                                      center=True,
                                      gamma=5.0,
                                      variance=2.0,
                                      K=1)

In [9]:
cfeat = clus.get_cluster_features(exp)

In [33]:
def format_pathway(x):
    fields = x.split('%')
    return '%s (%s)' % (fields[0], fields[1])

pth = '../img/microarray-58-top-10-pathways-cluster-%d.tex'
for key, values in cfeat.items():
    with pd.option_context('display.precision', 2):
        t = values.reset_index().reindex(['pathway', 'padj', 'NES'], axis=1)
        t['pathway'] = t['pathway'].apply(format_pathway)
        with open(pth % key, 'w') as f:
            print(t[(t['NES'] > 0) & (t['padj'] < 0.05)].sort_values('NES', ascending=False).head(10).to_latex(index=False))
            t[(t['NES'] > 0) & (t['padj'] < 0.05)].sort_values('NES', ascending=False).head(10).to_latex(f, index=False)

\begin{tabular}{lrr}
\toprule
                                           pathway &      padj &   NES \\
\midrule
                            TRANSLATION (REACTOME) &  3.36e-03 &  4.52 \\
                                TRANSLATION (GOBP) &  3.36e-03 &  4.42 \\
                        RIBOSOME BIOGENESIS (GOBP) &  3.36e-03 &  4.42 \\
 RRNA PROCESSING (REACTOME DATABASE ID RELEASE 66) &  3.36e-03 &  4.29 \\
 RRNA PROCESSING IN THE NUCLEUS AND CYTOSOL (RE... &  3.36e-03 &  4.25 \\
 MAJOR PATHWAY OF RRNA PROCESSING IN THE NUCLEO... &  3.36e-03 &  4.22 \\
               PEPTIDE BIOSYNTHETIC PROCESS (GOBP) &  3.36e-03 &  4.17 \\
                            RRNA PROCESSING (GOBP) &  3.36e-03 &  4.16 \\
                           NCRNA PROCESSING (GOBP) &  3.36e-03 &  4.16 \\
               HALLMARK\_MYC\_TARGETS\_V1 (MSIGDB\_C2) &  3.36e-03 &  4.09 \\
\bottomrule
\end{tabular}

\begin{tabular}{lrr}
\toprule
                                           pathway &      padj &   NES \\
\midrule
   

In [35]:
fgsea_df = None
for key, values in cfeat.items():
    if fgsea_df is None:
        header = pd.MultiIndex.from_product([list(cfeat.keys()), 
                                             ['padj', 'NES']],
                                           names=['cluster', 'feature'])
        
        fgsea_df = pd.DataFrame(index=values.index.values, columns=header)
        
    print 'Key: ', key,
    t = values.reset_index().reindex(['pathway', 'padj', 'NES'], axis=1)
    t['pathway'] = t['pathway'].apply(format_pathway) 
    print(t[(t['NES'] > 0) & (t['padj'] < 0.05)].sort_values('NES', ascending=False).head(10))
    
    fgsea_df[(key, 'padj')] = values.loc[fgsea_df.index, 'padj'].values
    fgsea_df[(key, 'NES')] = values.loc[fgsea_df.index, 'NES'].values

Key:  0                                              pathway      padj       NES
0                             TRANSLATION (REACTOME)  0.003357  4.519679
1                                 TRANSLATION (GOBP)  0.003357  4.424136
2                         RIBOSOME BIOGENESIS (GOBP)  0.003357  4.417018
3  RRNA PROCESSING (REACTOME DATABASE ID RELEASE 66)  0.003357  4.288639
4  RRNA PROCESSING IN THE NUCLEUS AND CYTOSOL (RE...  0.003357  4.252425
5  MAJOR PATHWAY OF RRNA PROCESSING IN THE NUCLEO...  0.003357  4.215953
6                PEPTIDE BIOSYNTHETIC PROCESS (GOBP)  0.003357  4.174270
7                             RRNA PROCESSING (GOBP)  0.003357  4.164961
8                            NCRNA PROCESSING (GOBP)  0.003357  4.157995
9                HALLMARK_MYC_TARGETS_V1 (MSIGDB_C2)  0.003357  4.088252
Key:  1                                              pathway      padj       NES
0     HALLMARK_INTERFERON_GAMMA_RESPONSE (MSIGDB_C2)  0.001768  5.112493
1         HALLMARK_INFLAMMATORY_RES

In [36]:
pth = '../data/micro-58-synovial-Pathways-Enrichment.tsv'
fgsea_df.to_csv(pth, sep='\t')

In [39]:
cfeat[2].head()

Unnamed: 0_level_0,pval,padj,ES,NES,nMoreExtreme,size,leadingEdge
pathway,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
POSITIVE REGULATION OF SYNAPSE ASSEMBLY%GOBP%GO:0051965,0.000123,0.010205,0.601092,2.567875,0,24,NLGN3|CLSTN2|EPHB3|CUX2|CLSTN1|BHLHB9|IL1RAPL1...
DENDRITIC SPINE MORPHOGENESIS%GOBP%GO:0060997,0.000124,0.010205,0.683882,2.501744,0,15,PAK3|SHANK1|SHANK2|EPHB3|ZNF365|WNT7A|EPHB1|EP...
CELL MORPHOGENESIS INVOLVED IN NEURON DIFFERENTIATION%GOBP%GO:0048667,0.000107,0.010205,0.316988,2.440983,0,256,NLGN3|PAK3|NRP2|SPTBN4|EPHA4|CDHR1|APP|ADGRB1|...
DENDRITIC SPINE DEVELOPMENT%GOBP%GO:0060996,0.000122,0.010205,0.622953,2.429497,0,18,PAK3|SHANK1|SHANK2|EPHB3|ZNF365|ARF4|WNT7A|EPH...
CYTOSKELETON-DEPENDENT INTRACELLULAR TRANSPORT%GOBP%GO:0030705,0.000112,0.010205,0.352694,2.41505,0,137,DLG2|KIF3B|APP|CLUAP1|MAP6|IFT88|SSX2IP|KIF3A|...


In [40]:
cfeat[1].head(50)

Unnamed: 0_level_0,pval,padj,ES,NES,nMoreExtreme,size,leadingEdge
pathway,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
HALLMARK_INTERFERON_GAMMA_RESPONSE%MSIGDB_C2%HALLMARK_INTERFERON_GAMMA_RESPONSE,0.000173,0.001768,0.659739,5.112493,0,198,SLAMF7|IL6|CIITA|GZMA|HLA-DMA|SELP|CD69|IL4R|F...
HALLMARK_INFLAMMATORY_RESPONSE%MSIGDB_C2%HALLMARK_INFLAMMATORY_RESPONSE,0.000173,0.001768,0.646391,5.009057,0,198,P2RY2|IL7R|IL6|CD48|SLC31A2|CXCR6|CD69|IL4R|PT...
HALLMARK_TNFA_SIGNALING_VIA_NFKB%MSIGDB_C2%HALLMARK_TNFA_SIGNALING_VIA_NFKB,0.000172,0.001768,0.629679,4.872291,0,196,IL7R|IL6|PLEK|BIRC3|MAP3K8|CXCL2|IER3|CXCL3|CD...
RESPONSE TO INTERFERON-GAMMA%GOBP%GO:0034341,0.000166,0.001768,0.670848,4.769488,0,130,CIITA|CCL18|GBP2|IFI30|PTAFR|ICAM1|TLR2|CCL5|C...
INFLAMMATORY RESPONSE%GOBP%GO:0006954,0.000177,0.001768,0.577929,4.672275,0,249,IL6|CCR5|CD6|CXCL2|CIITA|S100A9|ITGB2|CCL18|CX...
LEUKOCYTE MIGRATION%GOBP%GO:0050900,0.000177,0.001768,0.572611,4.634945,0,250,LGALS3|JCHAIN|IGHA1|IL6|CD48|CD2|CCR5|CXCL2|FC...
HALLMARK_ALLOGRAFT_REJECTION%MSIGDB_C2%HALLMARK_ALLOGRAFT_REJECTION,0.000173,0.001768,0.593128,4.596306,0,198,IL6|FGR|LTB|CD2|CCR5|CD3D|IL2RG|CXCR3|ITGB2|GZ...
CELLULAR RESPONSE TO INTERFERON-GAMMA%GOBP%GO:0071346,0.000166,0.001768,0.660861,4.571965,0,115,CIITA|CCL18|GBP2|IFI30|PTAFR|ICAM1|TLR2|CCL5|C...
IMMUNOREGULATORY INTERACTIONS BETWEEN A LYMPHOID AND A NON-LYMPHOID CELL%REACTOME DATABASE ID RELEASE 66%198933,0.000167,0.001768,0.662514,4.570671,0,114,SLAMF7|HCST|CD3D|SH2D1A|KLRB1|ITGB2|CRTAM|CD96...
MYELOID CELL ACTIVATION INVOLVED IN IMMUNE RESPONSE%GOBP%GO:0002275,0.000189,0.001769,0.509123,4.561737,0,483,LGALS3|FGR|ALOX5|FCER1G|ITGAX|S100A9|ITGB2|SLP...


In [13]:
cfeat[2].head()

Unnamed: 0_level_0,pval,padj,ES,NES,nMoreExtreme,size,leadingEdge
pathway,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
POSITIVE REGULATION OF SYNAPSE ASSEMBLY%GOBP%GO:0051965,0.000123,0.010889,0.601092,2.577286,0,24,NLGN3|CLSTN2|EPHB3|CUX2|CLSTN1|BHLHB9|IL1RAPL1...
DENDRITIC SPINE MORPHOGENESIS%GOBP%GO:0060997,0.000126,0.010889,0.683882,2.491609,0,15,PAK3|SHANK1|SHANK2|EPHB3|ZNF365|WNT7A|EPHB1|EP...
CELL MORPHOGENESIS INVOLVED IN NEURON DIFFERENTIATION%GOBP%GO:0048667,0.000107,0.010889,0.316988,2.438336,0,256,NLGN3|PAK3|NRP2|SPTBN4|EPHA4|CDHR1|APP|ADGRB1|...
DENDRITIC SPINE DEVELOPMENT%GOBP%GO:0060996,0.000125,0.010889,0.622953,2.425858,0,18,PAK3|SHANK1|SHANK2|EPHB3|ZNF365|ARF4|WNT7A|EPH...
DENDRITE DEVELOPMENT%GOBP%GO:0016358,0.000119,0.010889,0.450634,2.415156,0,50,PAK3|APP|SHANK1|MAP6|SHANK2|EPHB3|FLRT1|DVL1|N...


In [14]:
cfeat[3].head()

Unnamed: 0_level_0,pval,padj,ES,NES,nMoreExtreme,size,leadingEdge
pathway,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
HALLMARK_G2M_CHECKPOINT%MSIGDB_C2%HALLMARK_G2M_CHECKPOINT,0.00013,0.003052,0.524389,3.897572,0,194,CHAF1A|ILF3|TMPO|RAD21|RACGAP1|MCM3|NUSAP1|CBX...
HALLMARK_E2F_TARGETS%MSIGDB_C2%HALLMARK_E2F_TARGETS,0.00013,0.003052,0.504218,3.753634,0,196,SPAG5|ILF3|TMPO|RAD21|TIMELESS|RACGAP1|MCM3|KI...
REGULATION OF CELL CYCLE G2/M PHASE TRANSITION%GOBP%GO:1902749,0.00013,0.003052,0.531143,3.712578,0,144,FBXL7|CEP72|HAUS2|MSH6|RAD17|ATAD5|ZFYVE19|CDK...
REGULATION OF G2/M TRANSITION OF MITOTIC CELL CYCLE%GOBP%GO:0010389,0.00013,0.003052,0.543359,3.708448,0,129,FBXL7|CEP72|HAUS2|MSH6|RAD17|ZFYVE19|CDK2|HUS1...
MITOTIC PROMETAPHASE%REACTOME DATABASE ID RELEASE 66%68877,0.00013,0.003052,0.48266,3.508886,0,173,CEP72|NUP43|HAUS2|CENPK|KNTC1|RAD21|DSN1|DYNC1...


In [15]:
pth = '../data/pdata-group-2.tsv'
p2 = pd.read_csv(pth, sep='\t', index_col=1)
p2.head()

Unnamed: 0,title,geo_accession,status,submission_date,last_update_date,type,channel_count,source_name_ch1,organism_ch1,characteristics_ch1,...,contact_city,contact_zip/postal_code,contact_country,supplementary_file,data_row_count,age group:ch1,cinsarc group:ch1,fusion transcrit:ch1,metastasis:ch1,time:ch1
SS001,GSM983734,GSM983734,Public on Aug 09 2014,Aug 09 2012,Aug 09 2014,RNA,1,member,Homo sapiens,cinsarc group: C2,...,Bordeaux,33076,France,ftp://ftp.ncbi.nlm.nih.gov/geo/samples/GSM983n...,41078,adult,C2,SSX1,yes,2.612
SS002,GSM983735,GSM983735,Public on Aug 09 2014,Aug 09 2012,Aug 09 2014,RNA,1,member,Homo sapiens,cinsarc group: C1,...,Bordeaux,33076,France,ftp://ftp.ncbi.nlm.nih.gov/geo/samples/GSM983n...,41078,pediatric,C1,SSX1,no,11.406
SS003,GSM983736,GSM983736,Public on Aug 09 2014,Aug 09 2012,Aug 09 2014,RNA,1,trunk,Homo sapiens,cinsarc group: C2,...,Bordeaux,33076,France,ftp://ftp.ncbi.nlm.nih.gov/geo/samples/GSM983n...,41078,adult,C2,SSX2,yes,0.0
SS004,GSM983737,GSM983737,Public on Aug 09 2014,Aug 09 2012,Aug 09 2014,RNA,1,member,Homo sapiens,cinsarc group: C1,...,Bordeaux,33076,France,ftp://ftp.ncbi.nlm.nih.gov/geo/samples/GSM983n...,41078,adult,C1,SSX1,no,0.816
SS005,GSM983738,GSM983738,Public on Aug 09 2014,Aug 09 2012,Aug 09 2014,RNA,1,member,Homo sapiens,cinsarc group: C2,...,Bordeaux,33076,France,ftp://ftp.ncbi.nlm.nih.gov/geo/samples/GSM983n...,41078,adult,C2,SSX1,yes,0.0


In [16]:
surv = pd.DataFrame(columns=['sample', 'cluster', 'fusion', 'metastasis', 'time'])

for sample, assignment in zip(exp.columns, clus.get_assignments(exp)):
    met = 1 if p2.loc[sample, 'metastasis:ch1'] == 'yes' else 0
    surv.loc[len(surv), :] = [sample, assignment, p2.loc[sample, 'fusion transcrit:ch1'], met, p2.loc[sample, 'time:ch1']]

In [17]:
for i, rows in surv.groupby('cluster'):
    print(i)
    print(rows)

0
   sample cluster fusion metastasis   time
6   SS007       0   SSX1          0  2.174
12  SS013       0   SSX1          1  0.099
16  SS017       0   SSX1          0  6.078
18  SS019       0   SSX2          1  0.539
19  SS020       0   SSX1          0  3.318
20  SS021       0   SSX2          1  0.498
21  SS022       0   SSX1          1      0
23  SS024       0   SSX2          0  2.927
30  SS031       0   SSX1          1  1.076
38  SS039       0   SSX1          1      0
39  SS040       0   SSX1          1      0
40  SS041       0   SSX1          0   2.62
46  SS047       0   SSX1          1    0.2
48  SS049       0   SSX2          1  0.791
52  SS053       0   SSX1          0  0.008
56  SS057       0   SSX1          1  1.927
1
   sample cluster fusion metastasis    time
0   SS001       1   SSX1          1   2.612
3   SS004       1   SSX1          0   0.816
5   SS006       1   SSX2          0   2.045
8   SS009       1   SSX1          0   1.109
26  SS027       1   SSX2          1   0.903
2

In [18]:
surv.to_csv("../data/micro-58-survival-data.tsv", sep='\t', index=False)

In [21]:
surv[surv['cluster'].isin([0, 1])].to_csv("../data/micro-58-cluster-0-or-1-survival-data.tsv", 
                                         sep='\t', 
                                         index=False)