In [None]:
%matplotlib inline
import pandas as pd
import itertools
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import os
import bnpy
import warnings
from scipy import stats
import random

from scipy.stats import kruskal, f_oneway
import collections
import scikit_posthocs as sp

warnings.filterwarnings('ignore')

In [None]:
pth = '../data/expression/target-high-risk-nbl-mycn-na-exp-2018-11-12.tsv'

exp = pd.read_csv(pth, sep='\t', index_col=0) 

In [None]:
pth = '../data/archive/mycn-na-nbl-0.2-assign-2019-03-02.tsv'

assign = pd.read_csv(pth, sep='\t', index_col=0)

In [None]:
exp.reindex(assign[assign['1'] == 1].index.values, axis=1).to_csv('target-high-risk-nbl-mycn-na-exp-2018-11-12-cluster0.tsv',
                                                                  sep='\t')

In [None]:
def n1(key, zscore):
    import uuid
    import subprocess
    import numpy as np
    
    _dir = '/tmp/%s' % str(uuid.uuid4())

    os.mkdir(_dir)

    cmd = ["Rscript",
           "/opt/hydra/bin/fgsea.R",
           "/opt/hydra/data/Human_GO_AllPathways_no_GO_iea_October_01_2018_symbol.gmt",
           os.path.join(_dir, "%s.rnk" % key),
           os.path.join(_dir, "%s.fgsea" % key)]

    zscore = zscore.sort_values(ascending=False)
    
    zscore.to_csv(os.path.join(_dir, "%s.rnk" % key),
                 header=None,
                 sep='\t')

    subprocess.check_call(cmd)
    
    return pd.read_csv(os.path.join(_dir, "%s.fgsea" % key), index_col=0)

In [None]:
RUN = False

if RUN: 
    _dir = '../data/gsea/'
    for cluster, rows in assign.groupby('1'):
        samps = rows.index.values
        back = [x for x in exp.columns if x not in samps]
    
        for samp in samps:
            zscore = (exp[samp] - exp[back].mean(axis=1)) / (exp[back].std(axis=1) + 0.05)
            res = n1(samp, zscore)
        
            pth = os.path.join(_dir, 'sub_cohort', str(cluster), samp)
            res.to_csv(pth)
        
            cback = [x for x in samps if x != samp]
            zscore = (exp[samp] - exp[cback].mean(axis=1)) / (exp[cback].std(axis=1) + 0.05)
            res2 = n1(samp, zscore)
        
            pth = os.path.join(_dir, 'sub_clust', str(cluster), samp)
            res2.to_csv(pth)
            
else:
    print("Run already")

In [None]:
_dir = '../data/gsea/'

sub_cohort = collections.defaultdict(list)
sub_cohort_unique = collections.defaultdict(list)
sub_clust_unique = collections.defaultdict(list)

sigs = set()

interest = ["REGULATION OF VASCULATURE DEVELOPMENT%GOBP%GO:1901342", 
            "POSITIVE REGULATION OF LOCOMOTION%GOBP%GO:0040017",
            "PLATELET DEGRANULATION%GOBP%GO:0002576",
            "RESPONSE TO ELEVATED PLATELET CYTOSOLIC CA2+%REACTOME DATABASE ID RELEASE 66%76005",
            "POSITIVE REGULATION OF VASCULATURE DEVELOPMENT%GOBP%GO:1904018"]

cluster1_metastatic = []
for cluster, rows in assign.groupby('1'):
    samps = rows.index.values
    for samp in samps:
        pth = os.path.join(_dir, 'sub_cohort', str(cluster), samp)        
        f1 = pd.read_csv(pth, index_col=0)
        f1 = f1[(f1['padj'] < 0.05) & (f1['ES'] > 0.0)]
        sigs.update(f1.index.values)
        
        pth = os.path.join(_dir, 'sub_clust', str(cluster), samp)
        f2 = pd.read_csv(pth, index_col=0)
        f2 = f2[(f2['padj'] < 0.05) & (f2['ES'] > 0.0)]
        sigs.update(f2.index.values)
        
        if cluster == 1:
            print samp
            print f1.sort_values('NES', ascending=False)['NES'].head(50)
            print '\n'
            print f2.sort_values('NES', ascending=False)['NES'].head(50)
            print '\n\n'
            
            if len(f2.index.intersection(interest)) > 0:
                cluster1_metastatic.append(samp)
        
        sub_cohort[cluster].extend(list(f1.index.values))
        sub_cohort_unique[cluster].extend(list(set(f1.index.values) - set(f2.index.values)))
        sub_clust_unique[cluster].extend(list(set(f2.index.values) - set(f1.index.values)))

In [None]:
len(sigs)

In [None]:
for cluster, counts in sub_cohort_unique.items():
    print cluster, len(assign[assign['1'] == cluster])
    
    counter = collections.Counter(counts)
    
    for gs, n in counter.most_common(50):
        print gs, n
    
    print '\n\n'

In [None]:
met = ['POSITIVE REGULATION OF LOCOMOTION%GOBP%GO:0040017',
       'POSITIVE REGULATION OF CELL MIGRATION%GOBP%GO:0030335',
       'POSITIVE REGULATION OF VASCULATURE DEVELOPMENT%GOBP%GO:1904018',
       'REGULATION OF ENDOTHELIAL CELL MIGRATION%GOBP%GO:0010594',
       'POSITIVE REGULATION OF CELLULAR COMPONENT MOVEMENT%GOBP%GO:0051272',
       'POSITIVE REGULATION OF CELL MOTILITY%GOBP%GO:2000147',
       'FOCAL ADHESION%GOCC%GO:0005925']

In [None]:
top5 = []

gs_plot = pd.DataFrame(columns=['cluster', 'gs', 'percent'])
for cluster, counts in sub_clust_unique.items():
    N = len(assign[assign['1'] == cluster])
    
    print cluster, N
    
    counter = collections.Counter(counts)
    
    for gs, n in counter.most_common(10):
        print gs, n
        if cluster == 1:
            top5.append(gs)
        
    print '\n\n'

In [None]:
top5

In [None]:
def get_event(event):
    if pd.isnull(event):
        print("NULL")
        return np.nan
    
    events = ['Relapse',
              'Death', 
              'Progression',
              'Event',
              'Second Malignant Neoplasm']
    
    if event == 'Censored':
        return 0
    
    elif event in events:
        return 1 
    
    else:
        raise ValueError(event)
        
def get_vital(vital):
    if pd.isnull(vital):
        print("NULL")
        return np.nan
    
    
    if vital == 'Alive':
        return 0
    
    elif vital == 'Dead':
        return 1
    
    else:
        raise ValueError(vital)

pth = '../data/meta/TARGET_NBL_ClinicalData_Discovery_20170525.xlsx'
clinical = pd.read_excel(pth, index_col=0)

surv = pd.DataFrame(index=assign[assign['1'] == 2].index, 
                    columns=['OS', 'vital', 'EFS', 'event', 'immune'])

for sample in assign[assign['1'] == 1].index:
    root = '-'.join(sample.split('-')[:3])
    if root not in clinical.index:
        print "Missing: ", sample
        continue
    
    OS = clinical.loc[root, 'Overall Survival Time in Days'].item()
    vital = get_vital(clinical.loc[root, 'Vital Status'])
    
    EFS = clinical.loc[root, 'Event Free Survival Time in Days'].item()
    event = get_event(clinical.loc[root, 'First Event'])
    
    
    immune = 1 if sample in cluster1_metastatic else 0
    surv.loc[sample, :] = [OS, vital, EFS, event, immune]

In [None]:
import uuid

# Can't figure out how to pass this dataframe
survpth = '../data/mycn-na-tme-signal-subtract-cluster1-survival.tsv'
surv.to_csv(survpth, sep='\t')

%%R -i survpth

require(dpylr)
require(survival)
require(survminer)

surv <- read.table(survpth,
                  sep='\t',
                  header=T)

surv <- dplyr::as_tibble(surv)

%%R

fit <- survfit(Surv(OS, vital) ~ immune, data=surv)
ggsurvplot(fit, conf.int=F, pval=T, risk.table=TRUE)

%%R

fit <- survfit(Surv(EFS, event) ~ immune, data=surv)
ggsurvplot(fit, conf.int=F, pval=T, risk.table=TRUE)