In [None]:
%matplotlib inline
import pandas as pd
import itertools
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import os
import bnpy
import warnings
from scipy import stats
import random

from scipy.stats import kruskal, f_oneway
import collections
import scikit_posthocs as sp

warnings.filterwarnings('ignore')

In [None]:
pth = '../data/expression/target-high-risk-nbl-mycn-na-exp-2018-11-12.tsv'

exp = pd.read_csv(pth, sep='\t', index_col=0) 

In [None]:
pth = '../data/archive/mycn-na-nbl-0.2-assign-2019-03-02.tsv'
assign = pd.read_csv(pth, sep='\t', index_col=0)

In [None]:
assign

In [None]:
_dir = '../data/gsea/'

sub_cohort = collections.defaultdict(list)
sub_cohort_unique = collections.defaultdict(list)
sub_clust_unique = collections.defaultdict(list)

interest = ['HUMORAL IMMUNE RESPONSE MEDIATED BY CIRCULATING IMMUNOGLOBULIN%GOBP%GO:0002455',
 'COMPLEMENT ACTIVATION, CLASSICAL PATHWAY%GOBP%GO:0006958',
 'IMMUNOGLOBULIN COMPLEX%GOCC%GO:0019814',
 'LEUKOCYTE DEGRANULATION%GOBP%GO:0043299',
 'MYELOID LEUKOCYTE MEDIATED IMMUNITY%GOBP%GO:0002444',
 'IMMUNOGLOBULIN RECEPTOR BINDING%GOMF%GO:0034987',
 'GRANULOCYTE ACTIVATION%GOBP%GO:0036230',
 'INFLAMMATORY RESPONSE%GOBP%GO:0006954',
 'CELL ACTIVATION INVOLVED IN IMMUNE RESPONSE%GOBP%GO:0002263',
 'NEUTROPHIL ACTIVATION%GOBP%GO:0042119']

cluster2_immune = []
for cluster, rows in assign.groupby('1'):
    samps = rows.index.values
    for samp in samps:
        pth = os.path.join(_dir, 'sub_cohort', str(cluster), samp)        
        f1 = pd.read_csv(pth, index_col=0)
        f1 = f1[(f1['padj'] < 0.05) & (f1['ES'] > 0.0)]
        
        pth = os.path.join(_dir, 'sub_clust', str(cluster), samp)
        f2 = pd.read_csv(pth, index_col=0)
        f2 = f2[(f2['padj'] < 0.05) & (f2['ES'] > 0.0)]
        
        if cluster == 2:
            print samp
            print f1.sort_values('NES', ascending=False)['NES'].head(10)
            print '\n'
            print f2.sort_values('NES', ascending=False)['NES'].head(50)
            print '\n\n'
            
            if len(f2.index.intersection(interest)) > 0:
                cluster2_immune.append(samp)
        
        sub_cohort_unique[cluster].extend(list(set(f1.index.values) - set(f2.index.values)))
        sub_clust_unique[cluster].extend(list(set(f2.index.values) - set(f1.index.values)))

In [None]:
len(cluster2_immune)

In [None]:
for cluster, counts in sub_cohort_unique.items():
    print cluster, len(assign[assign['1'] == cluster])
    
    counter = collections.Counter(counts)
    
    for gs, n in counter.most_common(10):
        print gs, n
    
    print '\n\n'

In [None]:
top5 = []
for cluster, counts in sub_clust_unique.items():
    print cluster, len(assign[assign['1'] == cluster])
    
    counter = collections.Counter(counts)
    
    for gs, n in counter.most_common(10):
        print gs, n
        if cluster == 2:
            top5.append(gs)
    
    print '\n\n'

In [None]:
top5

In [None]:
def get_event(event):
    if pd.isnull(event):
        print("NULL")
        return np.nan
    
    events = ['Relapse',
              'Death', 
              'Progression',
              'Event',
              'Second Malignant Neoplasm']
    
    if event == 'Censored':
        return 0
    
    elif event in events:
        return 1 
    
    else:
        raise ValueError(event)
        
def get_vital(vital):
    if pd.isnull(vital):
        print("NULL")
        return np.nan
    
    
    if vital == 'Alive':
        return 0
    
    elif vital == 'Dead':
        return 1
    
    else:
        raise ValueError(vital)

pth = '../data/meta/TARGET_NBL_ClinicalData_Discovery_20170525.xlsx'
clinical = pd.read_excel(pth, index_col=0)

surv = pd.DataFrame(index=assign[assign['1'] == 2].index, 
                    columns=['OS', 'vital', 'EFS', 'event', 'immune'])

for sample in assign[assign['1'] == 2].index:
    root = '-'.join(sample.split('-')[:3])
    if root not in clinical.index:
        print "Missing: ", sample
        continue
    
    OS = clinical.loc[root, 'Overall Survival Time in Days'].item()
    vital = get_vital(clinical.loc[root, 'Vital Status'])
    
    EFS = clinical.loc[root, 'Event Free Survival Time in Days'].item()
    event = get_event(clinical.loc[root, 'First Event'])
    
    
    immune = 1 if sample in cluster2_immune else 0
    surv.loc[sample, :] = [OS, vital, EFS, event, immune]

In [None]:
import uuid

# Can't figure out how to pass this dataframe
survpth = '../data/mycn-na-tme-signal-subtract-cluster2-survival.tsv'
surv.to_csv(survpth, sep='\t')

In [None]:
cluster2_immune

In [None]:
og_list = ['TARGET-30-PALNVP-01',
 'TARGET-30-PALUYS-01',
 'TARGET-30-PALXTB-01',
 'TARGET-30-PANUIF-01',
 'TARGET-30-PAPEAV-01',
 'TARGET-30-PAPLSD-01',
 'TARGET-30-PARGKK-01',
 'TARGET-30-PARHAM-02',
 'TARGET-30-PASPER-01',
 'TARGET-30-PASUYG-01',
 'TARGET-30-PATCFL-01',
 'TARGET-30-PATYIL-02']

In [None]:
set(og_list) - set(cluster2_immune) 