In [None]:
import collections
import sys
import subprocess
import pandas as pd
import os
import numpy as np
import uuid
import tempfile
import bnpy
import scikit_posthocs as sp
import itertools
import logging

from scipy.stats import kruskal, f_oneway

sys.path.append('/opt/hydra/')

src = os.environ["HYDRA_SRC"]

import library.analysis as hy

In [None]:
logging.basicConfig(filename='example.log',level=logging.DEBUG)

In [None]:
# This cmd was used to identify multimodal genes 
cmd = ["docker" ,
       "run",
       "-v", "$PWD:/data",
       "jpfeil/hydra@sha256:123bee0aa2b3e63084c773a13a16d247076462af910a104cd5776ba5e6d4b29d",
       "filter",
       "-e", "data/target-high-risk-nbl-mycn-na-exp-2018-11-12.tsv",
       "--output-dir", "output",
       "--CPU", "15"]

In [None]:
mm_genes = '../data/output/MultiModalGenes/'
exp_path = '../data/expression/target-high-risk-nbl-mycn-na-exp-2018-11-12.tsv'

In [None]:
# Read in expression data
exp = pd.read_csv(exp_path, sep='\t', index_col=0)

In [None]:
# Save list of multimodal genes
mms = []
for gene in os.listdir(mm_genes):
    mms.append(gene)

In [None]:
# Scan minimum probability thresholds
if False:
    scan = hy.ScanEnrichmentAnalysis(mm_genes, 
                                     exp_path, 
                                     'GO', 
                                     min_prob_range=[0.10, 0.15, 0.20, 0.25, 0.3], 
                                     CPU=7,
                                     K=5).scan()

In [None]:
# Perform enrichment analysis - same as enrich command
res = hy.EnrichmentAnalysis(exp_path=exp_path,
                            mm_path=mm_genes,
                            min_prob_filter=0.2,
                            gmt_path='GO')

In [None]:
# Load enriched gene set terms
terms = res.get_enriched_terms()

In [None]:
# Extract enriched GO term genes
genes = res.get_enriched_term_genes()

len(genes)

In [None]:
# Perform multivariate DP-GMM analysis
clus = hy.MultivariateMixtureModel(data=exp.reindex(genes),
                                   center=True,
                                   gamma=5.0,
                                   variance=2.0,
                                   K=5, 
                                   verbose=True)

In [None]:
# Format pathway names for printing
def format_pathway(x):
    fields = x.split('%')
    return '%s (%s)' % (fields[0], fields[1])

In [None]:
# Perform GSEA on each cluster
fgsea = clus.get_cluster_features(exp,
                                  gmt='/opt/hydra/gene-sets/Human_GOBP_AllPathways_no_GO_iea_December_01_2018_symbol.gmt')

In [None]:
# Create dataframe with enriched pathways
fgsea_df = None
for key, values in fgsea.items():
    if fgsea_df is None:
        header = pd.MultiIndex.from_product([list(fgsea.keys()), 
                                             ['padj', 'NES']],
                                           names=['cluster', 'feature'])
        
        fgsea_df = pd.DataFrame(index=values.index.values, columns=header)
        
    print 'Key: ', key,
    t = values.reset_index().reindex(['pathway', 'padj', 'NES'], axis=1)
    t['pathway'] = t['pathway'].apply(format_pathway) 
    print(t[(t['NES'] > 0) & (t['padj'] < 0.05)].sort_values('NES', ascending=False).head(10))
    
    fgsea_df[(key, 'padj')] = values.loc[fgsea_df.index, 'padj'].values
    fgsea_df[(key, 'NES')] = values.loc[fgsea_df.index, 'NES'].values

In [None]:
# Save dataframe for downstream analysis
pth = '../data/TARGET-MYCN-NA-Pathways-Enrichment.tsv'
fgsea_df.to_csv(pth, sep='\t')

In [None]:
# Create LaTEX tables with top 10 enriched pathways
cpth = '../img/TARGET-MYCN-NA-top-10-pathways-cluster-%d.tex'
for key, values in fgsea.items():
    with pd.option_context('display.precision', 2):
        t = values.reset_index().reindex(['pathway', 'padj', 'NES'], axis=1)
        t['pathway'] = t['pathway'].apply(format_pathway)
        with open(pth % key, 'w') as f:
            #print(t[(t['NES'] > 0) & (t['padj'] < 0.05)].sort_values('NES', ascending=False).head(10).to_latex(index=False))
            t[(t['NES'] > 0) & (t['padj'] < 0.05)].sort_values('NES', ascending=False).head(10).to_latex(f, index=False)

In [None]:
# Save cluster assignments
assignments = clus.get_assignments(exp.reindex(genes))

assign = pd.DataFrame(index=exp.columns,
                      columns=[1])

for sample, assignment in zip(exp.columns, assignments):
    assign.loc[sample, 1] = assignment + 1

In [None]:
assign

In [None]:
# Save assignments
assign.to_csv('../data/assignments.tsv', sep='\t', header=None)

In [None]:
# Peform hierarchical clustering on multimodal genes
hclust = hy.HClust(data=exp.reindex(genes))

In [None]:
# Plot row linkage with a distance of 17
hclust.plot_row_linkage(17)
row_groups = hclust.get_row_groups(17)

In [None]:
# Load M3C analysis
pth = '../data/cluster-analysis/MYCN-NA-M3C-MAD-5000-labels-2.tsv'
m3c5000 = pd.read_csv(pth, sep='\t')
m3c5000.index = [x.replace('.', '-') for x in m3c5000.index.values]
m3c5000 = m3c5000.reindex(exp.columns)
m3c5000.head()

pth = '../data/cluster-analysis/MYCN-NA-M3C-MAD-500-labels-3.tsv'
m3c500 = pd.read_csv(pth, sep='\t')
m3c500.index = [x.replace('.', '-') for x in m3c500.index.values]
m3c500 = m3c500.reindex(exp.columns)
m3c500.head()

In [None]:
from scipy.stats import ttest_ind

# M3C fgsea analysis
m3c5000_feats = {}
for cluster, rows in m3c5000.groupby('consensuscluster'):
    ins = rows.index.values
    outs = m3c5000[m3c5000['consensuscluster'] != cluster].index.values
    
    res = ttest_ind(exp[ins], exp[outs], axis=1).statistic
    tstats = pd.DataFrame(index=exp.index, data=res).dropna()
    tstats = tstats.sort_values(0, ascending=False)
    
    m3c5000_feats[cluster] = hy.n1(tstats, 
                                   gmt='/opt/hydra/gene-sets/Human_GOBP_AllPathways_no_GO_iea_December_01_2018_symbol.gmt')

In [None]:
t = m3c5000_feats[1]
t[(t['padj'] < 0.05) & (t['ES'] > 0.0)].sort_values('NES', ascending=False).head()

In [None]:
t = m3c5000_feats[2]
t[(t['padj'] < 0.05) & (t['ES'] > 0.0)].sort_values('NES', ascending=False).head()

In [None]:
from scipy.stats import ttest_ind

# M3C fgsea analysis with a threshold of 500MAD
m3c500_feats = {}
for cluster, rows in m3c500.groupby('consensuscluster'):
    ins = rows.index.values
    outs = m3c500[m3c500['consensuscluster'] != cluster].index.values
    
    res = ttest_ind(exp[ins], exp[outs], axis=1).statistic
    tstats = pd.DataFrame(index=exp.index, data=res).dropna()
    tstats = tstats.sort_values(0, ascending=False)
    
    m3c500_feats[cluster] = hy.n1(tstats, 
                                  gmt='/opt/hydra/gene-sets/Human_GOBP_AllPathways_no_GO_iea_December_01_2018_symbol.gmt')

In [None]:
t = m3c500_feats[1]
t[(t['padj'] < 0.05) & (t['ES'] > 0.0)].sort_values('NES', ascending=False).head()

In [None]:
t = m3c500_feats[2]
t[(t['padj'] < 0.05) & (t['ES'] > 0.0)].sort_values('NES', ascending=False).head(20)

In [None]:
t = m3c500_feats[3]
t[(t['padj'] < 0.05) & (t['ES'] > 0.0)].sort_values('NES', ascending=False).head(20)

In [None]:
cmap = {0: 'r', 1:'b', 2:'g', 3:'y', 4: 'p'}

col_colors = [cmap[i] for i in assignments]

m3c5000_colors = [cmap[i] for i in m3c5000['consensuscluster'].values]

m3c500_colors = [cmap[i] for i in m3c500['consensuscluster'].values]

In [None]:
import seaborn as sns
import matplotlib.pyplot as plt

#col_order = assign.sort_values(1).index.values

g = sns.clustermap(exp.reindex(genes).dropna(),
                   z_score=0,
                   method='ward',
                   center=0,
                   col_colors=[col_colors, m3c5000_colors, m3c500_colors],
                   cmap=sns.diverging_palette(240, 10, n=7),
                   figsize=(10, 10))

ax = g.ax_heatmap

ax.set_xticklabels([])
ax.set_xticks([])
ax.set_yticklabels([])
ax.set_yticks([])

pth = '../img/NBL-expression-heatmap.svg'
plt.savefig(pth, format='svg', bbox_inches='tight')

In [None]:
# Correlate clusters with NBL clinical features
pth = '../data/meta/target-features-v2.tsv'
features = pd.read_csv(pth, sep='\t')
def feature_map(df, features):
    
    output = pd.DataFrame(columns=['cluster', 
                                   'sample', 
                                   'feature', 
                                   'value'])
    
    for cluster, rows in df.groupby(1):
        roots = ['-'.join(s.split('-')[:3]) for s in rows['index'].values]
        _f = features[features['root'].isin(roots)]
        for feature, value in _f.iterrows():
            output.loc[len(output), :] = [cluster, 
                                          value['root'], 
                                          value['feature'], 
                                          value['value']]
    return output

data = feature_map(assign.reset_index(),
                   features)

from collections import Counter

feature_clusters = pd.DataFrame(columns=['feature', 'value', 'cluster', 'fraction', 'count'])

for (feature, value), rows in data.groupby(['feature', 'value']):
    c = Counter(rows['cluster'].values)
    for cluster, count in c.items():
        frac = (count + 0.0) / sum(c.values())
        feature_clusters.loc[len(feature_clusters), :] = [feature, 
                                                          value, 
                                                          cluster, 
                                                          frac, 
                                                          count]
        
        
def _fisher(m):
    temp = os.path.join(tempfile.gettempdir(), 'M' + str(uuid.uuid4()))
    np.savetxt(temp, m, delimiter='\t')
    
    cmd = ['Rscript',
           os.path.join(src, 'bin', 'fisher.R'),
           temp]
    
    p = subprocess.Popen(cmd, stdout=subprocess.PIPE)
    
    stdout, stderr = p.communicate()
    
    return float(stdout)
        
def fisher(feature, value, df, background=None, mod=None):
    m = np.zeros((2, len(df['cluster'].unique())))
    
    for cluster, rows in df.groupby('cluster'):
        m1 = (rows['feature'] == feature) & (rows['value'] == value) & (rows['value'] != 'unknown') & (~pd.isnull(rows['value']))
        
        if background is None:
            m2 = (rows['feature'] == feature) & (rows['value'] != value) & (rows['value'] != 'unknown') & (~pd.isnull(rows['value']))
            
            #print rows[m2]
        
        else:
            m2 = (rows['feature'] == feature) & (rows['value'] == background) & (rows['value'] != 'unknown') & (~pd.isnull(rows['value']))
        
        n1 = rows[m1].shape[0]
        n2 = rows[m2].shape[0]
        
        # 1-indexed
        m[0, cluster - 1] += n1
        m[1, cluster - 1] += n2
        
    if m[0, :].sum() > m[1, :].sum():
        print("WARNING: Feature count is greater than background!")
    
    if mod is not None:
        for i, v in enumerate(mod):
            m[0, i] += v
        
    nr, nc = m.shape

    return _fisher(m), m


features.loc[features['feature'] == '%tumor', 'value'] = pd.to_numeric(features.loc[features['feature'] == '%tumor', 'value'])

features.loc[features['feature'] == 'age', 'value'] = pd.to_numeric(features.loc[features['feature'] == 'age', 'value'])

In [None]:
mf = pd.DataFrame(columns=['feature', 
                           'alteration', 
                           'p-value',
                           'cluster1',
                           'cluster2',
                           'cluster3',
                           'background1',
                           'background2',
                           'background3'])

molecular_features = [('ATRX', 'deleted', None), 
                      ('ALK', 'mutated', None),
                      ('1q+', 'gain', None),
                      ('1p-', 'loss', None),
                      ('11q-', 'loss', None),
                      ('17q+', 'gain', None),
                      ('MKI', 'low', None),
                      ('MKI', 'intermediate', None),
                      ('MKI', 'high', None),
                      ('Grade', 'differentiating', None)]

for feature, alteration, background in molecular_features:
    print feature, alteration
    p, m = fisher(feature, alteration, data, background=background)
    print p
    print m
    
    mf.loc[len(mf), :] = [feature, alteration, p] + list(m.flatten())

mf.sort_values('p-value').to_csv('../data/meta/molecular-feature-table.tsv', 
                                 sep='\t',
                                 index=False)

In [None]:
# Load TME profiling tools
pth = '../data/immune/CIBERSORT.Output_Job17-target-mycn-na.txt'
ciber = pd.read_csv(pth, index_col=0, sep='\t')

pth = '../data/immune/mycn-na-target-estimate.tsv'
est = pd.read_csv(pth, sep='\t', comment='#', header=1, index_col=0)
est.drop('Description', axis=1, inplace=True)
est.columns = [x.replace('.', '-') for x in est.columns]

pth = '../data/immune/xCell_target-high-risk-nbl-mycn-na-exp-2018-11-12_xCell_1754032119.txt'
xcell = pd.read_csv(pth, sep='\t', index_col=0)
xcell.columns = [x.replace('.', '-') for x in xcell.columns]

In [None]:
# Save TME profiling data in dataframe
immune_groups = dict((group, []) for group in assign[1].unique())
immune = pd.DataFrame(columns=['sample', 'cluster', 'source', 'feature', 'value'])
for sample in assign.index.values:
    if sample not in ciber.index.values:
        print 'Misisng: ', sample
        continue
        
    assignment = assign.loc[sample, 1]
    
    immune_groups[assignment].append(sample)
    
    for score in ['StromalScore', 'ImmuneScore', 'ESTIMATEScore']:
        _est = est.loc[score, sample].item()
        immune.loc[len(immune), :] = [sample, assignment, 'Estimate', score, _est]
        
    immune.loc[len(immune), :] = [sample, assignment, 
                                'Estimate', 'TumorPurity', 
                                np.cos(0.6049872018 + 0.0001467884 * _est)]
    
    pvalue = ciber.loc[sample, 'P-value'].item()
    for cell, value in ciber.loc[sample, :].iteritems():
        immune.loc[len(immune), :] = [sample, assignment, 'CIBERSORT', cell, value]
        
    for cell, value in xcell[sample].iteritems():
        immune.loc[len(immune), :] = [sample, assignment, 'xCell', cell, value]

skip = ['P-value', 'Pearson Correlation', 'RMSE']        
immune = immune[~immune['feature'].isin(skip)]        
immune['value'] = pd.to_numeric(immune['value'])

In [None]:
# Identify statistically significant correlations
sigs = collections.defaultdict(list)

p_values = pd.DataFrame(columns=['source', 
                                 'feature', 
                                 'A', 
                                 'B', 
                                 'holm p-value'])

for source, rows in immune.groupby('source'):
    print source
    nfeatures = len(rows['feature'].unique())
    alpha = 1 - (1 - 0.05) ** (1.0 / nfeatures)
    for feature in rows['feature'].unique():
        print(feature)
        groups3 = []
        for i, rows in immune[immune['feature'] == feature].groupby('cluster'):
            groups3.append(list(rows['value'].values))
            print i, rows['value'].mean()
    
        try:
            stat, pvalue = kruskal(*groups3)
            print(feature, pvalue, pvalue < alpha)
        
        except Exception as e:
            print e
            continue
    
        if pvalue < alpha:
            sigs[source].append(feature)
            
            try:
                res = sp.posthoc_mannwhitney(groups3, 
                                             p_adjust='holm')
                
            except Exception as e:
                print e
                continue
            
            for i, j in itertools.combinations(range(len(groups3)),
                                               2):
                
                if i == j:
                    continue 
                    
                p_values.loc[len(p_values), :] = [source,
                                                  feature,
                                                  i, 
                                                  j,
                                                  res.iloc[i, j]]

In [None]:
# Plot significant enrichment CIBERSORT
sns.set(font_scale=1.5, style='white')

mask = (immune['source'] == 'CIBERSORT') & (immune['feature'].isin(sigs['CIBERSORT']))

t = immune[mask]

g = sns.catplot(x='cluster', 
                y='value', 
                col='feature',
                kind='box',
                col_wrap=3,
                color='white',
                sharex=False,
                sharey=False,
                data=t,
                aspect=1.25)

for i, feature in enumerate(t['feature'].unique()):
    print i, feature
    sns.swarmplot(x='cluster',
                  y='value',
                  color='k',
                  size=5,
                  data=t[t['feature'] == feature],
                  ax=g.axes[i])

for i in range(len(g.axes)):
    g.axes[i].set_xlabel('Hydra Cluster')
    g.axes[i].set_ylabel('CIBERSORT Score')
    
plt.subplots_adjust(hspace=0.3, wspace=0.4)

pth = '../img/CIBERSORT-Plots.svg'
plt.savefig(pth, format='svg', bbox_inches='tight')

pth = '../img/CIBERSORT-Plots.png'
plt.savefig(pth, format='png', bbox_inches='tight')

In [None]:
# Plot significant enrichment xCELL
mask = (immune['source'] == 'xCell') & (immune['feature'].isin(sigs['xCell']))

t = immune[mask]

g = sns.catplot(x='cluster', 
                y='value', 
                col='feature',
                kind='box',
                col_wrap=3,
                sharex=False,
                sharey=False,
                data=t)

for i in range(len(g.axes)):
    g.axes[i].set_xlabel('Hydra Cluster')
    g.axes[i].set_ylabel('xCell Enrichment Score')
    
plt.subplots_adjust(hspace=0.3, wspace=0.4)

In [None]:
sns.set(font_scale=1.5, style='white')

interest = ["B-cells",
            "CD8+ naive T-cells",
            "Fibroblasts"]

mask = (immune['source'] == 'xCell') & (immune['feature'].isin(interest))

t = immune[mask]

g = sns.catplot(x='cluster', 
                y='value', 
                col='feature',
                kind='box',
                col_wrap=3,
                col_order=interest,
                color='white',
                sharex=False,
                sharey=False,
                data=t)

# CD8+
sns.swarmplot(x='cluster',
              y='value',
              color='k',
              size=5,
              data=t[t['feature'] == 'B-cells'],
              ax=g.axes[0])

# CD8+
sns.swarmplot(x='cluster',
              y='value',
              color='k',
              size=5,
              data=t[t['feature'] == 'CD8+ naive T-cells'],
              ax=g.axes[1])

# Fibroblasts
sns.swarmplot(x='cluster',
              y='value',
              color='k',
              size=5,
              data=t[t['feature'] == 'Fibroblasts'],
              ax=g.axes[2])

for i in range(len(g.axes)):
    g.axes[i].set_xlabel('Hydra Cluster')
    g.axes[i].set_ylabel('xCell Score')
    
plt.subplots_adjust(hspace=0.3, wspace=0.4)

pth = '../img/xCell-Plots.svg'
plt.savefig(pth, format='svg', bbox_inches='tight')

pth = '../img/xCell-Plots.png'
plt.savefig(pth, format='png', bbox_inches='tight')

In [None]:
# Plot significant enrichment ESTIMATE
mask = (immune['source'] == 'Estimate') & (immune['feature'].isin(sigs['Estimate']))

t = immune[mask]

g = sns.catplot(x='cluster', 
                y='value', 
                col='feature',
                kind='box',
                col_wrap=2,
                sharex=False,
                sharey=False,
                data=t)

for i in range(len(g.axes)):
    g.axes[i].set_xlabel('Hydra Cluster')
    g.axes[i].set_ylabel('ESTIMATE Enrichment Score')
    
plt.subplots_adjust(hspace=0.3, wspace=0.4)

In [None]:
sns.set(font_scale=1.5, style='white')

interest = ["ImmuneScore",
            "StromalScore",
            "TumorPurity"]

mask = (immune['source'] == 'Estimate') #& (immune['feature'].isin(sigs['Estimate']))

t = immune[mask]

g = sns.catplot(x='cluster', 
                y='value', 
                col='feature',
                kind='box',
                col_wrap=3,
                col_order=interest,
                color='white',
                sharex=False,
                sharey=False,
                data=t)

# Immune Score
sns.swarmplot(x='cluster',
              y='value',
              color='k',
              size=5,
              data=t[t['feature'] == 'ImmuneScore'],
              ax=g.axes[0])

# Stromal Score
sns.swarmplot(x='cluster',
              y='value',
              color='k',
              size=5,
              data=t[t['feature'] == 'StromalScore'],
              ax=g.axes[1])

# Stromal Score
sns.swarmplot(x='cluster',
              y='value',
              color='k',
              size=5,
              data=t[t['feature'] == 'TumorPurity'],
              ax=g.axes[2])


for i in range(len(g.axes)):
    g.axes[i].set_xlabel('Hydra Cluster')
    g.axes[i].set_ylabel('ESTIMATE Score')
    
plt.subplots_adjust(hspace=0.3, wspace=0.4)

pth = '../img/ESTIMATE-Plots.svg'
plt.savefig(pth, format='svg', bbox_inches='tight')

pth = '../img/ESTIMATE-Plots.png'
plt.savefig(pth, format='png', bbox_inches='tight')

In [None]:
# Load ATRX features
mask = (features['feature'] == 'ATRX') & (features['value'] == 'deleted')
atrx = features.loc[mask, 'root'].values

In [None]:
# Check for relationship between tumor purity
# and clustering. 
m = np.zeros((2, max(assign[1])))

tperc = pd.DataFrame(columns=['sample', '%tumor', 'cluster'])
for sample in exp.columns:
    root = '-'.join(sample.split('-')[:3])
    mask = (features['root'] == root) & (features['feature'] == '%tumor')
    v = features.loc[mask, 'value'].item()
    if pd.isnull(v):
        continue
        
    c = assign.loc[sample, 1]
        
    if v > 75:
        m[1, c - 1] += 1
        
    else:
        
        m[0, c - 1] += 1
        
    if root in atrx:
        print sample, v, assign.loc[sample, 1]
        
    tperc.loc[len(tperc), :] = [sample, v, c]
        
tperc['%tumor'] = pd.to_numeric(tperc['%tumor'])

print m
        
_fisher(m)

In [None]:
# Plot percent tumor values across clusters
fig, ax = plt.subplots(1, figsize=(6, 5))

sns.boxplot(x='cluster', 
            y ='%tumor', 
            data=tperc,
            color='white',
            ax=ax)

sns.swarmplot(x='cluster', 
              y ='%tumor', 
              data=tperc,
              color='k',
              ax=ax)

tgroups = []
for c, rows in tperc.groupby('cluster'):
    tgroups.append(list(rows['%tumor'].values))
    
ax.set_xlabel('Cluster')
ax.set_ylabel('Percent Tumor')

pth = '../img/perc-tumor-dist.svg'

plt.savefig(pth,
            format='svg',
            bbox_inches='tight')

print kruskal(*tgroups)

sp.posthoc_mannwhitney(tgroups, 
                       p_adjust='holm')

In [None]:
# Create colorbar for heatmap

annotations = []

atrx_a = []
atrx_samples = []

hydra_cols = []

m3c5000_cols = []
m3c500_cols = []

for sample in assign.sort_values(1).index.values:
    root = '-'.join(sample.split('-')[:3])
    cluster = assign.loc[sample, 1]
    
    if cluster == 1:
        hydra_cols.append("#3274a1")
        
    elif cluster == 2:
        hydra_cols.append("#e1812c")
        
    elif cluster == 3:
        hydra_cols.append("#3a923a")
        
    elif cluster == 4:
        hydra_cols.append("orange")
     
    if root in atrx:
        atrx_a.append('red')
        atrx_samples.append(sample)
         
    else:
        atrx_a.append('gray')

    m3c5000_c = m3c5000.loc[sample, 'consensuscluster']
    m3c500_c = m3c500.loc[sample, 'consensuscluster']
    
    if m3c5000_c == 1:
        m3c5000_cols.append('#0e4220')
        
    elif m3c5000_c == 2:
        m3c5000_cols.append('#542788')
        
    else:
        raise ValueError()
        
    if m3c500_c == 1:
        m3c500_cols.append('#7F002B')
        
    elif m3c500_c == 2:
        m3c500_cols.append('#C8C8C8')
        
    elif m3c500_c == 3:
        m3c500_cols.append('#8BC3C0')
        
    else:
        raise ValueError()
    
    
        
        
annotations.append(hydra_cols)
annotations.append(m3c5000_cols)
annotations.append(m3c500_cols)
annotations.append(atrx_a)

In [None]:
# Make dendrogram for clustering

import collections

from scipy.spatial import distance
from scipy.cluster import hierarchy
from scipy.cluster.hierarchy import fcluster, cophenet, linkage, dendrogram
from scipy.spatial.distance import pdist;

def fancy_dendrogram(*args, **kwargs):
    max_d = kwargs.pop('max_d', None)
    if max_d and 'color_threshold' not in kwargs:
        kwargs['color_threshold'] = max_d
    annotate_above = kwargs.pop('annotate_above', 0)

    ddata = dendrogram(*args, **kwargs)

    if not kwargs.get('no_plot', False):
        plt.title('Hierarchical Clustering Dendrogram (truncated)')
        plt.xlabel('sample index or (cluster size)')
        plt.ylabel('distance')
        for i, d, c in zip(ddata['icoord'], ddata['dcoord'], ddata['color_list']):
            x = 0.5 * sum(i[1:3])
            y = d[1]
            if y > annotate_above:
                plt.plot(x, y, 'o', c=c)
                plt.annotate("%.3g" % y, (x, y), xytext=(0, -5),
                             textcoords='offset points',
                             va='top', ha='center')
        if max_d:
            plt.axhline(y=max_d, c='k')
    return ddata;


zscore_df = exp.reindex(genes).dropna().apply(lambda x: (x - x.mean()) / x.std(), axis=1) 

method = 'ward'
metric = 'euclidean'

row_linkage = hierarchy.linkage(
    distance.pdist(zscore_df.values), 
    method=method, metric=metric)

col_linkage = hierarchy.linkage(
    distance.pdist(zscore_df.values.T), 
    method=method, metric=metric);

dist = 17

fancy_dendrogram(
    col_linkage,
    truncate_mode='lastp',
    p=12,
    leaf_rotation=90.,
    leaf_font_size=12.,
    show_contracted=True,
    annotate_above=10,
    max_d=dist,
)
plt.show()

clusters = fcluster(col_linkage, dist, criterion='distance')

cmap = sns.color_palette("Set2", max(clusters))

rcolors = [cmap[i - 1] for i in clusters]

groups = collections.defaultdict(list)
for sample, cluster in zip(zscore_df.columns, clusters):
    groups[cluster].append(sample);

In [None]:
# Cluster rows and assign function

dist = 18

fancy_dendrogram(
    row_linkage,
    truncate_mode='lastp',
    p=12,
    leaf_rotation=90.,
    leaf_font_size=12.,
    show_contracted=True,
    annotate_above=10,
    max_d=dist,
)
plt.show()

rclusters = fcluster(row_linkage, dist, criterion='distance')

rcmap = sns.color_palette("Set2", max(rclusters))

_rcolors = [rcmap[i -1] for i in rclusters]

rgroups = collections.defaultdict(list)
for sample, cluster in zip(zscore_df.index, rclusters):
    rgroups[cluster].append(sample);

In [None]:
import seaborn as sns

col_order = assign.sort_values(1).index.values

g = sns.clustermap(exp.reindex(genes).dropna().reindex(col_order, axis=1),
                   col_cluster=False,
                   row_linkage=row_linkage,
                   col_colors=annotations,
                   row_colors=_rcolors,
                   z_score=0,
                   method='ward',
                   center=0,
                   cmap=sns.diverging_palette(240, 10, n=7),
                   figsize=(10, 10))

ax = g.ax_heatmap

ax.set_xticklabels([])
ax.set_xticks([])
ax.set_yticklabels([])
ax.set_yticks([])

pth = '../img/micro-expression-heatmap.svg'
plt.savefig(pth, format='svg', bbox_inches='tight')

pth = '../img/micro-expression-heatmap.png'
plt.savefig(pth, format='png', bbox_inches='tight')

In [None]:
import scikit_posthocs as sp
from scipy.stats import mannwhitneyu

checkpoint = pd.DataFrame(columns=['sample',
                                   'cluster', 
                                   'gene', 
                                   'value'])

for sample in exp.columns:
    cluster = assign.loc[sample, 1]
    for gene in ['CD274', 'CTLA4']:
        checkpoint.loc[len(checkpoint), :] = [sample,
                                              cluster, 
                                              gene, 
                                              exp.loc[gene, sample]]
        
checkpoint['value'] = pd.to_numeric(checkpoint['value'])

sns.set(font_scale=1.5, style='white')

g = sns.catplot(x='cluster', 
                y='value', 
                col='gene',
                kind='box',
                col_wrap=2,
                color='white',
                sharex=False,
                sharey=False,
                data=checkpoint)

sns.swarmplot(x='cluster',
              y='value',
              color='k',
              size=5,
              data=checkpoint[checkpoint['gene'] == 'CD274'],
              ax=g.axes[0])

# CD8+ T-cells
sns.swarmplot(x='cluster',
              y='value',
              color='k',
              size=5,
              data=checkpoint[checkpoint['gene'] == 'CTLA4'],
              ax=g.axes[1])

for i in range(len(g.axes)):
    g.axes[i].set_xlabel('Hydra Cluster')
    g.axes[i].set_ylabel('Expression (log2(TPM + 1))')
    
plt.subplots_adjust(hspace=0.3, wspace=0.4)

pth = '../img/Checkpoint-Plots-V1.svg'
plt.savefig(pth, format='svg', bbox_inches='tight')

for gene in ['CD274', 'CTLA4']:
    print gene
    print sp.posthoc_mannwhitney(checkpoint[checkpoint['gene'] == gene],
                           val_col='value',
                           group_col='cluster',
                           p_adjust='fdr_bh')

In [None]:
pth = '../data/meta/ng.2529-S2.xlsx'

pugh = pd.read_excel(pth, index_col=0, header=1)

burden = pd.DataFrame(columns=['sample', 'burden', 'cluster'])

for sample in exp.columns:
    
    cluster = assign.loc[sample, 1]
    root = '-'.join(sample.split('-')[:3])
    
    try:
        bd = pugh.loc[root, 'Nonsilent per Mb']
        
    except KeyError:
        print 'Missing: ', sample
        continue
        
    burden.loc[len(burden), :] = [sample, bd, cluster]
    
burden['burden'] = pd.to_numeric(burden['burden'])

fig, ax = plt.subplots(1, 
                       figsize=(5, 5))

g = sns.boxplot(x='cluster',
                y='burden',
                data=burden,
                color='white',
                ax=ax)


sns.swarmplot(x='cluster',
              y='burden',
              data=burden,
              color='k',
              size=5,
              ax=ax)

g.set_xlabel('Hydra Cluster')
g.set_ylabel('Nonsilent Mutations / Mb')

pth = '../img/mutation-burden.svg'
plt.savefig(pth, format='svg', bbox_inches='tight')

In [None]:
assign

In [None]:
!ls ../data/cluster-analysis/

In [None]:
import glob 
from sklearn.metrics import adjusted_rand_score

# Clustering Analysis

_dir = '../data/cluster-analysis/'

cluster_screen = pd.DataFrame(columns=['Method', 'Threshold', 'Clusters', 'Rand Index'])
for pth in glob.glob(os.path.join(_dir, 'MYCN-NA-*labels*')):
    print(pth)
    if "MYCN-NA-M3C.tsv" in pth:
        continue
        
    b = os.path.basename(pth)
    fields = b.split('-')
    
    method = fields[2] if fields[2] == 'M3C' else 'Gap Statistic\nKmeans'
    threshold = fields[4]
    clusters = fields[-1].replace('.tsv', '')
    
    
    labels = pd.read_csv(pth, sep='\t')
    samples = [x.replace('.', '-') for x in labels.index.values]
    labels.index = samples
    
    if method == 'M3C':
        clustering = labels.reindex(assign.index)['consensuscluster'].values
            
    else:
        clustering = labels.reindex(assign.index)['x'].values
    
    hydra_clusters = assign[1].values
    ri = adjusted_rand_score(clustering, hydra_clusters)
    
    cluster_screen.loc[len(cluster_screen), :] = [method, threshold, clusters, ri]
    
cluster_screen['Threshold'] = pd.to_numeric(cluster_screen['Threshold'])
cluster_screen['Clusters'] = pd.to_numeric(cluster_screen['Clusters'])
cluster_screen['Rand Index'] = pd.to_numeric(cluster_screen['Rand Index'])

In [None]:
cluster_screen

In [None]:
fig, ax = plt.subplots(1, figsize=(8, 6))

sns.barplot(x='Threshold', 
            y='Clusters', 
            hue='Method',
            data=cluster_screen, ax=ax)

ax.set_yticks(range(7))
ax.set_yticklabels(range(7))
ax.set_xlabel("Median Absolute Deviation (MAD) Threshold")

ax.legend(bbox_to_anchor=(1.5, 1.0), frameon=False)

ax2 = ax.twinx()
sns.pointplot(x='Threshold', 
            y='Rand Index', 
            hue='Method', 
            data=cluster_screen,
            ax=ax2)

ax2.get_legend().remove()

plt.savefig('../img/clustering-screen.svg', 
            format='svg', 
            bbox_inches='tight')

In [None]:
def get_event(event):
    if pd.isnull(event):
        print("NULL")
        return np.nan
    
    events = ['Relapse',
              'Death', 
              'Progression',
              'Event',
              'Second Malignant Neoplasm']
    
    if event == 'Censored':
        return 0
    
    elif event in events:
        return 1 
    
    else:
        raise ValueError(event)
        
def get_vital(vital):
    if pd.isnull(vital):
        print("NULL")
        return np.nan
    
    
    if vital == 'Alive':
        return 0
    
    elif vital == 'Dead':
        return 1
    
    else:
        raise ValueError(vital)

pth = '../data/meta/TARGET_NBL_ClinicalData_Discovery_20170525.xlsx'
clinical = pd.read_excel(pth, index_col=0)

surv = pd.DataFrame(index=assign.index, 
                    columns=['OS', 'vital', 'EFS', 'event', 'cluster'])

for sample in assign.index.values:
    root = '-'.join(sample.split('-')[:3])
    if root not in clinical.index:
        print "Missing: ", sample
        continue
    
    OS = clinical.loc[root, 'Overall Survival Time in Days'].item()
    vital = get_vital(clinical.loc[root, 'Vital Status'])
    
    EFS = clinical.loc[root, 'Event Free Survival Time in Days'].item()
    event = get_event(clinical.loc[root, 'First Event'])
    
    cluster = assign.loc[sample, 1]
    if cluster == 3:
        continue
    
    surv.loc[sample, :] = [OS, vital, EFS, event, cluster]

In [None]:
survpth = '../data/mycn-na-survival.tsv'
surv.to_csv(survpth, sep='\t')

In [None]:
surv