In [None]:
%matplotlib inline
import bnpy
import pandas as pd
import numpy as np
import seaborn as sns
import datetime
import random
import os
import pickle
import matplotlib.pyplot as plt
import gzip

from subprocess import Popen, PIPE
from threading import Timer
from scipy.stats import iqr, percentileofscore
from statsmodels import robust
from sklearn.metrics import roc_curve, auc

date = str(datetime.date.today())

In [None]:
pth = '../data/TCGA-glioblastoma-multiforme-log2TPM1.tsv'

nbl = pd.read_csv(pth, sep='\t', index_col=0)

In [None]:
pth = '../data/h.all.v6.2.symbols.gmt'

gss = {}
gss_mean_mean = {}
gss_median_mean = {}
gss_mean_mad = {}
with open(pth) as f:
    for line in f:
        fields = line.strip().split()
        gss[fields[0]] = fields[2:]
        _mean = nbl.reindex(fields[2:]).dropna().mean(axis=1).mean()
        gss_mean_mean[fields[0]] = _mean

In [None]:
mformat = {'hydra': 'Hydra',
           'ssgsea': 'ssGSEA',
           'gsva': 'GSVA'}

mcolor = {'hydra': '#003050',
          'ssgsea': '#7096a0',
          'gsva': '#b0b7a7'}

In [None]:
import uuid
import numpy as np


def fit(xdata):
    #data = data.apply(lambda x: x - x.mean(), axis=1)
    #data = data.T.values

    #xdata = bnpy.data.XData(data)

    gamma = 5.0
    sF = 2.0
    K = 2

    hmodel, info_dict = bnpy.run(
        xdata, 'DPMixtureModel', 'Gauss', 'memoVB',
        output_path=('/tmp/%s/' % uuid.uuid4() +
            'trymoves-K=%d-gamma=%s-ECovMat=%s*eye-moves=merge,shuffle/' % (
                K, gamma, sF)),
        nLap=1000, nTask=1, nBatch=1,
        gamma0=gamma, sF=sF, ECovMat='eye',
        K=K, initname='randexamplesbydist',
        moves='birth,merge,delete,shuffle',
        b_startLap=0,
        m_startLap=2,
        d_startLap=2,
        doWriteStdOut=False)
    
    return hmodel, xdata

def get_assignments(model, data):
    """
    Takes model and data and classifies samples

    Will label samples with -1 cluster if they do not
    fit in any of the model components

    :param model:
    :param data:
    :return:
    """
    unclass = 1 - np.sum(model.allocModel.get_active_comp_probs())
    # Get the sample assignments
    LP = model.calc_local_params(data)
    asnmts = []
    for row in range(LP['resp'].shape[0]):
        _max = np.max(LP['resp'][row, :])
        if _max < unclass:
            print 'Could not classify sample'
            asnmts.append(-1)

        else:
            _arg = np.argmax(LP['resp'][row, :])
            #print row
            #print LP['resp'][row, :]
            asnmts.append(_arg)

    return asnmts

In [None]:
def get_hydra_auc(hydra_dir, tag, gs, test):
    model_pth = os.path.join(hydra_dir, 
                                 tag, 
                                 gs, 
                                 'MultivariateAnalysis', 
                                 gs)
    
    try:
        model = bnpy.ioutil.ModelReader.load_model_at_prefix(model_pth,
                                                         prefix=gs) 
    except IOError:
        print ("WARNING: Missing Model! ", gs, tag)
        return np.nan, np.nan, np.nan
    
    train_data_pth = os.path.join(hydra_dir, 
                                  tag, 
                                  gs, 
                                  'MultivariateAnalysis', 
                                  gs, 
                                  'training-data.tsv')
    
    train = pd.read_csv(train_data_pth, 
                        sep='\t', 
                        index_col=0)
        
    train_mean = train.mean(axis=1)
    train_center = train.sub(train_mean, axis=0)
    train_xdata = bnpy.data.XData(train_center.values.T)
    
    model, xdata = fit(train_xdata)
    #print model.allocModel.get_active_comp_probs()
    
    maxi = None
    max_mean = None
    for i in range(len(model.allocModel.get_active_comp_probs())):
        mean = model.obsModel.get_mean_for_comp(i)
        norm = np.linalg.norm(mean)
        if norm > max_mean:
            maxi = i
            max_mean = norm
    
    test = test.reindex(train.index)
    test_center = test.sub(train_mean, axis=0)
    test_xdata = bnpy.data.XData(test_center.values.T)
    
    LP = model.calc_local_params(test_xdata)
    probs = LP['resp']
    asnmts = LP['resp'].argmax(axis=1)
    
    test_labels = []
    for j, sample in enumerate(test.columns):
        if 'active' in sample:
            test_labels.append(1)
            
        elif 'normal' in sample:
            test_labels.append(-1)
            
        else:
            raise ValueError()
            
    scores = probs[:, maxi].flatten()
    fpr, tpr, thresholds = roc_curve(test_labels, scores, pos_label=1)
    
    return auc(fpr, tpr), fpr, tpr


def get_ssgsea_auc(tag, gs, test, date):
    ssgsea_pth = os.path.join('../data/output', date, 'ssGSEA', tag, gs)
    ssgsea = pd.read_csv(ssgsea_pth, sep='\t', index_col=0)
    ssgsea.columns = [x.replace('.', '-') for x in ssgsea.columns]
    
    ssgsea_scores = []
    for sample in ssgsea.columns:
        score = ssgsea.loc[gs, sample]
        perc = percentileofscore(ssgsea[sample].sort_values().values, score)
        #ssgsea_scores.append((100 - perc) / 100.)
        ssgsea_scores.append(score)
        
    test_labels = []
    for j, sample in enumerate(test.columns):
        if 'active' in sample:
            test_labels.append(1)
            
        elif 'normal' in sample:
            test_labels.append(-1)
            
        else:
            raise ValueError()
        
    fpr, tpr, thresholds = roc_curve(test_labels, ssgsea_scores, pos_label=1)
    return auc(fpr, tpr), fpr, tpr


def get_gsva_auc(tag, gs, test, date):
    gsva_pth = os.path.join('../data/output', date, 'GSVA', tag, gs)
    gsva = pd.read_csv(gsva_pth, sep='\t', index_col=0)
    gsva.columns = [x.replace('.', '-') for x in gsva.columns]
    
    gsva_scores = []
    for sample in gsva.columns:
        score = gsva.loc[gs, sample]
        perc = percentileofscore(gsva[sample].values, score)
        #gsva_scores.append((100 - perc) / 100.)
        gsva_scores.append(score)
        
    test_labels = []
    for j, sample in enumerate(test.columns):
        if 'active' in sample:
            test_labels.append(1)
            
        elif 'normal' in sample:
            test_labels.append(-1)
            
        else:
            raise ValueError()
        
    fpr, tpr, thresholds = roc_curve(test_labels, gsva_scores, pos_label=1)
    return auc(fpr, tpr), fpr, tpr

In [None]:
import glob
import re

aucs = {}
aucs['hydra'] = {}
aucs['ssgsea'] = {}
aucs['gsva'] = {}

# {method: {gs: {%DEG: eff: tpr}}}
datar = {}
for m in ['hydra', 'ssgsea', 'gsva']:
    datar[m] = {}
    for d in ['0.10', '0.25']:
        datar[m][d] = {}
        for e in ['0.25', '0.50', '0.75', '1.00', '1.50', '2.00', '2.50', '3.00']:
            datar[m][d][e] = {}

plot_df = pd.DataFrame(columns = ['method', 'gene-set', 'auc', 'gs_mean', 'effect', 'difffrac'])

dates = ['2019-09-07', '2019-10-02']
for date in dates:
    input_dir = os.path.join('../data/input', date)
    output_dir = os.path.join('../data/output', date)
    hydra_dir = os.path.join(output_dir, 'Hydra')

    train_pths = os.path.join('../data/input', date, '*train*eff-*')
    regex = re.compile('(?P<gs>HALLMARK_\w*).*-eff-(?P<eff>\d\.\d*)-diff-(?P<diff>\d\.\d*)-frac-(?P<frac>\d\.\d*)')
    for _pth in glob.glob(train_pths):
        print _pth 
        m = regex.search(_pth)
        if not m:
            raise ValueError()
        
        tag = "eff-%s-diff-%s-frac-%s" % m.groups()[1:]
        exp = pd.read_csv(_pth, sep='\t', index_col=0)  
        gs = m.group('gs')
        print gs, tag
    
        # Pull in degs
        deg_pth = os.path.join(input_dir, 'synthetic-%s-degs-%s-%s.tsv' % (gs, tag, date))   
        degs = []
        with open(deg_pth, 'r') as f:
            for line in f:
                degs.append(line.strip())   
    
        test = pd.read_csv(_pth.replace('train', 'test'), 
                           sep='\t', 
                           index_col=0)
    
        try:
            # Hydra
            aucs['hydra'][gs], fpr, tpr = get_hydra_auc(hydra_dir, tag, gs, test)
            plot_df.loc[len(plot_df), :] = ['Hydra', 
                                            gs, 
                                            aucs['hydra'][gs], 
                                            gss_mean_mean[gs], 
                                            m.group('eff'),
                                            m.group('diff')]
            print 'Hydra AUC: ', aucs['hydra'][gs]
            datar['hydra'][m.group('diff')][m.group('eff')][gs] = (fpr, tpr)
    
            # ssGSEA
            aucs['ssgsea'][gs], fpr, tpr = get_ssgsea_auc(tag, gs, test, date)
            plot_df.loc[len(plot_df), :] = ['ssGSEA', gs, aucs['ssgsea'][gs], gss_mean_mean[gs], m.group('eff'),
                                            m.group('diff')]
            print 'ssGSEA AUC: ', aucs['ssgsea'][gs]
            datar['ssgsea'][m.group('diff')][m.group('eff')][gs] = (fpr, tpr)
    
            # GSVA
            aucs['gsva'][gs], fpr, tpr = get_gsva_auc(tag, gs, test, date)
            plot_df.loc[len(plot_df), :] = ['GSVA', gs, aucs['gsva'][gs], gss_mean_mean[gs], m.group('eff'),
                                    m.group('diff')]
            print 'GSVA AUC: ', aucs['gsva'][gs]
            datar['gsva'][m.group('diff')][m.group('eff')][gs] = (fpr, tpr)
        
        except IOError:
            print("Missing: ", gs)
            continue

In [None]:
plot_df['auc'] = pd.to_numeric(plot_df['auc'])
plot_df['gs_mean'] = pd.to_numeric(plot_df['gs_mean'])
plot_df['effect'] = pd.to_numeric(plot_df['effect'])

In [None]:
palette = ['#0e3b59', '#7096a0', '#b0b7a7']

In [None]:
plot_df.loc[pd.isna(plot_df['auc']), ['gene-set', 'effect', 'difffrac']]

In [None]:
lost_gs = plot_df.loc[pd.isna(plot_df['auc']), ['gene-set', 'effect', 'difffrac']]

for i, gs, eff, diff in lost_gs.itertuples():
    mask = (plot_df['gene-set'] == gs) & (plot_df['effect'] == eff) & (plot_df['difffrac'] == diff)
    plot_df = plot_df[~mask]

In [None]:
plot_df.loc[plot_df['difffrac'] == '0.10', '%DEG'] = '10%'
plot_df.loc[plot_df['difffrac'] == '0.25', '%DEG'] = '25%'

In [None]:
def mean_confidence_interval(data, confidence=0.95):
    import scipy
    import numpy as np
    a = 1.0 * np.array(data)
    n = len(a)
    m, se = np.mean(a), scipy.stats.sem(a)
    h = se * scipy.stats.t.ppf((1 + confidence) / 2., n-1)
    return m, m - h, m + h

def mean_confidence_interval2(data, confidence=0.95):
    import scipy
    import numpy as np
    a = 1.0 * np.array(data)
    n = len(a)
    m, se = np.mean(a), scipy.stats.sem(a)
    h = se * scipy.stats.t.ppf((1 + confidence) / 2., n-1)
    return m, h

In [None]:
print 'Hydra: ', mean_confidence_interval( plot_df.loc[(plot_df['method'] == 'Hydra'), 'auc'] )
print 'ssGSEA: ', mean_confidence_interval( plot_df.loc[(plot_df['method'] == 'ssGSEA'), 'auc'] )
print 'GSVA: ', mean_confidence_interval( plot_df.loc[(plot_df['method'] == 'GSVA'), 'auc'] )

In [None]:
sns.set(style='white', font_scale=1.5)

# based on:
# https://stats.stackexchange.com/questions/186337/average-roc-for-repeated-10-fold-cross-validation-with-probability-estimates

fig, ax = plt.subplots(1, 2, figsize=(13, 5))

base_fpr = np.linspace(0, 1, 101)

diff_ind = {'0.10': 0, '0.25': 1}

for color, method, name in zip(palette,
                               ['hydra', 'ssgsea', 'gsva'],
                               ['Hydra', 'ssGSEA', 'GSVA']):

    for diff, l1 in datar[method].items():
        tprs = None
        for eff, l2 in l1.items():
            if float(eff) < 1.0:
                continue
            for gs, (fpr, tpr) in l2.items():
                mask = (plot_df['gene-set'] == gs) & (plot_df['effect'] == float(eff)) & (plot_df['difffrac'] == diff)
                if len(plot_df[mask]) == 0:
                    print('Skipping Lost Model', gs)
                    print(plot_df[mask])
                    continue
                
                #ax.plot(fpr, tpr, label=method)
                btpr = np.interp(base_fpr, fpr, tpr)
            
                if tprs is None:
                    tprs = btpr
            
                else:
                    tprs = np.vstack([btpr, tprs])
                    
        mean_tprs = tprs.mean(axis=0)
        std = tprs.std(axis=0)
    
        ax[diff_ind[diff]].plot([0.] + base_fpr, 
                                [0.] + mean_tprs, 
                                color, 
                                label=name,
                                linewidth=2.5)

for i in [0, 1]:
    ax[i].plot([0, 1], [0, 1],'r--')
    ax[i].set_xlim([-0.05, 1.02])
    ax[i].set_ylim([-0.05, 1.02])

    ax[i].set_ylabel("True Positive Rate")
    ax[i].set_xlabel("False Positive Rate")
    ax[i].set_aspect("equal", 'datalim')
    

ax[0].set_title("%DEG = 10%")
ax[1].set_title("%DEG = 25%")

#plt.title('Mean ROC Curve for HALLMARK Gene Sets')

hy1, hy2 = mean_confidence_interval2(plot_df.loc[(plot_df['method'] == 'Hydra') & (plot_df['%DEG'] == '10%') , 'auc'])
ss1, ss2 = mean_confidence_interval2(plot_df.loc[(plot_df['method'] == 'ssGSEA') & (plot_df['%DEG'] == '10%'), 'auc'])
gs1, gs2 = mean_confidence_interval2(plot_df.loc[(plot_df['method'] == 'GSVA') & (plot_df['%DEG'] == '10%'), 'auc'])

L = ax[0].legend(title='Method', 
                 frameon=False,
                 bbox_to_anchor=(0.35, 0.45))

L.get_texts()[0].set_text("Hydra ($%.2f \pm %.2f$)" % (hy1, hy2))
L.get_texts()[1].set_text("ssGSEA ($%.2f \pm %.2f$)" % (ss1, ss2))
L.get_texts()[2].set_text("GSVA ($%.2f \pm %.2f$)" % (gs1, gs2))

hy1, hy2 = mean_confidence_interval2(plot_df.loc[(plot_df['method'] == 'Hydra') & (plot_df['%DEG'] == '25%') , 'auc'])
ss1, ss2 = mean_confidence_interval2(plot_df.loc[(plot_df['method'] == 'ssGSEA') & (plot_df['%DEG'] == '25%'), 'auc'])
gs1, gs2 = mean_confidence_interval2(plot_df.loc[(plot_df['method'] == 'GSVA') & (plot_df['%DEG'] == '25%'), 'auc'])

L = ax[1].legend(title='Method', 
                 frameon=False,
                 bbox_to_anchor=(0.35, 0.45))

L.get_texts()[0].set_text("Hydra ($%.2f \pm %.2f$)" % (hy1, hy2))
L.get_texts()[1].set_text("ssGSEA ($%.2f \pm %.2f$)" % (ss1, ss2))
L.get_texts()[2].set_text("GSVA ($%.2f \pm %.2f$)" % (gs1, gs2))

sns.despine()


plt.subplots_adjust(wspace=0.4)

pth = '../img/ROC-plot-per-diff.svg'
plt.savefig(pth, format='svg', bbox_inches='tight')

pth = '../img/ROC-plot-per-diff.png'
plt.savefig(pth, format='png', bbox_inches='tight')

In [None]:
plot_df

In [None]:
sns.set(style='white', font_scale=1.5)

g = sns.catplot(x='effect', 
                y='auc',
                hue='method',
                col='%DEG',
                col_order=['10%', '25%'],
                kind='point',
                data=plot_df,
                palette=palette,
                scatter_kws={'alpha': 1.0},
                line_kws={'alpha': 1.0},
                sharex=False,
                sharey=False,
                legend=False,
                aspect=1.2)

axes = g.axes

for row in axes:
    for ax in row:
        ax.set_ylim(0.4, 1.1)
        ax.set_xlabel('Effect Size')
        ax.set_ylabel('AUC')
        ax.set_yticks([0.5, 0.6, 0.7, 0.8, 0.9, 1.0])

plt.subplots_adjust(wspace=0.4)

plt.legend(title='Method', frameon=False, loc=(0.6, 0.1))
        
pth = '../img/auc-vs-eff.svg'
plt.savefig(pth, format='svg', bbox_inches='tight')

pth = '../img/auc-vs-eff.png'
plt.savefig(pth, format='png', bbox_inches='tight')