In [None]:
from __future__ import print_function
import pickle
import pandas as pd
import re
import numpy as np
import random
import matplotlib.pyplot as plt
import seaborn as sns
import pyensae
import matplotlib.pyplot as plt
from pyensae.graphhelper import Corrplot
from sklearn.manifold import TSNE
from sklearn.decomposition import PCA
#plt.style.use('ggplot')

In [None]:
# read sample data 
processed_data = pickle.load( open( "result_04_processed_data_no_scale_Final.obj", "rb" ) )
cytof_files = processed_data["cytof_files"]
expr_list = processed_data["expr_list"]
coln = processed_data["marker_names"]

In [None]:
# prepare subject data
subject_df = cytof_files[['subject_accession','study_accession']]
subject_df = subject_df.drop_duplicates()
subject_df["presence"] = 1
subject_df = subject_df.pivot(index='subject_accession',
                              columns='study_accession',
                              values='presence')
subject_df = subject_df.fillna(0)
subject_df.columns.name = None
subject_df = subject_df.reset_index()

In [None]:
# find overlap between studies
subject_mx = subject_df.drop("subject_accession",axis=1)
col = subject_mx.columns
subject_mx = subject_mx.values
overlap = np.dot(subject_mx.transpose(),subject_mx)
total = np.sum(subject_mx,axis=0)
total = total.reshape([total.shape[0],1])
total = np.repeat(total,total.shape[0] , axis=1)
union = np.add(total.transpose(),total)
union = np.subtract(union, overlap)
overlap = np.divide(overlap,union)
overlap = pd.DataFrame(overlap)
overlap.columns = col
overlap.index = col

In [None]:
# plot overlap
c = Corrplot(overlap)
p = c.plot(method ="circle",binarise_color=True)
p.get_figure()
plt.savefig("result_07_subject_overlap.pdf")

In [None]:
# plot age
x = cytof_files.shape[0]
train_id = [i for i in range(x) if cytof_files.study_accession.iloc[i] not in ["SDY515","SDY519"]]
valid_id = [i for i in range(x) if cytof_files.study_accession.iloc[i]=="SDY515"]
test_id = [i for i in range(x) if cytof_files.study_accession.iloc[i]=="SDY519"]
cytof_files["group"] = [""]*x
cytof_files.group.iloc[valid_id]=cytof_files.group.iloc[valid_id]+"validation"
cytof_files.group.iloc[test_id]=cytof_files.group.iloc[test_id]+"test"
cytof_files.group.iloc[train_id]=cytof_files.group.iloc[train_id]+"train"

sns.stripplot(x="group", y="age", data=cytof_files,
              size=4, jitter=True, edgecolor="gray")
plt.show()

In [None]:
# plot CMV proportion
CMV_df = cytof_files.loc[:,["CMV_Ab","group"]]
CMV_df.CMV_Ab = CMV_df.CMV_Ab>2
CMV_df_group = CMV_df.groupby(["group"]).size().reset_index(name='group_counts')
CMV_df = CMV_df.groupby(["group","CMV_Ab"]).size().reset_index(name='counts')
CMV_df = CMV_df.merge(CMV_df_group ,on=["group"]).reset_index()
CMV_df["percent"] = CMV_df.counts/CMV_df.group_counts*100
display(CMV_df)
pivot_df = CMV_df.pivot(index='group', columns='CMV_Ab', values='percent')
p = pivot_df.loc[["train","validation","test"],:].plot.bar(stacked=True)
p.get_figure()
plt.savefig("result_07_CMV_percent.pdf")

In [None]:
# identify studies for cytof data
expr_df = expr_list.reshape(expr_list.shape[0:3])
expr_df = expr_df.reshape([expr_df.shape[0]*expr_df.shape[1],
                           expr_df.shape[2]])
expr_df = pd.DataFrame(expr_df)
expr_df["file_info_id"] = (cytof_files.file_info_id.
                           repeat(expr_list.shape[1]).values)
expr_df["study_accession"] = (cytof_files.study_accession.
                           repeat(expr_list.shape[1]).values)
expr_df["group"] = (cytof_files.group.
                           repeat(expr_list.shape[1]).values)
expr_df.columns = np.append(coln.values,
                            ["file_info_id","study_accession","group"])

In [None]:
# plot batch effect
B_df = expr_df[(expr_df.study_accession.isin(["SDY315","SDY311"]))]
B_df = B_df.sample(5000)

Bx = B_df.drop(["file_info_id","study_accession","group"],axis=1).values
#X_embedded = TSNE(n_components=2).fit_transform(Bx)
#print(X_embedded.shape)

X_embedded = PCA(n_components=2).fit(Bx).transform(Bx)
plt.figure(figsize=(5, 5))

p = sns.scatterplot(x=X_embedded[:,0], y=X_embedded[:,1], 
                hue=B_df.study_accession,s = 15)
p.get_figure()
plt.savefig("result_07_PCA.pdf")


In [None]:
# plot ROC curve
print(__doc__)

import numpy as np
import matplotlib.pyplot as plt
from sklearn import svm, datasets
from sklearn.metrics import roc_curve, auc
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import label_binarize
from sklearn.multiclass import OneVsRestClassifier



# Plot of a ROC curve for a specific class
plt.figure(figsize=(5,5))

roc1 = pickle.load( open( "result_05_deep_learning_ROC.obj", "rb" ) )
fpr, tpr, _ = roc_curve(roc1["true"], roc1["score"])
roc_auc1 = auc(fpr, tpr)
plt.plot(fpr, tpr, label='CNN (AUC = %0.2f)' % roc_auc1)

roc1_2 = pickle.load( open( "result_09_deep_learning_ROC.obj", "rb" ) )
fpr, tpr, _ = roc_curve(roc1_2["true"], roc1_2["score"])
roc_auc2 = auc(fpr, tpr)
plt.plot(fpr, tpr, label='CNN + demographics (AUC = %0.2f)' % roc_auc2)


roc4 = pickle.load( open( "result_08_10k_RF_ROC.obj", "rb" ) )
fpr, tpr, _ = roc_curve(roc4["true"], roc4["score"])
roc_auc4 = auc(fpr, tpr)
plt.plot(fpr, tpr, label='Gating Result + RandomForest  (AUC = %0.2f)' % roc_auc4)

roc5 = pickle.load( open( "result_08_flowSOM_RF_ROC.obj", "rb" ) )
fpr, tpr, _ = roc_curve(roc5["true"], roc5["score"])
roc_auc5 = auc(fpr, tpr)
plt.plot(fpr, tpr, label='FlowSOM + RandomForest  (AUC = %0.2f)' % roc_auc5)


plt.plot([0, 1], [0, 1], 'k--')
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('Receiver Operating Characteristic Curve')
plt.legend(loc="lower right")
#plt.show()
plt.savefig("result_07_AUCs.pdf")


auc_df = {"x":["CNN","CNN + demographics","CD8+ Effector T cells",
                "Gating Result + RandomForest","FlowSOM + RandomForest"],
             "AUC": [roc_auc1,roc_auc2,roc_auc3,roc_auc4,roc_auc5]}


In [None]:
# permutation test
pv = []
for i in range(100000):
    r1 = np.random.choice(roc1["true"].shape[0],replace=True,size=roc1["true"].shape[0])
    t1 = roc1["true"][r1]
    s1 = roc1["score"][r1]
    fpr, tpr, _ = roc_curve(t1, s1)
    roc_auc_1 = auc(fpr, tpr)
        
    r2 = np.random.choice(roc5["true"].shape[0],replace=True,size=roc5["true"].shape[0])
    t2 = roc5["true"][r2]
    s2 = roc5["score"][r2]
    fpr, tpr, _ = roc_curve(t2, s2)
    roc_auc_2 = auc(fpr, tpr)
        
    pv = pv + [roc_auc_1 > roc_auc_2]

1 - sum(pv)/len(pv)
# pre-calulated result = 0.00016

In [None]:
# plot precision recall curve

from sklearn.metrics import precision_recall_curve
from sklearn.metrics import average_precision_score

# Plot of a ROC curve for a specific class
plt.figure(figsize=(5,5))

roc1 = pickle.load( open( "result_05_deep_learning_ROC.obj", "rb" ) )
fpr, tpr, _ = precision_recall_curve(roc1["true"], roc1["score"])
roc_auc = average_precision_score(roc1["true"], roc1["score"])
plt.plot(tpr, fpr, label='CNN (AUC = %0.2f)' % roc_auc)

roc1_2 = pickle.load( open( "result_09_deep_learning_ROC.obj", "rb" ) )
fpr, tpr, _ = precision_recall_curve(roc1_2["true"], roc1_2["score"])
roc_auc = average_precision_score(roc1_2["true"], roc1_2["score"])
plt.plot(tpr, fpr, label='CNN + demographics (AUC = %0.2f)' % roc_auc)


roc4 = pickle.load( open( "result_06=8_10k_RF_ROC.obj", "rb" ) )
fpr, tpr, _ = precision_recall_curve(roc4["true"], roc4["score"])
roc_auc = average_precision_score(roc4["true"], roc4["score"])
plt.plot(tpr, fpr, label='Gating Result + RandomForest  (AUC = %0.2f)' % roc_auc)

roc5 = pickle.load( open( "result_08_flowSOM_RF_ROC.obj", "rb" ) )
fpr, tpr, _ = precision_recall_curve(roc5["true"], roc5["score"])
roc_auc = average_precision_score(roc5["true"], roc5["score"])
plt.plot(tpr, fpr, label='FlowSOM + RandomForest  (AUC = %0.2f)' % roc_auc)


#plt.plot([0, 1], [0, 1], 'k--')
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('Recall')
plt.ylabel('Precision')
plt.title('Precision Recall Curve')
plt.legend(loc="lower right")
#plt.show()
plt.savefig("result_07_AUCs_RP.pdf")

In [None]:
# permutation test
pv = []
for i in range(100000):
    r1 = np.random.choice(roc1["true"].shape[0],replace=True,size=roc1["true"].shape[0])
    t1 = roc1["true"][r1]
    s1 = roc1["score"][r1]
    roc_auc_1 = average_precision_score(roc1["true"], roc1["score"])
        
    r2 = np.random.choice(roc5["true"].shape[0],replace=True,size=roc5["true"].shape[0])
    t2 = roc5["true"][r2]
    s2 = roc5["score"][r2]
    roc_auc_2 = average_precision_score(roc5["true"], roc5["score"])
        
    pv = pv + [roc_auc_1 > roc_auc_2]

1 - sum(pv)/len(pv)