In [None]:
from __future__ import print_function
import keras
from keras.datasets import mnist
from keras.models import Sequential
from keras.layers import Dense, Dropout, Flatten, BatchNormalization, Activation, Concatenate
from keras.layers import Conv2D, MaxPooling2D, AveragePooling2D, Input
from keras import backend as K
from keras.models import load_model
import pickle
import pandas as pd
import re
import numpy as np
import random
from keras.utils.vis_utils import plot_model
import keras.callbacks
import matplotlib.pyplot as plt
from sklearn.manifold import TSNE
from sklearn import datasets, linear_model
import seaborn as sns
from numpy.random import seed; seed(111)
from tensorflow import set_random_seed; set_random_seed(111)
from sklearn.metrics import roc_auc_score
from statsmodels.formula.api import ols
from statsmodels.stats.anova import anova_lm
from scipy.stats import kruskal
from sklearn.tree import DecisionTreeRegressor

In [None]:
##### load data #####
processed_data = pickle.load( open( "result_04_processed_data_no_scale.obj", "rb" ) )
cytof_files = processed_data["cytof_files"]
expr_list = processed_data["expr_list"]

r1 = [t1==t1 for t1 in cytof_files.CMV_Ab] 
w1 = (cytof_files.isna().sum(axis=1).values)>=0

cytof_files = cytof_files.loc[r1&w1,:]
expr_list = expr_list[r1&w1]
print(expr_list.shape)

In [None]:
# get demo and cytokine data
col = ['gender', 'race', 'age', 
       'CCL11', 'CCL2', 'CCL3',
       'CCL4', 'CCL5', 'CCL7', 'CD40LG', 'CSF1', 'CSF2', 'CSF3', 'CXCL1',
       'CXCL10', 'CXCL5', 'CXCL8', 'FASLG', 'FGF2', 'HGF', 'ICAM1', 'IFN1@',
       'IFNB1', 'IFNG', 'IL10', 'IL12', 'IL12B', 'IL13', 'IL15', 'IL17F',
       'IL1A', 'IL1B', 'IL1R1', 'IL2', 'IL4', 'IL5', 'IL6', 'IL7', 'KITLG',
       'LEP', 'LIF', 'LTA', 'NGF', 'PDGFB', 'RETN', 'SERPINE1', 'TGFA',
       'TGFB1', 'TNF', 'TNFSF10', 'VCAM1', 'VEGFA']
col = ['gender', 'race', 'age']
x2 = cytof_files.loc[:,col]
x2 = pd.get_dummies(x2, prefix=['gender', 'race'],drop_first=True)
x2 = x2[["age","gender_Male","race_Asian","race_Black or African American","race_White"]]

norm_fun = (lambda x: (x - x.mean()) / x.std())
x2 = x2.transform(norm_fun)

x2 = x2.dropna(axis='columns')
display(x2.head())
x2 = x2.values

y = cytof_files.CMV_Ab.values> 2

x = expr_list
x_shape = x.shape
print(x_shape)
x = x.reshape((x_shape[0]*x_shape[1],x_shape[2],x_shape[3]))
x = np.apply_along_axis(norm_fun, 0, x)
x = x.reshape(x_shape)

In [None]:
##### split train, validation and test######
train_id = [i for i in range(len(x)) if cytof_files.study_accession.iloc[i] not in ["SDY515","SDY519"]]
valid_id = [i for i in range(len(x)) if cytof_files.study_accession.iloc[i]=="SDY515"]
test_id = [i for i in range(len(x)) if cytof_files.study_accession.iloc[i]=="SDY519"]

x_train = x[train_id]
x_valid = x[valid_id]
x_test = x[test_id]

x2_train = x2[train_id]
x2_valid = x2[valid_id]
x2_test = x2[test_id]

y_train = y[train_id]
y_valid = y[valid_id]
y_test = y[test_id]

In [None]:

##### define model #####
left_input = Input(shape=x_train[0].shape)

left_branch = Conv2D(3, kernel_size=(1, x_train.shape[2]),
                 activation=None)(left_input)
left_branch = BatchNormalization()(left_branch)
left_branch = Activation("relu")(left_branch)

left_branch = Conv2D(3, (1, 1), activation=None)(left_branch)
left_branch = BatchNormalization()(left_branch)
left_branch = Activation("relu")(left_branch)

left_branch = AveragePooling2D(pool_size=(x_train.shape[1], 1))(left_branch)
left_branch = Flatten()(left_branch)


right_input = Input(shape=x2[0].shape)
right_branch = Dense(1, activation=None)(right_input)
right_branch = BatchNormalization()(right_branch)
right_branch = Activation("relu")(right_branch)
 
merged = Concatenate()([left_branch, right_branch])
merged = Dense(3, activation=None)(merged)
merged = BatchNormalization()(merged)
merged = Activation("relu")(merged)
merged = Dense(1, activation="sigmoid")(merged)
model = keras.models.Model(inputs=[left_input, right_input],
                           outputs=merged)

model.compile(loss='binary_crossentropy',
              optimizer=keras.optimizers.Adam(lr=0.0001),
              metrics=['accuracy'])

checkpointer = keras.callbacks.ModelCheckpoint(filepath='result_09_weights.hdf5', monitor='val_loss', 
                                               verbose=0, save_best_only=True)
earlyStop = keras.callbacks.EarlyStopping(monitor='loss', min_delta=0.00000001, patience=100, 
                                          verbose=0, mode='auto', baseline=None, restore_best_weights=True)

model.fit([x_train,x2_train], y_train,
          batch_size=30,
          epochs=10000,
          verbose=1,
          callbacks=[checkpointer,earlyStop],
          validation_data=([x_valid, x2_valid], y_valid))


In [None]:
# plot train and validation loss
plt.plot(model.history.history['loss'])
plt.plot(model.history.history['val_loss'])
plt.title('model train vs validation loss')
plt.ylabel('loss')
plt.xlabel('epoch')
plt.legend(['train', 'validation'], loc='upper right')
plt.show()

plt.plot(model.history.history['acc'])
plt.plot(model.history.history['val_acc'])
plt.title('model train vs validation accuracy')
plt.ylabel('accuracy')
plt.xlabel('epoch')
plt.legend(['train', 'validation'], loc='upper right')
plt.show()

In [None]:
best_model = load_model('result_09_weights.hdf5')

# print result of the best model
score = best_model.evaluate([x_train,x2_train], y_train, verbose=0)
print('Train loss:', score[0])
print('Train accuracy:', score[1])

score = best_model.evaluate([x_valid,x2_valid], y_valid, verbose=0)
print('Valid loss:', score[0])
print('Valid accuracy:', score[1])

score = best_model.evaluate([x_test,x2_test], y_test, verbose=0)
print('Test loss:', score[0])
print('Test accuracy:', score[1])

In [None]:
y_true = y_test
y_scores = best_model.predict([x_test,x2_test])
print(roc_auc_score(y_true, y_scores))

with open("result_09_deep_learning_ROC.obj", "wb") as f:
    pickle.dump({"true":y_true,"score":y_scores}, f)

y_true = y_train
y_scores = best_model.predict([x_train,x2_train])
print(roc_auc_score(y_true, y_scores))

y_true = y_valid
y_scores = best_model.predict([x_valid,x2_valid])
print(roc_auc_score(y_true, y_scores))