# **Libraries**

In [None]:
#python functionalities
import os
import pickle
import pandas
import numpy as np

#display results
import matplotlib.pyplot as plt
from sklearn.metrics import roc_curve, auc

#T-SNE
import cv2
from sklearn import manifold

#one-hot-encoding
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import LabelEncoder

#model functionalities
import tensorflow as tf
from keras import regularizers
from keras.models import Model, Sequential
from keras.layers import Input, Conv2D, Conv2DTranspose, BatchNormalization, Reshape, Flatten, Dense, Dropout, Concatenate, Activation
from keras.preprocessing.image import ImageDataGenerator

#classification
from sklearn import svm, datasets
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import LinearSVC
from sklearn.metrics import accuracy_score
from sklearn.metrics import confusion_matrix

# **Import the NCIA dataset from Google Drive.**

In [None]:
dir_MCGILL = '/content/drive/My Drive/Bachelor Thesis/Model-Data/128/MCGILL'
dir_MAASTRO = '/content/drive/My Drive/Bachelor Thesis/Model-Data/128/MAASTRO'

# read the training data
with open(dir_MCGILL + '/train_data.pxl', 'rb') as f:
  train_data = np.array(pickle.load(f))
with open(dir_MCGILL + '/train_label.pxl', 'rb') as f:
  train_label = np.array(pickle.load(f))
with open(dir_MCGILL + '/train_contour.pxl', 'rb') as f:
  train_contour = np.array(pickle.load(f))
with open(dir_MCGILL + '/train_clinical.pxl', 'rb') as f:
  train_clinical = np.array(pickle.load(f))

# read the testing data
with open(dir_MCGILL + '/test_data.pxl', 'rb') as f:
  test_data = np.array(pickle.load(f))
with open(dir_MCGILL + '/test_label.pxl', 'rb') as f:
  test_label = np.array(pickle.load(f))
with open(dir_MCGILL + '/test_contour.pxl', 'rb') as f:
  test_contour = np.array(pickle.load(f))
with open(dir_MCGILL + '/test_clinical.pxl', 'rb') as f:
  test_clinical = np.array(pickle.load(f))

# **Prepare the data.**

## Radiomic Data.

In [None]:
#separate labels according to the metastasis type
def vectorize_labels(labels):
  local, distant, death = [], [], []

  for metastasis in labels:
    local.append(metastasis[0])
    distant.append(metastasis[1])
    death.append(metastasis[2])

  return np.asarray(local).astype('float32'), np.asarray(distant).astype('float32'), np.asarray(death).astype('float32')

#normalize the data
train_x = train_data.astype('float32') / 2000.
test_x = test_data.astype('float32') / 2000.

#reshape the data
train_x = np.reshape(train_x, (len(train_x), 128, 128, 1))
test_x  = np.reshape(test_x, (len(test_x), 128, 128, 1))

#vectorize the labels
train_local, train_distant, train_death = vectorize_labels(train_label)
test_local, test_distant, test_death = vectorize_labels(test_label)
train_y = np.asarray(train_label).astype('float32')
test_y = np.asarray(test_label).astype('float32')

#display data shape
print("Input data shape", train_x.shape)
print("Input label shape", train_y.shape)
print("Clinical data shape", train_clinical.shape)

## Clinical Data.

In [None]:
class MultiColumnLabelEncoder:
    def __init__(self, columns = None):
        self.columns = columns # list of column to encode    
        
    def fit(self, X, y=None):
      return self    
        
    def transform(self, X):
      """
      Transforms columns of X specified in self.columns using LabelEncoder(). 
      If no columns specified, transforms all columns in X.
      """
      output = X.copy()

      if self.columns is not None:
          for col in self.columns:
              output[col] = LabelEncoder().fit_transform(output[col])
      else:
          for colname, col in output.iteritems():
              output[colname] = LabelEncoder().fit_transform(col)

      return output    
      
    def fit_transform(self, X, y=None):
      return self.fit(X, y).transform(X)

In [None]:
label_encoder   = MultiColumnLabelEncoder()
one_hot_encoder = OneHotEncoder(sparse=False)

bins = [0, 20, 30, 40, 50, 60, 70, 80, 120]
labels = ['0-19', '20-29', '30-39', '40-49', '50-59', '60-69', '70-79', '80+']

def one_hot_encoding(data):
  df = pandas.DataFrame({'age': data[:, 0],
                         'location': data[:, 1],
                         't-stage': data[:, 2],
                         'n-stage': data[:, 3]})
  
  df.age = df.age.astype('float')
  df.age = pandas.cut(df.age, bins, labels=labels, include_lowest = True)

  le  = label_encoder.fit_transform(df)
  ohe = one_hot_encoder.fit_transform(le)
  
  return ohe

for tmp in bins:
  train_clinical = np.append(train_clinical, [[str(tmp), 'Oropharynx', 'T1', 'NO']], axis=0)
  test_clinical  = np.append(test_clinical, [[str(tmp), 'Oropharynx', 'T1', 'NO']], axis=0)

train_ohe = one_hot_encoding(train_clinical)
test_ohe  = one_hot_encoding(test_clinical)

n = len(bins)
train_ohe = train_ohe[:-n, :]
test_ohe  = test_ohe[:-n, :]
train_clinical = train_clinical[:-n, :]
test_clinical  = test_clinical[:-n, :]

print("Clinical data one-hot encoding shape", train_ohe.shape)

# **Build convolutional autoencoder.**

## Encoder and Decoder.

Encoder and Decoder.

In [None]:
def custom_conv2d(layer, filters, stride, name):
  layer = Conv2D(filters, kernel_size=(3,3), strides=stride, activation='relu', padding='same', name=('encoder_conv_%s' % name))(layer)
  layer = BatchNormalization(name=('encoder_bn_%s' % name))(layer)

  return layer

def custom_conv2d_transpose(layer, filters, stride, name):
  layer = Conv2DTranspose(filters, kernel_size=(3,3), strides=stride, activation='relu', padding='same', name=('decoder_conv_%s' % name))(layer)
  layer = BatchNormalization(name=('decoder_bn_%s' % name))(layer)

  return layer

def build_encoder_layers(layer, filters_per_layer):
  for i, filters in enumerate(filters_per_layer):
    layer = custom_conv2d(layer, filters, (1, 1), '1-1_%d' % i)
    layer = custom_conv2d(layer, filters, (1, 1), '1-2_%d' % i)
    layer = custom_conv2d(layer, filters, (2, 2), '2-1_%d' % i)

  return layer

def build_decoder_layers(layer, filters_per_layer):
  for i, filters in enumerate(filters_per_layer[::-1]):
    layer = custom_conv2d_transpose(layer, filters, (2, 2), '2-1_%d' % i)
    layer = custom_conv2d_transpose(layer, filters, (1, 1), '1-2_%d' % i)
    layer = custom_conv2d_transpose(layer, filters, (1, 1), '1-1_%d' % i)

  output = Conv2D(1, (3, 3), activation='sigmoid', padding='same', name=('decoder_conv_output'))(layer)

  return output

Build Model.

In [None]:
encoded = build_encoder_layers(input_img, nb_filters)
decoded = build_decoder_layers(encoded, nb_filters)

ae = Model(input_img, decoded)
ae.compile(loss='mean_squared_error', optimizer='rmsprop')
ae.summary()

## Autoencoder Classification

Learning parameters.

In [None]:
#hyper-parameters
epochs_AE = 300
epochs_FULL = 150
batch_size = 64
input_img  = Input(shape=(128, 128, 1))
nb_filters = (32, 64, 128)

#model layers
l = 18

Define the fully connected layers that will be stacked up with the encoder function.

In [None]:
def m_encoder(enco):
  flat = Flatten()(enco)
  drop = Dropout(0.25)(flat)
  den1 = Dense(512, activation='relu', kernel_regularizer=regularizers.l2(0.001))(drop)
  drop = Dropout(0.5)(den1)
  out  = Dense(1, activation='sigmoid')(drop)
  
  return out

def m_sequential(input):
  seq = Sequential()
  seq.add(Dense(16, activation='relu', input_shape=(input,)))
  seq.add(Dropout(0.5))
  seq.add(Dense(16, activation='relu'))
  seq.add(Dropout(0.5))

  return seq

#create augmented image generator
datagen = ImageDataGenerator(rotation_range=45,
                             width_shift_range=0.2,
                             height_shift_range=0.2,
                             shear_range=0.2,
                             zoom_range=0.2,
                             horizontal_flip=True,
                             fill_mode='nearest')

#compute quantities required for featurewise normalization
datagen.fit(train_x)

In [None]:
model_cf = tf.keras.models.load_model('/content/drive/My Drive/Bachelor Thesis/autoencoder_300_128_full.h5')

def create_model():
  #create the radiomic and clinical data models
  cnn = Model(input_img, m_encoder(build_encoder_layers(input_img, nb_filters)))
  seq = m_sequential(train_ohe.shape[1])

  #merge models
  merged = Concatenate()([seq.output, cnn.output])
  output = Dense(1, input_dim=2, activation='sigmoid', use_bias=True)(merged)
  full_model = Model(inputs=[seq.input, cnn.input], outputs=output)

  #set layer weights of the autoencoder
  for l1, l2 in zip(full_model.layers[:l], model_cf.layers[:l]):
    l1.set_weights(l2.get_weights())
    l1.trainable = False

  #compile the model
  full_model.compile(loss='binary_crossentropy', optimizer='adam')

  return full_model

Train the model

In [None]:

tmp_model = create_model()
tmp_model.fit([train_ohe, train_x], train_death, batch_size=64, epochs=100, verbose=0,validation_data=([test_ohe, test_x], test_death))
pred = tmp_model.predict([test_ohe, test_x])
fpr, tpr, _ = roc_curve(test_death, pred)
roc_auc = auc(fpr, tpr)

print(roc_auc)

DM_fpr = fpr
DM_tpr = tpr
DM_auc = roc_auc

In [None]:
clinical = m_sequential(train_ohe.shape[1])
clinical.add(Dense(1, activation='sigmoid'))

clinical.compile(loss='binary_crossentropy', optimizer='adam')
clinical.fit(train_ohe, train_local, batch_size=256, epochs=100, verbose=0, validation_data=(test_ohe, test_local))

pred = clinical.predict(test_ohe)
fpr, tpr, _ = roc_curve(test_local, pred)
roc_auc = auc(fpr, tpr)
print(roc_auc)

LR_fpr = fpr
LR_tpr = tpr
LR_auc = roc_auc

# **Binary Classification**

## ROC Curve

In [None]:
fig = plt.figure(figsize=(8,6))

plt.figure()
#plt.plot(LR_fpr, LR_tpr, 'blue', label='Local recurrence, AUC={:.2f}'.format(LR_auc))
#plt.plot(DM_fpr, DM_tpr, 'green', label='Distant metastasis, AUC={:.2f}'.format(DM_auc))
plt.plot(OS_fpr, OS_tpr, 'red', label='Death, AUC={:.2f}'.format(OS_auc))
    
plt.plot([0,1], [0,1], color='orange', linestyle='--')

plt.xticks(np.arange(0.0, 1.1, step=0.1))
plt.xlabel("Flase Positive Rate", fontsize=15)

plt.yticks(np.arange(0.0, 1.1, step=0.1))
plt.ylabel("True Positive Rate", fontsize=15)

plt.title('ROC Curve Analysis', fontweight='bold', fontsize=15)
plt.legend(prop={'size':13}, loc='lower right')

plt.show()

# **Display Encoding.**

Autoencoder image input.

In [None]:
fig = plt.figure(figsize=(30, 15))


for i in range(32):
  sub = fig.add_subplot(4, 8, i+1)
  sub.imshow(train_x[i].reshape(160, 160))

plt.show()

Autoencoder image reconstruction.

In [None]:
autoencoder_rec = tf.keras.models.load_model('/content/drive/My Drive/Bachelor Thesis/autoencoder_300_clinical_datagen.h5')

decoded_imgs = autoencoder_rec.predict(test_x)

# number of scans to display
scans = 8

plt.figure(figsize=(30, 8))
for i in range(scans):
    # display original
    ax = plt.subplot(2, scans, i + 1)
    plt.imshow(test_x[i].reshape(160, 160), cmap='gray')
    ax.get_xaxis().set_visible(False)
    ax.get_yaxis().set_visible(False)

    # display reconstruction
    ax = plt.subplot(2, scans, i + scans + 1)
    plt.imshow(decoded_imgs[i].reshape(160, 160), cmap='gray')
    ax.get_xaxis().set_visible(False)
    ax.get_yaxis().set_visible(False)

plt.show()

Autoencoder loss progression.

In [None]:
loss = hist.history['loss']
val_loss = hist.history['val_loss']

epochs = range(epochs)

plt.figure()
plt.plot(epochs, loss, 'bo', label='Training loss')
plt.plot(epochs, val_loss, 'b', label='Validation loss')
plt.title('Training and validation loss')
plt.legend()
plt.show()