In [112]:
%load_ext autoreload
%autoreload 1
import time
import pandas as pd
import numpy as np
import random as rn
from tqdm import tqdm
import os

import sys

sys.path.append("../utils/")
%aimport utils

from keras import backend as K
from keras.models import Sequential, Model
from keras.layers import Dense
from imblearn.over_sampling import RandomOverSampler
from imblearn.under_sampling import RandomUnderSampler

from sklearn.model_selection import StratifiedKFold
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split

from tensorflow import set_random_seed

# reproducibility

seed = 42
os.environ['PYTHONHASHSEED'] = str(seed)
np.random.seed(seed)
set_random_seed(seed)
rn.seed(seed)

# maximum number of cores
n_cores = 10

K.set_session(K.tf.Session(config=K.tf.ConfigProto(
    intra_op_parallelism_threads=n_cores, 
    inter_op_parallelism_threads=n_cores
)))

TUMOR = 0
NORMAL = 1

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [None]:
now = time.strftime('%Y%m%d_%H%M')
description = "BRCA_with_OV_UCEC_merge_NN"
folder = now + "_" + description

In [None]:
output_folder = os.path.join("./results/", folder)
os.makedirs(output_folder, exist_ok=True)

In [None]:
writer = pd.ExcelWriter(os.path.join(output_folder, "results.xlsx"), engine='xlsxwriter')

## Data

In [None]:
def get_filtered_features(X):
    return np.arange(10)
    #return X.std(0).argsort()[::-1][:5]

def preprocess(X):
    scaler = MinMaxScaler()
    return utils.pre_process(X, get_filtered_features, scaler)

## Procedure

In [127]:
cancer_name = "BLCA"
X_c, y_c = utils.get_cancer_data(cancer_name)
print("Cancer: {}".format(cancer_name))
print("\t#samples: {}".format(X_c.shape[0]))
print("\t#genes: {}".format(X_c.shape[1]))
print("\t#TUMORS: {}\t#NORMAL: {}".format(y_c[y_c == TUMOR].shape[0], y_c[y_c == NORMAL].shape[0]))

Cancer: BLCA
	#samples: 426
	#genes: 20530
	#TUMORS: 407	#NORMAL: 19


In [128]:
X_c = X_c[:, :5000]


scaler = MinMaxScaler()
X_c = scaler.fit_transform(X_c.T).T

oversampler = RandomUnderSampler(random_state=seed)

X_train, X_test, y_train, y_test = train_test_split(X_c, y_c, test_size=0.25, stratify=y_c)
X_train, y_train = oversampler.fit_sample(X_train, y_train)

X_test, y_test = oversampler.fit_sample(X_test, y_test)

In [129]:
model = tumor_alone_model(X_train.shape[1])

In [130]:
model.compile(loss="binary_crossentropy", optimizer="adam", metrics=['accuracy'])
model.fit(X_train, y_train, validation_split=0.25, epochs=100, verbose=1)

Train on 21 samples, validate on 7 samples
Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
Epoch 22/100
Epoch 23/100
Epoch 24/100
Epoch 25/100
Epoch 26/100
Epoch 27/100
Epoch 28/100
Epoch 29/100
Epoch 30/100
Epoch 31/100
Epoch 32/100
Epoch 33/100
Epoch 34/100
Epoch 35/100
Epoch 36/100
Epoch 37/100
Epoch 38/100
Epoch 39/100
Epoch 40/100
Epoch 41/100
Epoch 42/100
Epoch 43/100
Epoch 44/100
Epoch 45/100
Epoch 46/100
Epoch 47/100
Epoch 48/100
Epoch 49/100
Epoch 50/100
Epoch 51/100
Epoch 52/100
Epoch 53/100
Epoch 54/100
Epoch 55/100
Epoch 56/100
Epoch 57/100
Epoch 58/100
Epoch 59/100
Epoch 60/100
Epoch 61/100


Epoch 62/100
Epoch 63/100
Epoch 64/100
Epoch 65/100
Epoch 66/100
Epoch 67/100
Epoch 68/100
Epoch 69/100
Epoch 70/100
Epoch 71/100
Epoch 72/100
Epoch 73/100
Epoch 74/100
Epoch 75/100
Epoch 76/100
Epoch 77/100
Epoch 78/100
Epoch 79/100
Epoch 80/100
Epoch 81/100
Epoch 82/100
Epoch 83/100
Epoch 84/100
Epoch 85/100
Epoch 86/100
Epoch 87/100
Epoch 88/100
Epoch 89/100
Epoch 90/100
Epoch 91/100
Epoch 92/100
Epoch 93/100
Epoch 94/100
Epoch 95/100
Epoch 96/100
Epoch 97/100
Epoch 98/100
Epoch 99/100
Epoch 100/100


<keras.callbacks.History at 0x7fbe6a829d68>

In [131]:
y_pred = model.predict_classes(X_test)

In [132]:
from sklearn.metrics import f1_score, precision_score, confusion_matrix

In [133]:
f1_score(y_pred=y_pred, y_true=y_test)

0.888888888888889

In [134]:
precision_score(y_pred=y_pred, y_true=y_test)

1.0

In [135]:
confusion_matrix(y_pred=y_pred, y_true=y_test)

array([[5, 0],
       [1, 4]])

In [None]:
#others = list(set(utils.all_tumor_names) - {cancer_name})
others = ['UCEC', 'OV']
# print(", ".join(others))

X_others = np.empty((0, X_c.shape[1]), dtype=int)
y_others = np.empty(0, dtype=int)

for o in others:
    print(o)
    X_o, y_o = utils.get_cancer_data(o)
    X_others = np.append(X_others, X_o, axis=0)
    y_others = np.append(y_others, y_o)

### Tumor alone

In [3]:
def tumor_alone_model(input_size):
    """ A super-simple NN for the single tumor classification
    """
    model = Sequential()
    model.add(Dense(100, input_shape=(input_size,), activation='relu'))
    model.add(Dense(20, activation='relu'))
    model.add(Dense(1, activation="sigmoid"))
    return model

In [None]:
cvscores_c, histories_c = utils.cross_validation(X=X_c, y=y_c, preprocess=preprocess, seed=seed,
                                    create_model=tumor_alone_model, get_measures=utils.get_measures)
cvscores_c.mean().to_frame().T.drop("split", axis=1)

In [None]:
utils.report(cvscores_c, writer=writer, sheet_name="{}_alone".format(cancer_name))

### Others alone

In [None]:
def others_alone_model(input_size):
    h1 = 500
    h2 = 200
    h3 = 100
    h4 = 50
    out = 1

    model = Sequential()
    model.add(Dense(h1, input_shape=(input_size, ), activation="relu"))
    model.add(Dense(h2, activation="relu"))
    model.add(Dense(h3, activation="relu"))
    model.add(Dense(h4, activation="relu"))
    model.add(Dense(out, activation="sigmoid"))
    return model

In [None]:
cvscores_others, histories_others = utils.cross_validation(X=X_others, y=y_others, preprocess=preprocess, 
                                                           seed=seed, create_model=others_alone_model, 
                                                           get_measures=utils.get_measures)
cvscores_others.mean().to_frame().T.drop("split", axis=1)

In [None]:
utils.report(cvscores_others, writer=writer, sheet_name="{}_others".format(cancer_name))

### Transfer learning

In [None]:
def create_other_network(input_size):
    h1 = 500
    h2 = 200
    h3 = 100
    h4 = 50
    out = 1
    
    model = Sequential()
    model.add(Dense(h1, input_shape=(input_size, ), activation="relu", name='h1'))
    model.add(Dense(h2, activation="relu", name='h2'))
    model.add(Dense(h3, activation="relu", name='h3'))
    model.add(Dense(h4, activation="relu", name='h4'))
    model.add(Dense(out, activation="sigmoid", name='out'))
    
    encoder = Model(inputs=model.input, outputs=model.get_layer("h3").output)
    
    return model, encoder

In [None]:
def create_additional_network(input_size):
    h1 = 50
    h2 = 10
    out = 1
    
    model = Sequential()
    model.add(Dense(h1, input_shape=(input_size, ), activation="relu", name='h1'))
    model.add(Dense(h2, activation="relu", name='h2'))
    model.add(Dense(out, activation="sigmoid", name='out'))
    return model

In [None]:
def tl_data_merging(X, y, train, test, preprocess, validation_split, seed, X_other, y_other):

#     print(X.shape, y.shape)
#     print(X_other.shape, y.shape)
#     print("Splitting of X_c")
    
    # Splitting the single tumor dataset
    X_train, y_train = X[train], y[train]
    X_test, y_test = X[test], y[test]
    
    # get the validation set in a stratified fashion from the training set
    X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=validation_split,
                                                      random_state=seed, stratify=y_train)
    # Merge the single cancer training set with the other set
    X_train_merged = np.append(X_train, X_other, axis=0)
    y_train_merged = np.append(y_train, y_other)
    
    # preprocess merged training set and get features and scaler
    X_train_merged, scaler, sel_features = preprocess(X_train_merged)
    # transform testing set
    X_test = scaler.fit_transform(X_test[:, sel_features])
    # transform validation set
    X_val = scaler.fit_transform(X_val[:, sel_features])

#     print(X_train_merged.shape, y_train_merged.shape)
#     print(X_val.shape, y_val.shape)
#     print(X_test.shape, y_test.shape)
    
    return X_train_merged, X_val, X_test, y_train_merged, y_val, y_test

In [None]:
def transfer_learning(X, y, train, test, preprocess, validation_split, seed, X_other, y_other):
#     print(X.shape, y.shape)
#     print(X_other.shape, y.shape)
#     print("Splitting of X_c")
    # Splitting the single tumor dataset
    X_train, y_train = X[train], y[train]
    X_test, y_test = X[test], y[test]
    
    # get the validation set in a stratified fashion from the training set
    X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=validation_split,
                                                      random_state=seed, stratify=y_train)
#     print("Scaling of X_c")
    # preprocess training set and get features and scaler
    X_train, scaler, sel_features = preprocess(X_train)
    # transform testing set
    X_test = scaler.fit_transform(X_test[:, sel_features])
    # transform validation set
    X_val = scaler.fit_transform(X_val[:, sel_features])
    
#     print("Scaling and selection on X_other")
    # for the other set we use a brand new scaler but the same features
    other_scaler = MinMaxScaler()
    X_other = other_scaler.fit_transform(X_other[:, sel_features])
    # splitting other set in training and validation (no test...useless)
    X_other_train, X_other_val, \
    y_other_train, y_other_val = train_test_split(X_other, y_other, test_size=validation_split,
                                                  random_state=seed, stratify=y_other)
    
#     print("Fitting the OTHER model")
    # create and fit the OTHER model
    other_model, encoder = create_other_network(input_size=X_other_train.shape[1])
    other_model.compile(loss="binary_crossentropy", optimizer="adam", metrics=['accuracy'])
    other_model.fit(X_other_train, y_other_train,
                    epochs=100, batch_size=60,
                    verbose=0, validation_data=(X_other_val, y_other_val),
                    callbacks=[utils.get_early_stopping_condition()])
    
#     print("Encoding X_c")
    # embedding of data
    X_train_code = encoder.predict(X_train)
    X_val_code = encoder.predict(X_val)
    X_test_code = encoder.predict(X_test)
    
#     print(X_train_code.shape)
#     print(X_val_code.shape)
#     print(X_test_code.shape)
    
    return X_train_code, X_val_code, X_test_code, y_train, y_val, y_test

In [None]:
cvscores_tl, histories_tl = utils.cross_validation(X=X_c, y=y_c, preprocess=preprocess, seed=seed,
                                                   create_model=others_alone_model, 
                                                   get_measures=utils.get_measures, 
                                                   data_preparation=tl_data_merging, 
                                                   X_other=X_others, y_other=y_others)

In [None]:
cvscores_tl, histories_tl = utils.cross_validation(X=X_c, y=y_c, preprocess=preprocess, seed=seed,
                                                   create_model=create_additional_network, 
                                                   get_measures=utils.get_measures, 
                                                   data_preparation=transfer_learning, 
                                                   X_other=X_others, y_other=y_others)

In [None]:
cvscores_tl.mean().to_frame().T.drop("split", axis=1)

In [None]:
utils.report(cvscores_others, writer=writer, sheet_name="{}_TL".format(cancer_name))

In [None]:
writer.save()