In [1]:
%load_ext autoreload
%autoreload 1
import time
import pandas as pd
import numpy as np
import random as rn
from tqdm import tqdm
import os

import sys

sys.path.append("../utils/")
%aimport utils

from keras import backend as K
from keras.models import Sequential, Model
from keras.layers import Dense

from sklearn.model_selection import StratifiedKFold
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split

from tensorflow import set_random_seed

# reproducibility

seed = 42
os.environ['PYTHONHASHSEED'] = str(seed)
np.random.seed(seed)
set_random_seed(seed)
rn.seed(seed)

# maximum number of cores
n_cores = 10

K.set_session(K.tf.Session(config=K.tf.ConfigProto(
    intra_op_parallelism_threads=n_cores, 
    inter_op_parallelism_threads=n_cores
)))

TUMOR = 0
NORMAL = 1

  from ._conv import register_converters as _register_converters
Using TensorFlow backend.


In [13]:
now = time.strftime('%Y%m%d_%H%M')
description = "BRCA_1vsALL_NN_NN"
folder = now + "_" + description

In [41]:
output_folder = os.path.join("./results/", folder)
os.makedirs(output_folder, exist_ok=True)

In [1]:
writer = pd.ExcelWriter(os.path.join(output_folder, "results.xlsx"), engine='xlsxwriter')

NameError: name 'pd' is not defined

## Data

In [15]:
def get_filtered_features(X):
    #return np.arange(X.shape[1])
    return X.std(0).argsort()[::-1][:5000]

def preprocess(X):
    scaler = MinMaxScaler()
    return utils.pre_process(X, get_filtered_features, scaler)

## 1 vs ALL

In [16]:
cancer_name = "BRCA"
X_c, y_c = utils.get_cancer_data(cancer_name)
print("Cancer: {}".format(cancer_name))
print("\t#samples: {}".format(X_c.shape[0]))
print("\t#genes: {}".format(X_c.shape[1]))
print("\t#TUMORS: {}\t#NORMAL: {}".format(y_c[y_c == TUMOR].shape[0], y_c[y_c == NORMAL].shape[0]))

Cancer: BRCA
	#samples: 1218
	#genes: 20530
	#TUMORS: 1104	#NORMAL: 114


In [18]:
others = list(set(utils.all_tumor_names) - {cancer_name})
# print(", ".join(others))

X_others = np.empty((0, X_c.shape[1]), dtype=int)
y_others = np.empty(0, dtype=int)

for o in others:
    print(o)
    X_o, y_o = utils.get_cancer_data(o)
    X_others = np.append(X_others, X_o, axis=0)
    y_others = np.append(y_others, y_o)

LAML
STAD
PRAD
SKCM
CESC
HNSC
CHOL
UVM
TGCT
OV
COAD
THCA
MESO
ACC
UCS
LGG
DLBC
GBM
KIRC
LUSC
BLCA
LIHC
PCPG
UCEC
KIRP
SARC
KICH
READ
ESCA
THYM
PAAD
LUAD


### Tumor alone

In [19]:
def tumor_alone_model(input_size):
    """ A super-simple NN for the single tumor classification
    """
    model = Sequential()
    model.add(Dense(100, input_shape=(input_size,), activation='relu'))
    model.add(Dense(20, activation='relu'))
    model.add(Dense(1, activation="sigmoid"))
    return model

In [22]:
cvscores_c, histories_c = utils.cross_validation(X=X_c, y=y_c, preprocess=preprocess, seed=seed,
                                    create_model=tumor_alone_model, get_measures=utils.get_measures)
cvscores_c.mean().to_frame().T.drop("split", axis=1)

f1-score  0.98      precision 0.96      recall    1.00      accuracy  1.00      split     0.00      
f1-score  0.98      precision 0.96      recall    1.00      accuracy  1.00      split     1.00      
f1-score  0.85      precision 1.00      recall    0.74      accuracy  0.98      split     2.00      
f1-score  0.98      precision 1.00      recall    0.96      accuracy  1.00      split     3.00      
f1-score  0.96      precision 0.92      recall    1.00      accuracy  0.99      split     4.00      


Unnamed: 0,accuracy,f1-score,precision,recall
0,0.99097,0.948349,0.966667,0.93913


In [31]:
utils.report(cvscores_c, writer=writer, sheet_name="{}_alone".format(cancer_name))

### Others alone

In [32]:
def others_alone_model(input_size):
    h1 = 500
    h2 = 200
    h3 = 100
    h4 = 50
    out = 1

    model = Sequential()
    model.add(Dense(h1, input_shape=(input_size, ), activation="relu"))
    model.add(Dense(h2, activation="relu"))
    model.add(Dense(h3, activation="relu"))
    model.add(Dense(h4, activation="relu"))
    model.add(Dense(out, activation="sigmoid"))
    return model

In [33]:
cvscores_others, histories_others = utils.cross_validation(X=X_others, y=y_others, preprocess=preprocess, 
                                                           seed=seed, create_model=others_alone_model, 
                                                           get_measures=utils.get_measures)
cvscores_others.mean().to_frame().T.drop("split", axis=1)

f1-score  0.82      precision 0.99      recall    0.70      accuracy  0.98      split     0.00      
f1-score  0.95      precision 0.96      recall    0.94      accuracy  0.99      split     1.00      
f1-score  0.92      precision 0.98      recall    0.86      accuracy  0.99      split     2.00      
f1-score  0.94      precision 0.94      recall    0.95      accuracy  0.99      split     3.00      
f1-score  0.94      precision 0.90      recall    0.98      accuracy  0.99      split     4.00      


Unnamed: 0,accuracy,f1-score,precision,recall
0,0.989288,0.913641,0.951934,0.887698


In [34]:
utils.report(cvscores_others, writer=writer, sheet_name="{}_others".format(cancer_name))

### Transfer learning

In [35]:
def create_other_network(input_size):
    h1 = 500
    h2 = 200
    h3 = 100
    h4 = 50
    out = 1
    
    model = Sequential()
    model.add(Dense(h1, input_shape=(input_size, ), activation="relu", name='h1'))
    model.add(Dense(h2, activation="relu", name='h2'))
    model.add(Dense(h3, activation="relu", name='h3'))
    model.add(Dense(h4, activation="relu", name='h4'))
    model.add(Dense(out, activation="sigmoid", name='out'))
    
    encoder = Model(inputs=model.input, outputs=model.get_layer("h3").output)
    
    return model, encoder

In [36]:
def create_additional_network(input_size):
    h1 = 50
    h2 = 10
    out = 1
    
    model = Sequential()
    model.add(Dense(h1, input_shape=(input_size, ), activation="relu", name='h1'))
    model.add(Dense(h2, activation="relu", name='h2'))
    model.add(Dense(out, activation="sigmoid", name='out'))
    return model

In [37]:
def transfer_learning(X, y, train, test, preprocess, validation_split, seed, X_other, y_other):
#     print(X.shape, y.shape)
#     print(X_other.shape, y.shape)
#     print("Splitting of X_c")
    # Splitting the single tumor dataset
    X_train, y_train = X[train], y[train]
    X_test, y_test = X[test], y[test]
    
    # get the validation set in a stratified fashion from the training set
    X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=validation_split,
                                                      random_state=seed, stratify=y_train)
#     print("Scaling of X_c")
    # preprocess training set and get features and scaler
    X_train, scaler, sel_features = preprocess(X_train)
    # transform testing set
    X_test = scaler.fit_transform(X_test[:, sel_features])
    # transform validation set
    X_val = scaler.fit_transform(X_val[:, sel_features])
    
#     print("Scaling and selection on X_other")
    # for the other set we use a brand new scaler but the same features
    other_scaler = MinMaxScaler()
    X_other = other_scaler.fit_transform(X_other[:, sel_features])
    # splitting other set in training and validation (no test...useless)
    X_other_train, X_other_val, \
    y_other_train, y_other_val = train_test_split(X_other, y_other, test_size=validation_split,
                                                  random_state=seed, stratify=y_other)
    
#     print("Fitting the OTHER model")
    # create and fit the OTHER model
    other_model, encoder = create_other_network(input_size=X_other_train.shape[1])
    other_model.compile(loss="binary_crossentropy", optimizer="adam", metrics=['accuracy'])
    other_model.fit(X_other_train, y_other_train,
                    epochs=100, batch_size=60,
                    verbose=0, validation_data=(X_other_val, y_other_val),
                    callbacks=[utils.get_early_stopping_condition()])
    
#     print("Encoding X_c")
    # embedding of data
    X_train_code = encoder.predict(X_train)
    X_val_code = encoder.predict(X_val)
    X_test_code = encoder.predict(X_test)
    
#     print(X_train_code.shape)
#     print(X_val_code.shape)
#     print(X_test_code.shape)
    
    return X_train_code, X_val_code, X_test_code, y_train, y_val, y_test

In [38]:
cvscores_tl, histories_tl = utils.cross_validation(X=X_c, y=y_c, preprocess=preprocess, seed=seed,
                                                   create_model=create_additional_network, 
                                                   get_measures=utils.get_measures, 
                                                   data_preparation=transfer_learning, 
                                                   X_other=X_others, y_other=y_others)

f1-score  0.94      precision 0.88      recall    1.00      accuracy  0.99      split     0.00      
f1-score  0.90      precision 0.82      recall    1.00      accuracy  0.98      split     1.00      
f1-score  0.96      precision 0.96      recall    0.96      accuracy  0.99      split     2.00      
f1-score  0.94      precision 0.88      recall    1.00      accuracy  0.99      split     3.00      
f1-score  0.96      precision 0.92      recall    1.00      accuracy  0.99      split     4.00      


In [39]:
cvscores_tl.mean().to_frame().T.drop("split", axis=1)

Unnamed: 0,accuracy,f1-score,precision,recall
0,0.987691,0.938511,0.89277,0.991304


In [40]:
utils.report(cvscores_others, writer=writer, sheet_name="{}_TL".format(cancer_name))

In [42]:
writer.save()