In [1]:
%load_ext autoreload
%autoreload 1

import pandas as pd
import numpy as np
import sys
sys.path.append("../utils/")
%aimport utils

from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split
from imblearn.over_sampling import RandomOverSampler


from keras import backend as K
from keras.optimizers import Adam
from keras.models import Sequential, Model
from keras.layers import Dense, Input, Conv1D, Flatten
from tensorflow import set_random_seed

import os
import random as rn

seed = 42
os.environ['PYTHONHASHSEED'] = str(seed)
np.random.seed(seed)
set_random_seed(seed)
rn.seed(seed)

# maximum number of cores
n_cores = 20

K.set_session(K.tf.Session(config=K.tf.ConfigProto(
    intra_op_parallelism_threads=n_cores,
    inter_op_parallelism_threads=n_cores
)))

  from ._conv import register_converters as _register_converters
Using TensorFlow backend.


## Data

In [2]:
cancer_name = 'KIDNEY'
X_c, y_c = utils.get_cancer_data(cancer_name)
print("Cancer: {}".format(cancer_name))
print("\t#samples: {}".format(X_c.shape[0]))
print("\t#genes: {}".format(X_c.shape[1]))
print("\t#TUMORS: {}\t#NORMAL: {}".format(y_c[y_c == utils.TUMOR].shape[0], y_c[y_c == utils.NORMAL].shape[0]))

Cancer: KIDNEY
	#samples: 1020
	#genes: 20530
	#TUMORS: 891	#NORMAL: 129


## Feature selection

In [3]:
def get_filtered_features(X):
    return X.std(0).argsort()[::-1][:5000] # nothing happens

def preprocess(X):
    scaler = MinMaxScaler()
    return utils.pre_process(X, get_filtered_features, scaler)

## Model creation

In [4]:
def create_conv_model(input_size):
    global stride
    print(stride)
    model = Sequential()
    model.add(Conv1D(filters=5, kernel_size=(stride), input_shape=(input_size, 1), 
                     activation='relu', strides=stride))
    model.add(Flatten())
    model.add(Dense(units=200, activation="relu"))
    model.add(Dense(units=50, activation="relu"))
    model.add(Dense(units=1, activation="sigmoid", name='output'))
    return model

In [5]:
def split_training_default_1(X, y, train, test, preprocess, validation_split, seed):
    global n_neighbors, dm
    
    X_train, y_train = X[train], y[train]
    X_test, y_test = X[test], y[test]

    # get the validation set in a stratified fashion from the training set
    X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=validation_split,
                                                      random_state=seed, stratify=y_train)
    print("Train - Test")
    # preprocess training set and get features and scaler
    X_train, scaler, sel_features = preprocess(X_train)
    print("Training scaled")
    # transform testing set
    X_test = scaler.fit_transform(X_test[:, sel_features])
    print("Test scaled")
    # transform validation set
    X_val = scaler.fit_transform(X_val[:, sel_features])
    print("Val scaled")
    oversampler = RandomOverSampler(random_state=seed)
    
    # oversampling
    X_train, y_train =oversampler.fit_sample(X_train, y_train)
    print("Train - oversampled")
    X_val, y_val = oversampler.fit_sample(X_val, y_val)
    print("Val - oversampled")
#     print(X_train.shape)
#     print(X_val.shape)

    print("Filtering the distance matrix")
    sel_dm = dm[sel_features, :][:, sel_features]
    sel_neighbors = np.argsort(sel_dm, axis=1)
    conv_idxs = np.append(np.arange(sel_neighbors.shape[0]).reshape(-1, 1), sel_neighbors[:, :n_neighbors], axis=1).flatten()
    
    X_train = X_train[:, conv_idxs]
    X_test = X_test[:, conv_idxs]
    X_val = X_val[:, conv_idxs]
    
    X_train = X_train.reshape(X_train.shape[0], X_train.shape[1], -1)
    X_test = X_test.reshape(X_test.shape[0], X_test.shape[1], -1)
    X_val = X_val.reshape(X_val.shape[0], X_val.shape[1], -1)
    
    print("Reshaped")
    print(X_train.shape)
    
    return X_train, X_val, X_test, y_train, y_val, y_test

## Distance matrix

In [6]:
dm = np.load("/home/nanni/Data/TCGA/CIBB/ontological_distance_matrix.npy")
np.fill_diagonal(dm, np.inf)

### Neighbors

In [7]:
neighbors = np.argsort(dm, axis=1)

### Feature disposition

In [8]:
n_neighbors = 4
stride = n_neighbors + 1

In [9]:
conv_idxs = np.append(np.arange(neighbors.shape[0]).reshape(-1, 1), neighbors[:, :n_neighbors], axis=1).flatten()

In [10]:
X_c_conv = X_c[:, conv_idxs]

## Cross validation

In [11]:
cvscores_c, histories_c = utils.cross_validation(X=X_c, y=y_c, 
                                                 preprocess=preprocess, 
                                                 seed=seed, 
                                                 data_preparation=split_training_default_1,
                                                 create_model=create_conv_model, 
                                                 get_measures=utils.get_measures)
cvscores_c.mean().to_frame().T.drop("split", axis=1)

Train - Test
Training scaled
Test scaled
Val scaled
Train - oversampled
Val - oversampled
Filtering the distance matrix
Reshaped
(1068, 25000, 1)
5
Instructions for updating:
`NHWC` for data_format is deprecated, use `NWC` instead
f1-score  1.00      precision 1.00      recall    1.00      accuracy  1.00      split     0.00      
Train - Test
Training scaled
Test scaled
Val scaled
Train - oversampled
Val - oversampled
Filtering the distance matrix
Reshaped
(1070, 25000, 1)
5
f1-score  1.00      precision 1.00      recall    1.00      accuracy  1.00      split     1.00      
Train - Test
Training scaled
Test scaled
Val scaled
Train - oversampled
Val - oversampled
Filtering the distance matrix
Reshaped
(1070, 25000, 1)
5
f1-score  0.98      precision 1.00      recall    0.96      accuracy  1.00      split     2.00      
Train - Test
Training scaled
Test scaled
Val scaled
Train - oversampled
Val - oversampled
Filtering the distance matrix
Reshaped
(1070, 25000, 1)
5
f1-score  0.98      pr

Unnamed: 0,accuracy,f1-score,precision,recall
0,0.998039,0.992305,0.992593,0.992308
