In [1]:
%load_ext autoreload
%autoreload 1
import time
import pandas as pd
import numpy as np
import random as rn
from tqdm import tqdm
import os

import sys

sys.path.append("../utils/")
%aimport utils

from sklearn.model_selection import StratifiedKFold
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split
from sklearn.decomposition import PCA
from sklearn.svm import LinearSVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import f1_score, precision_score, recall_score, accuracy_score


seed = 42
os.environ['PYTHONHASHSEED'] = str(seed)
np.random.seed(seed)

TUMOR = 0
NORMAL = 1

  from ._conv import register_converters as _register_converters
Using TensorFlow backend.


In [2]:
def get_filtered_features(X):
    #return np.arange(10)
    return X.std(0).argsort()[::-1][:]

def preprocess(X):
    scaler = MinMaxScaler()
    return utils.pre_process(X, get_filtered_features, scaler)

In [3]:
cancer_name = "BLCA"

X_c, y_c = utils.get_cancer_data(cancer_name)

# sampler = RandomUnderSampler()
# X_c, y_c = sampler.fit_sample(X_c, y_c)

print("Cancer: {}".format(cancer_name))
print("\t#samples: {}".format(X_c.shape[0]))
print("\t#genes: {}".format(X_c.shape[1]))
print("\t#TUMORS: {}\t#NORMAL: {}".format(y_c[y_c == TUMOR].shape[0], y_c[y_c == NORMAL].shape[0]))

Cancer: BLCA
	#samples: 426
	#genes: 20530
	#TUMORS: 407	#NORMAL: 19


In [4]:
n_randomizations = 5
n_folds = 5

In [5]:
X, y = X_c, y_c

cvscores = []

def get_measures(y_true, y_pred):
    f1 = f1_score(y_pred=y_pred, y_true=y_true)
    precision = precision_score(y_pred=y_pred, y_true=y_true)
    recall = recall_score(y_pred=y_pred, y_true=y_true)
    accuracy = accuracy_score(y_pred=y_pred, y_true=y_true)

    return {'f1-score': f1,
            'precision': precision,
            'recall': recall,
            'accuracy': accuracy}


def split_training(X, y, train, test, preprocess, seed):
    X_train, y_train = X[train], y[train]
    X_test, y_test = X[test], y[test]

    # preprocess training set and get features and scaler
    X_train, scaler, sel_features = preprocess(X_train)

    # transform testing set
    X_test = scaler.transform(X_test[:, sel_features])
    return X_train, X_test, y_train, y_test


for r in range(n_randomizations):
    kfold = StratifiedKFold(n_splits=n_folds, shuffle=True, random_state=seed + r)
    for i_split, (train, test) in enumerate(kfold.split(X, y)):
        X_train, X_test, y_train, y_test = split_training(X, y, train, test, preprocess, seed)

        model = LinearSVC(random_state=seed, class_weight="balanced")
        #model = KNeighborsClassifier()
        #model = RandomForestClassifier()
        model.fit(X_train, y_train)
        # evaluate the model
        y_pred = model.predict(X_test)
        measures = get_measures(y_pred=y_pred, y_true=y_test)
        measures['split'] = i_split
        measures['random_set'] = r
        print("".join(["{:<10}{:<10.2f}".format(k, v) for (k, v) in measures.items()]))
        cvscores.append(measures)

cvscores = pd.DataFrame.from_dict(cvscores)

f1-score  1.00      precision 1.00      recall    1.00      accuracy  1.00      split     0.00      random_set0.00      
f1-score  0.86      precision 1.00      recall    0.75      accuracy  0.99      split     1.00      random_set0.00      
f1-score  0.67      precision 1.00      recall    0.50      accuracy  0.98      split     2.00      random_set0.00      
f1-score  1.00      precision 1.00      recall    1.00      accuracy  1.00      split     3.00      random_set0.00      
f1-score  0.86      precision 0.75      recall    1.00      accuracy  0.99      split     4.00      random_set0.00      
f1-score  1.00      precision 1.00      recall    1.00      accuracy  1.00      split     0.00      random_set1.00      
f1-score  1.00      precision 1.00      recall    1.00      accuracy  1.00      split     1.00      random_set1.00      
f1-score  0.86      precision 1.00      recall    0.75      accuracy  0.99      split     2.00      random_set1.00      
f1-score  1.00      precision 1.

In [6]:
cvscores.mean().to_frame().T.drop(["split", 'random_set'], axis=1)

Unnamed: 0,accuracy,f1-score,precision,recall
0,0.992016,0.899937,0.948667,0.876667
