In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from scipy import stats

from xgboost import XGBClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.model_selection import RandomizedSearchCV, KFold, train_test_split
from sklearn.metrics import accuracy_score, recall_score, f1_score, roc_auc_score, confusion_matrix, roc_curve, auc, matthews_corrcoef
from imblearn.over_sampling import RandomOverSampler

from functools import reduce

In [3]:
def load_geneset(name):
    gse = pd.read_csv(name + '.csv')
    gse = gse.rename(columns={gse.columns[0]: 'gene'})
    with open(name + '_y.txt') as file:
        data = [x.strip() for x in file.readlines()]
        data = ' '.join(data)
        y = [int(x) for x in data.split(' ')]
    gse = gse.append(dict(zip(gse.columns, ['outcome'] + y)), ignore_index=True)
    return gse

In [4]:
gse120396 = load_geneset('gse120396')
gse120649 = load_geneset('gse120649')
gse131179 = load_geneset('gse131179')

In [40]:
gse = reduce(lambda l, r: pd.merge(l, r, on='gene'), [gse120396, gse120649, gse131179])
y = gse[gse.gene == 'outcome']
gse = gse.drop(y.index)
gse = gse.set_index('gene').transpose()
#pca = PCA(n_components=100)
#principal_components = pca.fit_transform(gse)

X = gse.loc[:,~gse.columns.duplicated()] # pd.DataFrame(data=principal_components)
y = y.drop('gene', axis=1).squeeze()

mapping = {0: gse120396.columns.values, 1: gse120649.columns.values, 2: gse131179.columns.values}
rows = X.index

def which_source(x):
    if x in mapping[0]:
        return 120396
    if x in mapping[1]:
        return 120649
    if x in mapping[2]:
        return 131179
    
c = list(map(which_source, rows))

strat = [str(a) + str(b) for a, b in zip(y, c)]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, stratify=strat)

In [41]:
from itertools import permutations
        
class Result:
    def __init__(self, y_truth, y_pred):
        self.y_truth = y_truth
        self.y_pred = y_pred

max_depth = [x for x in range(1,11)]
gamma = [x / 2 for x in range(0,41,5)]
alpha = [x / 10 for x in range(0, 11)]

hyperparam_perms = [(i, j, k) for i in max_depth for j in gamma for k in alpha]

In [42]:
from IPython.display import display, clear_output

final_results = {}

for i, hyperparam_set in enumerate(hyperparam_perms):
    clear_output(wait=True)
    display(str(i+1) + '/' + str(len(hyperparam_perms)))
    max_depth = hyperparam_set[0]
    gamma = hyperparam_set[1]
    alpha = hyperparam_set[2]
    
    model = XGBClassifier(objective='binary:logistic', max_depth=max_depth, gamma=gamma, alpha=alpha)
    results = np.zeros(len(X_test))
    
    X_train_t, X_val, y_train_t, y_val = train_test_split(X_train, y_train, test_size=0.1, stratify=y_train)
    
    sm = RandomOverSampler(sampling_strategy=1.0)
    X_train_t_res, y_train_t_res = sm.fit_resample(X_train_t, y_train_t)

    model.fit(X_train_t_res, y_train_t_res, verbose=False, early_stopping_rounds=10, eval_metric='auc', eval_set=[(X_val, y_val)])
    
    final_results[hyperparam_set] = Result(y_test, model.predict_proba(X_test)[:,1])

'990/990'

In [43]:
df = pd.DataFrame(columns=['max_depth', 'gamma', 'alpha', 'truth', 'pred'])

for key, value in final_results.items():
    row = [key[0], key[1], key[2], value.y_truth.tolist(), value.y_pred]
    df = df.append(pd.DataFrame([row], columns=df.columns))

In [44]:
df.to_json('hyperparam_results.json', orient='records')