# Loading Clinical Data

In [1]:
from data import load_data

clinical, _, _, treatments, outcome = load_data()

clinical.head()

Unnamed: 0_level_0,cmmc,ecog_ps,percent_aneuploid,percent_plama_cells_bone_marrow,percent_plama_cells_peripherical_blood,creatinine,iss,absolute_neutrophil,platelet,wbc_x10_10_9_l,...,m_protein,first_line_transplant,cell_markers_cd117,cell_markers_cd13,cell_markers_cd138,cell_markers_cd38,race_asian,race_black_african_american,race_other,race_white
ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
MMRF1021,,1.0,0.0,4.9,0.0,88.4,1.0,2.4,216.0,5.2,...,3.05,1,0,1,0,0,0,0,0,1
MMRF1024,,1.0,11.0,6.0,0.0,123.76,2.0,2.3,188.0,4.3,...,2.6,0,1,0,0,0,0,0,0,1
MMRF1029,,1.0,0.0,8.4,0.0,106.08,1.0,2.6,219.0,4.0,...,1.8,0,1,0,0,0,0,0,0,1
MMRF1030,,1.0,15.4,9.6,0.0,55.692,1.0,2.5,215.0,4.7,...,3.55,1,1,0,0,0,0,0,0,1
MMRF1031,,,18.3,10.1,0.0,81.328,1.0,10.29,385.0,12.4,...,1.52,0,1,0,0,0,0,0,0,1


In [2]:
for c in clinical.columns:
    print(c, clinical[c].dtype)

cmmc float64
ecog_ps float64
percent_aneuploid float64
percent_plama_cells_bone_marrow float64
percent_plama_cells_peripherical_blood float64
creatinine float64
iss float64
absolute_neutrophil float64
platelet float64
wbc_x10_10_9_l float64
bun float64
glucose float64
total_protein float64
albumin float64
beta_2_microglobulin float64
calcium float64
hemoglobin float64
ldh float64
age float64
gender int64
lga float64
lgg float64
lgl_kappa float64
lgl_lambda float64
lgm float64
m_protein float64
first_line_transplant int64
cell_markers_cd117 uint8
cell_markers_cd13 uint8
cell_markers_cd138 uint8
cell_markers_cd38 uint8
race_asian uint8
race_black_african_american uint8
race_other uint8
race_white uint8


In [18]:
from evaluation import optimize_threshold, classification_metrics
from constants import N_FOLDS, RANDOM_STATE

from sklearn.metrics import roc_auc_score, log_loss, confusion_matrix
from sklearn.model_selection import StratifiedKFold

from optimization import LightGBMOptimizer

from lightgbm import LGBMModel

import pandas as pd
import pickle
import time
import os

result = {c: [] for c in ['experiment', 'marker', 'train_auc', 'valid_auc', 
                          'train_loss', 'valid_loss', 'execution_time', 'threshold']}

for t in clinical.columns:
    
    print('*********************************************************************************')
    print(t)
    print('*********************************************************************************\n')
    
    y = outcome.dropna()
    
    x = clinical[[t]].dropna().join(treatments, how='inner')
    
    x = x.join(y)[x.columns]
    y = x.join(y)[y.columns[0]]
    
    opt_kf = StratifiedKFold(N_FOLDS, shuffle=True, random_state=RANDOM_STATE)
    
    if y.sum() < 10 or (1 - y).sum() < 10:
        print('Ignoring {} marker once it has less than five treatment senstive patients associted.')
    
    for experiment, (opt_train, opt_valid) in enumerate(opt_kf.split(x, y)):
        
        initial_time = time.time()
        
        #################################################################################################
        # Train and Valid Split
        #################################################################################################
        
        x_train, y_train = x.iloc[opt_train,:], y.iloc[opt_train]
        
        x_valid, y_valid = x.iloc[opt_valid,:], y.iloc[opt_valid]
        
        #################################################################################################
        # Hyper parameters optimization
        #################################################################################################        
        
        training_default_params = {
            'objective': 'binary',
            'metric': 'binary_logloss',
            'n_estimators': 100,
            'objective': 'binary',
            'is_unbalance': False, 
            'extra_trees': True,
            'max_depth': 4,
            'learning_rate': 0.1,
            'min_split_gain': 0.0001,
            'min_child_weight': 0.0001,
            'random_state': RANDOM_STATE}
        
        optimizer_params = {
            'n_folds': 2, 
            'n_calls': 50, 
            'shuffle': True, 
            'early_stopping_rounds': 1,
            'fixed_parameters': training_default_params, 
            'random_state': RANDOM_STATE, 
            'use_gpu': False}
        
        optimizer = LightGBMOptimizer(**optimizer_params)
        
        params = optimizer.optimize(x_train, y_train)

        params = {**params, **training_default_params}
        
        skf = StratifiedKFold(3, shuffle=True, random_state=RANDOM_STATE)

        models = []
        
        for train_index, valid_index in skf.split(x_train, y_train):

            xx_train, yy_train = x_train.values[train_index, :], y_train.values[train_index]
            xx_valid, yy_valid = x_train.values[valid_index, :], y_train.values[valid_index]

            gbm = LGBMModel(**params)

            gbm.fit(xx_train, 
                    yy_train,
                    eval_set=[(xx_valid, yy_valid)],
                    early_stopping_rounds=1,
                    verbose=False)
            
            models.append(gbm)
        
        #################################################################################################
        # Predicting
        #################################################################################################        
        
        y_hat_train, y_hat_valid = None, None
        
        for model in models:
            
            y_ = model.predict(x_train)
            y_hat_train = y_ if y_hat_train is None else y_ + y_hat_train
            
            y_ = model.predict(x_valid)
            y_hat_valid = y_ if y_hat_valid is None else y_ + y_hat_valid
        
        y_hat_train /= len(models)
        y_hat_valid /= len(models)
        
        #################################################################################################
        # Analysing Performance
        #################################################################################################   
        
        # Computing AUC
        train_auc = roc_auc_score(y_train, y_hat_train)
        valid_auc = roc_auc_score(y_valid, y_hat_valid)
        
        # Computing logLoss
        train_loss = log_loss(y_train, y_hat_train)
        valid_loss = log_loss(y_valid, y_hat_valid)
        
        # Compute optimized threshold
        opt_threshold = optimize_threshold(y_train, y_hat_train)

        if opt_threshold is None:
            opt_threshold = np.mean(y_train)
        
        # compute confusion matrix
        tn, fp, fn, tp = confusion_matrix(y_valid, [int(y >= opt_threshold) for y in y_]).ravel()

        classification_results = classification_metrics(tn, fp, fn, tp)
        
        # add results to data frame (dict for now)
        for k in classification_results:
            if k not in result:
                result[k] = []
            result[k].append(classification_results[k])
        
        result['experiment'].append(experiment)
        result['marker'].append(t)
        result['train_auc'].append(train_auc)
        result['valid_auc'].append(valid_auc)
        result['train_loss'].append(train_loss)
        result['valid_loss'].append(valid_loss)
        result['execution_time'].append(time.time() - initial_time)
        result['threshold'].append(opt_threshold)
        
        log_message = 'Experiment #{}: '.format(experiment) + 'Train AUC: {}'.format(train_auc) + ' '
        log_message += 'Valid AUC: {}'.format(valid_auc)
        
        print(log_message)
    
    print('')
   
result = pd.DataFrame(result)

result.to_csv('output/gene/metrics_clinical_only.csv', sep=',', index=False)

result.head()

*********************************************************************************
cmmc
*********************************************************************************

Experiment #0: Train AUC: 0.6251167133520075 Valid AUC: 0.5120689655172413
Experiment #1: Train AUC: 0.6421994884910486 Valid AUC: 0.5428571428571428
Experiment #2: Train AUC: 0.6238084166472914 Valid AUC: 0.45000000000000007
Experiment #3: Train AUC: 0.6488723552662172 Valid AUC: 0.4696428571428572
Experiment #4: Train AUC: 0.6125784701232271 Valid AUC: 0.842857142857143
Experiment #5: Train AUC: 0.6491865061126942 Valid AUC: 0.4742063492063493
Experiment #6: Train AUC: 0.6221849434690688 Valid AUC: 0.75
Experiment #7: Train AUC: 0.6453028771026749 Valid AUC: 0.6924603174603174
Experiment #8: Train AUC: 0.6552992002941447 Valid AUC: 0.40674603174603174
Experiment #9: Train AUC: 0.608534791800717 Valid AUC: 0.6587301587301588

*********************************************************************************
ecog_ps
***

Unnamed: 0,experiment,marker,train_auc,valid_auc,train_loss,valid_loss,execution_time,threshold,accuracy,precision,sensitivity,specificity
0,0,cmmc,0.625117,0.512069,0.545428,0.573595,21.451243,0.251666,0.74359,0.5,0.3,0.896552
1,1,cmmc,0.642199,0.542857,0.536624,0.566508,22.181003,0.256543,0.421053,0.25,0.6,0.357143
2,2,cmmc,0.623808,0.45,0.543624,0.565358,22.201291,0.234144,0.736842,0.5,0.2,0.928571
3,3,cmmc,0.648872,0.469643,0.532694,0.589842,23.670052,0.248062,0.526316,0.3,0.6,0.5
4,4,cmmc,0.612578,0.842857,0.548261,0.521423,21.362892,0.264713,0.842105,0.75,0.6,0.928571


In [21]:
result.groupby('marker').mean()

Unnamed: 0_level_0,experiment,train_auc,valid_auc,train_loss,valid_loss,execution_time,threshold,accuracy,precision,sensitivity,specificity
marker,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
absolute_neutrophil,4.5,0.58529,0.541534,0.539001,0.544222,20.126997,0.234407,0.424406,0.24436,0.733824,0.32963
age,4.5,0.605942,0.562001,0.536214,0.543213,20.393987,0.232819,0.462878,0.335695,0.703268,0.389026
albumin,4.5,0.609315,0.579936,0.533329,0.538743,20.168885,0.236793,0.513149,0.313623,0.598529,0.487542
beta_2_microglobulin,4.5,0.628372,0.577573,0.546628,0.551963,20.573004,0.243988,0.485031,0.270905,0.649583,0.431353
bun,4.5,0.641997,0.609779,0.542008,0.549879,21.736295,0.237918,0.492857,0.280284,0.65,0.440476
calcium,4.5,0.578878,0.557695,0.539838,0.541441,19.98727,0.228363,0.372902,0.244421,0.841176,0.228249
cell_markers_cd117,4.5,0.585132,0.543136,0.540114,0.547788,20.068154,0.223151,0.348206,0.244954,0.859477,0.189935
cell_markers_cd13,4.5,0.579079,0.544781,0.537039,0.546268,20.220546,0.21774,0.361905,0.245229,0.835948,0.215065
cell_markers_cd138,4.5,0.585422,0.536547,0.539267,0.54597,20.183146,0.22518,0.385459,0.257296,0.853595,0.240714
cell_markers_cd38,4.5,0.579776,0.54569,0.536527,0.544487,20.536801,0.216621,0.367384,0.261343,0.918301,0.197045


In [22]:
result.groupby('marker').std()

Unnamed: 0_level_0,experiment,train_auc,valid_auc,train_loss,valid_loss,execution_time,threshold,accuracy,precision,sensitivity,specificity
marker,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
absolute_neutrophil,3.02765,0.023021,0.085133,0.003903,0.017299,0.786957,0.006923,0.101815,0.06411,0.302198,0.195358
age,3.02765,0.009224,0.090923,0.002403,0.009998,0.391031,0.012983,0.121719,0.234899,0.285315,0.240093
albumin,3.02765,0.034952,0.047948,0.004691,0.005842,0.666975,0.012753,0.15686,0.257025,0.354693,0.302427
beta_2_microglobulin,3.02765,0.026504,0.050716,0.004443,0.009605,0.678987,0.015461,0.068511,0.03973,0.21635,0.137033
bun,3.02765,0.016713,0.084248,0.003069,0.015963,0.51554,0.015043,0.093055,0.064431,0.263953,0.190889
calcium,3.02765,0.01537,0.083027,0.003081,0.007935,0.378779,0.021263,0.071345,0.044365,0.251179,0.160534
cell_markers_cd117,3.02765,0.016988,0.075386,0.003134,0.012339,0.655031,0.017736,0.073527,0.039695,0.198514,0.129138
cell_markers_cd13,3.02765,0.010848,0.083276,0.003548,0.015057,0.545243,0.025736,0.052783,0.039133,0.187856,0.096706
cell_markers_cd138,3.02765,0.014198,0.090059,0.003493,0.010276,1.119183,0.014627,0.077437,0.018138,0.167095,0.144929
cell_markers_cd38,3.02765,0.009103,0.083402,0.002704,0.01633,0.534778,0.024334,0.036668,0.016722,0.068705,0.045127


In [23]:
result.groupby('marker').mean().mean()

experiment         4.500000
train_auc          0.602766
valid_auc          0.570970
train_loss         0.540318
valid_loss         0.547044
execution_time    20.492841
threshold          0.233048
accuracy           0.442351
precision          0.272282
sensitivity        0.738518
specificity        0.349309
dtype: float64