# Treatment Sensitivity Models based on Clinical Markers

In [1]:
from optimization import lightgbm_optimizer

import pandas as pd
import numpy as np

dataset = pd.read_csv('data/input.tsv', sep='\t', index_col='ID')

dataset = dataset.loc[~dataset['therapy_first_line_class'].isnull()]

therapy_class = pd.get_dummies(dataset['therapy_first_line_class'])
therapy = pd.get_dummies(dataset['therapy_first_line'])

del dataset['therapy_first_line']
del dataset['therapy_first_line_class']

# removing unused outcome markers
# response_best_response_first_line ! Warning: we are using that outcome marker
del dataset['response_days_to_disease_progression']
del dataset['response_days_to_first_response']
del dataset['response_best_response_and_days_to_first_therapy']

dataset = dataset.loc[dataset['response_best_response_first_line'].notnull(),:]

dataset.iloc[:8, :8]

Unnamed: 0_level_0,cmmc,ecog_ps,cell_markers,dna_index,lgh,lgl,percent_aneuploid,percent_plama_cells_bone_marrow
ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
MMRF1007,,PS 1 (Restricted in physically strenuous activ...,CD117,,IgG,Kappa,0.0,0.6
MMRF1011,,PS 1 (Restricted in physically strenuous activ...,CD138,,,,0.0,0.9
MMRF1013,,PS 1 (Restricted in physically strenuous activ...,CD117,,,,0.0,1.3
MMRF1014,,PS 0 (Fully Active),CD117,,IgA,Kappa,0.0,1.4
MMRF1016,,PS 1 (Restricted in physically strenuous activ...,CD117,,IgG,Lambda,0.0,2.0
MMRF1017,,PS 1 (Restricted in physically strenuous activ...,CD138,1.25,IgG,Lambda,6.9,2.1
MMRF1018,,PS 1 (Restricted in physically strenuous activ...,CD117,,IgA,Kappa,0.0,2.1
MMRF1024,,PS 1 (Restricted in physically strenuous activ...,CD117,1.15,IgG,Kappa,11.0,6.0


In [2]:
to_delete = []

for c in therapy.columns:
    if therapy[c].sum() < 10:
        to_delete.append(c)

for c in to_delete:
    del therapy[c]

display(therapy.head())

print('{} rows X {} columns'.format(*therapy.shape))

Unnamed: 0_level_0,Bor,Bor-Cyc-Dex,Bor-Dex,Bor-Len-Dex,Len-Dex
ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
MMRF1007,0,0,0,0,0
MMRF1011,0,0,1,0,0
MMRF1013,0,0,0,1,0
MMRF1014,0,0,1,0,0
MMRF1016,0,1,0,0,0


1082 rows X 5 columns


In [3]:
from sklearn.model_selection import StratifiedKFold
from sklearn.preprocessing import MinMaxScaler
from sklearn.metrics import roc_auc_score, confusion_matrix, classification_report
from datetime import datetime

from evaluation import optimize_threshold, classification_metrics

import lightgbm as lgb
import numpy as np
import pickle

import warnings
warnings.filterwarnings('ignore')

result = None

N_FOLDS, RANDOM_STATE = 10, 13

kfold = StratifiedKFold(N_FOLDS, random_state=RANDOM_STATE)

for ic, col in enumerate(dataset.columns[:-1]):
    
    print(col)
    
    #########################################################################################
    # Dataset Preparation
    #########################################################################################
    
    all_ = None
    
    dat = dataset[[dataset.columns[-1], col]].copy().join(therapy.dropna(), how='inner')
    
    # transform categorical features into dummies
    for column in dat:

        if dat[column].dtype == 'object':

            values = pd.get_dummies(dat[column])

            values.columns = [column + '_' + str(col) for col in values.columns]
            
        else:
            values = dat[column].fillna(0)

        all_ = values if all_ is None else pd.concat([all_, values], axis=1)
    
    # format feature name
    col = col.lower().replace(' ', '').strip()
    
    # join all data
    all_ = all_.loc[all_.iloc[:,0].notna(),:]
    
    # split independent and dependent variables
    x, y = all_.values[:,1:], all_.values[:, 0]
    
    # x = np.delete(x, [i for i, v in enumerate(x) if np.sum(v) < N_FOLDS], axis=1) 
    
    # join treatments and dependent variable values to stratify fold based on these information
    r = np.concatenate([x[:, 1:], y.reshape([-1,1])], axis=1)
    r = np.apply_along_axis(lambda x: int(''.join([str(int(a)) for a in x])), axis=1, arr=r)
    
    # for each fold    
    for fold, (train_index, valid_index) in enumerate(kfold.split(x, r)):

        #########################################################################################
        # Dataset Pre-processing
        #########################################################################################
        
        s = MinMaxScaler()

        x_train, y_train = s.fit_transform(x[train_index,:]), y[train_index].ravel().reshape([-1, 1])

        x_valid, y_valid = s.transform(x[valid_index,:]), y[valid_index].ravel().reshape([-1, 1])

        lgb_train = lgb.Dataset(x_train, y_train)

        #########################################################################################
        # Baysian Optimization
        #########################################################################################

        file_name = 'output/brfl/optimization_lgbm_{}_{}_fold_{}.pkl'.format(
            fold, N_FOLDS, col)
        
        optimization_n_folds, optimization_n_calls = 2, 10
        
        opt = lightgbm_optimizer(x_train, y_train, 
                                 nfolds=optimization_n_folds, n_calls=optimization_n_calls, 
                                 random_state=RANDOM_STATE).x;

        params = {
            'learning_rate': opt[0],
            'num_leaves': opt[1],
            'max_depth': opt[2],
            'scale_pos_weight': opt[3],
            'min_child_weight': opt[4],
            'colsample_bytree': opt[5],
            'min_split_gain': opt[6],
            'min_child_samples': opt[7],
            'subsample': opt[8],
            'bin_construct_sample_cnt': opt[9],

            'objective':'binary',
            'metric':'auc',
            'is_unbalance':False,
            'nthread':24,          
            'verbose': -1,
            'device': 'gpu',
            'gpu_platform_id': 1,
            'gpu_device_id': 0,
            'random_state': RANDOM_STATE}

        #########################################################################################
        # Light GBM Train
        #########################################################################################
        
        model_name = 'output/brfl/classifier_{}_of_{}_fold_with_{}.lgbm'.format(
            fold, N_FOLDS, col)
        
        lgb_train = lgb.Dataset(x_train, y_train[:,0])
        lgb_valid = lgb.Dataset(x_valid, y_valid[:,0])

        gbm = lgb.train(params, lgb_train, valid_sets=lgb_valid, num_boost_round=100000, 
                        early_stopping_rounds=200, verbose_eval=False)
        
        with open(model_name, 'wb') as file:
            pickle.dump(gbm, file)
        
        #########################################################################################
        # Light GBM Inference
        #########################################################################################
        
        y_ = gbm.predict(x_valid)

        #########################################################################################
        # Performance Analysis
        #########################################################################################
        
        auc = roc_auc_score(y_valid, y_)

        t = optimize_threshold(y_train, gbm.predict(x_train))

        tn, fp, fn, tp = confusion_matrix(y_valid, [int(y >= t) for y in y_]).ravel()

        row = classification_metrics(tn, fp, fn, tp)
        
        row['auc'] = auc
        
        row['feature'] = col
        
        row['fold'] = fold + 1
        
        print(row)
        
        row = pd.Series(row).to_frame().T
        
        result = pd.DataFrame(row) if result is None else pd.concat([result, row], axis=0)
        
result

cmmc
{'accuracy': 0.4953271028037383, 'precision': 0.30434782608695654, 'sensitivity': 0.7777777777777778, 'specificity': 0.4, 'auc': 0.6055555555555556, 'feature': 'cmmc', 'fold': 1}
{'accuracy': 0.7142857142857143, 'precision': 0.3333333333333333, 'sensitivity': 0.2, 'specificity': 0.875, 'auc': 0.6252500000000001, 'feature': 'cmmc', 'fold': 2}
{'accuracy': 0.4380952380952381, 'precision': 0.2571428571428571, 'sensitivity': 0.72, 'specificity': 0.35, 'auc': 0.604, 'feature': 'cmmc', 'fold': 3}
{'accuracy': 0.23300970873786409, 'precision': 0.23300970873786409, 'sensitivity': 1.0, 'specificity': 0.0, 'auc': 0.6131329113924051, 'feature': 'cmmc', 'fold': 4}
{'accuracy': 0.2376237623762376, 'precision': 0.2376237623762376, 'sensitivity': 1.0, 'specificity': 0.0, 'auc': 0.5898268398268398, 'feature': 'cmmc', 'fold': 5}
{'accuracy': 0.4158415841584158, 'precision': 0.2535211267605634, 'sensitivity': 0.75, 'specificity': 0.3116883116883117, 'auc': 0.6458333333333333, 'feature': 'cmmc', 'fo

Unnamed: 0,accuracy,precision,sensitivity,specificity,auc,feature,fold
0,0.495327,0.304348,0.777778,0.4,0.605556,cmmc,1
0,0.714286,0.333333,0.2,0.875,0.62525,cmmc,2
0,0.438095,0.257143,0.72,0.35,0.604,cmmc,3
0,0.23301,0.23301,1,0,0.613133,cmmc,4
0,0.237624,0.237624,1,0,0.589827,cmmc,5
0,0.415842,0.253521,0.75,0.311688,0.645833,cmmc,6
0,0.434343,0.246154,0.695652,0.355263,0.574371,cmmc,7
0,0.234694,0.234694,1,0,0.502029,cmmc,8
0,0.680412,0.277778,0.217391,0.824324,0.645417,cmmc,9
0,0.412371,0.263889,0.826087,0.283784,0.523502,cmmc,10


In [10]:
result[~result['feature'].isin(fish)]

Unnamed: 0,accuracy,precision,sensitivity,specificity,auc,feature,fold
0,0.495327,0.304348,0.777778,0.400000,0.605556,cmmc,1.0
0,0.714286,0.333333,0.200000,0.875000,0.625250,cmmc,2.0
0,0.438095,0.257143,0.720000,0.350000,0.604000,cmmc,3.0
0,0.233010,0.233010,1.000000,0.000000,0.613133,cmmc,4.0
0,0.237624,0.237624,1.000000,0.000000,0.589827,cmmc,5.0
0,0.415842,0.253521,0.750000,0.311688,0.645833,cmmc,6.0
0,0.434343,0.246154,0.695652,0.355263,0.574371,cmmc,7.0
0,0.234694,0.234694,1.000000,0.000000,0.502029,cmmc,8.0
0,0.680412,0.277778,0.217391,0.824324,0.645417,cmmc,9.0
0,0.412371,0.263889,0.826087,0.283784,0.523502,cmmc,10.0


In [11]:
for c in result.columns:
    if c != 'feature':
        result[c] = result[c].astype(float)

fish = ['11p15', '13q14', '13q34', '15q15', '17p13', '19q13', '1q21', '20q13',
        '21q22', '3q21', '5q31', '7q22', '9q33', 't_11_14_ccnd1', 't_12_14_ccnd2', 't_14_16_maf', 't_14_20_mafb',
        't_4_14_whsc1', 't_6_14_ccnd3', 't_8_14_mafa', 't_8_14_myc', 'hyperdiploid_flag']

result[~result['feature'].isin(fish)].to_csv('output/brfl/clinical_markers.csv', sep=',', index=False)

result.groupby('feature').mean().drop(fish).drop(columns='fold')

Unnamed: 0_level_0,accuracy,precision,sensitivity,specificity,auc
feature,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
absolute_neutrophil,0.567957,0.277491,0.547024,0.575042,0.616225
age,0.465652,0.280557,0.778354,0.368695,0.594655
albumin,0.522518,0.271504,0.599474,0.498881,0.566746
beta_2_microglobulin,0.587297,0.317723,0.571181,0.592194,0.651628
bun,0.529012,0.293554,0.606279,0.505435,0.628566
calcium,0.371363,0.258387,0.849159,0.221583,0.567113
cell_markers,0.379573,0.250666,0.761888,0.264632,0.604371
cmmc,0.4296,0.264149,0.718691,0.340006,0.592892
creatinine,0.645198,0.323727,0.428705,0.713033,0.638889
crp,0.691602,0.374471,0.40584,0.780168,0.652427
