# Loading Clinical Data

In [1]:
from optimization import lightgbm_optimizer

import pandas as pd
import numpy as np

dataset = pd.read_csv('data/input.tsv', sep='\t', index_col='ID')
dataset = dataset.loc[~dataset.index.duplicated(keep='first')]

dataset = dataset.loc[~dataset['therapy_first_line_class'].isnull()]

therapy_class = pd.get_dummies(dataset['therapy_first_line_class'])
therapy = pd.get_dummies(dataset['therapy_first_line'])

del dataset['therapy_first_line']
del dataset['therapy_first_line_class']

# removing unused outcome markers
# response_best_response_first_line ! Warning: we are using that outcome marker
del dataset['response_days_to_disease_progression']
del dataset['response_days_to_first_response']
del dataset['response_best_response_and_days_to_first_therapy']

dataset = dataset.loc[dataset['response_best_response_first_line'].notnull(),:]

display(dataset.iloc[:8, :8])
print(dataset.shape)

Unnamed: 0_level_0,cmmc,ecog_ps,cell_markers,dna_index,lgh,lgl,percent_aneuploid,percent_plama_cells_bone_marrow
ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
MMRF1007,,PS 1 (Restricted in physically strenuous activ...,CD117,,IgG,Kappa,0.0,0.6
MMRF1011,,PS 1 (Restricted in physically strenuous activ...,CD138,,,,0.0,0.9
MMRF1013,,PS 1 (Restricted in physically strenuous activ...,CD117,,,,0.0,1.3
MMRF1014,,PS 0 (Fully Active),CD117,,IgA,Kappa,0.0,1.4
MMRF1016,,PS 1 (Restricted in physically strenuous activ...,CD117,,IgG,Lambda,0.0,2.0
MMRF1017,,PS 1 (Restricted in physically strenuous activ...,CD138,1.25,IgG,Lambda,6.9,2.1
MMRF1018,,PS 1 (Restricted in physically strenuous activ...,CD117,,IgA,Kappa,0.0,2.1
MMRF1024,,PS 1 (Restricted in physically strenuous activ...,CD117,1.15,IgG,Kappa,11.0,6.0


(1013, 57)


In [2]:
to_delete = []

for c in therapy.columns:
    if therapy[c].sum() < 10:
        to_delete.append(c)

for c in to_delete:
    del therapy[c]

display(therapy.head())

print('{} rows X {} columns'.format(*therapy.shape))

Unnamed: 0_level_0,Bor,Bor-Cyc-Dex,Bor-Dex,Bor-Len-Dex,Len-Dex
ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
MMRF1007,0,0,0,0,0
MMRF1011,0,0,1,0,0
MMRF1013,0,0,0,1,0
MMRF1014,0,0,1,0,0
MMRF1016,0,1,0,0,0


1082 rows X 5 columns


# Loading Gene Expressions (FPKM)

In [3]:
# loading gene fpkm
gene_fpkm = pd.read_csv('data/gene_fpkm.txt', sep='\t', index_col='GENE_ID')

# removing data not collected at the first trail
for col in gene_fpkm.columns:
    if '_1_' not in col:
        del gene_fpkm[col]

# transpose matrix, delete patients and gene with all nan, and replace remainder missing by zero
gene_fpkm = gene_fpkm.T.dropna(how='all', axis=0).dropna(how='all', axis=1).fillna(0)

# replace id column name
gene_fpkm.index.name = 'ID'

# normalize index value transforming mmrf ids to integers
gene_fpkm.index = [int(col.split('_')[1]) for col in gene_fpkm.index]

# removing genes with zero sum
gene_fpkm = gene_fpkm[list(gene_fpkm.sum(axis=0).index[(gene_fpkm.sum(axis=0) > 0).tolist()])]
gene_fpkm.index = ['MMRF' + str(m) for m in gene_fpkm.index]

gene_fpkm = gene_fpkm.loc[~gene_fpkm.index.duplicated(keep='first')]

gene_fpkm.shape

(769, 49872)

# Creating Models

In [4]:
from sklearn.model_selection import StratifiedKFold
from sklearn.preprocessing import MinMaxScaler
from sklearn.metrics import roc_auc_score, confusion_matrix, classification_report
from datetime import datetime
from correlation import select_genes
from evaluation import optimize_threshold, classification_metrics

import lightgbm as lgb
import numpy as np
import pickle

import warnings
warnings.filterwarnings('ignore')

result = None

N_FOLDS, RANDOM_STATE = 10, 13

kfold = StratifiedKFold(N_FOLDS, random_state=RANDOM_STATE)

#########################################################################################
# Dataset Preparation
#########################################################################################

all_ = None

# join only response (column -1) and therapy columns
dat = dataset[[dataset.columns[-1]]].join(therapy.dropna(), how='inner')

# transform categorical features into dummies
for column in dat:

    if dat[column].dtype == 'object':
        values = pd.get_dummies(dat[column])
        values.columns = [column + '_' + str(col) for col in values.columns]
    else:
        values = dat[column].fillna(0)

    all_ = values if all_ is None else pd.concat([all_, values], axis=1)

# join all data
all_ = all_.loc[all_.iloc[:,0].notna(),:]

all_ = all_.join(gene_fpkm, how='inner')[all_.columns]
gene_fpkm = all_.join(gene_fpkm, how='inner').iloc[:, len(all_.columns):]

# split independent and dependent variables
x, y = all_.values[:,1:], all_.values[:, 0]

# join treatments and dependent variable values to stratify fold based on these information
r = np.concatenate([x[:, 1:(therapy.shape[1] + 1)], y.reshape([-1,1])], axis=1)
r = np.apply_along_axis(lambda x: int(''.join([str(int(a)) for a in x])), axis=1, arr=r)

for fold, (train_index, valid_index) in enumerate(kfold.split(x, r)):
    
    y_train = all_.iloc[train_index, 0].values.ravel().reshape([-1, 1])
    y_valid = all_.iloc[valid_index,0].values.ravel().reshape([-1, 1])
    
    selected_genes = select_genes(gene_fpkm.iloc[train_index, :], y_train[:, 0], threshold=.005)[0]

    for ic, col in enumerate(dataset.columns[:-1]):
        
        print(col)
        
        #########################################################################################
        # Dataset Pre-processing
        #########################################################################################

        col_object = dataset[[col]] if dataset[col].dtype != 'object' else pd.get_dummies(dataset[col])
        
        x_train = all_.iloc[train_index, 1:].join(
            col_object, how='inner').join(gene_fpkm.iloc[train_index, :][selected_genes]).values
        x_valid = all_.iloc[valid_index, 1:].join(
            col_object, how='inner').join(gene_fpkm.iloc[valid_index, :][selected_genes]).values

        # format feature name
        col = col.lower().replace(' ', '').strip()

        lgb_train = lgb.Dataset(x_train, y_train)

        #########################################################################################
        # Baysian Optimization
        #########################################################################################

        file_name = 'output/brfl/optimization_lgbm_{}_{}_fold_{}.pkl'.format(
            fold, N_FOLDS, col)
        
        optimization_n_folds, optimization_n_calls = 2, 10
        
        opt = lightgbm_optimizer(x_train, y_train, 
                                 nfolds=optimization_n_folds, n_calls=optimization_n_calls, 
                                 random_state=RANDOM_STATE).x;

        params = {
            'learning_rate': opt[0],
            'num_leaves': opt[1],
            'max_depth': opt[2],
            'scale_pos_weight': opt[3],
            'min_child_weight': opt[4],
            'colsample_bytree': opt[5],
            'min_split_gain': opt[6],
            'min_child_samples': opt[7],
            'subsample': opt[8],
            'bin_construct_sample_cnt': opt[9],

            'objective':'binary',
            'metric':'auc',
            'is_unbalance':False,
            'nthread':24,          
            'verbose': -1,
            'device': 'gpu',
            'gpu_platform_id': 1,
            'gpu_device_id': 0,
            'random_state': RANDOM_STATE}

        #########################################################################################
        # Light GBM Train
        #########################################################################################
        
        model_name = 'output/brfl/classifier_{}_of_{}_fold_with_{}.lgbm'.format(
            fold, N_FOLDS, col)
        
        lgb_train = lgb.Dataset(x_train, y_train[:,0])
        lgb_valid = lgb.Dataset(x_valid, y_valid[:,0])

        gbm = lgb.train(params, lgb_train, valid_sets=lgb_valid, num_boost_round=100000, 
                        early_stopping_rounds=200, verbose_eval=False)
        
        with open(model_name, 'wb') as file:
            pickle.dump(gbm, file)
        
        #########################################################################################
        # Light GBM Inference
        #########################################################################################
        
        y_ = gbm.predict(x_valid)

        #########################################################################################
        # Performance Analysis
        #########################################################################################
        
        auc = roc_auc_score(y_valid, y_)

        t = optimize_threshold(y_train, gbm.predict(x_train))

        if t is None:
            t = np.mean(y_train)
        
        tn, fp, fn, tp = confusion_matrix(y_valid, [int(y >= t) for y in y_]).ravel()

        row = classification_metrics(tn, fp, fn, tp)
        
        row['auc'] = auc
        
        row['feature'] = col
        
        row['fold'] = fold + 1
        
        print(row)
        
        row = pd.Series(row).to_frame().T
        
        result = pd.DataFrame(row) if result is None else pd.concat([result, row], axis=0)
        
result

cmmc
{'accuracy': 0.589041095890411, 'precision': 0.34285714285714286, 'sensitivity': 0.631578947368421, 'specificity': 0.5740740740740741, 'auc': 0.6500974658869395, 'feature': 'cmmc', 'fold': 1}
ecog_ps
{'accuracy': 0.6986301369863014, 'precision': 0.43478260869565216, 'sensitivity': 0.5263157894736842, 'specificity': 0.7592592592592593, 'auc': 0.6686159844054581, 'feature': 'ecog_ps', 'fold': 1}
cell_markers
{'accuracy': 0.6164383561643836, 'precision': 0.37142857142857144, 'sensitivity': 0.6842105263157895, 'specificity': 0.5925925925925926, 'auc': 0.6364522417153996, 'feature': 'cell_markers', 'fold': 1}
dna_index
{'accuracy': 0.726027397260274, 'precision': 0.4666666666666667, 'sensitivity': 0.3684210526315789, 'specificity': 0.8518518518518519, 'auc': 0.6491228070175439, 'feature': 'dna_index', 'fold': 1}
lgh
{'accuracy': 0.6164383561643836, 'precision': 0.37142857142857144, 'sensitivity': 0.6842105263157895, 'specificity': 0.5925925925925926, 'auc': 0.6354775828460039, 'feature

Unnamed: 0,accuracy,precision,sensitivity,specificity,auc,feature,fold
0,0.589041,0.342857,0.631579,0.574074,0.650097,cmmc,1
0,0.69863,0.434783,0.526316,0.759259,0.668616,ecog_ps,1
0,0.616438,0.371429,0.684211,0.592593,0.636452,cell_markers,1
0,0.726027,0.466667,0.368421,0.851852,0.649123,dna_index,1
0,0.616438,0.371429,0.684211,0.592593,0.635478,lgh,1
0,0.60274,0.361111,0.684211,0.574074,0.635478,lgl,1
0,0.643836,0.393939,0.684211,0.62963,0.663743,percent_aneuploid,1
0,0.630137,0.382353,0.684211,0.611111,0.677388,percent_plama_cells_bone_marrow,1
0,0.726027,0.466667,0.368421,0.851852,0.649123,percent_plama_cells_peripherical_blood,1
0,0.561644,0.341463,0.736842,0.5,0.672515,creatinine,1


In [7]:
for c in result.columns:
    if c != 'feature':
        result[c] = result[c].astype(float)

fish = ['11p15', '13q14', '13q34', '15q15', '17p13', '19q13', '1q21', '20q13',
        '21q22', '3q21', '5q31', '7q22', '9q33', 't_11_14_ccnd1', 't_12_14_ccnd2', 't_14_16_maf', 't_14_20_mafb',
        't_4_14_whsc1', 't_6_14_ccnd3', 't_8_14_mafa', 't_8_14_myc', 'hyperdiploid_flag']

result[~result['feature'].isin(fish)].to_csv('output/brfl/clinical_and_gene_markers.csv', sep=',', index=False)

result.groupby('feature').mean().drop(fish).drop(columns='fold')

Unnamed: 0_level_0,accuracy,precision,sensitivity,specificity,auc
feature,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
absolute_neutrophil,0.598887,0.332384,0.534161,0.622705,0.619441
age,0.627543,0.31326,0.503069,0.667014,0.618195
albumin,0.652508,0.34096,0.428434,0.725467,0.613412
beta_2_microglobulin,0.582437,0.316347,0.539823,0.600973,0.624502
bun,0.623946,0.310889,0.498712,0.662965,0.625662
calcium,0.630695,0.329611,0.506475,0.669322,0.622514
cell_markers,0.6156,0.302934,0.433704,0.67178,0.59734
cmmc,0.61005,0.307306,0.514834,0.640074,0.619076
creatinine,0.605635,0.303036,0.52536,0.63059,0.618752
crp,0.601706,0.286762,0.464613,0.644455,0.605362
