# 1. Load and Format Data

This block aims at loading and formatting our iss and fish variables. Our id corresponde to the MMRF identifier.

In [2]:
import pandas as pd
import qgrid

# grid options
grid_options = {'forceFitColumns': False}

# loading iss and fish variables
iss_fish_vars = pd.read_csv('data/iss_fish_therapy_response.csv', sep='\t')

# all column name to upper case
iss_fish_vars.columns = [col.upper() for col in iss_fish_vars.columns]

# Renaming ID columns
iss_fish_vars = iss_fish_vars.rename(columns={'MMRF': 'ID'})

# Renaming ISS to Stage
iss_fish_vars = iss_fish_vars.rename(columns={'ISS': 'STAGE'})

# transforming mmrf ids to integers
iss_fish_vars['ID'] = iss_fish_vars['ID'].str.replace('MMRF', '').astype(int)

# setting index
iss_fish_vars = iss_fish_vars.set_index('ID')

# stage string to int
iss_fish_vars['STAGE'] = iss_fish_vars['STAGE'].map({'I': 1, 'II': 2, 'III': 3})

# count initial sample
initial_sample = iss_fish_vars.shape[0]

# drop invalid patients
iss_fish_vars = iss_fish_vars.dropna(subset=['BEST-RESPONSE-FIRSTLINE', 'FIRST-LINE-THERAPY'])

# count sample after dropping patients without therapy or response
therapy_and_response_sample = iss_fish_vars.shape

# removing unused variables
for col in ['DAYS-TO-OVERALL-SURVIVAL', 'DAYS-TO-PROGRESSION', 
            'PATIENT-FIRSTRESPONSE', 'FIRST-LINE-THERAPY-CLASS', 'FIRST-LINE-STARTING-TREATMENT-REGIMEN']:
    del iss_fish_vars[col]

qgrid.show_grid(iss_fish_vars, grid_options=grid_options)

QgridWidget(grid_options={'fullWidthRows': True, 'syncColumnCellResize': True, 'forceFitColumns': False, 'defa…

# 2. Format and Group the Response Variable

We group our response variable, a description of the patient's theray response, based on clinical protocols described by doctors specialized in Oncology.

In [3]:
%matplotlib inline

groups = {1: (['SCR'], ['CR', 'VGPR', 'PR', 'SD', 'PD']),
          2: (['SCR', 'CR'], ['VGPR', 'PR', 'SD', 'PD']),
          3: (['SCR', 'CR', 'VGPR'], ['PR', 'SD', 'PD']),
          4: (['SCR', 'CR', 'VGPR', 'PR'], ['SD', 'PD']),
          5: (['SCR', 'CR', 'VGPR', 'PR', 'SD'], ['PD']),
          6: (['SCR'], ['CR', 'VGPR'], ['PR', 'SD', 'PD']),
          7: (['SCR'], ['CR', 'VGPR', 'PR'], ['SD', 'PD'])}

selected_group = 2

# split response variable and drop it from iss and fish variables
if 'BEST-RESPONSE-FIRSTLINE' in  iss_fish_vars.columns:
    
    response_var = iss_fish_vars['BEST-RESPONSE-FIRSTLINE']
    
    del iss_fish_vars['BEST-RESPONSE-FIRSTLINE'] 

    response_var = response_var.apply(lambda x: 1 if x in groups[selected_group][0] else 0)

# plot count per class
print('Count per class')
    
for i, j in pd.DataFrame(response_var).groupby(by='BEST-RESPONSE-FIRSTLINE').apply(lambda x: len(x)).iteritems():
    print('{}: {}'.format(i, j))


Count per class
0: 566
1: 175


# 3. Load and Join Gene Expressions

In [4]:
# loading gene counts
gene_fpkm = pd.read_csv('data/gene_fpkm.txt', sep='\t', index_col='GENE_ID')

# removing data not collected at the first trail
for col in gene_fpkm.columns:
    if '_1_' not in col:
        del gene_fpkm[col]

# transpose matrix, delete patients and gene with all nan, and replace remainder missing by zero
gene_fpkm = gene_fpkm.T.dropna(how='all', axis=0).dropna(how='all', axis=1).fillna(0)

# replace id column name
gene_fpkm.index.name = 'ID'

# normalize index value transforming mmrf ids to integers
gene_fpkm.index = [int(col.split('_')[1]) for col in gene_fpkm.index]

# selected class
gene_details = pd.read_csv('data/gene_details.tsv', sep='\t')

gene_selected_class = pd.read_csv('data/gene_selected_class.tsv', sep='\t')

gene_selected_class = gene_details.merge(gene_selected_class, on='gene_biotype').set_index('ensembl_gene_id')

gene_selected_class = [gen for gen in gene_selected_class.index if gen in gene_fpkm.columns]

gene_fpkm = gene_fpkm[gene_selected_class]

# removing genes with zero sum
gene_fpkm = gene_fpkm[list(gene_fpkm.sum(axis=0).index[(gene_fpkm.sum(axis=0) > 0).tolist()])]

gene_fpkm.shape

(779, 26069)

In [5]:
gene_fpkm.iloc[:10,:10]

GENE_ID,ENSG00000000003,ENSG00000000005,ENSG00000000419,ENSG00000000457,ENSG00000000460,ENSG00000000938,ENSG00000000971,ENSG00000001036,ENSG00000001084,ENSG00000001167
2438,0.645325,0.0,32.1035,7.6691,7.0602,0.719675,15.3955,50.1926,11.3271,18.6944
1786,3.51915,0.0,29.3108,4.96608,1.50599,0.705555,1.28339,22.9092,7.74184,12.5095
1332,6.87306,0.0,41.2908,8.25159,4.64106,0.927851,0.072551,46.1312,13.6712,18.6234
2562,5.4415,0.0,27.8386,8.23226,2.71763,0.415857,0.064742,14.3873,15.6407,17.7199
1797,0.0,0.0,36.8206,7.72755,3.28094,0.220043,0.0,17.0595,17.2859,13.6723
1861,0.404031,0.0,41.0652,5.0433,1.54744,49.9608,1.26737,9.27582,8.13927,15.7142
1823,3.68256,0.0,60.5635,2.57942,1.71766,0.5457,0.165141,38.004,9.23173,15.483
2018,0.795386,0.0,29.0764,4.87317,2.25814,0.311901,0.685952,14.9834,11.3156,13.9395
2268,2.71476,0.491686,31.3695,2.17873,2.26555,172.304,1.23557,21.4735,11.2765,10.5836
2570,0.124405,0.0,39.5649,1.60298,1.19724,0.925557,0.222275,17.3529,5.64112,6.85792


# 4. Pairwise Linear Correlation


In [6]:
import pickle as pkl

file_path = 'data/output/selected_genes_g2.pkl'.format(col.split('_')[-1].lower())
    
with open(file_path, 'rb') as file:
    selected_genes = pkl.load(file)
    
gene_fpkm_selected = gene_fpkm[[gen for gen in selected_genes if gen in gene_fpkm.columns]]

gene_fpkm_selected.shape

(779, 1711)

# 5. Helpfull Functions

In [8]:
def optimize_threshold(y_true, y_):

    t, max_metric = None, -np.inf

    for i in np.arange(0.00, max(y_), 0.01):

        y_hat = np.copy(y_)

        filter__ = y_hat >= i

        y_hat[filter__], y_hat[~filter__] = 1, 0

        tn, fp, fn, tp = confusion_matrix(y_true, y_hat).ravel()

        sensitivity = (tp / float(tp + fn)) if tp + fn > 0 else 1

        specificity = (tn / float(tn + fp)) if tn + fp > 0 else 1

        ks = abs(sensitivity + specificity - 1.)
        
        auc = roc_auc_score(y_true, y_hat)
        
        metric = ks

        if metric > max_metric and metric is not np.inf:

            max_metric = metric

            t = i

    return t

In [9]:
if 'FIRST-LINE-THERAPY' in iss_fish_vars.columns:
    
    therapy = pd.get_dummies(iss_fish_vars['FIRST-LINE-THERAPY'])
    
    del iss_fish_vars['FIRST-LINE-THERAPY']
    
baseline_dataset = pd.DataFrame(response_var).join(therapy, how='inner').join(gene_fpkm_selected, how='inner')

baseline_dataset.shape

(495, 1726)

In [10]:
for col in therapy:
    print(col)
    

Bor
Bor-Cyc-Dex
Bor-Cyc-Dex+Bor-Dex
Bor-Dex
Bor-Dex+Bor
Bor-Dex+Bor-Cyc-Dex
Bor-Dex+Bor-Len-Dex
Bor-Dex+Bor-Len-Dex+Len
Bor-Len-Dex
Bor-Len-Dex+Bor-Dex
Bor-Len-Dex+Len
Len
Len-Dex
Len-Dex+Bor-Len-Dex


# CLINICAL OR Fish Only

In [11]:
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import roc_auc_score, confusion_matrix, classification_report
from datetime import datetime
import lightgbm as lgb
import numpy as np

kfold = StratifiedKFold(10, random_state=13)

result = None

for additional_column in iss_fish_vars:

    values = iss_fish_vars[additional_column]
    
    if values.dtype == 'object':
        values = pd.get_dummies(values)
    else:
        values = values.fillna(0)
    
    all_ = pd.DataFrame(response_var).join(therapy, how='inner').join(values, how='inner').fillna(0).values
    
    x = all_[:,1:]
    y = all_[:,0]
    
    for i, (train_index, valid_index) in enumerate(kfold.split(x, y)):

        x_train, y_train = x[train_index,:], y[train_index].ravel()

        x_valid, y_valid = x[valid_index,:], y[valid_index].ravel()

        lgb_train = lgb.Dataset(x_train, y_train)

        params = {'boosting_type': 'gbdt', 
                  'objective': 'binary',
                  'num_class': 1,
                  'metric': 'logloss',
                  'learning_rate': 0.01, 
                  'num_leaves': 31, 
                  'max_depth': -1,  
                  'min_child_samples': 20, 
                  'max_bin': 255,  
                  'subsample': 0.8, 
                  'subsample_freq': 0,  
                  'colsample_bytree': 0.3,  
                  'min_child_weight': 5, 
                  'subsample_for_bin': 200000,
                  'min_split_gain': 0, 
                  'reg_alpha': 0, 
                  'reg_lambda': 0, 
                  'nthread': 6, 
                  'verbose': 0}

        gbm = lgb.train(params, lgb_train, num_boost_round=1000)    

        y_ = gbm.predict(x_valid)

        #
        #
        #
        auc = roc_auc_score(y_valid, y_)

        t = optimize_threshold(y_train, gbm.predict(x_train))

        tn, fp, fn, tp = confusion_matrix(y_valid, [int(y >= t) for y in y_]).ravel()

        row = generate_metric(i + 1, auc, tn, fp, fn, tp, title='Fold')

        row['Threshold'] = t
        
        row['addc'] = additional_column

        result = row if result is None else pd.concat([result, row])
        
        break

del result['Fold']
        
result.set_index('addc')

Unnamed: 0_level_0,AUC,Overall Accuracy,Precision,Sensitivity,Specificity,KS,IFP,Threshold
addc,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
STAGE,0.741228,0.613333,0.365854,0.833333,0.54386,0.377193,2.733333,0.24
DEL13Q14,0.666667,0.56,0.333333,0.833333,0.473684,0.307018,3.0,0.21
DEL13Q34,0.666667,0.56,0.333333,0.833333,0.473684,0.307018,3.0,0.22
DEL17P13,0.705653,0.666667,0.347826,0.444444,0.736842,0.181287,2.875,0.24
AGE,0.685185,0.626667,0.34375,0.611111,0.631579,0.24269,2.909091,0.26
RACE,0.679825,0.56,0.333333,0.833333,0.473684,0.307018,3.0,0.21
GAIN1Q21,0.724659,0.64,0.371429,0.722222,0.614035,0.336257,2.692308,0.23
T11-14_CCND1,0.684211,0.613333,0.342857,0.666667,0.596491,0.263158,2.916667,0.23
T12-14_CCND2,0.725634,0.586667,0.348837,0.833333,0.508772,0.342105,2.866667,0.22
T14-16_MAF,0.726121,0.546667,0.326087,0.833333,0.45614,0.289474,3.066667,0.21


# Gene Expressions Only

In [12]:
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import roc_auc_score, confusion_matrix, classification_report
from datetime import datetime
import lightgbm as lgb
import numpy as np

kfold = StratifiedKFold(10, random_state=13)

model_id = datetime.now().strftime('%Y%m%d%H%M%S')

result, y_hat, y_true, index = None, [], [], []

all_ = baseline_dataset.values

x = all_[:,1:]
y = all_[:,0]


for i, (train_index, valid_index) in enumerate(kfold.split(x, y)):

    x_train, y_train = x[train_index,:], y[train_index].ravel()

    x_valid, y_valid = x[valid_index,:], y[valid_index].ravel()

    lgb_train = lgb.Dataset(x_train, y_train)

    params = {'boosting_type': 'gbdt', 
              'objective': 'binary',
              'num_class': 1,
              'metric': 'logloss',
              'learning_rate': 0.01, 
              'num_leaves': 31, 
              'max_depth': -1,  
              'min_child_samples': 20, 
              'max_bin': 255,  
              'subsample': 0.8, 
              'subsample_freq': 0,  
              'colsample_bytree': 0.3,  
              'min_child_weight': 5, 
              'subsample_for_bin': 200000,
              'min_split_gain': 0, 
              'reg_alpha': 0, 
              'reg_lambda': 0, 
              'nthread': 6, 
              'verbose': 0}

    gbm = lgb.train(params, lgb_train, num_boost_round=1000)    

    y_ = gbm.predict(x_valid)

    #
    #
    #
    auc = roc_auc_score(y_valid, y_)

    t = optimize_threshold(y_train, gbm.predict(x_train))

    y_ = [int(y >= t) for y in y_]
    
    tn, fp, fn, tp = confusion_matrix(y_valid, y_).ravel()

    row = generate_metric(i + 1, auc, tn, fp, fn, tp, title='Fold')

    row['Threshold'] = t

    result = row if result is None else pd.concat([result, row])

    y_hat  += list(y_)

    y_true += list(y_valid)

    index  += list(valid_index)
    
    break

del result['Fold']
result.index = ['baseline']
result

Unnamed: 0,AUC,Overall Accuracy,Precision,Sensitivity,Specificity,KS,IFP,Threshold
baseline,0.627193,0.66,0.333333,0.416667,0.736842,0.153509,3.0,0.16


# CLINICAL + GENE - AGE

In [13]:
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import roc_auc_score, confusion_matrix, classification_report
from datetime import datetime
import lightgbm as lgb
import numpy as np

kfold = StratifiedKFold(10, random_state=13)

result, addc = None, None

for column in iss_fish_vars[['STAGE', 'RACE', 'DPRT']]:

    if iss_fish_vars[column].dtype == 'object':
        
        values = pd.get_dummies(iss_fish_vars[column])
        
        values.columns = [column + '_' + col for col in values.columns]
    else:
        values = iss_fish_vars[column]

    addc = values if addc is None else pd.concat([addc, values], axis=1)

addc.index = iss_fish_vars.index

all_ = pd.DataFrame(response_var).join(therapy, how='inner').join(addc, how='inner').fillna(0).values

x = all_[:,1:]
y = all_[:,0]

for i, (train_index, valid_index) in enumerate(kfold.split(x, y)):

    x_train, y_train = x[train_index,:], y[train_index].ravel()

    x_valid, y_valid = x[valid_index,:], y[valid_index].ravel()

    lgb_train = lgb.Dataset(x_train, y_train)

    params = {'boosting_type': 'gbdt', 
              'objective': 'binary',
              'num_class': 1,
              'metric': 'logloss',
              'learning_rate': 0.01, 
              'num_leaves': 31, 
              'max_depth': -1,  
              'min_child_samples': 20, 
              'max_bin': 255,  
              'subsample': 0.8, 
              'subsample_freq': 0,  
              'colsample_bytree': 0.3,  
              'min_child_weight': 5, 
              'subsample_for_bin': 200000,
              'min_split_gain': 0, 
              'reg_alpha': 0, 
              'reg_lambda': 0, 
              'nthread': 6, 
              'verbose': 0}

    gbm = lgb.train(params, lgb_train, num_boost_round=1000)    

    y_ = gbm.predict(x_valid)

    #
    #
    #
    auc = roc_auc_score(y_valid, y_)

    t = optimize_threshold(y_train, gbm.predict(x_train))

    tn, fp, fn, tp = confusion_matrix(y_valid, [int(y >= t) for y in y_]).ravel()

    row = generate_metric(i + 1, auc, tn, fp, fn, tp, title='Fold')

    row['Threshold'] = t

    result = row if result is None else pd.concat([result, row])

    break

del result['Fold']
        
result.index = ['all']
result

Unnamed: 0,AUC,Overall Accuracy,Precision,Sensitivity,Specificity,KS,IFP,Threshold
all,0.690058,0.613333,0.351351,0.722222,0.578947,0.30117,2.846154,0.26


# CLINICAL + GENE

In [14]:
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import roc_auc_score, confusion_matrix, classification_report
from datetime import datetime
import lightgbm as lgb
import numpy as np

kfold = StratifiedKFold(10, random_state=13)

result, addc = None, None

for column in iss_fish_vars[['STAGE', 'RACE', 'AGE', 'DPRT']]:

    if iss_fish_vars[column].dtype == 'object':
        
        values = pd.get_dummies(iss_fish_vars[column])
        
        values.columns = [column + '_' + col for col in values.columns]
    else:
        values = iss_fish_vars[column]

    addc = values if addc is None else pd.concat([addc, values], axis=1)

addc.index = iss_fish_vars.index

all_ = pd.DataFrame(response_var).join(therapy, how='inner').join(addc, how='inner').fillna(0).values

x = all_[:,1:]
y = all_[:,0]

for i, (train_index, valid_index) in enumerate(kfold.split(x, y)):

    x_train, y_train = x[train_index,:], y[train_index].ravel()

    x_valid, y_valid = x[valid_index,:], y[valid_index].ravel()

    lgb_train = lgb.Dataset(x_train, y_train)

    params = {'boosting_type': 'gbdt', 
              'objective': 'binary',
              'num_class': 1,
              'metric': 'logloss',
              'learning_rate': 0.01, 
              'num_leaves': 31, 
              'max_depth': -1,  
              'min_child_samples': 20, 
              'max_bin': 255,  
              'subsample': 0.8, 
              'subsample_freq': 0,  
              'colsample_bytree': 0.3,  
              'min_child_weight': 5, 
              'subsample_for_bin': 200000,
              'min_split_gain': 0, 
              'reg_alpha': 0, 
              'reg_lambda': 0, 
              'nthread': 6, 
              'verbose': 0}

    gbm = lgb.train(params, lgb_train, num_boost_round=1000)    

    y_ = gbm.predict(x_valid)

    #
    #
    #
    auc = roc_auc_score(y_valid, y_)

    t = optimize_threshold(y_train, gbm.predict(x_train))

    tn, fp, fn, tp = confusion_matrix(y_valid, [int(y >= t) for y in y_]).ravel()

    row = generate_metric(i + 1, auc, tn, fp, fn, tp, title='Fold')

    row['Threshold'] = t

    result = row if result is None else pd.concat([result, row])

    break

del result['Fold']
        
result.index = ['all']

result

Unnamed: 0,AUC,Overall Accuracy,Precision,Sensitivity,Specificity,KS,IFP,Threshold
all,0.688109,0.653333,0.357143,0.555556,0.684211,0.239766,2.8,0.26


# CLINICAL AND FISH

In [15]:
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import roc_auc_score, confusion_matrix, classification_report
from datetime import datetime
import lightgbm as lgb
import numpy as np

kfold = StratifiedKFold(10, random_state=13)

result, addc = None, None

for column in iss_fish_vars:

    if iss_fish_vars[column].dtype == 'object':
        
        values = pd.get_dummies(iss_fish_vars[column])
        
        values.columns = [column + '_' + col for col in values.columns]
    else:
        values = iss_fish_vars[column]

    addc = values if addc is None else pd.concat([addc, values], axis=1)

addc.index = iss_fish_vars.index

all_ = pd.DataFrame(response_var).join(therapy, how='inner').join(addc, how='inner').fillna(0).values

x = all_[:,1:]
y = all_[:,0]

for i, (train_index, valid_index) in enumerate(kfold.split(x, y)):

    x_train, y_train = x[train_index,:], y[train_index].ravel()

    x_valid, y_valid = x[valid_index,:], y[valid_index].ravel()

    lgb_train = lgb.Dataset(x_train, y_train)

    params = {'boosting_type': 'gbdt', 
              'objective': 'binary',
              'num_class': 1,
              'metric': 'logloss',
              'learning_rate': 0.01, 
              'num_leaves': 31, 
              'max_depth': -1,  
              'min_child_samples': 20, 
              'max_bin': 255,  
              'subsample': 0.8, 
              'subsample_freq': 0,  
              'colsample_bytree': 0.3,  
              'min_child_weight': 5, 
              'subsample_for_bin': 200000,
              'min_split_gain': 0, 
              'reg_alpha': 0, 
              'reg_lambda': 0, 
              'nthread': 6, 
              'verbose': 0}

    gbm = lgb.train(params, lgb_train, num_boost_round=1000)    

    y_ = gbm.predict(x_valid)

    #
    #
    #
    auc = roc_auc_score(y_valid, y_)

    t = optimize_threshold(y_train, gbm.predict(x_train))

    tn, fp, fn, tp = confusion_matrix(y_valid, [int(y >= t) for y in y_]).ravel()

    row = generate_metric(i + 1, auc, tn, fp, fn, tp, title='Fold')

    row['Threshold'] = t

    result = row if result is None else pd.concat([result, row])

    break

del result['Fold']
        
result.index = ['all']
result

Unnamed: 0,AUC,Overall Accuracy,Precision,Sensitivity,Specificity,KS,IFP,Threshold
all,0.661793,0.613333,0.322581,0.555556,0.631579,0.187135,3.1,0.26


# GENE + (CLINICAL OR FISH Only)

In [16]:
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import roc_auc_score, confusion_matrix, classification_report
from datetime import datetime
import lightgbm as lgb
import numpy as np

kfold = StratifiedKFold(10, random_state=13)

result = None

for additional_column in iss_fish_vars:

    values = iss_fish_vars[additional_column]
    
    if values.dtype == 'object':
        values = pd.get_dummies(values)
    else:
        values = values.fillna(0)
    
    all_ = baseline_dataset.join(values, how='inner').fillna(0).values
    
    x = all_[:,1:]
    y = all_[:,0]
    
    for i, (train_index, valid_index) in enumerate(kfold.split(x, y)):

        x_train, y_train = x[train_index,:], y[train_index].ravel()

        x_valid, y_valid = x[valid_index,:], y[valid_index].ravel()

        lgb_train = lgb.Dataset(x_train, y_train)

        params = {'boosting_type': 'gbdt', 
                  'objective': 'binary',
                  'num_class': 1,
                  'metric': 'logloss',
                  'learning_rate': 0.01, 
                  'num_leaves': 31, 
                  'max_depth': -1,  
                  'min_child_samples': 20, 
                  'max_bin': 255,  
                  'subsample': 0.8, 
                  'subsample_freq': 0,  
                  'colsample_bytree': 0.3,  
                  'min_child_weight': 5, 
                  'subsample_for_bin': 200000,
                  'min_split_gain': 0, 
                  'reg_alpha': 0, 
                  'reg_lambda': 0, 
                  'nthread': 6, 
                  'verbose': 0}

        gbm = lgb.train(params, lgb_train, num_boost_round=1000)    

        y_ = gbm.predict(x_valid)

        #
        #
        #
        auc = roc_auc_score(y_valid, y_)

        t = optimize_threshold(y_train, gbm.predict(x_train))

        tn, fp, fn, tp = confusion_matrix(y_valid, [int(y >= t) for y in y_]).ravel()

        row = generate_metric(i + 1, auc, tn, fp, fn, tp, title='Fold')

        row['Threshold'] = t
        
        row['addc'] = additional_column

        result = row if result is None else pd.concat([result, row])
        
        break

del result['Fold']
        
result.set_index('addc')

Unnamed: 0_level_0,AUC,Overall Accuracy,Precision,Sensitivity,Specificity,KS,IFP,Threshold
addc,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
STAGE,0.642544,0.66,0.333333,0.416667,0.736842,0.153509,3.0,0.14
DEL13Q14,0.622807,0.66,0.333333,0.416667,0.736842,0.153509,3.0,0.16
DEL13Q34,0.622807,0.66,0.333333,0.416667,0.736842,0.153509,3.0,0.16
DEL17P13,0.622807,0.66,0.333333,0.416667,0.736842,0.153509,3.0,0.16
AGE,0.638158,0.66,0.333333,0.416667,0.736842,0.153509,3.0,0.16
RACE,0.622807,0.64,0.3125,0.416667,0.710526,0.127193,3.2,0.15
GAIN1Q21,0.622807,0.66,0.333333,0.416667,0.736842,0.153509,3.0,0.16
T11-14_CCND1,0.622807,0.66,0.333333,0.416667,0.736842,0.153509,3.0,0.16
T12-14_CCND2,0.642544,0.66,0.333333,0.416667,0.736842,0.153509,3.0,0.14
T14-16_MAF,0.642544,0.66,0.333333,0.416667,0.736842,0.153509,3.0,0.14


# GENE + CLINICAL + FISH

In [25]:
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import roc_auc_score, confusion_matrix, classification_report
from datetime import datetime
import lightgbm as lgb
import numpy as np

kfold = StratifiedKFold(10, random_state=13)

result, addc = None, None

for column in iss_fish_vars:

    if iss_fish_vars[column].dtype == 'object':
        
        values = pd.get_dummies(iss_fish_vars[column])
        
        values.columns = [column + '_' + col for col in values.columns]
    else:
        values = iss_fish_vars[column]

    addc = values if addc is None else pd.concat([addc, values], axis=1)

addc.index = iss_fish_vars.index

all_ = baseline_dataset.join(addc, how='inner').fillna(0).values

x = all_[:,1:]
y = all_[:,0]

for i, (train_index, valid_index) in enumerate(kfold.split(x, y)):

    x_train, y_train = x[train_index,:], y[train_index].ravel()

    x_valid, y_valid = x[valid_index,:], y[valid_index].ravel()

    lgb_train = lgb.Dataset(x_train, y_train)

    params = {'boosting_type': 'gbdt', 
              'objective': 'binary',
              'num_class': 1,
              'metric': 'logloss',
              'learning_rate': 0.01, 
              'num_leaves': 31, 
              'max_depth': -1,  
              'min_child_samples': 20, 
              'max_bin': 255,  
              'subsample': 0.8, 
              'subsample_freq': 0,  
              'colsample_bytree': 0.3,  
              'min_child_weight': 5, 
              'subsample_for_bin': 200000,
              'min_split_gain': 0, 
              'reg_alpha': 0, 
              'reg_lambda': 0, 
              'nthread': 6, 
              'verbose': 0}

    gbm = lgb.train(params, lgb_train, num_boost_round=1000)    

    y_ = gbm.predict(x_valid)

    #
    #
    #
    auc = roc_auc_score(y_valid, y_)

    t = optimize_threshold(y_train, gbm.predict(x_train))

    tn, fp, fn, tp = confusion_matrix(y_valid, [int(y >= t) for y in y_]).ravel()

    row = generate_metric(i + 1, auc, tn, fp, fn, tp, title='Fold')

    row['Threshold'] = t

    result = row if result is None else pd.concat([result, row])

del result['Fold']

result

Unnamed: 0,AUC,Overall Accuracy,Precision,Sensitivity,Specificity,KS,IFP,Threshold
0,0.710526,0.62,0.315789,0.5,0.657895,0.157895,3.166667,0.14
0,0.504386,0.54,0.238095,0.416667,0.578947,0.004386,4.2,0.13
0,0.627193,0.58,0.304348,0.583333,0.578947,0.162281,3.285714,0.14
0,0.774123,0.72,0.454545,0.833333,0.684211,0.517544,2.2,0.14
0,0.760965,0.7,0.428571,0.75,0.684211,0.434211,2.333333,0.14
0,0.730263,0.66,0.368421,0.583333,0.684211,0.267544,2.714286,0.13
0,0.72488,0.77551,0.5,0.545455,0.842105,0.38756,2.0,0.13
0,0.901914,0.816327,0.555556,0.909091,0.789474,0.698565,1.8,0.12
0,0.629187,0.591837,0.263158,0.454545,0.631579,0.086124,3.8,0.14
0,0.751843,0.625,0.347826,0.727273,0.594595,0.321867,2.875,0.13


In [38]:
a = pd.concat([result.mean(axis=0), result.std(axis=0)], axis=1)
a.columns = ['mean', 'std']
a.T

Unnamed: 0,AUC,Overall Accuracy,Precision,Sensitivity,Specificity,KS,IFP,Threshold
mean,0.711528,0.662867,0.377631,0.630303,0.672617,0.303798,2.8375,0.134
std,0.106231,0.088925,0.104288,0.16593,0.087128,0.212273,0.785073,0.006992


# GENE + CLINICAL + FISH - AGE

In [44]:
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import roc_auc_score, confusion_matrix, classification_report
from datetime import datetime
import lightgbm as lgb
import numpy as np

kfold = StratifiedKFold(10, random_state=13)

result, detailed_result, addc = None, None, None

for column in iss_fish_vars.drop(['AGE'], axis=1):

    if iss_fish_vars[column].dtype == 'object':
        
        values = pd.get_dummies(iss_fish_vars[column])
        
        values.columns = [column + '_' + col for col in values.columns]
    else:
        values = iss_fish_vars[column]

    addc = values if addc is None else pd.concat([addc, values], axis=1)

addc.index = iss_fish_vars.index

all_ = baseline_dataset.join(addc, how='inner').fillna(0).values

x, y = all_[:,1:], all_[:,0]

for i, (train_index, valid_index) in enumerate(kfold.split(x, y)):
    
    x_train, y_train = x[train_index,:], y[train_index].ravel()
    
    x_valid, y_valid = x[valid_index,:], y[valid_index].ravel()

    lgb_train = lgb.Dataset(x_train, y_train)

    params = {'boosting_type': 'gbdt', 
              'objective': 'binary',
              'num_class': 1,
              'metric': 'logloss',
              'learning_rate': 0.01, 
              'num_leaves': 31, 
              'max_depth': -1,  
              'min_child_samples': 20, 
              'max_bin': 255,  
              'subsample': 0.8, 
              'subsample_freq': 0,  
              'colsample_bytree': 0.3,  
              'min_child_weight': 5, 
              'subsample_for_bin': 200000,
              'min_split_gain': 0, 
              'reg_alpha': 0, 
              'reg_lambda': 0, 
              'nthread': 6, 
              'verbose': 0}

    gbm = lgb.train(params, lgb_train, num_boost_round=1000)    

    y_ = gbm.predict(x_valid)
    
    #
    #
    #
    auc = roc_auc_score(y_valid, y_)

    t = optimize_threshold(y_train, gbm.predict(x_train))

    tn, fp, fn, tp = confusion_matrix(y_valid, [int(y >= t) for y in y_]).ravel()

    row = generate_metric(i + 1, auc, tn, fp, fn, tp, title='Fold')

    row['Threshold'] = t

    result = row if result is None else pd.concat([result, row])
    
    
    #
    #
    #
    scores = pd.DataFrame({'fold': i + 1, 'y': y_valid, 'y_hat': y_, 'y_opt': [int(y >= t) for y in y_]})
    
    detailed_result = scores if detailed_result is None else pd.concat([detailed_result, scores])
    
detailed_result.to_csv('data/output/best_model_estimations.csv', sep=',', index=False)

result = result.set_index('Fold')

result

Unnamed: 0_level_0,AUC,Overall Accuracy,Precision,Sensitivity,Specificity,KS,IFP,Threshold
Fold,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
1,0.699561,0.64,0.3125,0.416667,0.710526,0.127193,3.2,0.14
2,0.532895,0.58,0.285714,0.5,0.605263,0.105263,3.5,0.13
3,0.620614,0.62,0.347826,0.666667,0.605263,0.27193,2.875,0.14
4,0.778509,0.72,0.454545,0.833333,0.684211,0.517544,2.2,0.14
5,0.767544,0.74,0.473684,0.75,0.736842,0.486842,2.111111,0.14
6,0.732456,0.68,0.4,0.666667,0.684211,0.350877,2.5,0.13
7,0.729665,0.693878,0.375,0.545455,0.736842,0.282297,2.666667,0.11
8,0.901914,0.816327,0.555556,0.909091,0.789474,0.698565,1.8,0.11
9,0.631579,0.653061,0.35,0.636364,0.657895,0.294258,2.857143,0.14
10,0.719902,0.604167,0.333333,0.727273,0.567568,0.29484,3.0,0.13


In [47]:
import numpy as np
import matplotlib.pyplot as plt
from matplotlib.colors import ListedColormap
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.datasets import make_moons, make_circles, make_classification
from sklearn.neural_network import MLPClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.gaussian_process import GaussianProcessClassifier
from sklearn.gaussian_process.kernels import RBF
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis

h = .02  # step size in the mesh

names = ["Nearest Neighbors", "Linear SVM", "RBF SVM", "Gaussian Process",
         "Decision Tree", "Random Forest", "Neural Net", "AdaBoost",
         "Naive Bayes", "QDA"]

classifiers = [
    KNeighborsClassifier(3),
    SVC(kernel="linear", C=0.025),
    SVC(gamma=2, C=1),
    GaussianProcessClassifier(1.0 * RBF(1.0)),
    DecisionTreeClassifier(max_depth=5),
    RandomForestClassifier(max_depth=5, n_estimators=10, max_features=1),
    MLPClassifier(alpha=1, max_iter=1000),
    AdaBoostClassifier(),
    GaussianNB(),
    QuadraticDiscriminantAnalysis()]


kfold = StratifiedKFold(10, random_state=13)

result, detailed_result, addc = {}, None, None

for column in iss_fish_vars.drop(['AGE'], axis=1):

    if iss_fish_vars[column].dtype == 'object':
        
        values = pd.get_dummies(iss_fish_vars[column])
        
        values.columns = [column + '_' + col for col in values.columns]
    else:
        values = iss_fish_vars[column]

    addc = values if addc is None else pd.concat([addc, values], axis=1)

addc.index = iss_fish_vars.index

all_ = baseline_dataset.join(addc, how='inner').fillna(0).values

x, y = all_[:,1:], all_[:,0]

for i, (train_index, valid_index) in enumerate(kfold.split(x, y)):
    
    x_train, y_train = x[train_index,:], y[train_index].ravel()
    
    x_valid, y_valid = x[valid_index,:], y[valid_index].ravel()

    for name, clf in zip(names, classifiers):
    
        if name not in result:
            result[name] = []
    
        clf.fit(x_train, y_train)
        
        y_ = clf.predict(x_valid)
        
        auc = roc_auc_score(y_valid, y_)
        
        result[name].append(auc)

result = pd.DataFrame(result)

result = result.unstack().reset_index()

result.columns = ['algorithm', 'fold', 'auc']

result.to_csv('data/output/baseline.csv', sep=',', index=False)

result



Unnamed: 0,Nearest Neighbors,Linear SVM,RBF SVM,Gaussian Process,Decision Tree,Random Forest,Neural Net,AdaBoost,Naive Bayes,QDA
0,0.517544,0.480263,0.5,0.5,0.60307,0.5,0.671053,0.614035,0.701754,0.473684
1,0.54386,0.482456,0.5,0.5,0.475877,0.5,0.535088,0.502193,0.473684,0.557018
2,0.627193,0.546053,0.5,0.5,0.436404,0.5,0.486842,0.475877,0.607456,0.460526
3,0.436404,0.464912,0.5,0.5,0.557018,0.5,0.574561,0.530702,0.730263,0.473684
4,0.421053,0.561404,0.5,0.5,0.47807,0.5,0.684211,0.640351,0.717105,0.473684
5,0.434211,0.684211,0.5,0.5,0.438596,0.5,0.5,0.530702,0.45614,0.486842
6,0.511962,0.400718,0.5,0.5,0.538278,0.5,0.5311,0.505981,0.543062,0.532297
7,0.492823,0.447368,0.5,0.5,0.706938,0.5,0.505981,0.577751,0.666268,0.519139
8,0.667464,0.595694,0.5,0.5,0.425837,0.5,0.342105,0.485646,0.562201,0.473684
9,0.477887,0.691646,0.5,0.5,0.70516,0.5,0.62285,0.605651,0.529484,0.486486


In [None]:
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import roc_auc_score, confusion_matrix, classification_report
from datetime import datetime
import lightgbm as lgb
import numpy as np

kfold = StratifiedKFold(10, random_state=13)

result, detailed_result, addc = None, None, None

for column in iss_fish_vars.drop(['AGE'], axis=1):

    if iss_fish_vars[column].dtype == 'object':
        
        values = pd.get_dummies(iss_fish_vars[column])
        
        values.columns = [column + '_' + col for col in values.columns]
    else:
        values = iss_fish_vars[column]

    addc = values if addc is None else pd.concat([addc, values], axis=1)

addc.index = iss_fish_vars.index

all_ = baseline_dataset.join(addc, how='inner').fillna(0).values

x = all_[:,1:]
y = all_[:,0]

for i, (train_index, valid_index) in enumerate(kfold.split(x, y)):
    
    x_train, y_train = x[train_index,:], y[train_index].ravel()
    
    x_valid, y_valid = x[valid_index,:], y[valid_index].ravel()

    lgb_train = lgb.Dataset(x_train, y_train)

    gbm = lgb.train(params, lgb_train, num_boost_round=1000)    

    y_ = gbm.predict(x_valid)
    
    #
    #
    #
    auc = roc_auc_score(y_valid, y_)

    t = optimize_threshold(y_train, gbm.predict(x_train))

    tn, fp, fn, tp = confusion_matrix(y_valid, [int(y >= t) for y in y_]).ravel()

    row = generate_metric(i + 1, auc, tn, fp, fn, tp, title='Fold')

    row['Threshold'] = t

    result = row if result is None else pd.concat([result, row])
    
    
    #
    #
    #
    scores = pd.DataFrame({'fold': i + 1, 'y': y_valid, 'y_hat': y_, 'y_opt': [int(y >= t) for y in y_]})
    
    detailed_result = scores if detailed_result is None else pd.concat([detailed_result, scores])
    
detailed_result.to_csv('data/output/best_model_estimations.csv', sep=',', index=False)

result = result.set_index('Fold')

result

In [40]:
a = pd.concat([result.mean(axis=0), result.std(axis=0)], axis=1)
a.columns = ['mean', 'std']
a.T

Unnamed: 0,Fold,AUC,Overall Accuracy,Precision,Sensitivity,Specificity,KS,IFP,Threshold
mean,5.5,0.711464,0.674743,0.388816,0.665152,0.677809,0.342961,2.670992,0.131
std,3.02765,0.100785,0.071007,0.08324,0.150196,0.069655,0.18079,0.524384,0.011972


# 5. Simulation

In [42]:
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import roc_auc_score, confusion_matrix, classification_report
from datetime import datetime
import lightgbm as lgb
import numpy as np

kfold = StratifiedKFold(10, random_state=13)

simulation, addc = None, None

for column in iss_fish_vars:

    if iss_fish_vars[column].dtype == 'object':
        
        values = pd.get_dummies(iss_fish_vars[column])
        
        values.columns = [column + '_' + col for col in values.columns]
    else:
        values = iss_fish_vars[column]

    addc = values if addc is None else pd.concat([addc, values], axis=1)

addc.index = iss_fish_vars.index

all_ = baseline_dataset.join(addc, how='inner').fillna(0).values

x = all_[:,1:]
y = all_[:,0]

for i, (train_index, valid_index) in enumerate(kfold.split(x, y)):

    x_train, y_train = x[train_index,:], y[train_index].ravel()

    x_valid, y_valid = x[valid_index,:], y[valid_index].ravel()

    lgb_train = lgb.Dataset(x_train, y_train)

    params = {'boosting_type': 'gbdt', 
              'objective': 'binary',
              'num_class': 1,
              'metric': 'logloss',
              'learning_rate': 0.01, 
              'num_leaves': 31, 
              'max_depth': -1,  
              'min_child_samples': 20, 
              'max_bin': 255,  
              'subsample': 0.8, 
              'subsample_freq': 0,  
              'colsample_bytree': 0.3,  
              'min_child_weight': 5, 
              'subsample_for_bin': 200000,
              'min_split_gain': 0, 
              'reg_alpha': 0, 
              'reg_lambda': 0, 
              'nthread': 6, 
              'verbose': 0}

    gbm = lgb.train(params, lgb_train, num_boost_round=1000)    

    row = therapy.iloc[valid_index,:].copy()
    
    row['THERAPY'] = therapy.iloc[valid_index,:].idxmax(axis=1)
    
    for lll in range(0, 14):
        
        for kkk in range(0, 14):
            x_valid[:,kkk] = int(kkk == lll)
        y_ = gbm.predict(x_valid)
        
        row.iloc[:,lll] = y_
        
    row['NEW_THERAPY'] = row.iloc[:,0:14].idxmax(axis=1)
    
    row['y'] = y_valid
        
    
    #
    #
    #
    #auc = roc_auc_score(y_valid, y_)

    #t = optimize_threshold(y_train, gbm.predict(x_train))

    #tn, fp, fn, tp = confusion_matrix(y_valid, [int(y >= t) for y in y_]).ravel()

    #row = generate_metric(i + 1, auc, tn, fp, fn, tp, title='Fold')

    #row['Threshold'] = t

    simulation = row if simulation is None else pd.concat([simulation, row])

#del result['Fold']
        
#result.index = ['all']

simulation

Unnamed: 0_level_0,Bor,Bor-Cyc-Dex,Bor-Cyc-Dex+Bor-Dex,Bor-Dex,Bor-Dex+Bor,Bor-Dex+Bor-Cyc-Dex,Bor-Dex+Bor-Len-Dex,Bor-Dex+Bor-Len-Dex+Len,Bor-Len-Dex,Bor-Len-Dex+Bor-Dex,Bor-Len-Dex+Len,Len,Len-Dex,Len-Dex+Bor-Len-Dex,THERAPY,NEW_THERAPY,y
ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1
1011,0.113978,0.113978,0.113978,0.113978,0.113978,0.113978,0.113978,0.113978,0.113978,0.113978,0.141249,0.113978,0.113978,0.113978,Bor-Dex,Bor-Len-Dex+Len,0.0
1013,0.020392,0.020392,0.020392,0.020392,0.020392,0.020392,0.020392,0.020392,0.020392,0.020392,0.023802,0.020392,0.020392,0.020392,Bor-Len-Dex+Len,Bor-Len-Dex+Len,0.0
1014,0.017657,0.017657,0.017657,0.017657,0.017657,0.017657,0.017657,0.017657,0.017657,0.017657,0.021661,0.017657,0.017657,0.017657,Bor-Dex+Bor-Len-Dex,Bor-Len-Dex+Len,1.0
1016,0.237484,0.237484,0.237484,0.237484,0.237484,0.237484,0.237484,0.237484,0.237484,0.237484,0.284937,0.237484,0.237484,0.237484,Bor-Cyc-Dex,Bor-Len-Dex+Len,0.0
1017,0.015928,0.015928,0.015928,0.015928,0.015928,0.015928,0.015928,0.015928,0.015928,0.015928,0.021186,0.015928,0.015928,0.015928,Len-Dex,Bor-Len-Dex+Len,0.0
1018,0.061270,0.061270,0.061270,0.061270,0.061270,0.061270,0.061270,0.061270,0.061270,0.061270,0.067686,0.061270,0.061270,0.061270,Len-Dex+Bor-Len-Dex,Bor-Len-Dex+Len,0.0
1020,0.057091,0.057091,0.057091,0.057091,0.057091,0.057091,0.057091,0.057091,0.057091,0.057091,0.072519,0.057091,0.057091,0.057091,Bor-Len-Dex,Bor-Len-Dex+Len,0.0
1021,0.027595,0.027595,0.027595,0.027595,0.027595,0.027595,0.027595,0.027595,0.027595,0.027595,0.032956,0.027595,0.027595,0.027595,Bor-Len-Dex,Bor-Len-Dex+Len,0.0
1029,0.009013,0.009013,0.009013,0.009013,0.009013,0.009013,0.009013,0.009013,0.009013,0.009013,0.010706,0.009013,0.009013,0.009013,Bor-Len-Dex,Bor-Len-Dex+Len,0.0
1030,0.220922,0.220922,0.220922,0.220922,0.220922,0.220922,0.220922,0.220922,0.220922,0.220922,0.272409,0.220922,0.220922,0.220922,Bor-Len-Dex,Bor-Len-Dex+Len,0.0


In [21]:
x = round((simulation['THERAPY'] != simulation['NEW_THERAPY']).sum() / simulation.shape[0] * 100, 2)

print('{}%'.format(x))

88.89%


In [22]:
simulation.to_csv('data/output/simulation.csv', sep=',', index=True)