In [1]:
import pandas as pd
import numpy as np

In [2]:
dataset = pd.read_csv('data/input.tsv', sep='\t', index_col='ID')
del dataset['days_to_disease_progression']

dataset = dataset.loc[~dataset['therapy_first_line_class'].isnull()]

therapy_class = dataset['therapy_first_line_class']
therapy = dataset['therapy_first_line']
del dataset['therapy_first_line']

del dataset['best_response_first_line']
del dataset['therapy_first_line_class']
del dataset['days_to_disease_progression_class']


dataset = dataset[~dataset['best_response_first_line_class'].isnull()]

dataset.iloc[:10, :10]

Unnamed: 0_level_0,best_response_first_line_class,lgh,lgl,percent_aneuploid,percent_plama_cells_bone_marrow,percent_plama_cells_peripherical_blood,creatinine,iss,absolute_neutrophil,platelet
ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
MMRF1011,0.0,Not Recorded,Not Recorded,0.0,0.9,0.0,123.76,3,8.28,170.0
MMRF1013,1.0,Unknown,Unknown,0.0,1.3,0.2,186.524,3,4.33,245.0
MMRF1016,0.0,IgG,Lambda,0.0,2.0,0.0,86.632,1,5.8,177.0
MMRF1017,0.0,IgG,Lambda,6.9,2.1,0.0,79.56,1,3.69,191.0
MMRF1018,0.0,IgA,Kappa,0.0,2.1,0.0,133.484,3,5.6,271.0
MMRF1029,0.0,Unknown,Kappa,0.0,8.4,0.0,106.08,1,2.6,219.0
MMRF1030,1.0,IgG,Kappa,15.4,9.6,0.0,55.692,1,2.5,215.0
MMRF1031,0.0,IgA,Unknown,18.3,10.1,0.0,81.328,1,10.29,385.0
MMRF1032,0.0,IgG,Lambda,20.7,11.1,0.0,70.72,2,1.3,166.0
MMRF1033,0.0,IgG,Kappa,18.5,12.0,0.0,79.56,1,3.99,307.0


In [3]:
len(therapy_class)

711

In [4]:
def generate_metric(t, auc, tn, fp, fn, tp, title='THERAPY'):
    
    sensitivity = (tp / float(tp + fn)) if tp + fn > 0 else 1

    precision =  (tp / float(tp + fp)) if tp + fp > 0 else 1

    specificity = (tn / float(tn + fp)) if tn + fp > 0 else 1

    ks = abs(sensitivity + specificity - 1.)

    ifp = (float(tp + fp) / tp) if tp > 0 else -np.inf

    accuracy = (tp + tn) / (tp + tn + fp + fn)
    
    row = pd.DataFrame({title: [t], 'AUC': auc, 'Overall Accuracy': accuracy, 
                        'Precision': precision, 'Sensitivity': sensitivity, 'Specificity': specificity,
                        'KS': ks, 'IFP': ifp})
    
    return row

In [5]:
def optimize_threshold(y_true, y_):

    t, max_metric = None, -np.inf

    for i in np.arange(0.00, max(y_), 0.01):

        y_hat = np.copy(y_)

        filter__ = y_hat >= i

        y_hat[filter__], y_hat[~filter__] = 1, 0

        tn, fp, fn, tp = confusion_matrix(y_true, y_hat).ravel()

        sensitivity = (tp / float(tp + fn)) if tp + fn > 0 else 1

        specificity = (tn / float(tn + fp)) if tn + fp > 0 else 1

        ks = abs(sensitivity + specificity - 1.)
        
        auc = roc_auc_score(y_true, y_hat)
        
        metric = ks

        if metric > max_metric and metric is not np.inf:

            max_metric = metric

            t = i

    return t

In [6]:
from scipy.special import erfinv

class GaussRankScaler():

    def __init__( self ):
        self.epsilon = 0.001
        self.lower = -1 + self.epsilon
        self.upper = 1 - self.epsilon
        self.range = self.upper - self.lower
        self.mean = None

    def fit_transform( self, X ):

        i = np.argsort( X, axis = 0 )
        j = np.argsort( i, axis = 0 )

        assert ( j.min() == 0 ).all()
        assert ( j.max() == len( j ) - 1 ).all()

        j_range = len( j ) - 1
        self.divider = j_range / self.range

        transformed = j / self.divider
        transformed = transformed - self.upper
        transformed = erfinv( transformed )
        
        self.mean = np.mean(X, axis=0)

        return transformed - self.mean
    
    def transform( self, X ):

        i = np.argsort( X, axis = 0 )
        j = np.argsort( i, axis = 0 )

        assert ( j.min() == 0 ).all()
        assert ( j.max() == len( j ) - 1 ).all()

        j_range = len( j ) - 1
        self.divider = j_range / self.range

        transformed = j / self.divider
        transformed = transformed - self.upper
        transformed = erfinv( transformed )

        return transformed - self.mean

In [11]:
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import roc_auc_score, confusion_matrix, classification_report
from datetime import datetime
import lightgbm as lgb
import numpy as np

kfold = StratifiedKFold(10, random_state=13)

index = []

result, detailed_result = None, None

for ic, col in enumerate(dataset.columns[1:]):
    
    all_ = None
    
    dat = dataset[[dataset.columns[0], col]].copy().join(therapy.dropna(), how='inner')
    
    for column in dat:

        if dat[column].dtype == 'object':

            values = pd.get_dummies(dat[column])

            values.columns = [column + '_' + str(col) for col in values.columns]
        else:
            values = dat[column].fillna(0)

        all_ = values if all_ is None else pd.concat([all_, values], axis=1)
        
    x = all_.values[:,1:]

    y = all_.values[:,0]

    for i, (train_index, valid_index) in enumerate(kfold.split(x, y)):

        s = GaussRankScaler()

        x_train, y_train = s.fit_transform(x[train_index,:]), y[train_index].ravel()

        x_valid, y_valid = s.transform(x[valid_index,:]), y[valid_index].ravel()

        lgb_train = lgb.Dataset(x_train, y_train)

        params = {'boosting_type': 'gbdt', 
                  'objective': 'binary',
                  'num_class': 1,
                  'metric': 'logloss',
                  'learning_rate': 0.01, 
                  'num_leaves': 31, 
                  'max_depth': 4,  
                  'min_child_samples': 20, 
                  'max_bin': 255,  
                  'subsample': 0.8, 
                  'subsample_freq': 0,  
                  'colsample_bytree': 0.3,  
                  'min_child_weight': 5, 
                  'subsample_for_bin': 200000,
                  'min_split_gain': 0, 
                  'reg_alpha': 0, 
                  'reg_lambda': 0, 
                  'nthread': 6, 
                  'verbose': 0}

        gbm = lgb.train(params, lgb_train, num_boost_round=1000)    

        y_ = gbm.predict(x_valid)

        #
        #
        #
        auc = roc_auc_score(y_valid, y_)

        t = optimize_threshold(y_train, gbm.predict(x_train))

        tn, fp, fn, tp = confusion_matrix(y_valid, [int(y >= t) for y in y_]).ravel()

        row = generate_metric(i + 1, auc, tn, fp, fn, tp, title='Fold')

        row['Threshold'] = t

        result = row if result is None else pd.concat([result, row])
        
        detailed_row = pd.DataFrame({'fold': i + 1, 'variable': col, 'y_opt': [int(y >= t) for y in y_], 
                                     'y_hat': [max(min(1, yy), 0) for yy in y_ * .5 / t], 'y': list(y_valid), 
                                     'class': list(therapy_class[valid_index])}, index=list(dataset.index[valid_index]))
        
        detailed_result = detailed_row if detailed_result is None else pd.concat([detailed_row, detailed_result])
        
        index.append(col)
        
result.index = index

result

Unnamed: 0,Fold,AUC,Overall Accuracy,Precision,Sensitivity,Specificity,KS,IFP,Threshold
lgh,1,0.657452,0.691176,0.368421,0.437500,0.769231,0.206731,2.714286,0.31
lgh,2,0.530048,0.588235,0.227273,0.312500,0.673077,0.014423,4.400000,0.28
lgh,3,0.567308,0.676471,0.285714,0.250000,0.807692,0.057692,3.500000,0.31
lgh,4,0.727163,0.705882,0.388889,0.437500,0.788462,0.225962,2.571429,0.26
lgh,5,0.741026,0.611940,0.310345,0.600000,0.615385,0.215385,3.222222,0.24
lgh,6,0.602564,0.641791,0.304348,0.466667,0.692308,0.158974,3.285714,0.26
lgh,7,0.641830,0.590909,0.269231,0.466667,0.627451,0.094118,3.714286,0.25
lgh,8,0.750327,0.727273,0.421053,0.533333,0.784314,0.317647,2.375000,0.29
lgh,9,0.745098,0.727273,0.421053,0.533333,0.784314,0.317647,2.375000,0.28
lgh,10,0.584314,0.666667,0.266667,0.266667,0.784314,0.050980,3.750000,0.33


In [20]:
result_mean = result.groupby(level=0).mean()

del result_mean['Fold']

result_mean.to_csv('output/nogen_overall.tsv', sep='\t')

result_mean

Unnamed: 0,AUC,Overall Accuracy,Precision,Sensitivity,Specificity,KS,IFP,Threshold
11p15,0.627174,0.654005,0.313909,0.456667,0.713424,0.227097,3.892381,0.266
13q14,0.656573,0.640612,0.330349,0.527083,0.67451,0.211209,3.270534,0.263
13q34,0.611277,0.653782,0.308152,0.422917,0.722888,0.169082,3.59,0.269
15q15,0.631243,0.630163,0.308233,0.48625,0.672738,0.158988,3.318611,0.253
17p13,0.617959,0.640368,0.312376,0.487917,0.685935,0.186352,3.347186,0.262
19q13,0.609507,0.629917,0.296198,0.454167,0.682202,0.150158,3.520714,0.267
1q21,0.61506,0.623078,0.291459,0.429167,0.680845,0.122915,3.701746,0.269
20q13,0.640413,0.677865,0.360981,0.50125,0.730694,0.231944,2.860238,0.273
21q22,0.619023,0.62865,0.296928,0.449583,0.682466,0.132049,3.437698,0.266
3q21,0.631009,0.658639,0.332717,0.4675,0.715498,0.188767,3.191468,0.271


In [15]:
def compute_all(x):
    
    a = None
    
    try:
        a = roc_auc_score(x['y_opt'], x['y'])
    except:
        a = np.inf
    
    try:
        tn, fp, fn, tp = confusion_matrix(x['y'], x['y_opt']).ravel()
    except:
        tn, fp, fn, tp = np.inf,np.inf,np.inf,np.inf

    result = {'auc': a, 'tn': tn, 'fp': fp, 'fn': fn, 'tp': tp}
    
    row = generate_metric(i + 1, auc, tn, fp, fn, tp, title='Fold')
    
    for c in row:
        result[c] = row[c][0]
    
    return pd.Series(result)    
    
detailed = detailed_result.groupby(['class', 'variable']).apply(compute_all)

detailed

Unnamed: 0_level_0,Unnamed: 1_level_0,auc,tn,fp,fn,tp,Fold,AUC,Overall Accuracy,Precision,Sensitivity,Specificity,KS,IFP
class,variable,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1
Bortezomib-based,11p15,0.605382,75.0,34.0,13.0,19.0,10.0,0.568627,0.666667,0.358491,0.593750,0.688073,0.281823,2.789474
Bortezomib-based,13q14,0.612051,74.0,35.0,12.0,20.0,10.0,0.568627,0.666667,0.363636,0.625000,0.678899,0.303899,2.750000
Bortezomib-based,13q34,0.582826,80.0,29.0,17.0,15.0,10.0,0.568627,0.673759,0.340909,0.468750,0.733945,0.202695,2.933333
Bortezomib-based,15q15,0.580645,77.0,32.0,16.0,16.0,10.0,0.568627,0.659574,0.333333,0.500000,0.706422,0.206422,3.000000
Bortezomib-based,17p13,0.614052,77.0,32.0,13.0,19.0,10.0,0.568627,0.680851,0.372549,0.593750,0.706422,0.300172,2.684211
Bortezomib-based,19q13,0.601213,74.0,35.0,13.0,19.0,10.0,0.568627,0.659574,0.351852,0.593750,0.678899,0.272649,2.842105
Bortezomib-based,1q21,0.569149,77.0,32.0,17.0,15.0,10.0,0.568627,0.652482,0.319149,0.468750,0.706422,0.175172,3.133333
Bortezomib-based,20q13,0.638101,82.0,27.0,13.0,19.0,10.0,0.568627,0.716312,0.413043,0.593750,0.752294,0.346044,2.421053
Bortezomib-based,21q22,0.591948,77.0,32.0,15.0,17.0,10.0,0.568627,0.666667,0.346939,0.531250,0.706422,0.237672,2.882353
Bortezomib-based,3q21,0.618571,78.0,31.0,13.0,19.0,10.0,0.568627,0.687943,0.380000,0.593750,0.715596,0.309346,2.631579


In [19]:
detailed = detailed.reset_index().set_index('variable')

for class_ in detailed['class'].unique():
    
    current_class = detailed.loc[detailed['class'] == class_].iloc[:,1:]
    
    current_class.to_csv('output/nogen_{}.tsv'.format(class_.lower().replace(' ', '_').replace('/', '_')))