# Classification

In [144]:
# Import libraries
import numpy as np
import pandas as pd
import os
import random
import dill
import pickle
from tabulate import tabulate

import sys

import warnings
warnings.filterwarnings("ignore")

from sklearn.metrics import roc_auc_score
from sklearn.model_selection import RandomizedSearchCV, GridSearchCV
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import BaggingClassifier
from sklearn.ensemble import GradientBoostingClassifier
import lightgbm as lgb
from sklearn.metrics import precision_recall_curve
from sklearn import metrics
from hmeasure import h_score
try:
  from catboost import CatBoostClassifier
except:
  !pip install catboost
  from catboost import CatBoostClassifier

import time
from datetime import timedelta

In [145]:
try:
  from google.colab import drive
  IN_COLAB=True
except:
  IN_COLAB=False

if IN_COLAB:
  print("We're running Colab")

if IN_COLAB:  
  # Mount the Google Drive at mount
  mount='/content/gdrive'
  print("Colab: mounting Google drive on ", mount)
  # connect your colab with the drive
  drive.mount(mount)

 # Switch to the directory on the Google Drive that you want to use
  import os
  path_to_repo = mount + "/My Drive/MIMIC-III Text Mining/mimim_iii_readmission"

else:
   path_to_repo = os.path.dirname(os.getcwd())

  
print(path_to_repo)

C:\Users\luca9\Documents\MIMIC-III Text Mining\mimim_iii_readmission


In [146]:
path_to_data = os.path.join(path_to_repo, "data","")
print(path_to_data)

C:\Users\luca9\Documents\MIMIC-III Text Mining\mimim_iii_readmission\data\


In [147]:
path_to_processed = os.path.join(path_to_data,"processed","")
os.makedirs(path_to_processed, exist_ok=True) # we create the directory if it does not exist
print(path_to_processed)

C:\Users\luca9\Documents\MIMIC-III Text Mining\mimim_iii_readmission\data\processed\


In [148]:
path_to_models = os.path.join(path_to_data,"models","")
os.makedirs(path_to_models, exist_ok=True) # we create the directory if it does not exist
print(path_to_models)

C:\Users\luca9\Documents\MIMIC-III Text Mining\mimim_iii_readmission\data\models\


In [149]:
# PARAMETERS

session_seed = 10 # set seed for our session
include_val = False # set to True if we want to also create a validation set
tune_models = False # set to True if we want to perform parameter tuning

seed_tag = f'_{session_seed}'

if tune_models:
  tune_tag = '_tuned'
else:
  tune_tag = ''

random.seed(session_seed)

## Train the Models

In [150]:
def load_datasets(method, include_val = True, target = False):
    """
    Function to load train, test and validation set based on the chosen method
    method: string for the processing method we want to load
    include_diag: if we want to load the dataframes with the diagnosis text, default True
    include_test: if we want to load also the test set, default True
    target: if we are importing our target variables
    """
    global path_to_processed
    if target == True: 
        target = 'y_'
    else: 
        target = ''
    # load it back
    train = pd.read_feather(f'{path_to_processed}{target}train_{method}{seed_tag}')
    test = pd.read_feather(f'{path_to_processed}{target}test_{method}{seed_tag}')
    if include_val == True:
        val = pd.read_feather(f'{path_to_processed}{target}val_{method}{seed_tag}')
    else: val = []
    return train, test, val

In [151]:
y_train, y_test, y_val = load_datasets(method = '', include_val = include_val, target = True)

In [152]:
# initialize a dictionary for the results of all the models
final_train = {}
final_val = {}
final_test = {}

In [153]:
model_dict = {
    'log_reg': LogisticRegression(solver = "saga", penalty = 'l1', random_state = session_seed, n_jobs = -1) # default penalty is l2, we do lasso
    , 'dec_tree': DecisionTreeClassifier(random_state = session_seed)
    #, 'bag_tree': BaggingClassifier(base_estimator = DecisionTreeClassifier(), n_estimators = 10, random_state = session_seed, n_jobs = -1)
    , 'rand_for': RandomForestClassifier(random_state = session_seed, n_jobs = -1)
    , 'gboost': GradientBoostingClassifier(random_state = session_seed)
    , 'lightgbm': lgb.LGBMClassifier(random_state = 42, n_jobs = -1, deterministic = True)
    , 'catboost': CatBoostClassifier(random_seed = 42)
}

In [154]:
# PARAMETERS FOR LOGISTIC REGRESSION -------
param_en = {'C': np.logspace(-3, 4, 10), 'l1_ratio':np.linspace(0,1,11) }

# PARAMETERS FOR DECISION TREE -------------
# Number of features to consider at every split
max_features = ['auto', 'sqrt']
# Maximum number of levels in tree
max_depth = [int(x) for x in np.linspace(10, 110, num = 11)]
max_depth.append(None)
# Minimum number of samples required to split a node
min_samples_split = [2, 5, 10]
# Minimum number of samples required at each leaf node
min_samples_leaf = [1, 2, 4]

param_dec = {'max_features': max_features,
               'max_depth': max_depth,
               'min_samples_split': min_samples_split,
               'min_samples_leaf': min_samples_leaf}

# PARAMETERS FOR RANDOM FOREST -------
# Number of trees in random forest
n_estimators = [int(x) for x in np.linspace(start = 100, stop = 500, num = 5)]

# Maximum number of samples per tree
max_sampl = list(np.arange(0.01,1,0.2))
max_sampl.append(None)
# Create the random grid
param_rf = {'n_estimators': n_estimators,
               'max_features': max_features,
               'max_depth': max_depth,
               'min_samples_split': min_samples_split,
               'min_samples_leaf': min_samples_leaf,
               'max_samples': max_sampl}

# PARAMETERS FOR GRADIENT BOOSTING --------

learn_rate = list(np.linspace(0, 1, num = 10))

param_gb = {'n_estimators': n_estimators,
            'learning_rate': learn_rate,
               'max_features': max_features,
               'max_depth': max_depth,
               'min_samples_split': min_samples_split,
               'min_samples_leaf': min_samples_leaf,
               'subsamples': list(np.linspace(0, 1, num = 10))}

# PARAMETERS FOR LIGHTGBM -----------
param_lgb = {'max_depth': max_depth,
             'min_data_in_leaf': min_samples_leaf,
             'num_iterations': n_estimators,
             'learning_rate': learn_rate,
             'colsample_bytree': list(np.linspace(0, 1, num = 10)),
             'subsample': list(np.linspace(0, 1, num = 10)),
             'reg_alpha': [0, 1e-1, 1, 2, 5, 7, 10, 50, 100],
             'reg_lambda': [0, 1e-1, 1, 5, 10, 20, 50, 100]}

# PARAMETERS FOR CATBOOSTING ------

param_cat = {'n_estimators': n_estimators,
            'learning_rate': learn_rate,
               'rsm': list(np.linspace(0, 1, num = 10)),
               'depth': [int(x) for x in np.linspace(10, 110, num = 11)]
            , 'l2_leaf_reg': [1, 2, 3, 4, 5, 7, 9, 15, 20]}

In [155]:
param_dictionary = {
    'log_reg': param_en
    , 'dec_tree': param_dec
    #, 'bag_tree': BaggingClassifier(base_estimator = DecisionTreeClassifier(), n_estimators = 10, random_state = session_seed, n_jobs = -1)
    , 'rand_for': param_rf
    , 'gboost': param_gb
    , 'lightgbm': param_lgb
    , 'catboost': param_cat
}

In [156]:
method_list = ['frequency', 'onehot','tf_idf', 'svd', 'lda']

In [157]:
def get_main_scores(estimator, df, y):
    """
    Function to get the main scores
    """
    results = {}
    probas = estimator.predict_proba(df)[:,1]
    results['ROC'] = roc_auc_score(y, probas)
    precision, recall, _ = metrics.precision_recall_curve(y, probas)
    results['PrRc'] = metrics.auc(recall, precision)
    results['HScore'] = h_score(y.to_numpy(), estimator.predict(df))
    return results

In [158]:
for model_name, model in model_dict.items(): 
    print(model_name)
    train_res = {}
    val_res = {}
    test_res = {}
    for method in method_list:
        print(method)
        train, test, val = load_datasets(method, include_val = include_val) # we load the dataset we want to use
        start_time = time.monotonic()
        try:
            with open(f'{path_to_models}_{model_name}_{method}{tune_tag}{seed_tag}', 'rb') as file:
                estimator = dill.load(file)
            print('Model already trained')
        except:
            if tune_models:
              # if we want to perform parameter tuning we use randomsearchCV
               gridsearch = RandomizedSearchCV(model, param_dictionary[model_name], cv=5, n_jobs=-1, 
                          scoring=['accuracy','roc_auc'], refit = 'roc_auc')
               gridsearch.fit(train, y_train) # we fit our model
               estimator = gridsearch.best_estimator_
            else:
               model.fit(train, y_train) # we fit our model
               estimator = model
            print('Model successfully trained')
            with open(f'{path_to_models}_{model_name}_{method}{tune_tag}{seed_tag}', 'wb') as file: # and save the fitted model
                dill.dump(estimator, file)
            print('Model saved')
        end_time = time.monotonic()
        print(timedelta(seconds=end_time - start_time))
        results_train = get_main_scores(estimator, train, y_train.target)
        results_test = get_main_scores(estimator, test, y_test.target)
        train_res[method] = results_train
        test_res[method] = results_test
        print('ROC Training Set: {}'.format(results_train['ROC']))
        print('ROC Test Set: {}'.format(results_test['ROC']))
        if include_val == True:
            results_val = get_main_scores(estimator, val, y_val.target)
            val_res[method] = results_val
            print('ROC Validation Set: {}'.format(results_val['ROC']))
    # finally we add the result lists to our dictionary
    final_train[model_name] = train_res
    final_val[model_name] = val_res
    final_test[model_name] = test_res

log_reg
frequency
Model already trained
0:00:00
ROC Training Set: 0.8371474567365131
ROC Test Set: 0.6202382836343906
onehot
Model already trained
0:00:00
ROC Training Set: 0.9956003253143796
ROC Test Set: 0.6238056636078798
tf_idf
Model already trained
0:00:00
ROC Training Set: 0.7514657849604172
ROC Test Set: 0.7021246949710185
svd
Model already trained
0:00:00
ROC Training Set: 0.6260359511465914
ROC Test Set: 0.6083181039919867
lda
Model already trained
0:00:00
ROC Training Set: 0.6928253090483725
ROC Test Set: 0.6852068614392154
dec_tree
frequency
Model already trained
0:00:00
ROC Training Set: 1.0
ROC Test Set: 0.5517807602321103
onehot
Model already trained
0:00:00
ROC Training Set: 1.0
ROC Test Set: 0.5364791362252856
tf_idf
Model already trained
0:00:00
ROC Training Set: 1.0
ROC Test Set: 0.5464123885712809
svd
Model already trained
0:00:00
ROC Training Set: 1.0
ROC Test Set: 0.5133684473269333
lda
Model already trained
0:00:00
ROC Training Set: 1.0
ROC Test Set: 0.53589180116

In [159]:
try:
    # Then we save all our results
    with open(f'{path_to_models}train_results{tune_tag}.pkl', 'rb') as file:
        results_train = pickle.load(file)
    with open(f'{path_to_models}test_results{tune_tag}.pkl', 'rb') as file:
        results_test = pickle.load(file)
    if include_val == True:
        with open(f'{path_to_models}val_results{tune_tag}.pkl', 'rb') as file:
            results_val = pickle.load(file)
except:
    results_train = {}
    results_val = {}
    results_test = {}

In [160]:
results_train[session_seed] = final_train
results_val[session_seed] = final_val
results_test[session_seed] = final_test

In [161]:
# Then we save all our results
with open(f'{path_to_models}train_results{tune_tag}.pkl', 'wb') as file:
    pickle.dump(results_train, file)
with open(f'{path_to_models}test_results{tune_tag}.pkl', 'wb') as file:
    pickle.dump(results_test, file)
if include_val == True:
    with open(f'{path_to_models}val_results{tune_tag}.pkl', 'wb') as file:
        pickle.dump(results_val, file)

## Display Results

In [166]:
def average_score(dict_res, model, method, score):
    """
    Function to return the average score for a certain combination of model, method and score
    """
    score_avg = 0
    for key in dict_res.keys():
        score_avg += dict_res[key][model][method][score]
    score_avg = score_avg / len(dict_res.keys())
    return score_avg

In [180]:
for score in ['ROC', 'PrRc', 'HScore']:
    final_train = []
    final_test = []
    if include_val: final_val = []
    for model in model_dict.keys():
        train_res = [model]
        test_res = [model]
        if include_val: val_res = [model]
        for method in method_list:
            train_res.append(average_score(results_train, model, method, score))
            test_res.append(average_score(results_test, model, method, score))
            if include_val: val_res.append(average_score(results_val, model, method, score))
        final_train.append(train_res)
        final_test.append(test_res)
        if include_val: final_val.append(val_res)
    print("-"*65)
    print(f'\n{score}\n')
    print('\n Train Results \n')
    print(tabulate(final_train, headers = method_list))
    print("-"*65)
    print('\n Test Results \n')
    print(tabulate(final_test, headers = method_list))
    if include_val: 
        print('\n Val Results \n')
        print(tabulate(final_val, headers = method_list))
        print("-"*65)

-----------------------------------------------------------------

ROC


 Train Results 

            frequency    onehot    tf_idf       svd       lda
--------  -----------  --------  --------  --------  --------
log_reg      0.845747  0.995599  0.749759  0.628829  0.692477
dec_tree     1         1         1         1         1
rand_for     1         1         1         1         1
gboost       0.771362  0.854002  0.836283  0.793691  0.777784
lightgbm     0.995582  0.996128  0.999639  0.996557  0.979271
catboost     0.913173  0.987107  0.978186  0.994491  0.961889
-----------------------------------------------------------------

 Test Results 

            frequency    onehot    tf_idf       svd       lda
--------  -----------  --------  --------  --------  --------
log_reg      0.623613  0.605154  0.6981    0.601681  0.672712
dec_tree     0.547977  0.531834  0.538137  0.519704  0.53476
rand_for     0.666673  0.666363  0.631988  0.628142  0.652268
gboost       0.682979  0.682127  0.6