# Classification

In [1]:
# Import libraries
import numpy as np
import pandas as pd
import os
import random
import dill
import pickle
from tabulate import tabulate

import sys

import warnings
warnings.filterwarnings("ignore")

from sklearn.metrics import roc_auc_score
from sklearn.model_selection import RandomizedSearchCV, GridSearchCV
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import BaggingClassifier
from sklearn.ensemble import GradientBoostingClassifier
import lightgbm as lgb
try:
  from catboost import CatBoostClassifier
except:
  !pip install catboost
  from catboost import CatBoostClassifier

import time
from datetime import timedelta

In [2]:
try:
  from google.colab import drive
  IN_COLAB=True
except:
  IN_COLAB=False

if IN_COLAB:
  print("We're running Colab")

if IN_COLAB:  
  # Mount the Google Drive at mount
  mount='/content/gdrive'
  print("Colab: mounting Google drive on ", mount)
  # connect your colab with the drive
  drive.mount(mount)

 # Switch to the directory on the Google Drive that you want to use
  import os
  path_to_repo = mount + "/My Drive/MIMIC-III Text Mining/mimim_iii_readmission"

else:
   path_to_repo = os.path.dirname(os.getcwd())

  
print(path_to_repo)

C:\Users\luca9\Documents\MIMIC-III Text Mining\mimim_iii_readmission


In [3]:
path_to_data = os.path.join(path_to_repo, "data","")
print(path_to_data)

C:\Users\luca9\Documents\MIMIC-III Text Mining\mimim_iii_readmission\data\


In [4]:
path_to_processed = os.path.join(path_to_data,"processed","")
os.makedirs(path_to_processed, exist_ok=True) # we create the directory if it does not exist
print(path_to_processed)

C:\Users\luca9\Documents\MIMIC-III Text Mining\mimim_iii_readmission\data\processed\


In [5]:
path_to_models = os.path.join(path_to_data,"models","")
os.makedirs(path_to_models, exist_ok=True) # we create the directory if it does not exist
print(path_to_models)

C:\Users\luca9\Documents\MIMIC-III Text Mining\mimim_iii_readmission\data\models\


In [6]:
# PARAMETERS

session_seed = 42 # set seed for our session
include_val = False # set to True if we want to also create a validation set
tune_models = True # set to True if we want to perform parameter tuning


if tune_models:
  tune_tag = '_tuned'
else:
  tune_tag = ''

random.seed(session_seed)

## Train the Models

In [7]:
def load_datasets(method, include_val = True, target = False):
    """
    Function to load train, test and validation set based on the chosen method
    method: string for the processing method we want to load
    include_diag: if we want to load the dataframes with the diagnosis text, default True
    include_test: if we want to load also the test set, default True
    target: if we are importing our target variables
    """
    global path_to_processed
    if target == True: 
        target = 'y_'
    else: 
        target = ''
    # load it back
    train = pd.read_feather(f'{path_to_processed}{target}train_{method}')
    test = pd.read_feather(f'{path_to_processed}{target}test_{method}')
    if include_val == True:
        val = pd.read_feather(f'{path_to_processed}{target}val_{method}')
    else: val = []
    return train, test, val

In [8]:
y_train, y_test, y_val = load_datasets(method = '', include_val = include_val, target = True)

In [9]:
# initialize a dictionary for the results of all the models
train_roc = {}
val_roc = {}
test_roc = {}

In [10]:
model_dict = {
    'log_reg': LogisticRegression(solver = "saga", penalty = 'elasticnet', random_state = session_seed, n_jobs = -1) # default penalty is l2, we do lasso
    #, 'dec_tree': DecisionTreeClassifier(random_state = session_seed)
    #, 'bag_tree': BaggingClassifier(base_estimator = DecisionTreeClassifier(), n_estimators = 10, random_state = session_seed, n_jobs = -1)
    , 'rand_for': RandomForestClassifier(random_state = session_seed, n_jobs = -1)
    #, 'gboost': GradientBoostingClassifier(random_state = session_seed)
    , 'lightgbm': lgb.LGBMClassifier(random_state = 42, n_jobs = -1, deterministic = True)
    #, 'catboost': CatBoostClassifier(random_seed = 42)
}

In [27]:
# PARAMETERS FOR LOGISTIC REGRESSION -------
param_en = {'C': np.logspace(-3, 4, 10), 'l1_ratio':np.linspace(0,1,11) }

# PARAMETERS FOR RANDOM FOREST -------
# Number of trees in random forest
n_estimators = [int(x) for x in np.linspace(start = 100, stop = 500, num = 5)]
# Number of features to consider at every split
max_features = ['auto', 'sqrt']
# Maximum number of levels in tree
max_depth = [int(x) for x in np.linspace(10, 110, num = 11)]
max_depth.append(None)
# Minimum number of samples required to split a node
min_samples_split = [2, 5, 10]
# Minimum number of samples required at each leaf node
min_samples_leaf = [1, 2, 4]
# Maximum number of samples per tree
max_sampl = list(np.arange(0.01,1,0.2))
max_sampl.append(None)
# Create the random grid
param_rf = {'n_estimators': n_estimators,
               'max_features': max_features,
               'max_depth': max_depth,
               'min_samples_split': min_samples_split,
               'min_samples_leaf': min_samples_leaf,
               'max_samples': max_sampl}

# PARAMETERS FOR LIGHTGBM -----------
param_lgb = {'max_depth': max_depth,
             'min_data_in_leaf': min_samples_leaf,
             'num_iterations': n_estimators,
             'learning_rate': list(np.linspace(0, 1, num = 10)),
             'colsample_bytree': list(np.linspace(0, 1, num = 10)),
             'subsample': list(np.linspace(0, 1, num = 10)),
             'reg_alpha': [0, 1e-1, 1, 2, 5, 7, 10, 50, 100],
             'reg_lambda': [0, 1e-1, 1, 5, 10, 20, 50, 100]}

In [29]:
param_dictionary = {
    'log_reg': param_en
    #, 'dec_tree': DecisionTreeClassifier(random_state = session_seed)
    #, 'bag_tree': BaggingClassifier(base_estimator = DecisionTreeClassifier(), n_estimators = 10, random_state = session_seed, n_jobs = -1)
    , 'rand_for': param_rf
    #, 'gboost': GradientBoostingClassifier(random_state = session_seed)
    , 'lightgbm': param_lgb
    #, 'catboost': CatBoostClassifier(random_seed = 42)
}

In [30]:
method_list = ['frequency', 'onehot','tf_idf', 'svd', 'lda']

In [31]:
for model_name, model in model_dict.items(): 
    print(model_name)
    # initialize lists with the results
    results_train = []
    results_val = []
    results_test = []
    for method in method_list:
        print(method)
        train, test, val = load_datasets(method, include_val = include_val) # we load the dataset we want to use
        start_time = time.monotonic()
        try:
            with open(f'{path_to_models}_{model_name}_{method}{tune_tag}', 'rb') as file:
                estimator = dill.load(file)
            print('Model already trained')
        except:
            if tune_models:
              # if we want to perform parameter tuning we use randomsearchCV
               gridsearch = RandomizedSearchCV(model, param_dictionary[model_name], cv=5, n_jobs=-1, 
                          scoring=['accuracy','roc_auc'], refit = 'roc_auc')
               gridsearch.fit(train, y_train) # we fit our model
               estimator = gridsearch.best_estimator_
            else:
               model.fit(train, y_train) # we fit our model
               estimator = model
            print('Model successfully trained')
            with open(f'{path_to_models}_{model_name}_{method}{tune_tag}', 'wb') as file: # and save the fitted model
                dill.dump(estimator, file)
            print('Model saved')
        end_time = time.monotonic()
        print(timedelta(seconds=end_time - start_time))
        roc_train = roc_auc_score(y_train.target, estimator.predict_proba(train)[:, 1])
        roc_test = roc_auc_score(y_test.target, estimator.predict_proba(test)[:, 1])
        results_train.append(roc_train) # append the ROC score
        results_test.append(roc_test)
        print('ROC Training Set: {}'.format(roc_train))
        print('ROC Test Set: {}'.format(roc_test))
        if include_val == True:
            roc_val = roc_auc_score(y_val.target, estimator.predict_proba(val)[:, 1])
            results_val.append(roc_val)
            print('ROC Validation Set: {}'.format(roc_val))
    # finally we add the result lists to our dictionary
    train_roc[model_name] = results_train
    val_roc[model_name] = results_val
    test_roc[model_name] = results_test

log_reg
frequency
Model already trained
0:00:00.015000
ROC Training Set: 0.8065971893423174
ROC Test Set: 0.6117438407099326
onehot
Model already trained
0:00:00
ROC Training Set: 0.9979198703261057
ROC Test Set: 0.6076135787437305
tf_idf
Model already trained
0:00:00
ROC Training Set: 0.7866722311232908
ROC Test Set: 0.7134400071605972
svd
Model already trained
0:00:00
ROC Training Set: 0.6268982267530675
ROC Test Set: 0.5930676930189254
lda
Model already trained
0:00:00
ROC Training Set: 0.6904609240913326
ROC Test Set: 0.6800855580628924
rand_for
frequency
Model already trained
0:00:00.110000
ROC Training Set: 0.9834809468559619
ROC Test Set: 0.7271595636649523
onehot
Model already trained
0:00:00.125000
ROC Training Set: 0.9979467858202722
ROC Test Set: 0.7173834107603659
tf_idf
Model already trained
0:00:00.109000
ROC Training Set: 0.9996056620747604
ROC Test Set: 0.6677644403453512
svd
Model already trained
0:00:00.047000
ROC Training Set: 0.873592793702074
ROC Test Set: 0.688391

In [32]:
# Then we save all our results
with open(f'{path_to_models}train_results.pkl', 'wb') as file:
    pickle.dump(train_roc, file)
with open(f'{path_to_models}val_results.pkl', 'wb') as file:
    pickle.dump(val_roc, file)
if include_val == True:
    with open(f'{path_to_models}test_results.pkl', 'wb') as file:
        pickle.dump(test_roc, file)

## Display Results

In [33]:
def get_final_res_list(dict):
    """
    Function to transform our results to a list of list usable by the tabulate function
    """
    results = []
    for key, values in dict.items():
        new_res = [[key], values]
        flat_list = [item for sublist in new_res for item in sublist]
        results.append(flat_list)
    return results

In [34]:
train_results = get_final_res_list(train_roc)
print(tabulate(train_results, headers = method_list))

            frequency    onehot    tf_idf       svd       lda
--------  -----------  --------  --------  --------  --------
log_reg      0.806597  0.99792   0.786672  0.626898  0.690461
rand_for     0.983481  0.997947  0.999606  0.873593  0.999965
lightgbm     0.98078   0.906672  1         0.767814  0.873133


In [35]:
if include_val:
    val_results = get_final_res_list(val_roc)
    print(tabulate(val_results, headers = method_list))

In [36]:
test_results = get_final_res_list(test_roc)
print(tabulate(test_results, headers = method_list))

            frequency    onehot    tf_idf       svd       lda
--------  -----------  --------  --------  --------  --------
log_reg      0.611744  0.607614  0.71344   0.593068  0.680086
rand_for     0.72716   0.717383  0.667764  0.688391  0.695488
lightgbm     0.689711  0.702099  0.681877  0.697932  0.683949
