# Classification

In [19]:
# Import libraries
import numpy as np
import pandas as pd
import os
import random
import dill
import pickle
from tabulate import tabulate

import sys

import warnings
warnings.filterwarnings("ignore")

from sklearn.metrics import roc_auc_score
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import BaggingClassifier
from sklearn.ensemble import GradientBoostingClassifier
import lightgbm as lgb
try:
  from catboost import CatBoostClassifier
except:
  !pip install catboost
  from catboost import CatBoostClassifier

import time
from datetime import timedelta

In [20]:
try:
  from google.colab import drive
  IN_COLAB=True
except:
  IN_COLAB=False

if IN_COLAB:
  print("We're running Colab")

if IN_COLAB:  
  # Mount the Google Drive at mount
  mount='/content/gdrive'
  print("Colab: mounting Google drive on ", mount)
  # connect your colab with the drive
  drive.mount(mount)

 # Switch to the directory on the Google Drive that you want to use
  import os
  path_to_repo = mount + "/My Drive/MIMIC-III Text Mining/mimim_iii_readmission"

else:
   path_to_repo = os.path.dirname(os.getcwd())

  
print(path_to_repo)

We're running Colab
Colab: mounting Google drive on  /content/gdrive
Drive already mounted at /content/gdrive; to attempt to forcibly remount, call drive.mount("/content/gdrive", force_remount=True).
/content/gdrive/My Drive/MIMIC-III Text Mining/mimim_iii_readmission


In [21]:
path_to_data = os.path.join(path_to_repo, "data","")
print(path_to_data)

/content/gdrive/My Drive/MIMIC-III Text Mining/mimim_iii_readmission/data/


In [22]:
path_to_processed = os.path.join(path_to_data,"processed","")
os.makedirs(path_to_processed, exist_ok=True) # we create the directory if it does not exist
print(path_to_processed)

/content/gdrive/My Drive/MIMIC-III Text Mining/mimim_iii_readmission/data/processed/


In [23]:
path_to_models = os.path.join(path_to_data,"models","")
os.makedirs(path_to_models, exist_ok=True) # we create the directory if it does not exist
print(path_to_models)

/content/gdrive/My Drive/MIMIC-III Text Mining/mimim_iii_readmission/data/models/


In [24]:
# PARAMETERS

session_seed = 42 # set seed for our session
include_val = False # set to True if we want to also create a validation set

random.seed(session_seed)

## Train the Models

In [25]:
def load_datasets(method, include_val = True, target = False):
    """
    Function to load train, test and validation set based on the chosen method
    method: string for the processing method we want to load
    include_diag: if we want to load the dataframes with the diagnosis text, default True
    include_test: if we want to load also the test set, default True
    target: if we are importing our target variables
    """
    global path_to_processed
    if target == True: 
        target = 'y_'
    else: 
        target = ''
    # load it back
    train = pd.read_feather(f'{path_to_processed}{target}train_{method}')
    test = pd.read_feather(f'{path_to_processed}{target}test_{method}')
    if include_val == True:
        val = pd.read_feather(f'{path_to_processed}{target}val_{method}')
    else: val = []
    return train, test, val

In [26]:
y_train, y_test, y_val = load_datasets(method = '', include_val = include_val, target = True)

In [27]:
# initialize a dictionary for the results of all the models
train_roc = {}
val_roc = {}
test_roc = {}

In [28]:
model_dict = {
    'log_reg': LogisticRegression(solver = "saga", penalty = 'l1', random_state = session_seed, n_jobs = -1) # default penalty is l2, we do lasso
    #, 'dec_tree': DecisionTreeClassifier(random_state = session_seed)
    #, 'bag_tree': BaggingClassifier(base_estimator = DecisionTreeClassifier(), n_estimators = 10, random_state = session_seed, n_jobs = -1)
    , 'rand_for': RandomForestClassifier(random_state = session_seed, n_jobs = -1)
    #, 'gboost': GradientBoostingClassifier(random_state = session_seed)
    , 'lightgbm': lgb.LGBMClassifier(random_state = 42, n_jobs = -1, deterministic = True)
    #, 'catboost': CatBoostClassifier(random_seed = 42)
}

In [29]:
method_list = ['frequency', 'onehot','tf_idf', 'svd', 'lda']

In [30]:
for model_name, model in model_dict.items(): 
    print(model_name)
    # initialize lists with the results
    results_train = []
    results_val = []
    results_test = []
    for method in method_list:
        print(method)
        train, test, val = load_datasets(method, include_val = include_val) # we load the dataset we want to use
        start_time = time.monotonic()
        try:
            with open(f'{path_to_models}_{model_name}_{method}', 'rb') as file:
                model = dill.load(file)
            print('Model already trained')
        except:
            model.fit(train, y_train) # we fit our model
            print('Model successfully trained')
            with open(f'{path_to_models}_{model_name}_{method}', 'wb') as file: # and save the fitted model
                dill.dump(model, file)
            print('Model saved')
        end_time = time.monotonic()
        print(timedelta(seconds=end_time - start_time))
        roc_train = roc_auc_score(y_train.target, model.predict_proba(train)[:, 1])
        roc_test = roc_auc_score(y_test.target, model.predict_proba(test)[:, 1])
        results_train.append(roc_train) # append the ROC score
        results_test.append(roc_test)
        print('ROC Training Set: {}'.format(roc_train))
        print('ROC Test Set: {}'.format(roc_test))
        if include_val == True:
            roc_val = roc_auc_score(y_val.target, model.predict_proba(val)[:, 1])
            results_val.append(roc_val)
            print('ROC Validation Set: {}'.format(roc_val))
    # finally we add the result lists to our dictionary
    train_roc[model_name] = results_train
    val_roc[model_name] = results_val
    test_roc[model_name] = results_test

log_reg
frequency
Model already trained
0:00:00.012054
ROC Training Set: 0.8390933835663675
ROC Test Set: 0.6114780865887829
onehot
Model already trained
0:00:00.015086
ROC Training Set: 0.9953941941261862
ROC Test Set: 0.6112127938463157
tf_idf
Model already trained
0:00:00.005361
ROC Training Set: 0.749163847407937
ROC Test Set: 0.7065949930262612
svd
Model already trained
0:00:00.003367
ROC Training Set: 0.6267888357296673
ROC Test Set: 0.5930386261619245
lda
Model successfully trained
Model saved
0:00:01.758939
ROC Training Set: 0.6836896024523876
ROC Test Set: 0.6809031210883738
rand_for
frequency
Model already trained
0:00:00.206843
ROC Training Set: 1.0
ROC Test Set: 0.6933003662885361
onehot
Model already trained
0:00:00.097878
ROC Training Set: 1.0
ROC Test Set: 0.7035766536850084
tf_idf
Model already trained
0:00:00.104357
ROC Training Set: 1.0
ROC Test Set: 0.6391734954556507
svd
Model already trained
0:00:00.063075
ROC Training Set: 1.0
ROC Test Set: 0.6463061791985576
lda


In [32]:
# Then we save all our results
with open(f'{path_to_models}train_results.pkl', 'wb') as file:
    pickle.dump(train_roc, file)
with open(f'{path_to_models}val_results.pkl', 'wb') as file:
    pickle.dump(val_roc, file)
if include_val == True:
    with open(f'{path_to_models}test_results.pkl', 'wb') as file:
        pickle.dump(test_roc, file)

## Display Results

In [33]:
def get_final_res_list(dict):
    """
    Function to transform our results to a list of list usable by the tabulate function
    """
    results = []
    for key, values in dict.items():
        new_res = [[key], values]
        flat_list = [item for sublist in new_res for item in sublist]
        results.append(flat_list)
    return results

In [34]:
train_results = get_final_res_list(train_roc)
print(tabulate(train_results, headers = method_list))

            frequency    onehot    tf_idf       svd       lda
--------  -----------  --------  --------  --------  --------
log_reg      0.839093  0.995394  0.749164  0.626789  0.68369
rand_for     1         1         1         1         1
lightgbm     0.995699  0.996217  0.999692  0.997682  0.981339


In [35]:
if include_val:
    val_results = get_final_res_list(val_roc)
    print(tabulate(val_results, headers = method_list))

In [36]:
test_results = get_final_res_list(test_roc)
print(tabulate(test_results, headers = method_list))

            frequency    onehot    tf_idf       svd       lda
--------  -----------  --------  --------  --------  --------
log_reg      0.611478  0.611213  0.706595  0.593039  0.680903
rand_for     0.6933    0.703577  0.639173  0.646306  0.672264
lightgbm     0.689916  0.689288  0.694925  0.648108  0.682114
