# First Classification (rework)

In [1]:
# Import libraries
import numpy as np
import pandas as pd
import os
import psycopg2
import matplotlib.pyplot as plt
import random
import dill
import pickle

from sklearn.metrics import roc_auc_score
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import BaggingClassifier
from sklearn.ensemble import GradientBoostingClassifier
import lightgbm as lgb
try:
  from catboost import CatBoostClassifier
except:
  !pip install catboost
  from catboost import CatBoostClassifier

try:
  # feature selection through BorutaPy
  from boruta import BorutaPy
except:
  !pip install boruta
  from boruta import BorutaPy
import time
from datetime import timedelta

%matplotlib inline

  """)


Collecting catboost
  Downloading catboost-1.0.4-cp37-none-manylinux1_x86_64.whl (76.1 MB)
[K     |████████████████████████████████| 76.1 MB 2.5 MB/s 
Installing collected packages: catboost
Successfully installed catboost-1.0.4
Collecting boruta
  Downloading Boruta-0.3-py3-none-any.whl (56 kB)
[K     |████████████████████████████████| 56 kB 4.3 MB/s 
Installing collected packages: boruta
Successfully installed boruta-0.3


In [2]:
# PARAMETERS

icu_stays = True # set to TRUE if we want to have only ICU stays
med_7 = False # set to false if we want to avoid using Med7 preprocessing

if med_7 == False: 
    tag_med7 = '_nomed7'
else:
    tag_med7 = ''

if icu_stays == True:
    tag_icu = '_icu'
    icu_folder = 'icu_only'
    title_tag = 'Only ICU Hospitalization'
else:
    tag_icu = ''
    icu_folder = 'all_hosp'
    title_tag = 'All Hospitalization'

In [3]:
try:
  from google.colab import drive
  IN_COLAB=True
except:
  IN_COLAB=False

if IN_COLAB:
  print("We're running Colab")

if IN_COLAB:  
  # Mount the Google Drive at mount
  mount='/content/gdrive'
  print("Colab: mounting Google drive on ", mount)
  # connect your colab with the drive
  drive.mount(mount)

 # Switch to the directory on the Google Drive that you want to use
  import os
  path_to_repo = mount + "/My Drive/MIMIC-III Text Mining"

else:
  path_to_repo = os.path.join(os.path.dirname(os.getcwd()))

  
print(path_to_repo)

We're running Colab
Colab: mounting Google drive on  /content/gdrive
Mounted at /content/gdrive
/content/gdrive/My Drive/MIMIC-III Text Mining


In [4]:
path_to_data = os.path.join(path_to_repo,"Readmission","data", icu_folder,"")
print(path_to_data)

/content/gdrive/My Drive/MIMIC-III Text Mining/Readmission/data/icu_only/


In [5]:
path_to_processed = os.path.join(path_to_data,"processed","")
print(path_to_processed)

/content/gdrive/My Drive/MIMIC-III Text Mining/Readmission/data/icu_only/processed/


In [6]:
path_to_models = os.path.join(path_to_data,"models","")
os.makedirs(path_to_models, exist_ok=True) # we create the directory if it does not exist
print(path_to_models)

/content/gdrive/My Drive/MIMIC-III Text Mining/Readmission/data/icu_only/models/


In [7]:
path_to_boruta = os.path.join(path_to_models,"boruta","")
os.makedirs(path_to_boruta, exist_ok=True) # we create the directory if it does not exist
print(path_to_models)

/content/gdrive/My Drive/MIMIC-III Text Mining/Readmission/data/icu_only/models/


In [8]:
# PARAMETERS

session_seed = 42 # set seed for our session
include_diag = True # set to True if we want to also process the diagnosis column
include_test = True # set to True if we want to also process the test set
feature_boruta = False # set to True if we want to select features with BorutaPy

random.seed(session_seed)

if include_diag == True: diag_tag = '_diag'
else: diag_tag = ''

In [9]:
def load_datasets(method, include_diag = True, include_test = True):
    """
    Function to load train, test and validation set based on the chosen method
    method: string for the processing method we want to load
    include_diag: if we want to load the dataframes with the diagnosis text, default True
    include_test: if we want to load also the test set, default True
    """
    global path_to_processed
    if include_diag == True: diag_tag = '_diag'
    else: diag_tag = ''
    # load it back
    train = pd.read_feather(f'{path_to_processed}train_{method}{diag_tag}{tag_med7}')
    val = pd.read_feather(f'{path_to_processed}val_{method}{diag_tag}{tag_med7}')
    if include_test == True:
        test = pd.read_feather(f'{path_to_processed}test_{method}{diag_tag}{tag_med7}')
    else: test = []
    return train, val, test

In [10]:
y_train = pd.read_feather(f'{path_to_processed}y_train{tag_med7}')
y_val = pd.read_feather(f'{path_to_processed}y_val{tag_med7}')
if include_test == True:
    y_test = pd.read_feather(f'{path_to_processed}y_test{tag_med7}')
print('Train composition:')  
print(y_train.value_counts())
print('\nVal composition:')  
print(y_val.value_counts())
if include_test == True:
    print('\nTest composition:')  
    print(y_test.value_counts())

Train composition:
target
0         22746
1          1236
dtype: int64

Val composition:
target
0         2528
1          137
dtype: int64

Test composition:
target
0         6319
1          343
dtype: int64


In [11]:
# initialize a dictionary for the results of all the models
train_roc = {}
val_roc = {}
test_roc = {}

# For Boruta
train_roc_boruta = {}
val_roc_boruta = {}
test_roc_boruta = {}

In [12]:
model_dict = {
    'log_reg': LogisticRegression(solver = "saga", penalty = 'l1', random_state = session_seed, n_jobs = -1) # default penalty is l2, we do lasso
    , 'dec_tree': DecisionTreeClassifier(random_state = session_seed)
    #, 'bag_tree': BaggingClassifier(base_estimator = DecisionTreeClassifier(), n_estimators = 10, random_state = session_seed, n_jobs = -1)
    , 'rand_for': RandomForestClassifier(random_state = session_seed, n_jobs = -1)
    , 'gboost': GradientBoostingClassifier(random_state = session_seed)
    , 'lightgbm': lgb.LGBMClassifier(random_state = 42, n_jobs = -1, deterministic = True)
    , 'catboost': CatBoostClassifier(random_seed = 42)
}

In [13]:
method_list = ['frequency', 'one_hot','tf_idf', 'word2vec', 'GloVe', 'W2V_Med', 'Bio_W2V']

In [None]:
for model_name, model in model_dict.items(): 
    print(model_name)
    # initialize lists with the results
    results_train = []
    results_val = []
    results_test = []
    results_train_boruta = []
    results_val_boruta = []
    results_test_boruta = []
    for method in method_list:
        print(method)
        train, val, test = load_datasets(method, include_diag = include_diag, include_test = include_test) # we load the dataset we want to use
        start_time = time.monotonic()
        try:
            with open(f'{path_to_models}_{model_name}_{method}{diag_tag}{tag_med7}', 'rb') as file:
                model = dill.load(file)
            print('Model already trained')
            trained = True
        except:
            trained = False
            model.fit(train, y_train) # we fit our model
            print('Model successfully trained')
        end_time = time.monotonic()
        print(timedelta(seconds=end_time - start_time))
        results_train.append(roc_auc_score(y_train.target, model.predict_proba(train)[:, 1])) # append the ROC score
        results_val.append(roc_auc_score(y_val.target, model.predict_proba(val)[:, 1]))
        if include_test == True:
            results_test.append(roc_auc_score(y_test.target, model.predict_proba(test)[:, 1]))
        if trained == False:
          with open(f'{path_to_models}_{model_name}_{method}{diag_tag}{tag_med7}', 'wb') as file: # and save the fitted model
            dill.dump(model, file)
            print('Model saved')
        # BORUTA -----------------------------------------------------------------------------------------
        if feature_boruta == True and model_name != 'log_reg' and model_name != 'dec_tree':
          start_time = time.monotonic()
          try:
            with open(f'{path_to_boruta}_{model_name}_{method}{diag_tag}{tag_med7}_feat_selector', 'rb') as file:
                feat_selector = dill.load(file)
            print('Boruta Model already trained')
          except:
            feat_selector = BorutaPy(model, random_state = session_seed)
            # find all relevant features - 5 features should be selected
            feat_selector.fit(np.array(train), np.array(y_train))
            print('Boruta Model trained')
            with open(f'{path_to_boruta}_{model_name}_{method}{diag_tag}{tag_med7}_feat_selector', 'wb') as file: # and save the fitted model
              dill.dump(feat_selector, file)
              print('Model saved')
          # call transform() on X to filter it down to selected features
          train_filtered = feat_selector.transform(np.array(train))
          print('Number of features selected: {}'.format(train_filtered.shape[1]))
          val_filtered = feat_selector.transform(np.array(val))
          if include_test == True:
            test_filtered = feat_selector.transform(np.array(test))
          end_time = time.monotonic()
          print(timedelta(seconds=end_time - start_time))
          # FIT MODEL WITH FEATURE SELECTION -------------------------
          start_time = time.monotonic()
          try:
              with open(f'{path_to_boruta}_{model_name}_{method}{diag_tag}{tag_med7}_boruta', 'rb') as file:
                  model = dill.load(file)
              print('Reduced Model already trained')
              trained = True
          except:
              trained = False
              model.fit(train, y_train) # we fit our model
              print('Reduced Model successfully trained')
          end_time = time.monotonic()
          print(timedelta(seconds=end_time - start_time))
          results_train_boruta.append(roc_auc_score(y_train.target, model.predict_proba(train)[:, 1])) # append the ROC score
          results_val_boruta.append(roc_auc_score(y_val.target, model.predict_proba(val)[:, 1]))
          if include_test == True:
              results_test_boruta.append(roc_auc_score(y_test.target, model.predict_proba(test)[:, 1]))
          if trained == False:
            with open(f'{path_to_boruta}_{model_name}_{method}{diag_tag}{tag_med7}_boruta', 'wb') as file: # and save the fitted model
              dill.dump(model, file)
              print('Model saved')
    train_roc[model_name] = results_train # finally we add the result lists to our dictionary
    val_roc[model_name] = results_val
    test_roc[model_name] = results_test
    if feature_boruta == True:
      train_roc_boruta[model_name] = results_train_boruta
      val_roc_boruta[model_name] = results_val_boruta
      test_roc_boruta[model_name] = results_test_boruta

log_reg
frequency


https://scikit-learn.org/stable/modules/model_persistence.html#security-maintainability-limitations
  f"X has feature names, but {self.__class__.__name__} was fitted without"


Model already trained
0:00:00.262445


  f"X has feature names, but {self.__class__.__name__} was fitted without"
  f"X has feature names, but {self.__class__.__name__} was fitted without"


one_hot


In [None]:
# Then we save all our results
with open(f'{path_to_models}train_results{diag_tag}{tag_med7}.pkl', 'wb') as file:
    pickle.dump(train_roc, file)
with open(f'{path_to_models}val_results{diag_tag}{tag_med7}.pkl', 'wb') as file:
    pickle.dump(val_roc, file)
if include_test == True:
    with open(f'{path_to_models}test_results{diag_tag}{tag_med7}.pkl', 'wb') as file:
        pickle.dump(test_roc, file)

In [None]:
def get_final_res_list(dict):
  """
  Function to transform our results to a list of list usable by the tabulate function
  """
  results = []
  for key, values in dict.items():
    new_res = [[key], values]
    flat_list = [item for sublist in new_res for item in sublist]
    results.append(flat_list)
  return results

In [None]:
from tabulate import tabulate
train_results = get_final_res_list(train_roc)
print(tabulate(train_results, headers = ['Frequency','One Hot', 'TF-IDF', 'Word2Vec', 'GloVe', 'W2V_Med', 'Bio_W2V']))

In [None]:
val_results = get_final_res_list(val_roc)
print(tabulate(val_results, headers = ['Frequency','One Hot', 'TF-IDF', 'Word2Vec', 'GloVe', 'W2V_Med', 'Bio_W2V']))

In [None]:
if include_test == True:
    test_results = get_final_res_list(test_roc)
    print(tabulate(test_results, headers = ['Frequency','One Hot', 'TF-IDF', 'Word2Vec', 'GloVe', 'W2V_Med', 'Bio_W2V']))

# Results

In [None]:
# Then we save all our results
with open(f'{path_to_models}train_results{diag_tag}{tag_med7}.pkl', 'rb') as file:
    train_roc = pickle.load(file)
with open(f'{path_to_models}val_results{diag_tag}{tag_med7}.pkl', 'rb') as file:
    val_roc = pickle.load(file)
if include_test == True:
    with open(f'{path_to_models}test_results{diag_tag}{tag_med7}.pkl', 'rb') as file:
        test_roc = pickle.load(file)

In [None]:
train_results = get_final_res_list(train_roc)
print(title_tag)
print(tabulate(train_results, headers = ['Frequency','One Hot', 'TF-IDF', 'Word2Vec', 'GloVe', 'W2V_Med', 'Bio_W2V']))

In [None]:
val_results = get_final_res_list(val_roc)
print(title_tag)
print(tabulate(val_results, headers = ['Frequency','One Hot', 'TF-IDF', 'Word2Vec', 'GloVe', 'W2V_Med', 'Bio_W2V']))

In [None]:
if include_test == True:
    test_results = get_final_res_list(test_roc)
    print(title_tag)
    print(tabulate(test_results, headers = ['Frequency','One Hot', 'TF-IDF', 'Word2Vec', 'GloVe', 'W2V_Med', 'Bio_W2V']))

In [None]:
if icu_stays == True:
    other_folder = 'all_hosp'
    title_tag = 'All Hospitalization'
else:
    title_tag = 'Only ICU Hospitalization'
    other_folder = 'icu_only'

path_to_other = os.path.join(path_to_repo,"Readmission","data", other_folder, "models","")
print(path_to_other)

C:\Users\luca9\Documents\MIMIC-III Text Mining\Readmission\data\all_hosp\models\


In [None]:
# Then we save all our results
with open(f'{path_to_other}train_results{diag_tag}{tag_med7}.pkl', 'rb') as file:
    train_roc = pickle.load(file)
with open(f'{path_to_other}val_results{diag_tag}{tag_med7}.pkl', 'rb') as file:
    val_roc = pickle.load(file)
if include_test == True:
    with open(f'{path_to_other}test_results{diag_tag}{tag_med7}.pkl', 'rb') as file:
        test_roc = pickle.load(file)

FileNotFoundError: [Errno 2] No such file or directory: 'C:\\Users\\luca9\\Documents\\MIMIC-III Text Mining\\Readmission\\data\\all_hosp\\models\\train_results_nomed7.pkl'

In [None]:
train_results = get_final_res_list(train_roc)
print(title_tag)
print(tabulate(train_results, headers = ['Frequency','One Hot', 'TF-IDF', 'Word2Vec', 'GloVe', 'W2V_Med', 'Bio_W2V']))

In [None]:
val_results = get_final_res_list(val_roc)
print(title_tag)
print(tabulate(val_results, headers = ['Frequency','One Hot', 'TF-IDF', 'Word2Vec', 'GloVe', 'W2V_Med', 'Bio_W2V']))

In [None]:
if include_test == True:
    test_results = get_final_res_list(test_roc)
    print(title_tag)
    print(tabulate(test_results, headers = ['Frequency','One Hot', 'TF-IDF', 'Word2Vec', 'GloVe', 'W2V_Med', 'Bio_W2V']))