# LOS Text Only

## Setup

In [1]:
# First install package from terminal:
!pip install -U pip
!pip install -U setuptools wheel
!pip install autogluon  # autogluon==0.4.1

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting pip
  Downloading pip-22.1.2-py3-none-any.whl (2.1 MB)
[K     |████████████████████████████████| 2.1 MB 14.5 MB/s 
[?25hInstalling collected packages: pip
  Attempting uninstall: pip
    Found existing installation: pip 21.1.3
    Uninstalling pip-21.1.3:
      Successfully uninstalled pip-21.1.3
Successfully installed pip-22.1.2
Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting setuptools
  Downloading setuptools-63.1.0-py3-none-any.whl (1.2 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.2/1.2 MB[0m [31m34.0 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: setuptools
  Attempting uninstall: setuptools
    Found existing installation: setuptools 57.4.0
    Uninstalling setuptools-57.4.0:
      Successfully uninstalled setuptools-57.4.0
[31mERROR: pip's dependency resolver does no

In [1]:
# import all required modules
import pandas as pd
import numpy as np
import os
import matplotlib.pyplot as plt
import seaborn as sns

pd.set_option('display.max_rows', None)  ###
pd.set_option('display.max_columns', None)  ###
pd.set_option('display.width', None)  ###
pd.set_option('display.max_colwidth', None)  ###

import warnings
warnings.filterwarnings('ignore')

from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_auc_score
from sklearn.metrics import precision_recall_curve
from sklearn.metrics import roc_curve
from sklearn.metrics import classification_report
from sklearn.metrics import auc
from sklearn.metrics import cohen_kappa_score
from sklearn.inspection import permutation_importance
from autogluon.text import TextPredictor

In [2]:
try:
  from google.colab import drive
  IN_COLAB=True
except:
  IN_COLAB=False

if IN_COLAB:
  print("We're running Colab")

if IN_COLAB:  
  # Mount the Google Drive at mount
  mount='/content/gdrive'
  print("Colab: mounting Google drive on ", mount)
  # connect your colab with the drive
  drive.mount(mount)

 # Switch to the directory on the Google Drive that you want to use
  import os
  path_to_repo = mount + "/MyDrive/MIMIC-III Text Mining/LOS_FINAL/"

else:
  # Setup Repository
  with open("repo_info.txt", "r") as repo_info:
      path_to_repo = repo_info.readline()

  
print(path_to_repo)

path_to_data = f"{path_to_repo}data/"
path_to_raw = f"{path_to_data}raw/"
path_to_processed = f"{path_to_data}processed/"
path_to_lda = f"{path_to_data}lda/"
path_to_icd = f"{path_to_data}icd_codes/"
path_to_models = f"{path_to_repo}models/"
path_to_results = f"{path_to_repo}results/"

/Users/ADORNI/Dropbox (BFI)/Luca_Data_df_mixed/


## Import the Dataset

In [None]:
# Model Parameters
label = 'los_cat'
metric = 'roc_auc'

In [None]:
# compute other metrics
def perf_evaluator(y_test, y_pred, y_pred_proba):
    """ Function to display the main classification performance metrics """
    kappa = cohen_kappa_score(y_test, y_pred)
    precision, recall, prc_th = precision_recall_curve(y_test, y_pred_proba)
    prc_auc = auc(recall, precision)
    return kappa, prc_auc

In [24]:
# Iterate over our main methods of vectorization

vect_dict = {'stemming': (False, False),
             'spacy': (True, True)}

             
for key, value in vect_dict.items():
    print(key)
    # PARAMETERS

    lemmatize = value[0] # set to false if we want to do stemming
    lemma_tag = str(np.where(lemmatize, "_lemma",""))
    spacy = value[1]
    if spacy: lemma_tag = str(np.where(lemmatize, "_lemma_spacy",""))

    preprocessing = True # set to true if we want to clean and perform some preprocessing
    preproc_heavier = True # set to True if we want a heavier preprocessing
    preproc_tag_2 = np.where(preproc_heavier, '_heavier', '')
    preproc_tag = np.where(preprocessing, f'_preproc{preproc_tag_2}', f'{preproc_tag_2}')
    
    df = pd.read_feather(f'{path_to_processed}df_los{preproc_tag}{lemma_tag}')
    # restrict to just discharge notes
    df = df[['los_cat', 'text']]
    print('Dataframe Loaded')
    # split the data into training and test
    train, test = train_test_split(df, train_size=0.80, stratify = df['los_cat'], random_state=42)

    # run the tabular predictor ensemble of models (include 'multimodal' hyperparameter)
    save_path = f'{path_to_models}text{preproc_tag}{lemma_tag}'
    os.makedirs(save_path, exist_ok = True)

    try:
      predictor = TextPredictor.load(save_path)
      print("Model Loaded")
    except:
      print("Training Model")
      predictor = TextPredictor(label=label, eval_metric=metric, path=save_path)
      predictor.fit(
          train_data=train,
          hyperparameters={
              "model.hf_text.checkpoint_name": "emilyalsentzer/Bio_ClinicalBERT", 
          },
      )
    # evaluate performance on the test set
    per_tab = predictor.evaluate(test, metrics=['roc_auc','f1','acc'])
    print(f"\nTest set performance:\n{per_tab}")
    # save the class and probability predictions
    y_pred = predictor.predict(test)
    y_pred_proba = predictor.predict_proba(test).iloc[:,1]
    perf = perf_evaluator(test['los_cat'], y_pred, y_pred_proba)
    perf_dict = {"Cohen's Kappa": perf[0], "PRC AUC": perf[1]}
    print(f"\nPerformance metrics:\n{perf_dict}")
    perf_dict.update(per_tab)

    # save performances
    df_perf = pd.DataFrame.from_dict(perf_dict, orient='index', columns=['performances'])
    df_perf.to_excel(path_to_results+f'text_only/df_perf_{key}.xlsx')


Without Preprocessing
