# Classification - BERT

In [1]:
# First install package from terminal:
!pip install -U pip
!pip install -U setuptools wheel
!pip install autogluon  # autogluon==0.4.1

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
[0mLooking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
[0mLooking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
[0m

In [2]:
# Import libraries
import numpy as np
import pandas as pd
import os
import random
import dill
import pickle
from tabulate import tabulate

import sys

import warnings
warnings.filterwarnings("ignore")

from sklearn.metrics import roc_auc_score
from sklearn.model_selection import RandomizedSearchCV, GridSearchCV
from sklearn.metrics import precision_recall_curve
from sklearn import metrics

from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_curve
from sklearn.metrics import classification_report
from sklearn.metrics import auc
from sklearn.metrics import cohen_kappa_score
from sklearn.inspection import permutation_importance
from autogluon.text import TextPredictor
import torch
from transformers import BertTokenizerFast, BertModel, BertConfig, BertForPreTraining, BertForSequenceClassification, pipeline
import time
from datetime import timedelta

In [3]:
try:
  from google.colab import drive
  IN_COLAB=True
except:
  IN_COLAB=False

if IN_COLAB:
  print("We're running Colab")

if IN_COLAB:  
  # Mount the Google Drive at mount
  mount='/content/gdrive'
  print("Colab: mounting Google drive on ", mount)
  # connect your colab with the drive
  drive.mount(mount)

 # Switch to the directory on the Google Drive that you want to use
  import os
  path_to_repo = mount + "/My Drive/MIMIC-III Text Mining/mimim_iii_readmission"

else:
   path_to_repo = os.path.dirname(os.getcwd())

  
print(path_to_repo)

We're running Colab
Colab: mounting Google drive on  /content/gdrive
Mounted at /content/gdrive
/content/gdrive/My Drive/MIMIC-III Text Mining/mimim_iii_readmission


In [4]:
# PARAMETERS
test_proportion = 0.2
val_proportion = 0.1

session_seed = 42 # set seed for our session
include_val = False # set to True if we want to also create a validation set
tune_models = True # set to True if we want to perform parameter tuning

icu_stays = True # set to TRUE if we want to have only ICU stays
lemmatize = True # set to false if we want to do stemming
lemma_tag = str(np.where(lemmatize, "_lemma",""))
heavier_proc = True # if we want a heavier processing
if heavier_proc:
    heavier_tag = '_heavier'
else:
    heavier_tag = ''
    
spacy = True
if spacy: lemma_tag = str(np.where(lemmatize, "_lemma_spacy",""))

seed_tag = f'_{session_seed}'

halving = True # if we want to perform halving tune
if tune_models:
    if halving:
        tune_tag = '_tuned_halv'
    else:
        tune_tag = '_tuned'   
else:
    tune_tag = ''

random.seed(session_seed)

med_7 = False # set to True if we want to use our Med7 preprocessing

if med_7:
    med_tag = "_med7"
else:
    med_tag = ''
    
feat_select = False # select True if we want to use Lasso as a feature selection method

if feat_select:
    feat_tag = "_featselect"
else:
    feat_tag = ''
    
expanded_def = True # set to True if we want to consider future readmissions and avoid using CMS 

if icu_stays == True:
    icu_folder = 'icu_only'
    if expanded_def:
        icu_folder = 'expanded'
else:
    icu_folder = 'all_hosp'

In [5]:
path_to_data = os.path.join(path_to_repo, "data", icu_folder,"")
print(path_to_data)

/content/gdrive/My Drive/MIMIC-III Text Mining/mimim_iii_readmission/data/expanded/


In [6]:
path_to_processed = os.path.join(path_to_data,"processed","")
os.makedirs(path_to_processed, exist_ok=True) # we create the directory if it does not exist
print(path_to_processed)

/content/gdrive/My Drive/MIMIC-III Text Mining/mimim_iii_readmission/data/expanded/processed/


In [7]:
path_to_models = os.path.join(path_to_data,"models","")
os.makedirs(path_to_models, exist_ok=True) # we create the directory if it does not exist
print(path_to_models)

/content/gdrive/My Drive/MIMIC-III Text Mining/mimim_iii_readmission/data/expanded/models/


### Train/Test split

In [8]:
# Load our dataset
df = pd.read_feather(os.path.join(path_to_data,f"df_cleaned{lemma_tag}{med_tag}{heavier_tag}"))

In [9]:
df = df[['target', 'clean']]

In [10]:
# split into train and test
train, test = train_test_split(df, test_size = test_proportion, random_state = session_seed, stratify = df.target)
if include_val == True:
    # furtherly split into validation and train
    train, val = train_test_split(train, test_size = val_proportion, random_state = session_seed, stratify = train.target)
else:
    val = ''

## Train the Models

In [11]:
# Model Parameters
label = 'target'
metric = 'roc_auc'

In [12]:
# compute other metrics
def perf_evaluator(y_test, y_pred, y_pred_proba):
    """ Function to display the main classification performance metrics """
    kappa = cohen_kappa_score(y_test, y_pred)
    precision, recall, prc_th = precision_recall_curve(y_test, y_pred_proba)
    prc_auc = auc(recall, precision)
    return kappa, prc_auc

In [16]:
# run the tabular predictor ensemble of models (include 'multimodal' hyperparameter)
save_path = f'{path_to_models}text{seed_tag}{lemma_tag}{med_tag}{heavier_tag}{lemma_tag}'
os.makedirs(save_path, exist_ok = True)
print(save_path)

/content/gdrive/My Drive/MIMIC-III Text Mining/mimim_iii_readmission/data/expanded/models/text_42_lemma_spacy_heavier_lemma_spacy


In [17]:
try:
  predictor = TextPredictor.load(save_path)
  print("Model Loaded")
except:
  print("Training Model")
  predictor = TextPredictor(label=label, eval_metric=metric, path=save_path)
  predictor.fit(
      train_data=train,
      hyperparameters={
          "model.hf_text.checkpoint_name": "emilyalsentzer/Bio_ClinicalBERT", 
      },
  )

INFO:pytorch_lightning.utilities.seed:Global seed set to 123


Training Model


Downloading (…)lve/main/config.json:   0%|          | 0.00/385 [00:00<?, ?B/s]

Downloading (…)"pytorch_model.bin";:   0%|          | 0.00/436M [00:00<?, ?B/s]

Downloading (…)solve/main/vocab.txt:   0%|          | 0.00/213k [00:00<?, ?B/s]

Downloading /usr/local/lib/python3.8/dist-packages/autogluon/multimodal/data/templates.zip from https://automl-mm-bench.s3.amazonaws.com/few_shot/templates.zip...


INFO:pytorch_lightning.trainer.connectors.accelerator_connector:Auto select gpus: [0]
INFO:pytorch_lightning.utilities.rank_zero:Using 16bit native Automatic Mixed Precision (AMP)
INFO:pytorch_lightning.utilities.rank_zero:GPU available: True (cuda), used: True
INFO:pytorch_lightning.utilities.rank_zero:TPU available: False, using: 0 TPU cores
INFO:pytorch_lightning.utilities.rank_zero:IPU available: False, using: 0 IPUs
INFO:pytorch_lightning.utilities.rank_zero:HPU available: False, using: 0 HPUs
INFO:pytorch_lightning.accelerators.cuda:LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]
INFO:pytorch_lightning.callbacks.model_summary:
  | Name              | Type                         | Params
-------------------------------------------------------------------
0 | model             | HFAutoModelForTextPrediction | 108 M 
1 | validation_metric | AUROC                        | 0     
2 | loss_func         | CrossEntropyLoss             | 0     
-------------------------------------------------

Sanity Checking: 0it [00:00, ?it/s]

Training: 0it [00:00, ?it/s]

INFO:automm:Models and intermediate outputs are saved to /content/gdrive/My Drive/MIMIC-III Text Mining/mimim_iii_readmission/data/expanded/models/text_42_lemma_spacy_heavier_lemma_spacy 


In [None]:
# evaluate performance on the test set
per_tab = predictor.evaluate(test, metrics=['roc_auc','f1','acc'])
print(f"\nTest set performance:\n{per_tab}")
# save the class and probability predictions
y_pred = predictor.predict(test)
y_pred_proba = predictor.predict_proba(test).iloc[:,1]
perf = perf_evaluator(test['los_cat'], y_pred, y_pred_proba)
perf_dict = {"Cohen's Kappa": perf[0], "PRC AUC": perf[1]}
print(f"\nPerformance metrics:\n{perf_dict}")
perf_dict.update(per_tab)

# save performances
df_perf = pd.DataFrame.from_dict(perf_dict, orient='index', columns=['performances'])
df_perf.to_excel(path_to_models+f'text{seed_tag}{lemma_tag}{med_tag}{heavier_tag}{lemma_tag}/df_perf.xlsx')