In [33]:
%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


# BEFORE YOU RUN THIS NOTEBOOK

## Make sure to read README and run the process_data.py script according to the instruction

# Set up

In [34]:
# Standard data libraries
import pandas as pd
import numpy as np
# Sklearn models to compare
from sklearn.dummy import DummyClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
# Custom utility script for easy model evaluation
from model_util import eval_sklearn_model

In [35]:
RANDOM_SEED = 0 # for reproducibility (so models can be trained the exact same way and train/test are split the exact same way)
MAX_ITER = 10000 # for logistic regression convergence

## Helper function

In [36]:
# Computs the micro and macro f1, precision, recall of logistic regression and random forest model on the given data
def eval_models(feature_matrix,target_col,random_seed=RANDOM_SEED):
    print("Logistic Regression Performance:")
    lr_clf = LogisticRegression(random_state=RANDOM_SEED,max_iter=MAX_ITER,class_weight='balanced')
    lr_clf_report = eval_sklearn_model(lr_clf,feature_matrix,target_col,random_seed)
    print(lr_clf_report)
    print()
    print("Random Forest Performance:")
    rf_clf = RandomForestClassifier(random_state=RANDOM_SEED)
    rf_clf_report = eval_sklearn_model(rf_clf,feature_matrix,target_col,random_seed)
    print(rf_clf_report)
    
    return lr_clf_report,rf_clf_report

## Extract baseline (demographic) features

In [37]:
baseline_df = pd.read_csv("processed_data/baseline_features.csv")
baseline_feature_column_names = [col for col in baseline_df.columns if not (col == "SUBJECT_ID" or col == "DIAGNOSIS" or col == "DIED")]
baseline_target_column_name = "DIED"
baseline_feature_matrix = baseline_df[baseline_feature_column_names].to_numpy()
baseline_target_col = baseline_df[baseline_target_column_name].to_numpy()

# Model 0 (Dummy Classifiers):  Always predict 1 or always predict 0

We first construct two dummy models that always predicts either 1 or 0 so we have something to compare to

In [38]:
dummy_clf_0 = DummyClassifier(strategy='constant',constant=0,random_state=RANDOM_SEED)
dummy_clf_1 = DummyClassifier(strategy='constant',constant=1,random_state=RANDOM_SEED)
dummy_0_report = eval_sklearn_model(dummy_clf_0,baseline_feature_matrix,baseline_target_col,random_state=RANDOM_SEED)
dummy_1_report = eval_sklearn_model(dummy_clf_1,baseline_feature_matrix,baseline_target_col,random_state=RANDOM_SEED)

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


As we may expect, due to the highly unbalanced nature of the dataset the dummy model has very good micro-averaged metrics (since it's only predicting the majority class) but poor macro-averaged metrics. 

In [39]:
print("Dummy Classifer (Always Predict 0) Performance: ")
print(dummy_0_report)
print()
print("Dummy Classifer (Always Predict 1) Performance: ")
print(dummy_1_report)

Dummy Classifer (Always Predict 0) Performance: 
{'binary': {'f1': 0.0, 'precision': 0.0, 'recall': 0.0, 'AUPRC': 0.1477243507328362, 'AUROC': 0.5}, 'micro': {'f1': 0.8522756492671638, 'precision': 0.8522756492671638, 'recall': 0.8522756492671638, 'AUPRC': 0.1477243507328362, 'AUROC': 0.5}, 'macro': {'f1': 0.4601235510515722, 'precision': 0.4261378246335819, 'recall': 0.5, 'AUPRC': 0.1477243507328362, 'AUROC': 0.5}}

Dummy Classifer (Always Predict 1) Performance: 
{'binary': {'f1': 0.25742130614988235, 'precision': 0.1477243507328362, 'recall': 1.0, 'AUPRC': 0.1477243507328362, 'AUROC': 0.5}, 'micro': {'f1': 0.1477243507328362, 'precision': 0.1477243507328362, 'recall': 0.1477243507328362, 'AUPRC': 0.1477243507328362, 'AUROC': 0.5}, 'macro': {'f1': 0.12871065307494117, 'precision': 0.0738621753664181, 'recall': 0.5, 'AUPRC': 0.1477243507328362, 'AUROC': 0.5}}


# Model 1: Logistic Regression (LR) and Random Forest (RF) using Demographic Info (baseline faetures) Only

We will compare the performance of logistic regression (LR) and random forest (RF) on mortality prediction using only demographic feature. While neither model outperform the dummy model on micro-averaged metrics, we can see there are still signal in the data as both model received > 0.6 area under the ROC curve. From this experiment it seems LR is a better model than RF as it outperforms it in macro metrics.

In [40]:
baseline_lr_report, baseline_rf_report = eval_models(baseline_feature_matrix,baseline_target_col)

Logistic Regression Performance:
{'binary': {'f1': 0.317582640438045, 'precision': 0.20703331570597566, 'recall': 0.6814621409921671, 'AUPRC': 0.23256219216327212, 'AUROC': 0.6641100547072685}, 'micro': {'f1': 0.567369503728465, 'precision': 0.567369503728465, 'recall': 0.567369503728465, 'AUPRC': 0.23256219216327212, 'AUROC': 0.6641100547072685}, 'macro': {'f1': 0.500438379042552, 'precision': 0.5577208620571921, 'recall': 0.6145280232793088, 'AUPRC': 0.23256219216327212, 'AUROC': 0.6641100547072685}}

Random Forest Performance:
{'binary': {'f1': 0.025256511444356748, 'precision': 0.13559322033898305, 'recall': 0.01392515230635335, 'AUPRC': 0.19790689062574157, 'AUROC': 0.6255298703996116}, 'micro': {'f1': 0.8412188223193623, 'precision': 0.8412188223193623, 'recall': 0.8412188223193623, 'AUPRC': 0.19790689062574157, 'AUROC': 0.6255298703996116}, 'macro': {'f1': 0.46941319518610164, 'precision': 0.493840996592468, 'recall': 0.49926910805844144, 'AUPRC': 0.19790689062574157, 'AUROC': 0

# Model 2: LR and RF using diagnosis text embedding from DistilBERT

Next we will examine if the text embeddings obtained from applying DistilBERT to each patient's diagnosis text data contains any signals. Notice there is no leakage here since no training was done when the embeddings were produced therefore it is safe to produce the text embedding for train and test data at the same time.

In [41]:
distil_bert_text_embedding_df = pd.read_csv("processed_data/distil_bert_text_embedding.csv")
distil_bert_feature_column_names = [col for col in distil_bert_text_embedding_df.columns if not (col == "SUBJECT_ID" or col == "DIED")]
distil_bert_target_column_name = "DIED"

In [42]:
distil_bert_feature_matrix = distil_bert_text_embedding_df[distil_bert_feature_column_names].to_numpy()
distil_bert_target_col = distil_bert_text_embedding_df[distil_bert_target_column_name].to_numpy()

We see a significant performance boost when compared to using just baseline features which suggest that there are more useful information contained in the diagnosis text that are relevant to a patient's chance of survival

In [43]:
distil_bert_lr_report, distil_bert_rf_report = eval_models(distil_bert_feature_matrix,distil_bert_target_col)

Logistic Regression Performance:
{'binary': {'f1': 0.3610261237938338, 'precision': 0.24741935483870967, 'recall': 0.6675369886858138, 'AUPRC': 0.29732463283473315, 'AUROC': 0.7192813022821763}, 'micro': {'f1': 0.650938544613011, 'precision': 0.650938544613011, 'recall': 0.650938544613011, 'AUPRC': 0.29732463283473315, 'AUROC': 0.7192813022821763}, 'macro': {'f1': 0.5604546909762482, 'precision': 0.5828802631397482, 'recall': 0.6577992682152858, 'AUPRC': 0.29732463283473315, 'AUROC': 0.7192813022821763}}

Random Forest Performance:
{'binary': {'f1': 0.0846805234795997, 'precision': 0.36666666666666664, 'recall': 0.047867711053089644, 'AUPRC': 0.27362694699480156, 'AUROC': 0.6979210607819296}, 'micro': {'f1': 0.847132939058884, 'precision': 0.847132939058884, 'recall': 0.847132939058884, 'AUPRC': 0.27362694699480156, 'AUROC': 0.6979210607819296}, 'macro': {'f1': 0.500641447122419, 'precision': 0.6116238419856669, 'recall': 0.5167683705363503, 'AUPRC': 0.27362694699480156, 'AUROC': 0.697

# Model 3: LR and RF using diagnosis text embedding from BlueBERT

BlueBERT is a BERT-based transformer fine-tuned using PubMed and other biomedical text data. We will examine if text embeddings produced by such domain adapted transformer will give LR and RF better performance than using text embeddings produced by the general purpose DistilBERT 

In [44]:
blue_bert_text_embedding_df = pd.read_csv("processed_data/blue_bert_text_embedding.csv")
blue_bert_feature_column_names = [col for col in blue_bert_text_embedding_df.columns if not (col == "SUBJECT_ID" or col == "DIED")]
blue_bert_target_column_name = "DIED"

In [45]:
blue_bert_feature_matrix = blue_bert_text_embedding_df[blue_bert_feature_column_names].to_numpy()
blue_bert_target_col = blue_bert_text_embedding_df[blue_bert_target_column_name].to_numpy()

We see a similar but not necessarily better performance when compared to using text embeddings produced by DistilBERT.

In [46]:
blue_bert_lr_report, blue_bert_rf_report = eval_models(blue_bert_feature_matrix,blue_bert_target_col)

Logistic Regression Performance:
{'binary': {'f1': 0.36212086131048854, 'precision': 0.2466876971608833, 'recall': 0.68059181897302, 'AUPRC': 0.29253613124268774, 'AUROC': 0.7184465598779317}, 'micro': {'f1': 0.6457958344047313, 'precision': 0.6457958344047313, 'recall': 0.6457958344047313, 'AUPRC': 0.29253613124268774, 'AUROC': 0.7184465598779317}, 'macro': {'f1': 0.5584743311624971, 'precision': 0.5835217999693305, 'recall': 0.6601782446803552, 'AUPRC': 0.29253613124268774, 'AUROC': 0.7184465598779317}}

Random Forest Performance:
{'binary': {'f1': 0.08314087759815242, 'precision': 0.36, 'recall': 0.04699738903394256, 'AUPRC': 0.2797361442788494, 'AUROC': 0.7055071992265438}, 'micro': {'f1': 0.8468758035484699, 'precision': 0.84687580354847, 'recall': 0.84687580354847, 'AUPRC': 0.2797361442788494, 'AUROC': 0.7055071992265438}, 'macro': {'f1': 0.49980148319831874, 'precision': 0.6082249606712113, 'recall': 0.5162577833689851, 'AUPRC': 0.2797361442788494, 'AUROC': 0.7055071992265438}}


# Model 4: LR and RF using features from baseline + DistilBERT

Next we combine the baseline features and the text embeddings to see if this would lead to increased performance or if the text embedding features overshadow the performance of the baseline features.

In [47]:
baseline_distil_bert_df = pd.merge(baseline_df,distil_bert_text_embedding_df,how='inner',on="SUBJECT_ID")
baseline_distil_bert_feature_column_names = [col for col in baseline_distil_bert_df.columns if not (col == "SUBJECT_ID" or col == "DIAGNOSIS" or "DIED" in col)]
baseline_distil_bert_target_column_name = "DIED_x"

In [48]:
baseline_distil_feature_matrix = baseline_distil_bert_df[baseline_distil_bert_feature_column_names].to_numpy()
baseline_distil_target_col = baseline_distil_bert_df[baseline_distil_bert_target_column_name].to_numpy()

We observe an improvement over both using the baseline features alone and using the text embeddings alone. This suggest that the text data complements the baseline features to a certain degree. We also note that this improvement is only observed in LR but not RF.

In [49]:
baseline_distil_lr_report, baseline_distil_rf_report = eval_models(baseline_distil_feature_matrix,baseline_distil_target_col)

Logistic Regression Performance:
{'binary': {'f1': 0.3889573576534385, 'precision': 0.2713204951856946, 'recall': 0.6866840731070496, 'AUPRC': 0.32890150128693285, 'AUROC': 0.7487755557804993}, 'micro': {'f1': 0.6812805348418617, 'precision': 0.6812805348418617, 'recall': 0.6812805348418617, 'AUPRC': 0.32890150128693285, 'AUROC': 0.7487755557804993}, 'macro': {'f1': 0.5866866969152487, 'precision': 0.598699261966564, 'recall': 0.6835140081932894, 'AUPRC': 0.32890150128693285, 'AUROC': 0.7487755557804993}}

Random Forest Performance:
{'binary': {'f1': 0.18084473527662104, 'precision': 0.2857142857142857, 'recall': 0.13228894691035684, 'AUPRC': 0.2501866384085179, 'AUROC': 0.6755888787314122}, 'micro': {'f1': 0.8229622010799691, 'precision': 0.8229622010799691, 'recall': 0.8229622010799691, 'AUPRC': 0.2501866384085179, 'AUROC': 0.6755888787314122}, 'macro': {'f1': 0.5408007460166889, 'precision': 0.574060565435117, 'recall': 0.5374825334944, 'AUPRC': 0.2501866384085179, 'AUROC': 0.675588

# Model 5: LR and RF using features from baseline + BlueBERT

We perform a similar experiment as above except using BlueBERT's text embeddings. 

In [50]:
baseline_blue_bert_df = pd.merge(baseline_df,blue_bert_text_embedding_df,how='inner',on="SUBJECT_ID")
baseline_blue_bert_feature_column_names = [col for col in baseline_blue_bert_df.columns if not (col == "SUBJECT_ID" or col == "DIAGNOSIS" or "DIED" in col)]
baseline_blue_bert_target_column_name = "DIED_x"

In [51]:
baseline_blue_feature_matrix = baseline_blue_bert_df[baseline_blue_bert_feature_column_names].to_numpy()
baseline_blue_target_col = baseline_blue_bert_df[baseline_blue_bert_target_column_name].to_numpy()

Both had very similar results

In [52]:
baseline_blue_lr_report, baseline_blue_rf_report = eval_models(baseline_blue_feature_matrix,baseline_blue_target_col)

Logistic Regression Performance:
{'binary': {'f1': 0.3889573576534385, 'precision': 0.2713204951856946, 'recall': 0.6866840731070496, 'AUPRC': 0.3201713445958627, 'AUROC': 0.7487017051038104}, 'micro': {'f1': 0.6812805348418617, 'precision': 0.6812805348418617, 'recall': 0.6812805348418617, 'AUPRC': 0.3201713445958627, 'AUROC': 0.7487017051038104}, 'macro': {'f1': 0.5866866969152487, 'precision': 0.598699261966564, 'recall': 0.6835140081932894, 'AUPRC': 0.3201713445958627, 'AUROC': 0.7487017051038104}}

Random Forest Performance:
{'binary': {'f1': 0.17671641791044776, 'precision': 0.2813688212927757, 'recall': 0.1288076588337685, 'AUPRC': 0.2514841805225723, 'AUROC': 0.6785698202678029}, 'micro': {'f1': 0.8227050655695551, 'precision': 0.8227050655695551, 'recall': 0.8227050655695551, 'AUPRC': 0.2514841805225723, 'AUROC': 0.6785698202678029}, 'macro': {'f1': 0.5386859951377756, 'precision': 0.5716689666309438, 'recall': 0.5358927417716889, 'AUPRC': 0.2514841805225723, 'AUROC': 0.678569

# Final Performance Table

Now we define a helper function that groups all these scores together in a dataframe 

In [53]:
def convert_reports_to_df(model_name_list,report_list):
    assert len(report_list) > 0 and len(model_name_list) == len(report_list)
    metric_names = ["model"]
    metric_names += [f"{key}_{metric}" for key in report_list[0] for metric in report_list[0][key]]
    rows = []
    for idx,report in enumerate(report_list):
        row = [model_name_list[idx]]
        nums = [report[key][metric] for key in report for metric in report[key]]
        row += nums
        rows.append(row)
    to_return_df = pd.DataFrame(columns=metric_names,data=rows)
    return to_return_df

In [54]:
dummy_reports = [dummy_0_report,dummy_1_report]
dummy_model_names = ["Dummy (Always Predict 0)", "Dummy (Always Predict 1)"]
dummy_report_df = convert_reports_to_df(dummy_model_names,dummy_reports)

lr_reports = [baseline_lr_report,distil_bert_lr_report,blue_bert_lr_report,baseline_distil_lr_report,baseline_blue_lr_report]
lr_model_names = ["LR + Baseline","LR + DistilBERT", "LR + BlueBERT", "LR + Baseline + DistilBERT", "LR + Baseline + BlueBERT"]
lr_report_df = convert_reports_to_df(lr_model_names,lr_reports)

rf_reports = [baseline_rf_report,distil_bert_rf_report,blue_bert_rf_report,baseline_distil_rf_report,baseline_blue_rf_report]
rf_model_names = ["RF + Baseline","RF + DistilBERT", "RF + BlueBERT", "RF + Baseline + DistilBERT", "RF + Baseline + BlueBERT"]
rf_report_df = convert_reports_to_df(rf_model_names,rf_reports)

final_report_df = pd.concat([dummy_report_df,lr_report_df,rf_report_df]).reset_index(drop=True)

In [55]:
final_report_df

Unnamed: 0,model,binary_f1,binary_precision,binary_recall,binary_AUPRC,binary_AUROC,micro_f1,micro_precision,micro_recall,micro_AUPRC,micro_AUROC,macro_f1,macro_precision,macro_recall,macro_AUPRC,macro_AUROC
0,Dummy (Always Predict 0),0.0,0.0,0.0,0.147724,0.5,0.852276,0.852276,0.852276,0.147724,0.5,0.460124,0.426138,0.5,0.147724,0.5
1,Dummy (Always Predict 1),0.257421,0.147724,1.0,0.147724,0.5,0.147724,0.147724,0.147724,0.147724,0.5,0.128711,0.073862,0.5,0.147724,0.5
2,LR + Baseline,0.317583,0.207033,0.681462,0.232562,0.66411,0.56737,0.56737,0.56737,0.232562,0.66411,0.500438,0.557721,0.614528,0.232562,0.66411
3,LR + DistilBERT,0.361026,0.247419,0.667537,0.297325,0.719281,0.650939,0.650939,0.650939,0.297325,0.719281,0.560455,0.58288,0.657799,0.297325,0.719281
4,LR + BlueBERT,0.362121,0.246688,0.680592,0.292536,0.718447,0.645796,0.645796,0.645796,0.292536,0.718447,0.558474,0.583522,0.660178,0.292536,0.718447
5,LR + Baseline + DistilBERT,0.388957,0.27132,0.686684,0.328902,0.748776,0.681281,0.681281,0.681281,0.328902,0.748776,0.586687,0.598699,0.683514,0.328902,0.748776
6,LR + Baseline + BlueBERT,0.388957,0.27132,0.686684,0.320171,0.748702,0.681281,0.681281,0.681281,0.320171,0.748702,0.586687,0.598699,0.683514,0.320171,0.748702
7,RF + Baseline,0.025257,0.135593,0.013925,0.197907,0.62553,0.841219,0.841219,0.841219,0.197907,0.62553,0.469413,0.493841,0.499269,0.197907,0.62553
8,RF + DistilBERT,0.084681,0.366667,0.047868,0.273627,0.697921,0.847133,0.847133,0.847133,0.273627,0.697921,0.500641,0.611624,0.516768,0.273627,0.697921
9,RF + BlueBERT,0.083141,0.36,0.046997,0.279736,0.705507,0.846876,0.846876,0.846876,0.279736,0.705507,0.499801,0.608225,0.516258,0.279736,0.705507


In [56]:
final_report_df.to_csv('lr_rf_report_df.csv',index=False)