In [None]:
import numpy as np
import pandas as pd
from fastai import *
from fastai.text import *

### Load the data

In [None]:
path = "/home/littlefield/MIMIC-NLP/readmission-prediction/data/"
bs = 64

In [None]:
clinical_lm = (TextList.from_folder(path)
               .filter_by_folder(include=['train', 'valid', 'unsup'])
               .split_by_folder()
               .label_for_lm().databunch(bs=bs))

In [None]:
clinical_data = (TextList.from_folder(path, vocab=clinical_lm.vocab)
                .split_by_folder()
                .label_from_folder(classes=['neg', 'pos'])
                .databunch(bs=bs, num_workers=1))

In [None]:
# clinical_lm.save('lm_export.pkl')
# clinical_data.save('clas_export.pkl')

In [None]:
clinical_lm = load_data(path, 'lm_export.pkl', bs=bs)
clinical_data = load_data(path, 'clas_export.pkl', bs=bs)

### 1. Build a language model using the clinical notes

In [None]:
# Build language model

In [None]:
learn = language_model_learner(clinical_lm, AWD_LSTM, drop_mult=0.3).to_fp16()

In [None]:
learn.lr_find()

In [None]:
learn.recorder.plot()

In [None]:
# Fit and save, to only run once and save time
learn.fit_one_cycle(10, 1e-3)

In [None]:
learn.save("clinical_lm-step1")

In [None]:
learn.unfreeze()

In [None]:
learn.lr_find()
learn.recorder.plot()

In [None]:
# Fit and save, to only run once and save time
learn.fit_one_cycle(6, slice(1e-4,1e-2))

In [None]:
learn.save('clinical_lm-step2')

In [None]:
learn.save_encoder("clinical_lm-step2_encoder")

### 2. Build a classifier to predict readmission

In [None]:
learn = text_classifier_learner(clinical_data, AWD_LSTM, drop_mult=0.3, metrics=[accuracy, AUROC(), Precision(), Recall()]).to_fp16()
learn.load_encoder('clinical_lm-step2_encoder')

In [None]:
lr = 1e-2
lr *= bs/48 # Scale learning rate by batch size
lr

In [None]:
learn.fit_one_cycle(6, lr, pct_start=0.15)

In [None]:
import sklearn.metrics as metrics
def score_positive_class(inp, preds, thresh=0.5):
    final_pred = [1 if p > thresh else 0 for p in preds]
    auc = metrics.roc_auc_score(inp, preds)
    f1 = metrics.f1_score(inp, final_pred)
    pre = metrics.precision_score(inp, final_pred)
    recall = metrics.recall_score(inp, final_pred)
    
    return pd.DataFrame({"auc": auc, "f1": f1, "precision": pre, "recall": recall}, index=[0])

In [None]:
train_preds = learn.get_preds(ds_type=DatasetType.Fix)

In [None]:
score_positive_class(learn.data.train_ds.y.items, train_preds[0][:, 1])

In [None]:
preds = learn.get_preds(ds_type=DatasetType.Valid)

In [None]:
score_positive_class(learn.data.valid_ds.y.items, preds[0][:, 1])

In [None]:
test = (TextList.from_folder(path).filter_by_folder("test"))
learn.data.add_test(test)
t_preds = learn.get_preds(ds_type=DatasetType.Test)
output = pd.read_csv("./data/test_lbls.csv")

In [None]:
# Create test labels based on order fast.ai read in test files
test_labels = []

for n in test.items:
    if "pos" in str(n):
        test_labels.append(1)
    else:
        test_labels.append(0)

In [None]:
score_positive_class(output.OUTPUT_LABEL, t_preds[0][:, 1])

In [None]:
# learn.save("clinical_classifier_step1")

In [None]:
learn.load("clinical_classifier_step1")

In [None]:
lr = 1e-02
lr *= bs/48

In [None]:
learn.fit_one_cycle(4, lr)

In [None]:
train_preds = learn.get_preds(ds_type=DatasetType.Fix)
score_positive_class(learn.data.train_ds.y.items, train_preds[0][:, 1])

In [None]:
preds = learn.get_preds(ds_type=DatasetType.Valid)
score_positive_class(learn.data.valid_ds.y.items, preds[0][:, 1])

In [None]:
t_preds = learn.get_preds(ds_type=DatasetType.Test)
score_positive_class(output.OUTPUT_LABEL, t_preds[0][:, 1])

In [None]:
# learn.save("clinical_classifier_step2")

In [None]:
learn.load("clinical_classifier_step2")

In [None]:
# Unfreeze and continue training
learn.unfreeze()
learn.lr_find()
learn.recorder.plot(suggestion=True)

In [None]:
lr = 1.32e-06
lr *= bs/48
lr

In [None]:
learn.fit_one_cycle(4, lr)

In [None]:
# learn.save("clinical_classifier_step3")

In [None]:
train_preds = learn.get_preds(ds_type=DatasetType.Fix)
score_positive_class(learn.data.train_ds.y.items, train_preds[0][:, 1])

In [None]:
preds = learn.get_preds(ds_type=DatasetType.Valid)
score_positive_class(learn.data.valid_ds.y.items, preds[0][:, 1])

In [None]:
t_preds = learn.get_preds(ds_type=DatasetType.Test)

In [None]:
score_positive_class(np.array(test_labels), t_preds[0][:, 1])