In [None]:
import numpy as np
import os
import pandas as pd

from imblearn.over_sampling import RandomOverSampler
from imblearn.under_sampling import RandomUnderSampler
from imblearn.pipeline import Pipeline

In [None]:
sampling_options = {"SUBSAMPLE": 0, "SMOTE": 1, "UNDER": 3, "OVER": 4, "UNDER_OVER": 5, "NONE": 6}

## Load the Data

In [None]:
mimic_data_path = "/home/littlefield/mimic-data/mimiciii/1.4/"
def get_mimic_dataset(table_name):
    try:
        file = table_name + ".csv"
        return pd.read_csv(mimic_data_path + file, low_memory=False)
    except FileNotFoundError:
        print("Unable to load data table", table_name, "from", mimic_data_path + file)

In [None]:
notes = get_mimic_dataset("NOTEEVENTS")

In [None]:
notes

In [None]:
admissions = get_mimic_dataset("ADMISSIONS")

## Preprocessing

#### Convert Dates

In [None]:
admissions.ADMITTIME = pd.to_datetime(admissions.ADMITTIME, format = '%Y-%m-%d %H:%M:%S', errors = 'coerce')
admissions.DISCHTIME = pd.to_datetime(admissions.DISCHTIME, format = '%Y-%m-%d %H:%M:%S', errors = 'coerce')
admissions.DEATHTIME = pd.to_datetime(admissions.DEATHTIME, format = '%Y-%m-%d %H:%M:%S', errors = 'coerce')

#### Find Next Unplanned Admission Event

In [None]:
# sort by subject_ID and admission date
admissions = admissions.sort_values(['SUBJECT_ID','ADMITTIME'])
admissions = admissions.reset_index(drop = True)

In [None]:
# add the next admission date and type for each subject using groupby
# you have to use groupby otherwise the dates will be from different subjects
admissions['NEXT_ADMITTIME'] = admissions.groupby('SUBJECT_ID').ADMITTIME.shift(-1)

# get the next admission type
admissions['NEXT_ADMISSION_TYPE'] = admissions.groupby('SUBJECT_ID').ADMISSION_TYPE.shift(-1)

#### Filter Out Elective Admissions and Back Fill NAs

In [None]:
# get rows where next admission is elective and replace with naT or nan
rows = admissions.NEXT_ADMISSION_TYPE == 'ELECTIVE'
admissions.loc[rows,'NEXT_ADMITTIME'] = pd.NaT
admissions.loc[rows,'NEXT_ADMISSION_TYPE'] = np.NaN

In [None]:
# sort by subject_ID and admission date
# it is safer to sort right before the fill in case something changed the order above
admissions = admissions.sort_values(['SUBJECT_ID','ADMITTIME'])
# back fill (this will take a little while)
admissions[['NEXT_ADMITTIME','NEXT_ADMISSION_TYPE']] = admissions.groupby(['SUBJECT_ID'])[['NEXT_ADMITTIME','NEXT_ADMISSION_TYPE']].fillna(method = 'bfill')

#### Calculate Number of Days till Next Admission

In [None]:
admissions['DAYS_NEXT_ADMIT'] =  (admissions.NEXT_ADMITTIME - admissions.DISCHTIME).dt.total_seconds()/(24*60*60)

#### Filter to Use Discharge Notes Only

In [None]:
notes_dis = notes.loc[notes.CATEGORY == 'Discharge summary']

In [None]:
notes_dis_last = (notes_dis.groupby(['SUBJECT_ID','HADM_ID']).nth(-1)).reset_index()
assert notes_dis_last.duplicated(['HADM_ID']).sum() == 0, 'Multiple discharge summaries per admission'

#### Merge ADMISSIONS and NOTEEVENTS Tables

In [None]:
adm_notes = pd.merge(admissions[['SUBJECT_ID','HADM_ID','ADMITTIME','DISCHTIME','DAYS_NEXT_ADMIT','NEXT_ADMITTIME','ADMISSION_TYPE','DEATHTIME']],
                        notes_dis_last[['SUBJECT_ID','HADM_ID','TEXT']], 
                        on = ['SUBJECT_ID','HADM_ID'],
                        how = 'left')
assert len(admissions) == len(adm_notes), 'Number of rows increased'

#### Calculate Amount of Admissions Missing Discharge Notes

In [None]:
adm_notes.TEXT.isnull().sum() / len(adm_notes)

In [None]:
adm_notes.groupby('ADMISSION_TYPE').apply(lambda g: g.TEXT.isnull().sum())/adm_notes.groupby('ADMISSION_TYPE').size()

In [None]:
adm_notes_clean = adm_notes.loc[adm_notes.ADMISSION_TYPE != 'NEWBORN'].copy()

#### Create Output Label: Patients who are readmitted within 30 days

In [None]:
adm_notes_clean['OUTPUT_LABEL'] = (adm_notes_clean.DAYS_NEXT_ADMIT < 30).astype('int')

In [None]:
print('Number of positive samples:', (adm_notes_clean.OUTPUT_LABEL == 1).sum())
print('Number of negative samples:',  (adm_notes_clean.OUTPUT_LABEL == 0).sum())
print('Total samples:', len(adm_notes_clean))

#### Generate Training/Validation/Test Sets

In [None]:
adm_notes_clean

In [None]:
# shuffle the samples
adm_notes_clean = adm_notes_clean.sample(n = len(adm_notes_clean), random_state = 42)
adm_notes_clean = adm_notes_clean.reset_index(drop = True)

adm_notes_clean = adm_notes_clean[adm_notes_clean.DEATHTIME.isnull()]

# Save 30% of the data as validation and test data 
valid_test=adm_notes_clean.sample(frac=0.30,random_state=42)

test = valid_test.sample(frac = 0.5, random_state = 42)
valid = valid_test.drop(test.index)

# use the rest of the data as training data
train = adm_notes_clean.drop(valid_test.index)

print('Test prevalence(n = %d):'%len(test), test.OUTPUT_LABEL.sum()/ len(test))
print('Valid prevalence(n = %d):'%len(valid), valid.OUTPUT_LABEL.sum()/ len(valid))
print('Train all prevalence(n = %d):'%len(train), train.OUTPUT_LABEL.sum()/ len(train))
print('all samples (n = %d)'%len(adm_notes_clean))

In [None]:
def fix_imbalance(train, method="SUBSAMPLE"):
    if method == "SUBSAMPLE":
        subsample(train)
    elif(method == "SMOTE"):
        smote(train)
    elif(method == "UNDER"):
        undersample(train)
    elif(method == "OVER"):
        oversample(train)
    else:
        under_over_sample(train)

In [None]:
def subsample(train, pos_class=1, random_state=42):
    # split the training data into positive and negative
    rows_pos = train.OUTPUT_LABEL == pos_class
    train_pos = train.loc[rows_pos]
    train_neg = train.loc[~rows_pos]

    # merge the balanced data
    whole_df = pd.concat([train_pos, train_neg.sample(n = len(train_pos), random_state = random_state)],axis = 0)

    # shuffle the order of training samples 
    train_sub = whole_df.sample(n = len(whole_df), random_state = random_state).reset_index(drop = True)
    
    return train_sub

In [None]:
def undersample(X, y, samp_rate=0.5, random_state=42):
    rus = RandomUnderSampler(random_state=42, sampling_strategy=samp_rate)
    X_res, y_res = rus.fit_resample(np.array(X).reshape(-1, 1), y)
    return X_res, y_res

In [None]:
def oversample(X, y, samp_rate=0.5, random_state=42):
    ros = RandomOverSampler(random_state=42, sampling_strategy=samp_rate)
    X_res, y_res = ros.fit_resample(np.array(X).reshape(-1, 1), y)
    return X_res, y_res

In [None]:
def under_over_sample(X, y, under_samp_rate=0.15, over_samp_rate=0.75, random_state=42):
    under = RandomUnderSampler(sampling_strategy=under_samp_rate, random_state=random_state, )
    over = RandomOverSampler(sampling_strategy=over_samp_rate, random_state=random_state)
    steps = [('under', under), ('over', over)]
    pipeline = Pipeline(steps = steps)
    
    X_res, y_res = pipeline.fit_resample(np.array(X).reshape(-1, 1), y)
    
    combined = pd.DataFrame( data = {"TEXT": X_res.squeeze(), "OUTPUT_LABEL": y_res})
    
    return combined.fillna("")

In [None]:
train_sampled = under_over_sample(train.TEXT, train.OUTPUT_LABEL)

In [None]:
from collections import Counter
Counter(train_sampled.OUTPUT_LABEL)

#### Prevalence is low, subsample negatives in training set

In [None]:
print('Train prevalence (n = %d):'%len(data), data.OUTPUT_LABEL.sum()/ len(data))

#### Preprocess Notes: Remove new lines and carriage returns, and replace NaNs with '   '

In [None]:
def preprocess_text(df):
    df.TEXT = df.TEXT.fillna(' ')
    df.TEXT = df.TEXT.str.replace('\n',' ')
    df.TEXT = df.TEXT.str.replace('\r',' ')
    return df

In [None]:
train_sampled = preprocess_text(train_sampled)
valid = preprocess_text(valid)
test = preprocess_text(test)

#### Save training, valid, and test sets

In [None]:
data_pth = "/home/littlefield/MIMIC-NLP/readmission-prediction/data/"
if not os.path.exists(data_pth):
    os.mkdir(data_pth)

In [None]:
from clinical_note_utils import extract_notes

In [None]:
# extract_notes(data_pth, "train", train_sub[train_sub.OUTPUT_LABEL == 0], ds_type="train", label="neg")
# extract_notes(data_pth, "train", train_sub[train_sub.OUTPUT_LABEL == 1], ds_type="train", label="pos")
# extract_notes(data_pth, "valid", valid[valid.OUTPUT_LABEL == 0], ds_type="valid", label="neg")
# extract_notes(data_pth, "valid", valid[valid.OUTPUT_LABEL == 1], ds_type="valid", label="pos")
# extract_notes(data_pth, "test", test[test.OUTPUT_LABEL == 0], ds_type="test", label="neg")
# extract_notes(data_pth, "test", test[test.OUTPUT_LABEL == 1], ds_type="test", label="pos")
# extract_notes(data_pth, "unsup", not_used, ds_type="unsup", label="None")

In [None]:
train_sampled[["TEXT", "OUTPUT_LABEL"]].to_csv(data_pth + "train_under-oversampled.csv", index=False)
valid[["TEXT", "OUTPUT_LABEL"]].to_csv(data_pth + "valid.csv", index=False)
test[["TEXT", "OUTPUT_LABEL"]].to_csv(data_pth + "test.csv", index=False)