In [None]:
import numpy as np
import os
import pandas as pd

## Load the Data

In [None]:
mimic_data_path = "/home/littlefield/mimic-data/mimiciii/1.4/"
def get_mimic_dataset(table_name):
    try:
        file = table_name + ".csv"
        return pd.read_csv(mimic_data_path + file)
    except FileNotFoundError:
        print("Unable to load data table", table_name, "from", mimic_data_path + file)

In [None]:
notes = get_mimic_dataset("NOTEEVENTS")

In [None]:
admissions = get_mimic_dataset("ADMISSIONS")

## Preprocessing

#### Convert Dates

In [None]:
admissions.ADMITTIME = pd.to_datetime(admissions.ADMITTIME, format = '%Y-%m-%d %H:%M:%S', errors = 'coerce')
admissions.DISCHTIME = pd.to_datetime(admissions.DISCHTIME, format = '%Y-%m-%d %H:%M:%S', errors = 'coerce')
admissions.DEATHTIME = pd.to_datetime(admissions.DEATHTIME, format = '%Y-%m-%d %H:%M:%S', errors = 'coerce')

#### Find Next Unplanned Admission Event

In [None]:
# sort by subject_ID and admission date
admissions = admissions.sort_values(['SUBJECT_ID','ADMITTIME'])
admissions = admissions.reset_index(drop = True)

In [None]:
# add the next admission date and type for each subject using groupby
# you have to use groupby otherwise the dates will be from different subjects
admissions['NEXT_ADMITTIME'] = admissions.groupby('SUBJECT_ID').ADMITTIME.shift(-1)

# get the next admission type
admissions['NEXT_ADMISSION_TYPE'] = admissions.groupby('SUBJECT_ID').ADMISSION_TYPE.shift(-1)

#### Filter Out Elective Admissions and Back Fill NAs

In [None]:
# get rows where next admission is elective and replace with naT or nan
rows = admissions.NEXT_ADMISSION_TYPE == 'ELECTIVE'
admissions.loc[rows,'NEXT_ADMITTIME'] = pd.NaT
admissions.loc[rows,'NEXT_ADMISSION_TYPE'] = np.NaN

In [None]:
# sort by subject_ID and admission date
# it is safer to sort right before the fill in case something changed the order above
admissions = admissions.sort_values(['SUBJECT_ID','ADMITTIME'])
# back fill (this will take a little while)
admissions[['NEXT_ADMITTIME','NEXT_ADMISSION_TYPE']] = admissions.groupby(['SUBJECT_ID'])[['NEXT_ADMITTIME','NEXT_ADMISSION_TYPE']].fillna(method = 'bfill')

#### Calculate Number of Days till Next Admission

In [None]:
admissions['DAYS_NEXT_ADMIT'] =  (admissions.NEXT_ADMITTIME - admissions.DISCHTIME).dt.total_seconds()/(24*60*60)

#### Filter to Use Discharge Notes Only

In [None]:
notes_dis = notes.loc[notes.CATEGORY == 'Discharge summary']

In [None]:
notes_dis_last = (notes_dis.groupby(['SUBJECT_ID','HADM_ID']).nth(-1)).reset_index()
assert notes_dis_last.duplicated(['HADM_ID']).sum() == 0, 'Multiple discharge summaries per admission'

#### Merge ADMISSIONS and NOTEEVENTS Tables

In [None]:
adm_notes = pd.merge(admissions[['SUBJECT_ID','HADM_ID','ADMITTIME','DISCHTIME','DAYS_NEXT_ADMIT','NEXT_ADMITTIME','ADMISSION_TYPE','DEATHTIME']],
                        notes_dis_last[['SUBJECT_ID','HADM_ID','TEXT']], 
                        on = ['SUBJECT_ID','HADM_ID'],
                        how = 'left')
assert len(admissions) == len(adm_notes), 'Number of rows increased'

#### Calculate Amount of Admissions Missing Discharge Notes

In [None]:
adm_notes.TEXT.isnull().sum() / len(adm_notes)

In [None]:
adm_notes.groupby('ADMISSION_TYPE').apply(lambda g: g.TEXT.isnull().sum())/adm_notes.groupby('ADMISSION_TYPE').size()

In [None]:
adm_notes_clean = adm_notes.loc[adm_notes.ADMISSION_TYPE != 'NEWBORN'].copy()

#### Create Output Label: Patients who are readmitted within 30 days

In [None]:
adm_notes_clean['OUTPUT_LABEL'] = (adm_notes_clean.DAYS_NEXT_ADMIT < 30).astype('int')

In [None]:
print('Number of positive samples:', (adm_notes_clean.OUTPUT_LABEL == 1).sum())
print('Number of negative samples:',  (adm_notes_clean.OUTPUT_LABEL == 0).sum())
print('Total samples:', len(adm_notes_clean))

#### Generate Training/Validation/Test Sets

In [None]:
# shuffle the samples
adm_notes_clean = adm_notes_clean.sample(n = len(adm_notes_clean), random_state = 42)
adm_notes_clean = adm_notes_clean.reset_index(drop = True)

# Save 30% of the data as validation and test data 
valid_test=adm_notes_clean.sample(frac=0.30,random_state=42)

test = valid_test.sample(frac = 0.5, random_state = 42)
valid = valid_test.drop(test.index)

# use the rest of the data as training data
train = adm_notes_clean.drop(valid_test.index)

print('Test prevalence(n = %d):'%len(test), test.OUTPUT_LABEL.sum()/ len(test))
print('Valid prevalence(n = %d):'%len(valid), valid.OUTPUT_LABEL.sum()/ len(valid))
print('Train all prevalence(n = %d):'%len(train), train.OUTPUT_LABEL.sum()/ len(train))
print('all samples (n = %d)'%len(adm_notes_clean))

#### Prevalence is low, subsample negatives in training set

In [None]:
# split the training data into positive and negative
rows_pos = train.OUTPUT_LABEL == 1
train_pos = train.loc[rows_pos]
train_neg = train.loc[~rows_pos]

# merge the balanced data
train_sub = pd.concat([train_pos, train_neg.sample(n = len(train_pos), random_state = 42)],axis = 0)

# shuffle the order of training samples 
train_sub = train_sub.sample(n = len(train_sub), random_state = 42).reset_index(drop = True)

print('Train prevalence (n = %d):'%len(train_sub), train_sub.OUTPUT_LABEL.sum()/ len(train_sub))

#### Preprocess Notes: Remove new lines and carriage returns, and replace NaNs with '   '

In [None]:
def preprocess_text(df):
    df.TEXT = df.TEXT.fillna(' ')
    df.TEXT = df.TEXT.str.replace('\n',' ')
    df.TEXT = df.TEXT.str.replace('\r',' ')
    return df

In [None]:
train = preprocess_text(train)
train_sub = preprocess_text(train_sub)
valid = preprocess_text(valid)
test = preprocess_text(test)

#### Save training, valid, and test sets

In [None]:
data_pth = "/home/littlefield/MIMIC-NLP/readmission-prediction/data"
if not os.path.exists(data_pth):
    os.mkdir(data_pth)

In [None]:
train.to_csv(data_pth + "/train_complete.csv")
train_sub.to_csv(data_pth + "/train_subsample.csv")
valid.to_csv(data_pth + "/valid.csv")
test.to_csv(data_pth + "/test.csv")

In [None]:
train_sub["is_valid"] = False
valid["is_valid"] = True

fast_form = pd.merge(train_sub, valid, how="outer")
fast_form.to_csv(data_pth + "/train_valid_fastai.csv")