In [None]:
import pandas as pd
import numpy as np
from datetime import datetime as dt
import sqlite3 # library for working with sqlite database
conn = sqlite3.connect("./data/data.db") # Create a connection to the on-disk database

In [None]:
pd.read_sql("SELECT * FROM sqlite_master where type='table'", conn)

In [None]:
pd.read_sql("""SELECT * FROM admissions LIMIT 10""", conn)


In [None]:
pat_df = pd.read_sql("SELECT * FROM patients", conn)
adm_df = pd.read_sql("SELECT * FROM admissions", conn)
pat_df.shape[0]

In [None]:
pat_df.head()

In [None]:
all_admissions = pd.read_sql("""SELECT * FROM admissions""", conn)
print(all_admissions.shape)
all_admissions = all_admissions.sort_values('SUBJECT_ID')
all_admissions_info = all_admissions[['SUBJECT_ID', 'HADM_ID', 'HOSPITAL_EXPIRE_FLAG']]
all_admissions_info[5:20]

In [None]:
# 'First' admission if it is a patient's first ever admission
# Exclude patients who die during their 'first' visit

one_admission = pd.read_sql("""SELECT SUBJECT_ID, HOSPITAL_EXPIRE_FLAG FROM admissions
GROUP BY SUBJECT_ID
HAVING COUNT(SUBJECT_ID) = 1
""", conn)
print("Number of people admitted once: " + str(one_admission.shape[0]))
one_admission_deaths = one_admission.iloc[one_admission['HOSPITAL_EXPIRE_FLAG'].values == 1]
one_admission_deaths = one_admission_deaths[['SUBJECT_ID']]
print("People who died on their first admission:")
print(one_admission_deaths.shape[0])
print(one_admission_deaths.head())

one_admission_survivors = one_admission[one_admission['HOSPITAL_EXPIRE_FLAG'] == 0]
one_admission_survivors = one_admission_survivors[['SUBJECT_ID']]
print("People who only had one admission but survived it: " + str(one_admission_survivors.shape[0]))
print(one_admission_survivors[0:15])

mult_admissions = pd.read_sql("""SELECT SUBJECT_ID FROM admissions
GROUP BY SUBJECT_ID
HAVING COUNT(SUBJECT_ID) > 1
ORDER BY CAST(SUBJECT_ID AS UNSIGNED) ASC
""", conn)
print("People who had multiple admissions: " + str(mult_admissions.shape[0]))
print(mult_admissions.head())

# Concatenate all subject IDs from patients who survived their first visit
relevant_patients = pd.concat([one_admission_survivors, mult_admissions], axis = 0)
relevant_patients.columns = relevant_patients.columns.str.strip()
relevant_patients = relevant_patients.sort_values('SUBJECT_ID')
print("Patients who survived their first visit: " + str(relevant_patients.shape[0]))
relevant_patients[0:20]

In [None]:
# All admissions information for relevant patients
all_admissions_for_relevant_patients = pd.merge(relevant_patients, all_admissions_info, how='inner', on='SUBJECT_ID')
print(all_admissions_for_relevant_patients.shape[0])
all_admissions_for_relevant_patients[4:20]

In [None]:
# Features for all patients
# Only take table info for subjects that are included in our design matrix
# Filter via inner join
admissions_info = pd.read_sql("""SELECT adm.SUBJECT_ID, HADM_ID, ADMITTIME, DISCHTIME
                                 FROM admissions adm
                                 INNER JOIN patients pat
                                 ON adm.SUBJECT_ID = pat.SUBJECT_ID
                                 ORDER BY pat.SUBJECT_ID ASC
                                 """, conn)
admissions_info['DISCHTIME'] = pd.to_datetime(admissions_info['DISCHTIME'])
admissions_info['ADMITTIME'] = pd.to_datetime(admissions_info['ADMITTIME'])

all_icustays = pd.read_sql("""SELECT icu.SUBJECT_ID, HADM_ID, OUTTIME, LAST_CAREUNIT, LOS
                              FROM icustays icu
                              INNER JOIN patients pat
                              ON icu.SUBJECT_ID = pat.SUBJECT_ID
                              ORDER BY pat.SUBJECT_ID ASC
                              """, conn)

diagnoses_icd_feats = pd.read_sql("""SELECT diag.SUBJECT_ID, HADM_ID, ICD9_CODE
                                  FROM diagnoses_icd diag
                                  INNER JOIN patients pat
                                  ON diag.SUBJECT_ID = pat.SUBJECT_ID
                                  ORDER BY pat.SUBJECT_ID ASC
                                  """, conn)

# Left joined DRG codes because a lot of DRG severity/mortality data is missing
drgcodes_feats = pd.read_sql("""SELECT drg.SUBJECT_ID, HADM_ID, DRG_SEVERITY, DRG_MORTALITY 
                             FROM drgcodes drg
                             LEFT JOIN patients pat
                             ON drg.SUBJECT_ID = pat.SUBJECT_ID
                             ORDER BY pat.SUBJECT_ID ASC
                             """, conn)

In [None]:
# Further process information for admissions and ICU stays
# Calculate length of stay for each admission
admissions_feats= pd.read_sql("""SELECT adm.SUBJECT_ID, HADM_ID, ADMITTIME, DISCHTIME
                                 FROM admissions adm
                                 INNER JOIN patients pat
                                 ON adm.SUBJECT_ID = pat.SUBJECT_ID
                                 ORDER BY pat.SUBJECT_ID ASC
                                 """, conn)
# 'HOS_LOS' is length of stay in HOSPITAL, in seconds ('TimedeltaProperties' object only
# has days, seconds, microseconds as attributes)
admissions_feats['HOS_LOS'] = (admissions_info['DISCHTIME'] - admissions_info['ADMITTIME']).dt.seconds
# Only get information from FIRST admission
admissions_feats.sort_values('DISCHTIME')
admissions_feats = admissions_feats.drop_duplicates(subset='SUBJECT_ID', keep='first')
admissions_feats = admissions_feats[['SUBJECT_ID', 'HADM_ID', 'ADMITTIME', 'HOS_LOS']]

# Only get information from LAST ICU stay from an admission
multiple_icustays = pd.read_sql("""SELECT icu.SUBJECT_ID, HADM_ID, LAST_CAREUNIT, LOS
                                    FROM icustays icu
                                    INNER JOIN patients pat
                                    ON icu.SUBJECT_ID = pat.SUBJECT_ID
                                    GROUP BY HADM_ID
                                    HAVING COUNT(HADM_ID) > 1
                                    ORDER BY icu.SUBJECT_ID ASC
                                    """, conn)
print(multiple_icustays.shape)
multiple_icustays.head()

In [None]:
all_icustays[69:74]

In [None]:
# For each patient, only keep the LAST ICU stay (if there was one)
all_icustays_sorted = all_icustays.sort_values('OUTTIME')
icustays_feats = all_icustays_sorted.drop_duplicates(subset=['HADM_ID'], keep='last')
icustays_feats = icustays_feats.sort_values('SUBJECT_ID')
icustays_feats[67:71]

In [None]:
# Age
pat_adm_df = pd.read_sql("""SELECT pat.SUBJECT_ID, pat.DOB, adm.ADMITTIME
                            FROM patients pat
                            INNER JOIN admissions adm
                            ON pat.SUBJECT_ID = adm.SUBJECT_ID
                            ORDER BY pat.SUBJECT_ID ASC""", conn)

admissions_feats['age'] = (pd.to_datetime(pat_adm_df['ADMITTIME']) - pd.to_datetime(pat_adm_df['DOB'])).dt.days

In [None]:
print(admissions_info.shape)
admissions_info[3:17]

In [None]:
print(admissions_feats.shape[0])
admissions_feats[4:15]

In [None]:
print(all_icustays.shape)
all_icustays[0:10]

In [None]:
print(diagnoses_icd_feats.shape)
diagnoses_icd_feats.head()

In [None]:
print(drgcodes_feats.shape)
drgcodes_feats[15:25]

In [None]:
# Get one overall features matrix for ALL patients
features_for_all_mat = pd.merge(admissions_feats, icustays_feats[['HADM_ID', 'LAST_CAREUNIT', 'LOS']], how='left', on=['HADM_ID'])
print(features_for_all_mat.shape)

# Merge feature dataframe with the patient_cols dataframes so we only keep the features
# for the RELEVANT patients
features_mat = pd.merge(relevant_patients, features_for_all_mat, how='left', on=['SUBJECT_ID'])
print(features_mat.shape)
features_mat[4:20]

In [None]:
# !!! Need to generate truth labels for ONLY:
# - patients who survived their first admission
# - the last ICU stay, if there were multiple
# - the first admission, if there were multiple

# Generate additional columns:
# Append truth labels column (Was patient readmitted within 30 days?)
# Append 'ICU stay?' column
subj_adm = all_admissions_for_relevant_patients["SUBJECT_ID"].value_counts()
subj_adm = subj_adm.sort_index()

mult_adm_subj = subj_adm.index[subj_adm > 1]

# Instantiate to all zeros
truth = np.zeros((len(subj_adm)))

subj_adm_idx = list(subj_adm.index)

for subject in mult_adm_subj:
    temp_df = admissions_info[admissions_info["SUBJECT_ID"] == subject]
    # Get time between first discharge and second admission
    first_disc_time = temp_df["DISCHTIME"].iloc[0]
    second_adm_time = temp_df["ADMITTIME"].iloc[1]
    days_to_readm = (pd.to_datetime(second_adm_time) - pd.to_datetime(first_disc_time)).days
    
    # If readmitted within 30 days, set truth to 1
    if days_to_readm <= 30:
        truth[subj_adm_idx.index(subject)] = 1
        
truth_df = pd.DataFrame({'SUBJECT_ID': subj_adm_idx, 'READM': np.array(truth, dtype='int')})
print(truth_df.head())
print(truth_df.shape)

In [None]:
# Now we have the truth labels, delete the column ADMITTIME from the features_mat
features_mat = features_mat.drop('ADMITTIME', axis=1)

In [None]:
from sklearn.preprocessing import LabelEncoder, normalize
from sklearn import metrics
from sklearn.model_selection import train_test_split

design_mat = features_mat.merge(truth_df,how='inner',on="SUBJECT_ID")
X = design_mat.iloc[:,2:-1]

LCU = design_mat["LAST_CAREUNIT"]
le = LabelEncoder() # categorical encoder
LCU = le.fit_transform(LCU.astype(str))
X['LAST_CAREUNIT'] = LCU
X['LOS'] = pd.to_numeric(X['LOS']).fillna(0)

# normalize feature values
for i in list(X):
    X[i] = normalize(X[i][:].values.reshape(-1,1),axis=0)
    
# make train and test splits
X_train, X_test, y_train, y_test = train_test_split(X, design_mat.iloc[:,-1],test_size = 0.33, random_state = 0)

print(X.shape)
X.head()

In [None]:
print(design_mat.shape)
print(features_mat.shape)

### Classifiers

### Metrics for evaluating models

In [None]:
from sklearn import metrics
from sklearn.ensemble import RandomForestClassifier
clf = RandomForestClassifier(random_state=0)
clf=clf.fit(X_train,y_train)
y_pred = clf.predict(X_test)
fpr, tpr, thresholds = metrics.roc_curve(y_test, y_pred, pos_label=1)
AUC = metrics.auc(fpr, tpr)

In [None]:
AUC