In [None]:
import pandas as pd
import numpy as np
from datetime import datetime as dt
import sqlite3 # library for working with sqlite database

### GETTING TABLES

In [None]:
# Create connections to the on-disk databases
conn2 = sqlite3.connect("./data/lab_events.db")
conn3 = sqlite3.connect("./data/labevents_common.db")

# Create dataframes
#lab_events_df = pd.read_sql("""SELECT * FROM lab_events""", conn2)
lab_events_df = pd.read_sql("""SELECT * FROM labevents_common""", conn3)
lab_items_df = pd.read_sql("""SELECT * FROM lab_items""", conn2)

In [None]:
# Convert to numeric data types
lab_events_df["SUBJECT_ID"] = pd.to_numeric(lab_events_df["SUBJECT_ID"])
lab_events_df["HADM_ID"] = pd.to_numeric(lab_events_df["HADM_ID"]).astype(int)
lab_events_df["VALUENUM"] = pd.to_numeric(lab_events_df["VALUENUM"])

### CLEAN DATA

In [None]:
# Remove rows where VALUENUM is nan
lab_events_df = lab_events_df.loc[(lab_events_df["VALUENUM"]).notna(), :]

# Remove negative values
lab_events_df = lab_events_df.loc[lab_events_df["VALUENUM"] >= 0,:]

# Merge with labels for laboratory measurements
lab_events_df = lab_events_df.merge(lab_items_df[['ITEMID','LABEL']], left_on='ITEMID', right_on='ITEMID')

### TRUTH LABELS

In [None]:
# Column 1: SUBJECT_ID
# Column 2: TRUTH
# Column 3: HADM_ID
truth_array = np.loadtxt("TRUTH.csv", delimiter=',', skiprows=1)

# Create a dataframe with truth values
truth_df = pd.DataFrame(truth_array, columns=['SUBJECT_ID', 'TRUTH', 'HADM_ID'])

# Convert to ints
truth_df["SUBJECT_ID"] = truth_df["SUBJECT_ID"].astype(int)
truth_df["TRUTH"] = truth_df["TRUTH"].astype(int)
truth_df["HADM_ID"] = truth_df["HADM_ID"].astype(int)
#truth_df = truth_df.loc[truth_df["TRUTH"]==1]

### GET RELEVANT ADMISSIONS

In [None]:
# Only interested in lab events corresponding to the relevant admissions
relevant_lab_events_df = lab_events_df.merge(truth_df, on='HADM_ID')

### GET FEATURES

In [None]:
meas_type = np.unique(relevant_lab_events_df["LABEL"].get_values())
subjects = np.unique(relevant_lab_events_df["SUBJECT_ID_x"].get_values())

In [None]:
# Take the mean of every measurement for every subject
avg_df = relevant_lab_events_df.groupby(['HADM_ID', 'LABEL'], as_index = False)['VALUENUM'].median()
avg_df = avg_df.rename(index=str, columns={"VALUENUM": "AVG"})

In [None]:
# Create separate columns for glucose average, oxygen saturation average, and temperature average
design_mat_df = truth_df
for meas in meas_type:
    # Get avg of current measurement
    meas_avg_df = avg_df.loc[avg_df["LABEL"] == meas, ["HADM_ID", "AVG"]]
    
    # Rename
    rename_as = meas + "_AVG"
    meas_avg_df = meas_avg_df.rename(index=str, columns={"AVG": rename_as})
    
    # Merge
    design_mat_df = design_mat_df.merge(meas_avg_df, how='left', on='HADM_ID')


#### Missing Values

In [None]:
# Create additional columns for missing values
for meas in meas_type:
    # Get current column
    curr_meas = design_mat_df.loc[:, meas + "_AVG"]
    
    # Add column for missing values
    design_mat_df[meas + "_AVG_missing"] = curr_meas.isna().astype(int)


In [None]:
# Mean Imputation
design_mat_df = design_mat_df.fillna(design_mat_df.median())

In [None]:
#design_mat_df.iloc[:,3:len(design_mat_df.columns)].isna().sum().sort_values()

In [None]:
# Get X and y values
X = design_mat_df.iloc[:, 3:len(design_mat_df.columns)].get_values()
y = design_mat_df.TRUTH.get_values()

In [None]:
import random

# Get negative and positive examples
X0 = X[y == 0, :]
y0 = y[y == 0]
X1 = X[y == 1, :]
y1 = y[y == 1]

# Randomly select X0's
rand_idx = random.sample(range(0, len(y0)), len(y1))
X0_subset = X0[rand_idx, :]
y0_subset = y0[rand_idx]

# Recreate X and y
X = np.concatenate((X0_subset, X1))
y = np.concatenate((y0_subset, y1))

# Randomly order
rand_idx2 = random.sample(range(0, len(y)), len(y))
X = X[rand_idx2, :]
y = y[rand_idx2]

### CLASSIFICATION

In [None]:
from sklearn.model_selection import train_test_split

# make train and test splits
X_train, X_test, y_train, y_test = train_test_split(X, y,test_size = 0.33, random_state = 0)

In [None]:
from sklearn.preprocessing import normalize

# Normalize based on the training data (NOTE: NEED TO CHANGE SINCE ALSO CURRENTLY NORMALIZING TEST)
#X_train = normalize(X_train)
#X_test = normalize(X_test)

In [None]:
from sklearn import metrics
from sklearn.ensemble import RandomForestClassifier
clf = RandomForestClassifier(random_state=0)
clf=clf.fit(X_train,y_train)
y_pred = clf.predict(X_test)
fpr, tpr, thresholds = metrics.roc_curve(y_test, y_pred, pos_label=1)
AUC = metrics.auc(fpr, tpr)

In [None]:
AUC