In [1]:
import pandas as pd
import numpy as np
from datetime import datetime as dt
import sqlite3 # library for working with sqlite database
conn = sqlite3.connect("./data/data.db") # Create a connection to the on-disk database
conn2 = sqlite3.connect("./data/lab_events.db") # Create a connection to the on-disk database

### GETTING TABLES

In [2]:
pd.read_sql("SELECT * FROM sqlite_master where type='table'", conn2)

Unnamed: 0,type,name,tbl_name,rootpage,sql
0,table,lab_events,lab_events,2,"CREATE TABLE lab_events(\n ""ROW_ID"" TEXT,\n ..."
1,table,lab_items,lab_items,7592,"CREATE TABLE lab_items(\n ""ROW_ID"" TEXT,\n ""..."


In [3]:
lab_events_df = pd.read_sql("""SELECT * FROM lab_events""", conn2)
lab_events_df.head()

Unnamed: 0,ROW_ID,SUBJECT_ID,HADM_ID,ITEMID,CHARTTIME,VALUE,VALUENUM,VALUEUOM,FLAG
0,516,3,145834,50809,10/20/01 19:14,140.0,140.0,mg/dL,abnormal
1,585,3,145834,50809,10/20/01 20:04,265.0,265.0,mg/dL,abnormal
2,593,3,145834,50825,10/20/01 20:04,36.7,36.7,,
3,599,3,145834,50809,10/20/01 21:51,267.0,267.0,mg/dL,abnormal
4,605,3,145834,50817,10/20/01 21:51,99.0,99.0,,


In [4]:
lab_items_df = pd.read_sql("""SELECT * FROM lab_items""", conn2)
lab_items_df.head()

Unnamed: 0,ROW_ID,ITEMID,LABEL,FLUID,CATEGORY,LOINC_CODE
0,546,51346,Blasts,Cerebrospinal Fluid (CSF),Hematology,26447-3
1,547,51347,Eosinophils,Cerebrospinal Fluid (CSF),Hematology,26451-5
2,548,51348,"Hematocrit, CSF",Cerebrospinal Fluid (CSF),Hematology,30398-2
3,549,51349,Hypersegmented Neutrophils,Cerebrospinal Fluid (CSF),Hematology,26506-6
4,550,51350,Immunophenotyping,Cerebrospinal Fluid (CSF),Hematology,


In [5]:
# Convert to numeric data types
lab_events_df["SUBJECT_ID"] = pd.to_numeric(lab_events_df["SUBJECT_ID"])
lab_events_df["HADM_ID"] = pd.to_numeric(lab_events_df["HADM_ID"]).astype(int)
lab_events_df["VALUENUM"] = pd.to_numeric(lab_events_df["VALUENUM"])


### CLEAN DATA

In [6]:
lab_events_df.head()

Unnamed: 0,ROW_ID,SUBJECT_ID,HADM_ID,ITEMID,CHARTTIME,VALUE,VALUENUM,VALUEUOM,FLAG
0,516,3,145834,50809,10/20/01 19:14,140.0,140.0,mg/dL,abnormal
1,585,3,145834,50809,10/20/01 20:04,265.0,265.0,mg/dL,abnormal
2,593,3,145834,50825,10/20/01 20:04,36.7,36.7,,
3,599,3,145834,50809,10/20/01 21:51,267.0,267.0,mg/dL,abnormal
4,605,3,145834,50817,10/20/01 21:51,99.0,99.0,,


In [7]:
# Remove rows where VALUENUM is nan
lab_events_df = lab_events_df.loc[(lab_events_df["VALUENUM"]).notna(), :]

# Remove negative values
lab_events_df = lab_events_df.loc[lab_events_df["VALUENUM"] >= 0,:]

# Merge with labels
lab_events_df = lab_events_df.merge(lab_items_df[['ITEMID','LABEL']], left_on='ITEMID', right_on='ITEMID')

### TRUTH LABELS

In [8]:
# Column 1: SUBJECT_ID
# Column 2: TRUTH
# Column 3: HADM_ID
truth_array = np.loadtxt("TRUTH.csv", delimiter=',', skiprows=1)

# Create a dataframe
truth_df = pd.DataFrame(truth_array, columns=['SUBJECT_ID', 'TRUTH', 'HADM_ID'])

# Convert to ints
truth_df["SUBJECT_ID"] = truth_df["SUBJECT_ID"].astype(int)
truth_df["TRUTH"] = truth_df["TRUTH"].astype(int)
truth_df["HADM_ID"] = truth_df["HADM_ID"].astype(int)

### GET RELEVANT ADMISSIONS

In [9]:
# Only interested in lab events corresponding to the relevant admissions
relevant_lab_events_df = lab_events_df.merge(truth_df, on='HADM_ID')

### GET FEATURES

In [10]:
meas_type = np.unique(relevant_lab_events_df["LABEL"].get_values())
subjects = np.unique(relevant_lab_events_df["SUBJECT_ID_x"].get_values())

In [11]:
# Take the mean of every measurement for every subject
avg_df = relevant_lab_events_df.groupby(['HADM_ID', 'LABEL'], as_index = False)['VALUENUM'].mean()
avg_df = avg_df.rename(index=str, columns={"VALUENUM": "AVG"})

In [12]:
avg_df.head()

Unnamed: 0,HADM_ID,LABEL,AVG
0,100003,Oxygen Saturation,95.0
1,100006,Oxygen Saturation,67.0
2,100006,Temperature,36.2
3,100007,Temperature,37.8
4,100009,Glucose,132.833333


In [13]:
# Create separate columns for glucose average, oxygen saturation average, and temperature average
design_mat_df = truth_df
for meas in meas_type:
    # Get avg of current measurement
    meas_avg_df = avg_df.loc[avg_df["LABEL"] == meas, ["HADM_ID", "AVG"]]
    
    # Rename
    rename_as = meas + "_AVG"
    meas_avg_df = meas_avg_df.rename(index=str, columns={"AVG": rename_as})
    
    # Merge
    design_mat_df = design_mat_df.merge(meas_avg_df, how='left', on='HADM_ID')

# Replace NaN's with 0
design_mat_df = design_mat_df.fillna(0)

In [14]:
# Get X and y values
X = design_mat_df.iloc[:, 3:len(design_mat_df.columns)].get_values()
y = design_mat_df.TRUTH.get_values()

### CLASSIFICATION

In [15]:
from sklearn.model_selection import train_test_split

# make train and test splits
X_train, X_test, y_train, y_test = train_test_split(X, y,test_size = 0.33, random_state = 0)

In [16]:
from sklearn import metrics
from sklearn.ensemble import RandomForestClassifier
clf = RandomForestClassifier(random_state=0)
clf=clf.fit(X_train,y_train)
y_pred = clf.predict(X_test)
fpr, tpr, thresholds = metrics.roc_curve(y_test, y_pred, pos_label=1)
AUC = metrics.auc(fpr, tpr)

  from numpy.core.umath_tests import inner1d


In [19]:
np.sum(y_test)

0