In [1]:
# Import libraries
import pandas as pd
import numpy as np
from datetime import datetime as dt
import sqlite3 # library for working with sqlite database

### GETTING TABLES

In [2]:
# Create connection
conn = sqlite3.connect("./data/labevents_balanced.db")
conn2 = sqlite3.connect("./data/lab_events.db") # keep this

# Get dataframe
lab_events_df = pd.read_sql("""SELECT * FROM labevents""", conn)
lab_items_df = pd.read_sql("""SELECT * FROM lab_items""", conn2)

### CREATE DATAFRAMES

In [3]:
# Convert to numeric data types
lab_events_df["SUBJECT_ID"] = pd.to_numeric(lab_events_df["SUBJECT_ID"])
lab_events_df["HADM_ID"] = pd.to_numeric(lab_events_df["HADM_ID"]).astype(int)
lab_events_df["VALUENUM"] = pd.to_numeric(lab_events_df["VALUENUM"])

# Merge with labels for laboratory measurements
lab_events_df = lab_events_df.merge(lab_items_df[['ITEMID','LABEL']], left_on='ITEMID', right_on='ITEMID')

lab_events_df.head()

Unnamed: 0,ROW_ID,SUBJECT_ID,HADM_ID,ITEMID,CHARTTIME,VALUE,VALUENUM,VALUEUOM,FLAG,LABEL
0,12019,30,104557,50800,2172-10-14 16:06:00,ART,,,,SPECIMEN TYPE
1,34727,71,111944,50800,2164-02-03 21:07:00,ART,,,,SPECIMEN TYPE
2,34766,71,111944,50800,2164-02-03 23:14:00,ART,,,,SPECIMEN TYPE
3,34772,71,111944,50800,2164-02-04 02:30:00,ART,,,,SPECIMEN TYPE
4,32967,68,170467,50800,2173-12-15 12:39:00,ART,,,,SPECIMEN TYPE


### TRUTH LABELS

In [4]:
# Load in information about truth labels and corresponding admissions
# Column 1: SUBJECT_ID
# Column 2: TRUTH
# Column 3: HADM_ID
truth_array = np.loadtxt("csv/TRUTH.csv", delimiter=',', skiprows=1)

# Create a dataframe with truth values
truth_df = pd.DataFrame(truth_array, columns=['SUBJECT_ID', 'TRUTH', 'HADM_ID'])

# Convert to ints
truth_df["SUBJECT_ID"] = truth_df["SUBJECT_ID"].astype(int)
truth_df["TRUTH"] = truth_df["TRUTH"].astype(int)
truth_df["HADM_ID"] = truth_df["HADM_ID"].astype(int)

### GET RELEVANT ADMISSIONS

In [5]:
# Only interested in lab events corresponding to the relevant admissions, i.e. those where
# patient did not die during the first visit
relevant_lab_events_df = lab_events_df.merge(truth_df, on='HADM_ID')

# Get relevant truth labels
hadm_df = pd.DataFrame(lab_events_df["HADM_ID"].unique(), columns={'HADM_ID'})

# Get updated truth_df
relevant_truth_df = hadm_df.merge(truth_df, how='inner', on='HADM_ID')

### GET FEATURES

In [6]:
# Create binary variable to represent whether measurement is abnormal
relevant_lab_events_df['ABNORMAL'] = 0
relevant_lab_events_df.loc[relevant_lab_events_df['FLAG']=='abnormal', 'ABNORMAL'] = 1

# Get proportion of measurements that are abnormal
abnormal_df = relevant_lab_events_df.groupby(['HADM_ID', 'LABEL'], as_index = False)['ABNORMAL'].mean()
#abnormal_df = abnormal_df.rename(index=str, columns={"ABNORMAL": "AVG"})

# Get list of measurement names
meas_type = np.unique(lab_events_df["LABEL"].get_values())

# Initially set design matrix equal to truth dataframe
design_mat_df = relevant_truth_df

# For all measurement types, get the proportion of abnormal values
for meas in meas_type:
    # Get avg of current measurement
    curr_df = abnormal_df.loc[abnormal_df["LABEL"] == meas, ["HADM_ID", "ABNORMAL"]]
    
    # Rename
    rename_as = meas + "_ABNORMAL"
    curr_df = curr_df.rename(index=str, columns={"ABNORMAL": rename_as})
    
    # Merge
    design_mat_df = design_mat_df.merge(curr_df, how='left', on='HADM_ID')

### SELECT COMMON FEATURES

In [7]:
# Select features that have at least 50% non-missing values
non_missing = design_mat_df.iloc[:, 3:len(design_mat_df.columns)].count()/len(design_mat_df)
selected = non_missing[non_missing > 0.5].index

# Extract truth labels
truth = design_mat_df.TRUTH

# Recreate design matrix using selected features
design_mat_df = design_mat_df.loc[:, selected]
design_mat_df["TRUTH"] = truth

### DEAL WITH MISSING VALUES

In [8]:
# Fill with zeros
design_mat_df = design_mat_df.fillna(0)

# Get X and y values
X = design_mat_df.iloc[:, 3:len(design_mat_df.columns)].get_values()
y = design_mat_df.TRUTH.get_values()

### VISUALIZATION

In [9]:
# Visualize in 2D using TSNE
# from sklearn.manifold import TSNE

# tsne = TSNE(n_components=2)
# X_tsne = tsne.fit_transform(X)

# Plot
# plt.scatter(X_tsne[y==0, 0], X_tsne[y==0, 1])
# plt.scatter(X_tsne[y==1, 0], X_tsne[y==1, 1])
# plt.show()

### CLASSIFICATION

In [10]:
from sklearn.svm import SVC
from sklearn.model_selection import train_test_split

# make train and test splits
X_train, X_test, y_train, y_test = train_test_split(X, y,test_size = 0.33, random_state = 0)

# Run classifier
svm = SVC(gamma='auto')
svm.fit(X_train, y_train)
y_pred = svm.decision_function(X_test)

### EVALUATION

In [11]:
from sklearn import metrics

fpr, tpr, thresholds = metrics.roc_curve(y_test, y_pred, pos_label=1)
AUC = metrics.auc(fpr, tpr)
print(AUC)

1.0
