In [None]:
# Import libraries
import pandas as pd
import numpy as np
from datetime import datetime as dt
import sqlite3 # library for working with sqlite database

### GETTING TABLES

In [None]:
# Create connection
conn = sqlite3.connect("./data/labevents_balanced.db")
conn2 = sqlite3.connect("./data/lab_events.db") # keep this

# Get dataframe
lab_events_df = pd.read_sql("""SELECT * FROM labevents""", conn)
lab_items_df = pd.read_sql("""SELECT * FROM lab_items""", conn2)

### CREATE DATAFRAMES

In [None]:
# Convert to numeric data types
lab_events_df["SUBJECT_ID"] = pd.to_numeric(lab_events_df["SUBJECT_ID"])
lab_events_df["HADM_ID"] = pd.to_numeric(lab_events_df["HADM_ID"]).astype(int)
lab_events_df["VALUENUM"] = pd.to_numeric(lab_events_df["VALUENUM"])

# Merge with labels for laboratory measurements
lab_events_df = lab_events_df.merge(lab_items_df[['ITEMID','LABEL']], left_on='ITEMID', right_on='ITEMID')

lab_events_df.head()

### TRUTH LABELS

In [None]:
# Load in information about truth labels and corresponding admissions
# Column 1: SUBJECT_ID
# Column 2: TRUTH
# Column 3: HADM_ID
truth_array = np.loadtxt("csv/TRUTH.csv", delimiter=',', skiprows=1)

# Create a dataframe with truth values
truth_df = pd.DataFrame(truth_array, columns=['SUBJECT_ID', 'TRUTH', 'HADM_ID'])

# Convert to ints
truth_df["SUBJECT_ID"] = truth_df["SUBJECT_ID"].astype(int)
truth_df["TRUTH"] = truth_df["TRUTH"].astype(int)
truth_df["HADM_ID"] = truth_df["HADM_ID"].astype(int)

### GET RELEVANT ADMISSIONS

In [None]:
# Only interested in lab events corresponding to the relevant admissions, i.e. those where
# patient did not die during the first visit
relevant_lab_events_df = lab_events_df.merge(truth_df, on='HADM_ID')

# Get relevant truth labels
hadm_df = pd.DataFrame(lab_events_df["HADM_ID"].unique(), columns={'HADM_ID'})

# Get updated truth_df
relevant_truth_df = hadm_df.merge(truth_df, how='inner', on='HADM_ID')

### GET FEATURES

In [None]:
# Create binary variable to represent whether measurement is abnormal
relevant_lab_events_df['ABNORMAL'] = 0
relevant_lab_events_df.loc[relevant_lab_events_df['FLAG']=='abnormal', 'ABNORMAL'] = 1

# Get proportion of measurements that are abnormal
abnormal_df = relevant_lab_events_df.groupby(['HADM_ID', 'LABEL'], as_index = False)['ABNORMAL'].mean()
#abnormal_df = abnormal_df.rename(index=str, columns={"ABNORMAL": "AVG"})

# Get list of measurement names
meas_type = np.unique(lab_events_df["LABEL"].get_values())

# Initially set design matrix equal to truth dataframe
design_mat_df = relevant_truth_df

# For all measurement types, get the proportion of abnormal values
for meas in meas_type:
    # Get avg of current measurement
    curr_df = abnormal_df.loc[abnormal_df["LABEL"] == meas, ["HADM_ID", "ABNORMAL"]]
    
    # Rename
    rename_as = meas + "_ABNORMAL"
    curr_df = curr_df.rename(index=str, columns={"ABNORMAL": rename_as})
    
    # Merge
    design_mat_df = design_mat_df.merge(curr_df, how='left', on='HADM_ID')
    
# Get X and y values
X = design_mat_df.iloc[:, 3:len(design_mat_df.columns)].get_values()
y = design_mat_df.TRUTH.get_values()

### SELECT COMMON FEATURES

In [None]:
from sklearn.model_selection import train_test_split

# make train and test splits
X_train, X_test, y_train, y_test = train_test_split(X, y,test_size = 0.33, random_state = 0)

# Select features based that have at least 50% non-missing values in the training set
prop_non_missing = (~np.isnan(X_train)).sum(0)/len(X_train)
threshold = 0.5

# Select same features in training and testing set
X_train = X_train[:, prop_non_missing > threshold]
X_test = X_test[:, prop_non_missing > threshold]

# Replace NaN values
X_train[np.isnan(X_train)] = 0
X_test[np.isnan(X_test)] = 0

### DEAL WITH MISSING VALUES

In [None]:
# Replace NaN values
X_train[np.isnan(X_train)] = 0
X_test[np.isnan(X_test)] = 0

### VISUALIZATION

In [None]:
# Visualize in 2D using TSNE
# import matplotlib.pyplot as plt
# from sklearn.manifold import TSNE

# tsne = TSNE(n_components=2)
# X_tsne = tsne.fit_transform(X_train)

# Plot
# plt.scatter(X_tsne[y_train==0, 0], X_tsne[y_train==0, 1])
# plt.scatter(X_tsne[y_train==1, 0], X_tsne[y_train==1, 1])
# plt.show()

### CLASSIFICATION

In [None]:
from sklearn.svm import SVC

# Run classifier
svm = SVC(gamma='auto')
svm.fit(X_train, y_train)
y_pred = svm.decision_function(X_test)

### EVALUATION

In [None]:
from sklearn import metrics

fpr, tpr, thresholds = metrics.roc_curve(y_test, y_pred, pos_label=1)
AUC = metrics.auc(fpr, tpr)
print(AUC)