In [1]:
# Import libraries
import pandas as pd
import numpy as np
from datetime import datetime as dt
import sqlite3 # library for working with sqlite database

### GETTING TABLES

In [2]:
# Create connection
conn = sqlite3.connect("./data/labevents_balanced.db")
conn2 = sqlite3.connect("./data/lab_events.db") # keep this

# Get dataframe
lab_events_df = pd.read_sql("""SELECT * FROM labevents""", conn)
lab_items_df = pd.read_sql("""SELECT * FROM lab_items""", conn2)

### CREATE DATAFRAMES

In [3]:
# Convert to numeric data types
lab_events_df["SUBJECT_ID"] = pd.to_numeric(lab_events_df["SUBJECT_ID"])
lab_events_df["HADM_ID"] = pd.to_numeric(lab_events_df["HADM_ID"]).astype(int)
lab_events_df["VALUENUM"] = pd.to_numeric(lab_events_df["VALUENUM"])

# Merge with labels for laboratory measurements
lab_events_df = lab_events_df.merge(lab_items_df[['ITEMID','LABEL']], left_on='ITEMID', right_on='ITEMID')

lab_events_df.head()

Unnamed: 0,ROW_ID,SUBJECT_ID,HADM_ID,ITEMID,CHARTTIME,VALUE,VALUENUM,VALUEUOM,FLAG,LABEL
0,12019,30,104557,50800,2172-10-14 16:06:00,ART,,,,SPECIMEN TYPE
1,34727,71,111944,50800,2164-02-03 21:07:00,ART,,,,SPECIMEN TYPE
2,34766,71,111944,50800,2164-02-03 23:14:00,ART,,,,SPECIMEN TYPE
3,34772,71,111944,50800,2164-02-04 02:30:00,ART,,,,SPECIMEN TYPE
4,32967,68,170467,50800,2173-12-15 12:39:00,ART,,,,SPECIMEN TYPE


### TRUTH LABELS

In [4]:
# Load in information about truth labels and corresponding admissions
# Column 1: SUBJECT_ID
# Column 2: TRUTH
# Column 3: HADM_ID
truth_array = np.loadtxt("csv/TRUTH.csv", delimiter=',', skiprows=1)

# Create a dataframe with truth values
truth_df = pd.DataFrame(truth_array, columns=['SUBJECT_ID', 'TRUTH', 'HADM_ID'])

# Convert to ints
truth_df["SUBJECT_ID"] = truth_df["SUBJECT_ID"].astype(int)
truth_df["TRUTH"] = truth_df["TRUTH"].astype(int)
truth_df["HADM_ID"] = truth_df["HADM_ID"].astype(int)

### GET RELEVANT ADMISSIONS

In [5]:
# Only interested in lab events corresponding to the relevant admissions, i.e. those where
# patient did not die during the first visit
relevant_lab_events_df = lab_events_df.merge(truth_df, on='HADM_ID')

# Get relevant truth labels
hadm_df = pd.DataFrame(lab_events_df["HADM_ID"].unique(), columns={'HADM_ID'})

# Get updated truth_df
relevant_truth_df = hadm_df.merge(truth_df, how='inner', on='HADM_ID')

### GET FEATURES

In [6]:
# Create binary variable to represent whether measurement is abnormal
relevant_lab_events_df['ABNORMAL'] = 0
relevant_lab_events_df.loc[relevant_lab_events_df['FLAG']=='abnormal', 'ABNORMAL'] = 1

# Get proportion of measurements that are abnormal
abnormal_df = relevant_lab_events_df.groupby(['HADM_ID', 'ITEMID'], as_index = False)['ABNORMAL'].mean()

# Get list of measurement names
meas_type = np.unique(lab_events_df["ITEMID"].get_values())

# Initially set design matrix equal to truth dataframe
design_mat_df = relevant_truth_df

# For all measurement types, get the proportion of abnormal values
for meas in meas_type:
    # Get proportion of abnormal values
    curr_df = abnormal_df.loc[abnormal_df["ITEMID"] == meas, ["HADM_ID", "ABNORMAL"]]
    
    # Rename
    rename_as = meas + "_ABNORMAL"
    curr_df = curr_df.rename(index=str, columns={"ABNORMAL": rename_as})
    
    # Merge
    design_mat_df = design_mat_df.merge(curr_df, how='left', on='HADM_ID')
    
# Get X and y values
X = design_mat_df.iloc[:, 3:len(design_mat_df.columns)].get_values()
y = design_mat_df.TRUTH.get_values()

### SELECT FEATURES

In [7]:
from sklearn.model_selection import train_test_split

# make train and test splits
X_train, X_test, y_train, y_test = train_test_split(X, y,test_size = 0.33, random_state = 0)

# Select features that have at least 50% non-missing values in the training set
prop_non_missing = (~np.isnan(X_train)).sum(0)/len(X_train)
threshold = 0.5

# Select same features in training and testing set
X_train = X_train[:, prop_non_missing > threshold]
X_test = X_test[:, prop_non_missing > threshold]

In [8]:
# Get names of selected features
all_columns = design_mat_df.iloc[:, 3:len(design_mat_df.columns)].columns
selected_features_df = pd.DataFrame(all_columns[prop_non_missing > threshold], columns={"ITEMID"})

# Strip _ABNORMAL from the name
selected_features_df["ITEMID"] = selected_features_df["ITEMID"].replace(to_replace="_ABNORMAL", value="", regex=True)

# Get labels and fluid from lab_items_df
selected_features_df = selected_features_df.merge(lab_items_df.loc[:,["LABEL", "FLUID", "ITEMID"]], how='inner', on='ITEMID')
selected_features_df

Unnamed: 0,ITEMID,LABEL,FLUID
0,50802,Base Excess,Blood
1,50804,Calculated Total CO2,Blood
2,50813,Lactate,Blood
3,50818,pCO2,Blood
4,50820,pH,Blood
5,50821,pO2,Blood
6,50822,"Potassium, Whole Blood",Blood
7,50861,Alanine Aminotransferase (ALT),Blood
8,50862,Albumin,Blood
9,50863,Alkaline Phosphatase,Blood


In [9]:
# Here, we include additional features to indicate the presence of any remaining missing values
missing_train = np.isnan(X_train).astype(int)
missing_test = np.isnan(X_test).astype(int)

# Concatenate to existing X matrices
X_train = np.concatenate((X_train, missing_train), axis=1)
X_test = np.concatenate((X_test, missing_test), axis=1)

### DEAL WITH MISSING VALUES

In [10]:
# Replace NaN values
X_train[np.isnan(X_train)] = 0
X_test[np.isnan(X_test)] = 0

### VISUALIZATION

In [11]:
# Visualize in 2D using TSNE
# import matplotlib.pyplot as plt
# from sklearn.manifold import TSNE

# tsne = TSNE(n_components=2)
# X_tsne = tsne.fit_transform(X_train)

# Plot
# plt.scatter(X_tsne[y_train==0, 0], X_tsne[y_train==0, 1])
# plt.scatter(X_tsne[y_train==1, 0], X_tsne[y_train==1, 1])
# plt.show()

### PARAMETER SEARCHING

#### Set up cross-validation

In [13]:
from sklearn.model_selection import StratifiedKFold, GridSearchCV
from sklearn.svm import SVC

# Set up cross-validation folds
cv = StratifiedKFold(n_splits = 10)

# Number of features
nfeatures = np.shape(X_train)[1]

#### Set up parameter grid

In [22]:
# Parameter grid
params_grid = {
    'C': np.arange(0.5, 3.5, 0.5),
    'gamma': np.arange(0.5/nfeatures, 2.5/nfeatures, 0.5/nfeatures)
}

bst_grid = GridSearchCV(estimator = SVC(),
                       param_grid = params_grid,
                       cv = cv,
                       scoring = 'roc_auc')

bst_grid.fit(X_train, y_train)

bst_grid.best_params_

{'C': 1.5, 'gamma': 0.017857142857142856}

#### Classify

In [40]:
from sklearn.metrics import roc_auc_score

y_pred = bst_grid.best_estimator_.decision_function(X_test)
roc_auc_score(y_test, y_pred)

0.6626372296658086

### CLASSIFICATION

In [32]:
#from sklearn.svm import SVC

# Run classifier
#svm = SVC(C=1.5, gamma=0.017857142857142856)
#svm.fit(X_train, y_train)
#y_pred = svm.decision_function(X_test)

### EVALUATION

In [33]:
#from sklearn import metrics

#fpr, tpr, thresholds = metrics.roc_curve(y_test, y_pred, pos_label=1)
#AUC = metrics.auc(fpr, tpr)
#print(AUC)

0.6626372296658086
