In [None]:
import numpy as np
import pandas as pd
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import roc_curve, roc_auc_score
import matplotlib.pyplot as plt
from gensim.models import Word2Vec

# Load and clean data

In [None]:
# Load data
admissions = pd.read_csv("ADMISSIONS.csv.gz")
diagnoses = pd.read_csv("DIAGNOSES_ICD.csv.gz")

In [None]:
# Filter out null diagnoses
diagnoses = diagnoses[~diagnoses["ICD9_CODE"].isna()]

In [None]:
# Exclude newborns
admissions = admissions.query("ADMISSION_TYPE != 'NEWBORN'")

In [None]:
# Exclude admissions with no diagnoses
admissions = admissions[admissions["HADM_ID"].isin(diagnoses["HADM_ID"])]

In [None]:
# Apply a little bit of feature engineering
admissions["LANGUAGE"] = np.where(admissions["LANGUAGE"] == "ENGL", "ENGLISH", "NOT ENGLISH")
admissions["ETHNICITY"] = admissions["ETHNICITY"].str.split(' - ').str[0]

# Part 1: without diagnosis codes

In [None]:
# Select initial feature set
features_init = ["ADMISSION_TYPE", "ADMISSION_LOCATION", "INSURANCE", "LANGUAGE", "RELIGION", "MARITAL_STATUS", "ETHNICITY"]

In [None]:
# One-hot encode the feature set
X = pd.get_dummies(admissions[features_init], prefix=features_init, dummy_na=True)

In [None]:
# Get the target
y = admissions["HOSPITAL_EXPIRE_FLAG"]

In [None]:
# Split into train and test sets (70/30)
np.random.seed(777)
train_perc = 0.7
train_size = round(train_perc * X.shape[0])
train_idx = np.random.choice(X.index, train_size, replace=False)
train_mask = np.where(X.index.isin(train_idx), True, False)

X_train = X[train_mask]
X_test = X[~train_mask]
y_train = y[train_mask]
y_test = y[~train_mask]

In [None]:
# Train logistic regression model
model = LogisticRegression()
model.fit(X_train, y_train)

In [None]:
# Define function to help plot ROC curves
def plot_roc(y_prob, y_actual):
    # Calculate ROC curve
    fpr, tpr, thresholds = roc_curve(y_actual, y_prob)
    
    # Calculate AUC
    auc = roc_auc_score(y_actual, y_prob)
    
    # Plot ROC curve
    plt.figure()
    plt.plot(fpr, tpr, color="blue", lw=2, label=f"ROC curve (area = {auc:.4f})")
    plt.plot([0, 1], [0, 1], color="gray", lw=2, linestyle="--")
    plt.xlim([0.0, 1.0])
    plt.ylim([0.0, 1.05])
    plt.xlabel("False Positive Rate")
    plt.ylabel("True Positive Rate")
    plt.legend(loc="lower right")
    plt.show()

In [None]:
# Plot ROC curve for train set
plot_roc(model.predict_proba(X_train)[:, 1], y_train)

In [None]:
# Plot ROC curve for test set
plot_roc(model.predict_proba(X_test)[:, 1], y_test)

In [None]:
# Create function to help with feature importance
# (i.e., looking at largest coefficients)
def feat_importance(model, data, coef_std=False):
    # Create df for coefficients
    importance_df = pd.DataFrame({
        "feature": data.columns,
        "coef": model.coef_[0]
        
    })

    # If specified, standardize coefficients use std dev of each feature
    if coef_std:
        importance_df["coef"] = importance_df["coef"] * data.std().values
    
    # Sort by abs val of coefficients
    importance_df["abs_coef"] = importance_df["coef"].abs()
    importance_df = importance_df.sort_values(by="abs_coef", ascending=False)
    
    return importance_df

In [None]:
# Look at feature importance - unstandardized
feat_importance(model, X_train, coef_std=False).head(10)

In [None]:
# Look at feature importance - standardized
feat_importance(model, X_train, coef_std=True).head(10)

# Part 2: with diagnosis codes

In [None]:
# Pre-process diagnosis data

# Group diagnoses by admission
diags_grouped = diagnoses.groupby("HADM_ID")["ICD9_CODE"].apply(list).reset_index()

# Split into train and test
diags_train = diags_grouped[diags_grouped["HADM_ID"].isin(admissions[train_mask]["HADM_ID"])]
diags_test = diags_grouped[diags_grouped["HADM_ID"].isin(admissions[~train_mask]["HADM_ID"])]

In [None]:
# Train Word2Vec model for diagnoses (using training data only)

EMBEDDING_LENGTH = 50

# Convert to list of lists
diags_corpus = diags_train["ICD9_CODE"].tolist()

# Train model
w2v = Word2Vec(diags_corpus, vector_size=EMBEDDING_LENGTH, window=40)

In [None]:
# Define function to get average embedding vector given list of list of codes
def get_avg_embedding(w2v, icd_list):
    embeddings = []

    for code in icd_list:
        if code in w2v.wv:
            embeddings.append(w2v.wv[code])

    if len(embeddings) > 0:
        # Compute the average of the vectors
        avg_embedding = np.mean(embeddings, axis=0)

        # Standardize values to between -1 and 1
        avg_embedding = avg_embedding / np.abs(avg_embedding).max()
    else:
        # Handle case where no codes are in the vocabulary
        avg_embedding = np.zeros(w2v.vector_size)
    
    return avg_embedding

In [None]:
# Get diagnosis code embeddings for training data
diag_embeddings_train = []
for icd_list in diags_train["ICD9_CODE"]:
    diag_embeddings_train.append(get_avg_embedding(w2v, icd_list))

In [None]:
# Convert to dataframe
diag_embeddings_train = pd.DataFrame(np.vstack(diag_embeddings_train), columns=[f"diag_vec_{i}" for i in range(0, EMBEDDING_LENGTH)])

In [None]:
# Combine with rest of training data
X_train_w_diag = pd.concat([X_train.reset_index(drop=True), diag_embeddings_train], axis=1)

In [None]:
# Do same for test data

# Get diagnosis code embeddings for training data
diag_embeddings_test = []
for icd_list in diags_test["ICD9_CODE"]:
    diag_embeddings_test.append(get_avg_embedding(w2v, icd_list))

# Convert to dataframe
diag_embeddings_test = pd.DataFrame(np.vstack(diag_embeddings_test), columns=[f"diag_vec_{i}" for i in range(0, EMBEDDING_LENGTH)])

# Combine with rest of test data
X_test_w_diag = pd.concat([X_test.reset_index(drop=True), diag_embeddings_test], axis=1)

In [None]:
# Re-train model
model = LogisticRegression()
model.fit(X_train_w_diag, y_train)

In [None]:
# Plot ROC curve for train set
plot_roc(model.predict_proba(X_train_w_diag)[:, 1], y_train)

In [None]:
# Plot ROC curve for test set
plot_roc(model.predict_proba(X_test_w_diag)[:, 1], y_test)

In [None]:
# Look at feature importance - unstandardized
feat_importance(model, X_train_w_diag, coef_std=False).head(10)

In [None]:
# Look at feature importance - standardized
feat_importance(model, X_train_w_diag, coef_std=True).head(10)

# Part 3: diagnosis codes only

In [None]:
# Re-train model on only diagnosis codes
model = LogisticRegression()
model.fit(diag_embeddings_train, y_train)

In [None]:
# Plot ROC curve for train set
plot_roc(model.predict_proba(diag_embeddings_train)[:, 1], y_train)

In [None]:
# Plot ROC curve for test set
plot_roc(model.predict_proba(diag_embeddings_test)[:, 1], y_test)