In [1]:
import pandas as pd
import numpy as np
from sklearn import preprocessing
import xgboost as xgb
from sklearn.preprocessing import MultiLabelBinarizer, OneHotEncoder
from sklearn.model_selection import train_test_split
from sklearn.multiclass import OneVsRestClassifier
import sklearn.metrics as M

In [2]:
df = pd.read_json("data.json.gz").rename(columns={
    "los": "length_of_stay",
    "dob": "date_of_birth",
    "dod": "date_of_death",
    "admittime": "date_of_admission"
})
df["date_of_birth"] = pd.to_datetime(df["date_of_birth"]).dt.date
df["date_of_death"] = pd.to_datetime(df["date_of_death"]).dt.date
df["date_of_admission"] = pd.to_datetime(df["date_of_admission"]).dt.date
df["age"] = df.apply(lambda e: (e['date_of_admission'] - e['date_of_birth']).days/365, axis=1)
df = df[df.seq_num == 1]
le = preprocessing.LabelEncoder()
le.fit(df.gender)
df.gender = le.transform(df.gender)

In [3]:
df[["length_of_stay", "gender", "age", "icd9_code"]].head()

Unnamed: 0,length_of_stay,gender,age,icd9_code
0,0.0044,1,84.09863,53240
1,1.7219,1,84.09863,53240
22,3.5894,1,80.024658,845
31,1.0389,1,77.035616,4414
37,7.618,1,83.928767,4413


In [4]:
X = df[["length_of_stay", "gender", "age"]].values
y = df[["icd9_code"]].values
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [5]:
ohe = OneHotEncoder(handle_unknown="ignore")
y_train_onehot = ohe.fit_transform(y_train)
clf_onehot = OneVsRestClassifier(xgb.XGBClassifier())
clf_onehot.fit(X_train, y_train_onehot)
f1_one_hot_encoding = M.f1_score(ohe.transform(y_test), clf_onehot.predict(X_test), average="weighted")
f"one hot f1 score = {f1_one_hot_encoding:.2f}"

In [9]:
from icdcodex import icd2vec, hierarchy

In [11]:
embedder = icd2vec.Icd2Vec(num_embedding_dimensions=128, window=3, num_walks=200)
embedder.fit(hierarchy.icd10hierarchy("icd-10-cm.xml"))
y_train_continuous = embedder.transform(y_train)
clf = OneVsRestClassifier(xgb.XGBRegressor())
clf.fit(X_train, y_train_continuous)
y_pred = embedder.vec2code(clf.predict(X_test))
f1_continuous_encoding = M.f1_score(y_test, y_pred, average="weighted")
f"continuous f1 score = {f1_continuous_encoding:.2f}"

TypeError: unhashable type: 'numpy.ndarray'