In [1]:
import pandas as pd
import numpy as np
from sklearn import preprocessing
import xgboost as xgb
from sklearn.preprocessing import MultiLabelBinarizer, OneHotEncoder
from sklearn.ensemble import RandomForestRegressor, RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.multiclass import OneVsRestClassifier
import sklearn.metrics as M

from icdcodex import icd2vec, hierarchy

From the MIMIC-III Big Query database, run:
```sql
SELECT
    i.seq_num, i.subject_id, i.icd9_code, j.los, k.gender, k.dob, k.dod, l.admittime
FROM `physionet-data.mimiciii_clinical.diagnoses_icd` as i
    INNER JOIN
        `physionet-data.mimiciii_clinical.icustays` as j
        ON i.hadm_id = j.hadm_id
    INNER JOIN
        `physionet-data.mimiciii_clinical.patients` as k
        ON i.subject_id = k.subject_id
    INNER JOIN
        `physionet-data.mimiciii_clinical.admissions` as l
        ON i.hadm_id = l.hadm_id
```

Save the results as `data.json.gz`

In [2]:
df = pd.read_csv("data.csv").rename(columns={
    "los": "length_of_stay",
    "dob": "date_of_birth",
    "dod": "date_of_death",
    "admittime": "date_of_admission"
})
df["date_of_birth"] = pd.to_datetime(df["date_of_birth"]).dt.date
df["date_of_death"] = pd.to_datetime(df["date_of_death"]).dt.date
df["date_of_admission"] = pd.to_datetime(df["date_of_admission"]).dt.date
df["age"] = df.apply(lambda e: (e['date_of_admission'] - e['date_of_birth']).days/365, axis=1)
df = df[df.seq_num == 1]
le = preprocessing.LabelEncoder()
le.fit(df.gender)
df.gender = le.transform(df.gender)
drg_severity_le = preprocessing.LabelEncoder()
df.drg_severity = drg_severity_le.fit_transform(df.drg_severity)
drg_mortality_le = preprocessing.LabelEncoder()
df.drg_mortality = drg_mortality_le.fit_transform(df.drg_mortality)
curr_service_le = OneHotEncoder()
one_hot_service = curr_service_le.fit_transform(df.curr_service.values.reshape(-1,1))
df = df.merge(pd.DataFrame(one_hot_service.todense()).set_index(df.index), left_index=True, right_index=True)

In [3]:
# gem = pd.read_csv("icd10cmtoicd9gem.csv")
# gem.icd10cm = gem.icd10cm.apply(lambda s: "{}.{}".format(s[:3],s[3:]) if len(s) > 3 else s)
# _9 = gem.icd9cm
# _10 = gem.icd10cm
# m = dict(zip(_9, _10))
# df["icd10_code"] = df.icd9_code.map(m)
# df = df.dropna()

In [4]:
G, icd_codes = hierarchy.icd9hierarchy("icd9Hierarchy.json")

In [5]:
df.head()

Unnamed: 0,seq_num,subject_id,icd9_code,length_of_stay,gender,date_of_birth,date_of_death,date_of_admission,curr_service,drg_severity,...,8,9,10,11,12,13,14,15,16,17
0,1,256,53240,0.0044,1,2086-07-31,NaT,2170-08-16,MED,181,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,1,256,53240,0.0044,1,2086-07-31,NaT,2170-08-16,MED,2,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,1,256,53240,0.0044,1,2086-07-31,NaT,2170-08-16,MED,2,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,1,256,53240,1.7219,1,2086-07-31,NaT,2170-08-16,MED,314,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,1,256,53240,1.7219,1,2086-07-31,NaT,2170-08-16,MED,2,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [6]:
df = df[df.icd9_code.isin(G.nodes())]

In [8]:
features = ["length_of_stay", "gender", "age", "drg_severity"] + list(range(17))
df[features].head()

Unnamed: 0,length_of_stay,gender,age,drg_severity,0,1,2,3,4,5,...,7,8,9,10,11,12,13,14,15,16
0,0.0044,1,84.09863,181,0.0,0.0,0.0,0.0,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0044,1,84.09863,2,0.0,0.0,0.0,0.0,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0044,1,84.09863,2,0.0,0.0,0.0,0.0,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,1.7219,1,84.09863,314,0.0,0.0,0.0,0.0,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,1.7219,1,84.09863,2,0.0,0.0,0.0,0.0,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [9]:
X = df[features].values
y = df[["icd9_code"]].values
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [10]:
ohe = OneHotEncoder(handle_unknown="ignore")
y_train_onehot = ohe.fit_transform(y_train)
clf_onehot = RandomForestClassifier()
clf_onehot.fit(X_train, y_train_onehot.todense())

f1 = M.f1_score(ohe.transform(y_test), clf_onehot.predict(X_test), average="weighted")
acc = M.accuracy_score(ohe.transform(y_test), clf_onehot.predict(X_test))
f"accuracy = {acc:.2f}, f1 = {f1:.2f}"

  _warn_prf(


'accuracy = 0.53, f1 = 0.56'

In [11]:
embedder = icd2vec.Icd2Vec(num_embedding_dimensions=64, walk_length=10, num_walks=200, workers=-1)
embedder.fit(icd_codes, G)
y_train_continuous = embedder.to_vec(y_train.reshape(-1))

In [12]:
clf = RandomForestRegressor()
clf.fit(X_train, y_train_continuous)
y_pred = embedder.to_code(clf.predict(X_test))

acc = M.accuracy_score(y_test, y_pred)
f1 = M.f1_score(y_test, y_pred, average="weighted")
f"accuracy = {acc:.2f}, f1 = {f1:.2f}"

'accuracy = 0.47, f1 = 0.49'