#### Notes from phase 1

In [14]:
!pip install scikit-multilearn



In [1]:
# Data for each table was split into train and test
# To get the target labels, search and filter the diagnoses dataframes (saved) for the hadm_ids present in the train / test
# set

In [3]:
# External libraries for data processing
import numpy as np
import pandas as pd
import sklearn as sk
#To render graphs within notebook
%matplotlib inline
import matplotlib.pyplot as plt
import joblib 
import os
from tqdm import tqdm

# Versions of libraries
print("Numpy version: {}".format(np.__version__))
print("Pandas version: {}".format(pd.__version__))
print("Scikit version: {}".format(sk.__version__))

Numpy version: 1.24.3
Pandas version: 1.5.3
Scikit version: 1.3.0


In [4]:
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import mean_squared_error
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MultiLabelBinarizer

In [5]:
def convert_to_days(duration_str):
    parts = duration_str.split(' days ')  # Split string into form ['22', '20:55:00']
    days = float(parts[0])  # Extract number of days and convert to float
    time_parts = parts[1].split(':')  # Split time part (hh:mm:ss) ['20', '55', '00']
    hours = float(time_parts[0])  # Extract hours and convert to float
    minutes = float(time_parts[1])  # Extract minutes and convert to float
    seconds = float(time_parts[2])  # Extract seconds and convert to float
    total_days = days + (hours / 24) + (minutes / (24 * 60)) + (seconds / (24 * 3600))  # Calculate total days
    return total_days

In [6]:
path = "C:/Project/Data/"

In [7]:
file = "hosp/admissions.csv"
full_path = path + file

df_admissions = pd.read_csv(full_path)

df_admissions['dischtime'] = pd.to_datetime(df_admissions['dischtime'], format='%d/%m/%Y %H:%M')
df_admissions['admittime'] = pd.to_datetime(df_admissions['admittime'], format='%d/%m/%Y %H:%M')

df_admittime= pd.DataFrame()
df_admittime['hadm_id'] = df_admissions['hadm_id']
df_admittime['admittime'] = df_admissions['admittime']

In [8]:
file = "hosp/diagnoses_icd.csv"
full_path = path + file

df_diagnoses = pd.read_csv(full_path)

In [9]:
df_diagnoses = df_diagnoses.drop(columns=['subject_id','seq_num','icd_version'])

In [10]:
one_hot_encoded = pd.get_dummies(df_diagnoses['icd_code'])

df_encoded = pd.concat([df_diagnoses[['hadm_id']], one_hot_encoded], axis=1)

df_aggregated = df_encoded.groupby('hadm_id').sum().reset_index()

# If you want to replace NaN values (where the category did not appear for a particular ID) with 0
df_aggregated = df_aggregated.fillna(0)

In [11]:
df_diagnoses = df_aggregated
df_diagnoses

Unnamed: 0,hadm_id,00845,0088,0380,0383,03842,03843,03849,0388,0389,...,Z95810,Z95820,Z961,Z96651,Z980,Z981,Z9884,Z9911,Z992,Z9981
0,20044587,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,20093566,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,1,0,1,0
2,20192635,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,20199380,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,20214994,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
270,29820177,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
271,29839885,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
272,29842315,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,1,1
273,29858644,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [12]:
file = "hosp/drgcodes.csv"
full_path = path + file

df_drgcodes = pd.read_csv(full_path)

In [13]:
df_drgcodes['hadm_id'].value_counts()

22187210    2
27505812    2
25926192    2
27089790    2
24490144    2
           ..
22539296    1
20385771    1
20199380    1
20973395    1
23559586    1
Name: hadm_id, Length: 233, dtype: int64

In [14]:
df_drgcodes.head()

Unnamed: 0,subject_id,hadm_id,drg_type,drg_code,description,drg_severity,drg_mortality
0,10004235,22187210,HCFA,864,FEVER,,
1,10026255,22059910,HCFA,180,RESPIRATORY NEOPLASMS W MCC,,
2,10032725,20611640,HCFA,54,NERVOUS SYSTEM NEOPLASMS W MCC,,
3,10005866,21636229,HCFA,393,OTHER DIGESTIVE SYSTEM DIAGNOSES W MCC,,
4,10008454,20291550,HCFA,956,"LIMB REATTACHMENT, HIP & FEMUR PROC FOR MULTIP...",,


In [15]:
df_drgcodes = df_drgcodes.drop(columns=['subject_id','drg_type','description','drg_severity','drg_mortality'])

In [16]:
one_hot_encoded = pd.get_dummies(df_drgcodes['drg_code'])

df_encoded = pd.concat([df_drgcodes[['hadm_id']], one_hot_encoded], axis=1)

df_aggregated = df_encoded.groupby('hadm_id').sum().reset_index()

# If you want to replace NaN values (where the category did not appear for a particular ID) with 0
df_aggregated = df_aggregated.fillna(0)

In [17]:
df_drgcodes = df_aggregated
df_drgcodes

Unnamed: 0,hadm_id,14,20,21,22,23,24,25,26,27,...,950,951,952,956,957,981,982,983,987,988
0,20044587,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,20093566,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,20192635,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,20199380,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,20214994,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
228,29802992,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
229,29839885,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
230,29842315,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
231,29858644,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


### microbiologyevents

In [20]:
path = "C:/Users/jenni/OneDrive/Desktop/IP/"

file = "microbio_data_train.csv"
full_path = path + file
X_train = pd.read_csv(full_path)

file = "microbio_data_test.csv"
full_path = path + file
X_test = pd.read_csv(full_path)

In [21]:
X_train['delay']= X_train['delay'].apply(convert_to_days)
X_test['delay']= X_test['delay'].apply(convert_to_days)

In [22]:
# Also need to deal with days_since_admission
X_train['days_since_admission'] = X_train['days_since_admission'].apply(convert_to_days)
X_test['days_since_admission'] = X_test['days_since_admission'].apply(convert_to_days)

In [23]:
# target = df_diagnoses (rows where hadm_id is in train data)
# Make sure the order is the same

# Extract the unique IDs from the column 
train_ids = X_train['hadm_id'].unique()

# Filter to keep only rows where the id column is in train_ids
y_train = df_drgcodes[df_drgcodes['hadm_id'].isin(train_ids)]

test_ids = X_test['hadm_id'].unique()

# Filter to keep only rows where the id column is in train_ids
y_test = df_drgcodes[df_drgcodes['hadm_id'].isin(test_ids)]

In [24]:
merged_df = pd.merge(y_train, X_train['hadm_id'], on='hadm_id', how='inner')

In [25]:
y_train = merged_df
y_train

Unnamed: 0,hadm_id,14,20,21,22,23,24,25,26,27,...,950,951,952,956,957,981,982,983,987,988
0,20093566,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,20199380,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,20214994,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,20214994,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,20214994,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1535,29974575,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1536,29974575,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1537,29974575,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1538,29974575,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [26]:
merged_df = pd.merge(y_test, X_test['hadm_id'], on='hadm_id', how='inner')
y_test = merged_df

In [27]:
valid_ids = y_train['hadm_id'].unique()
X_train = X_train[X_train['hadm_id'].isin(valid_ids)]

In [28]:
valid_ids = y_test['hadm_id'].unique()
X_test = X_test[X_test['hadm_id'].isin(valid_ids)]

In [29]:
# Order by hadm_id and drop 

In [30]:
y_train = y_train.sort_values(by='hadm_id')
y_train = y_train.drop(columns=['hadm_id'])

y_test = y_test.sort_values(by='hadm_id')
y_test = y_test.drop(columns=['hadm_id'])

X_train = X_train.sort_values(by='hadm_id')
X_train = X_train.drop(columns=['hadm_id'])

X_test = X_test.sort_values(by='hadm_id')
X_test = X_test.drop(columns=['hadm_id'])

In [31]:
# Number of samples and labels

In [32]:
y_train.shape, y_test.shape

((1540, 240), (386, 240))

In [33]:
# Number of unique label classes

In [34]:
# Count the number of unique rows
num_unique_rows = y_train.drop_duplicates().shape[0]

print("Number of unique rows in y_train:", num_unique_rows)

num_unique_rows = y_test.drop_duplicates().shape[0]

print("Number of unique rows in y_test:", num_unique_rows)

Number of unique rows in y_train: 125
Number of unique rows in y_test: 81


In [35]:
# Number of features

In [36]:
X_train.shape[1]

277

There are several ways to measure a classifier’s generalization quality:

Hamming loss measures how well the classifier predicts each of the labels, averaged over samples, then over labels
accuracy score measures how well the classifier predicts label combinations, averaged over samples

jaccard similarity measures the proportion of predicted labels for a sample to its correct assignment, averaged over samples

precision measures how many samples with ,

recall measures how many samples ,

F1 score measures a weighted average of precision and recall, where both have the same impact on the score

In [391]:
from skmultilearn.adapt import MLARAM
from sklearn.datasets import make_multilabel_classification
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.metrics import accuracy_score, classification_report

# MLARAM classifier
classifier = MLARAM()

# Train the classifier
classifier.fit(X_train.values, y_train.values)

# Predict on the test set
predictions = classifier.predict(X_test.values)

# Evaluate accuracy
accuracy = accuracy_score(y_test, predictions)
print("MLARAM Accuracy:", accuracy)
# Classification report
print("Classification Report:")
print(classification_report(y_test, predictions))

MLARAM Accuracy: 0.06476683937823834
Classification Report:
              precision    recall  f1-score   support

           0       0.00      0.00      0.00         1
           1       0.08      0.31      0.12        13
           2       0.00      0.00      0.00         5
           3       0.00      0.00      0.00         0
           4       0.00      0.00      0.00         1
           5       0.00      0.00      0.00         0
           6       0.00      0.00      0.00         9
           7       0.00      0.00      0.00         0
           8       0.00      0.00      0.00         0
           9       0.00      0.00      0.00         0
          10       0.00      0.00      0.00         2
          11       0.00      0.00      0.00         0
          12       0.00      0.00      0.00         0
          13       0.00      0.00      0.00         9
          14       0.00      0.00      0.00         2
          15       0.00      0.00      0.00         2
          16       0.

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [118]:
from skmultilearn.problem_transform import LabelPowerset
from skmultilearn.adapt import MLkNN
from skmultilearn.problem_transform import LabelPowerset
from sklearn.linear_model import SGDClassifier
from sklearn.datasets import make_multilabel_classification
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

# # Generate sample multilabel dataset
# X, y = make_multilabel_classification(n_samples=1000, n_features=20, n_classes=5, n_labels=2, random_state=42)

# # Split data into train and test sets
# X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Multi-Label Stochastic Gradient Descent (ML-SGD) with Label Powerset
classifier = LabelPowerset(SGDClassifier())

# Train the classifier
classifier.fit(X_train, y_train)

# Predict on the test set
predictions = classifier.predict(X_test)



In [119]:
import sklearn.metrics as metrics

hamming_loss = metrics.hamming_loss(y_test, predictions)
print("Hamming loss:", hamming_loss)
# Evaluate accuracy
accuracy = accuracy_score(y_test, predictions)
print("Accuracy:", accuracy)
# Classification report
print("Classification Report:")
print(classification_report(y_test, predictions))

Hamming loss: 0.012022760646108663
Accuracy: 0.2511013215859031
Classification Report:
              precision    recall  f1-score   support

           0       0.00      0.00      0.00         0
           1       0.50      0.07      0.12        15
           2       0.50      0.09      0.15        11
           3       0.00      0.00      0.00         0
           4       0.00      0.00      0.00         1
           5       0.00      0.00      0.00         0
           6       0.00      0.00      0.00         5
           7       0.00      0.00      0.00         0
           8       0.00      0.00      0.00         0
           9       0.00      0.00      0.00         0
          10       0.00      0.00      0.00         1
          11       0.00      0.00      0.00         0
          12       0.00      0.00      0.00         0
          13       0.00      0.00      0.00         3
          14       0.00      0.00      0.00         0
          15       0.00      0.00      0.00     

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [None]:
# Will take 50 minutes to train 

In [161]:
from skmultilearn.ensemble import RakelD
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.datasets import make_multilabel_classification
from sklearn.metrics import accuracy_score, classification_report

# Initialize the base classifier (Random Forest)
base_classifier = RandomForestClassifier()

# Initialize the RAkEL classifier with the base classifier
classifier = RakelD(base_classifier, labelset_size=3)

for i in tqdm(range(100)):
# Train the classifier with both input features (X_train) and target labels (y_train)
    classifier.fit(X_train, y_train)

# Predict labels for test data
y_pred = classifier.predict(X_test)

# Evaluate performance
accuracy = accuracy_score(y_test, y_pred)
print("Accuracy:", accuracy)

# Print classification report
print("Classification Report:")
print(classification_report(y_test, y_pred))

100%|████████████████████████████████████████████████████████████████████████████| 100/100 [16:39:46<00:00, 599.86s/it]


Accuracy: 0.3876651982378855
Classification Report:
              precision    recall  f1-score   support

           0       0.00      0.00      0.00         0
           1       1.00      0.33      0.50        15
           2       1.00      0.36      0.53        11
           3       0.00      0.00      0.00         0
           4       0.00      0.00      0.00         1
           5       0.00      0.00      0.00         0
           6       1.00      0.20      0.33         5
           7       0.00      0.00      0.00         0
           8       0.00      0.00      0.00         0
           9       0.00      0.00      0.00         0
          10       0.00      0.00      0.00         1
          11       0.00      0.00      0.00         0
          12       0.00      0.00      0.00         0
          13       1.00      0.33      0.50         3
          14       0.00      0.00      0.00         0
          15       0.00      0.00      0.00         1
          16       0.00      

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


#### Multi-output decision tree

In [393]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.multioutput import MultiOutputClassifier
from sklearn.metrics import accuracy_score

# Initialize the decision tree classifier
decision_tree = DecisionTreeClassifier(max_depth=50)

# Initialize the multi-output classifier with the decision tree as the base estimator
multi_output_tree = MultiOutputClassifier(decision_tree, n_jobs=-1)


In [394]:
# Train the multi-output classifier
multi_output_tree.fit(X_train, y_train)

In [395]:

# Predict the outputs for the test set
y_pred = multi_output_tree.predict(X_test.values)

In [396]:
X_train.shape

(1540, 277)

In [397]:
y_train.shape

(1540, 240)

In [398]:
X_test.shape

(386, 277)

In [399]:
y_test.shape

(386, 240)

In [400]:
y_pred.shape

(386, 240)

In [401]:
y_test.shape

(386, 240)

In [405]:
# Hamming loss measures the fraction of labels that are incorrectly predicted, averaged over all samples

from sklearn.metrics import hamming_loss

# Calculate Hamming Loss
hamming_loss_value = hamming_loss(y_test.values, y_pred_multilabel)

print("Hamming Loss:", hamming_loss_value)

Hamming Loss: 0.007102763385146805


In [403]:
# Example thresholding (adjust threshold as needed)
threshold = 0.5
y_pred_multilabel = np.where(y_pred > threshold, 1, 0)

# Now, both y_test.values and y_pred_multilabel should be in the multilabel format

In [406]:
# Evaluate the classifier
accuracy = accuracy_score(y_test.values, y_pred_multilabel)
print("Accuracy:", accuracy)

# Classification report
print("Classification Report:")
print(classification_report(y_test.values, y_pred_multilabel))

Accuracy: 0.4481865284974093
Classification Report:
              precision    recall  f1-score   support

           0       0.50      1.00      0.67         1
           1       0.60      0.69      0.64        13
           2       0.50      1.00      0.67         5
           3       0.00      0.00      0.00         0
           4       0.33      1.00      0.50         1
           5       0.00      0.00      0.00         0
           6       1.00      0.44      0.62         9
           7       0.00      0.00      0.00         0
           8       0.00      0.00      0.00         0
           9       0.00      0.00      0.00         0
          10       0.00      0.00      0.00         2
          11       0.00      0.00      0.00         0
          12       0.00      0.00      0.00         0
          13       0.55      0.67      0.60         9
          14       1.00      0.50      0.67         2
          15       0.00      0.00      0.00         2
          16       0.00      

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [None]:
# Need to work out which label is which for write up

#### Random forest for multi-output

In [407]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report

# Create a multi-output Random Forest classifier
forest_classifier = RandomForestClassifier(n_estimators=10)

# Train the classifier
forest_classifier.fit(X_train, y_train)

# Predict labels for test data
y_pred = forest_classifier.predict(X_test)

# Example thresholding (adjust threshold as needed)
threshold = 0.5
y_pred_multilabel = np.where(y_pred > threshold, 1, 0)

# Now, both y_test.values and y_pred_multilabel should be in the multilabel format

# Evaluate performance
accuracy = accuracy_score(y_test, y_pred_multilabel)
print("Accuracy:", accuracy)

# Print classification report
print("Classification Report:")
print(classification_report(y_test, y_pred_multilabel))


Accuracy: 0.2616580310880829
Classification Report:
              precision    recall  f1-score   support

           0       0.00      0.00      0.00         1
           1       0.14      0.08      0.10        13
           2       0.00      0.00      0.00         5
           3       0.00      0.00      0.00         0
           4       0.00      0.00      0.00         1
           5       0.00      0.00      0.00         0
           6       0.00      0.00      0.00         9
           7       0.00      0.00      0.00         0
           8       0.00      0.00      0.00         0
           9       0.00      0.00      0.00         0
          10       0.00      0.00      0.00         2
          11       0.00      0.00      0.00         0
          12       0.00      0.00      0.00         0
          13       1.00      0.56      0.71         9
          14       0.00      0.00      0.00         2
          15       0.00      0.00      0.00         2
          16       0.00      

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


#### Classifier Chain

In [409]:
from skmultilearn.problem_transform import ClassifierChain
from sklearn.model_selection import train_test_split
from sklearn.datasets import make_multilabel_classification
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report

# Initialize a base classifier (Random Forest in this case)
base_classifier = RandomForestClassifier()

# Initialize the Classifier Chain classifier with the base classifier
classifier = ClassifierChain(classifier=base_classifier)

# Train the classifier with both input features (X_train) and target labels (y_train)
classifier.fit(X_train, y_train)

# Predict labels for test data
y_pred = classifier.predict(X_test)


In [414]:
y_pred = y_pred.toarray()

In [415]:
# Example thresholding (adjust threshold as needed)
threshold = 0.5
y_pred_multilabel = np.where(y_pred > threshold, 1, 0)

# Evaluate performance
accuracy = accuracy_score(y_test, y_pred_multilabel)
print("Accuracy:", accuracy)

# Print classification report
print("Classification Report:")
print(classification_report(y_test, y_pred_multilabel))

Accuracy: 0.3549222797927461
Classification Report:
              precision    recall  f1-score   support

           0       0.00      0.00      0.00         1
           1       0.50      0.15      0.24        13
           2       0.00      0.00      0.00         5
           3       0.00      0.00      0.00         0
           4       0.00      0.00      0.00         1
           5       0.00      0.00      0.00         0
           6       1.00      0.22      0.36         9
           7       0.00      0.00      0.00         0
           8       0.00      0.00      0.00         0
           9       0.00      0.00      0.00         0
          10       0.00      0.00      0.00         2
          11       0.00      0.00      0.00         0
          12       0.00      0.00      0.00         0
          13       1.00      0.56      0.71         9
          14       0.00      0.00      0.00         2
          15       0.00      0.00      0.00         2
          16       0.00      

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [174]:
from skmultilearn.problem_transform import ClassifierChain
from sklearn.model_selection import train_test_split
from sklearn.datasets import make_multilabel_classification
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report

# Initialize a base classifier (Random Forest in this case)
base_classifier = RandomForestClassifier()

# Initialize the Classifier Chain classifier with the base classifier
classifier = ClassifierChain(classifier=base_classifier)

# Train the classifier with both input features (X_train) and target labels (y_train)
classifier.fit(X_train, y_train)

# Predict labels for test data
y_pred = classifier.predict(X_test)

# Evaluate performance
accuracy = accuracy_score(y_test, y_pred)
print("Accuracy:", accuracy)

# Print classification report
print("Classification Report:")
print(classification_report(y_test, y_pred))

Accuracy: 0.44052863436123346
Classification Report:
              precision    recall  f1-score   support

           0       0.00      0.00      0.00         0
           1       1.00      0.40      0.57        15
           2       0.83      0.45      0.59        11
           3       0.00      0.00      0.00         0
           4       0.00      0.00      0.00         1
           5       0.00      0.00      0.00         0
           6       1.00      0.20      0.33         5
           7       0.00      0.00      0.00         0
           8       0.00      0.00      0.00         0
           9       0.00      0.00      0.00         0
          10       0.00      0.00      0.00         1
          11       0.00      0.00      0.00         0
          12       0.00      0.00      0.00         0
          13       1.00      0.33      0.50         3
          14       0.00      0.00      0.00         0
          15       0.00      0.00      0.00         1
          16       0.00     

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


#### 	Probabilistic graphical models

In [None]:
# joint inference of multiple diagnoses (Bayesian Networks or Markov Random Fields)

In [417]:
import tensorflow as tf

# Define the Bernoulli Naive Bayes model
model = tf.keras.Sequential([
    tf.keras.layers.Input(shape=(277,)),      # Input layer with 277 features
    tf.keras.layers.Dense(240, activation='sigmoid')  # Output layer with 240 units (one for each label)
])

# Compile the model
model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

model.fit(X_train, y_train, epochs=10, batch_size=32, validation_split=0.2)

y_pred = model.predict(X_test)

# Evaluate accuracy
accuracy = model.evaluate(X_test, y_pred)
print("Accuracy:", accuracy)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
Accuracy: [0.2706722915172577, 1.0]


#### K nearest radius neighbours

In [177]:
from skmultilearn.adapt import MLkNN
from skmultilearn.problem_transform import LabelPowerset
from sklearn.model_selection import train_test_split
from sklearn.datasets import make_multilabel_classification
from sklearn.neighbors import RadiusNeighborsClassifier

# Initialize the RadiusNeighborsClassifier
radius_classifier = RadiusNeighborsClassifier(radius=13) # Any smaller radius and no neighbours is found

# Wrap the RadiusNeighborsClassifier with LabelPowerset for multi-label classification
classifier = LabelPowerset(radius_classifier)

# Train the classifier with both input features (X_train) and target labels (y_train)
classifier.fit(X_train, y_train)

In [179]:
# Predict labels for test data
y_pred = classifier.predict(X_test.values)

In [272]:
# Print classification report
print("Classification Report:")
print(classification_report(y_test, y_pred))

# Evaluate performance
accuracy = accuracy_score(y_test, y_pred)
print("Accuracy:", accuracy)


Classification Report:
              precision    recall  f1-score   support

           0       0.00      0.00      0.00         1
           1       0.00      0.00      0.00        13
           2       0.00      0.00      0.00         5
           3       0.00      0.00      0.00         0
           4       0.00      0.00      0.00         1
           5       0.00      0.00      0.00         0
           6       0.00      0.00      0.00         9
           7       0.00      0.00      0.00         0
           8       0.00      0.00      0.00         0
           9       0.00      0.00      0.00         0
          10       0.00      0.00      0.00         0
          11       0.00      0.00      0.00         0
          12       0.00      0.00      0.00         0
          13       0.00      0.00      0.00         9
          14       0.00      0.00      0.00         2
          15       0.00      0.00      0.00         0
          16       0.00      0.00      0.00         0
    

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


#### Adaboost for multi-output

In [180]:
from sklearn.multioutput import MultiOutputClassifier
from sklearn.ensemble import AdaBoostClassifier
# Initialize the base classifier (Gradient Boosting Classifier)
base_classifier = AdaBoostClassifier()

# Initialize the multi-output classifier with the base classifier
classifier = MultiOutputClassifier(base_classifier)

# Train the classifier with both input features (X_train) and target labels (y_train)
classifier.fit(X_train.values, y_train.values)

# Predict labels for test data
y_pred = classifier.predict(X_test.values)

# Evaluate performance
accuracy = accuracy_score(y_test, y_pred)
print("Accuracy:", accuracy)

# Print classification report
print("Classification Report:")
print(classification_report(y_test, y_pred))


Accuracy: 0.3964757709251101
Classification Report:
              precision    recall  f1-score   support

           0       0.00      0.00      0.00         0
           1       1.00      0.60      0.75        15
           2       0.89      0.73      0.80        11
           3       0.00      0.00      0.00         0
           4       1.00      1.00      1.00         1
           5       0.00      0.00      0.00         0
           6       0.80      0.80      0.80         5
           7       0.00      0.00      0.00         0
           8       0.00      0.00      0.00         0
           9       0.00      0.00      0.00         0
          10       0.00      0.00      0.00         1
          11       0.00      0.00      0.00         0
          12       0.00      0.00      0.00         0
          13       0.50      0.67      0.57         3
          14       0.00      0.00      0.00         0
          15       0.00      0.00      0.00         1
          16       0.00      

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [326]:
from sklearn.multioutput import MultiOutputClassifier
from sklearn.ensemble import AdaBoostClassifier
# Initialize the base classifier (Gradient Boosting Classifier)
base_classifier = AdaBoostClassifier()

# Initialize the multi-output classifier with the base classifier
classifier = MultiOutputClassifier(base_classifier)

# Train the classifier with both input features (X_train) and target labels (y_train)
classifier.fit(X_train.values, y_train.values)

# Predict labels for test data
y_pred = classifier.predict(X_test.values)

threshold = 0.5
y_pred_multilabel = np.where(y_pred > threshold, 1, 0)

# Evaluate performance
accuracy = accuracy_score(y_test, y_pred_multilabel)
print("Accuracy:", accuracy)

# Print classification report
print("Classification Report:")
print(classification_report(y_test, y_pred_multilabel))


Accuracy: 0.0
Classification Report:
              precision    recall  f1-score   support

           0       0.00      0.00      0.00         0
           1       0.00      0.00      0.00        15
           2       0.25      0.36      0.30        11
           3       0.00      0.00      0.00         0
           4       0.00      0.00      0.00         1
           5       0.00      0.00      0.00         0
           6       0.14      0.80      0.24         5
           7       0.00      0.00      0.00         0
           8       0.00      0.00      0.00         0
           9       0.00      0.00      0.00         0
          10       0.00      0.00      0.00         1
          11       0.00      0.00      0.00         0
          12       0.00      0.00      0.00         0
          13       0.01      1.00      0.03         3
          14       0.00      0.00      0.00         0
          15       0.00      0.00      0.00         1
          16       0.00      0.00      0.00 

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


#### XGBoost for multi-output

In [81]:
import xgboost as xgb
from catboost import CatBoostClassifier

base_classifier = xgb.XGBClassifier()

# Initialize the multi-output classifier with the base classifier
classifier = MultiOutputClassifier(base_classifier)

# Train the classifier with both input features (X_train) and target labels (y_train)
classifier.fit(X_train.values, y_train.values)

# Predict labels for test data
y_pred = classifier.predict(X_test.values)

threshold = 0.5
y_pred_multilabel = np.where(y_pred > threshold, 1, 0)

# Evaluate performance
accuracy = accuracy_score(y_test, y_pred_multilabel)
print("Accuracy:", accuracy)

# Print classification report
print("Classification Report:")
print(classification_report(y_test, y_pred_multilabel))

### emar

In [479]:
X_train.shape

(27540, 650)

In [480]:
y_train.shape

(27540, 240)

In [58]:
path = "C:/Users/jenni/OneDrive/Desktop/IP/"

file = "emar_data_train.csv"
full_path = path + file
X_train = pd.read_csv(full_path)

file = "emar_data_test.csv"
full_path = path + file
X_test = pd.read_csv(full_path)

In [59]:
X_train['delay']= X_train['delay'].apply(convert_to_days)
X_test['delay']= X_test['delay'].apply(convert_to_days)

In [60]:
# target = df_diagnoses (rows where hadm_id is in train data)
# Make sure the order is the same

# Extract the unique IDs from the column 
train_ids = X_train['hadm_id'].unique()

# Filter to keep only rows where the id column is in train_ids
y_train = df_drgcodes[df_drgcodes['hadm_id'].isin(train_ids)]

test_ids = X_test['hadm_id'].unique()

# Filter to keep only rows where the id column is in train_ids
y_test = df_drgcodes[df_drgcodes['hadm_id'].isin(test_ids)]

In [61]:
merged_df = pd.merge(y_train, X_train['hadm_id'], on='hadm_id', how='inner')

In [62]:
y_train = merged_df

In [63]:
merged_df = pd.merge(y_test, X_test['hadm_id'], on='hadm_id', how='inner')
y_test = merged_df

In [64]:
valid_ids = y_train['hadm_id'].unique()
X_train = X_train[X_train['hadm_id'].isin(valid_ids)]

In [65]:
valid_ids = y_test['hadm_id'].unique()
X_test = X_test[X_test['hadm_id'].isin(valid_ids)]

In [66]:
# Order by hadm_id and drop 

In [67]:
y_train = y_train.sort_values(by='hadm_id')
y_train = y_train.drop(columns=['hadm_id'])

y_test = y_test.sort_values(by='hadm_id')
y_test = y_test.drop(columns=['hadm_id'])

X_train = X_train.sort_values(by='hadm_id')
X_train = X_train.drop(columns=['hadm_id'])

X_test = X_test.sort_values(by='hadm_id')
X_test = X_test.drop(columns=['hadm_id'])

In [68]:
y_test

Unnamed: 0,14,20,21,22,23,24,25,26,27,39,...,950,951,952,956,957,981,982,983,987,988
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
19,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
21,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
22,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
23,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6845,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
6844,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
6874,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
6858,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


#### XGBoost for multi-output

In [331]:
import xgboost as xgb
from catboost import CatBoostClassifier

base_classifier = xgb.XGBClassifier()

# Initialize the multi-output classifier with the base classifier
classifier = MultiOutputClassifier(base_classifier)

# Train the classifier with both input features (X_train) and target labels (y_train)
classifier.fit(X_train.values, y_train.values)

# Predict labels for test data
y_pred = classifier.predict(X_test.values)

# Evaluate performance
accuracy = accuracy_score(y_test, y_pred)
print("Accuracy:", accuracy)

# Print classification report
print("Classification Report:")
print(classification_report(y_test, y_pred))

Accuracy: 0.3261090909090909
Classification Report:
              precision    recall  f1-score   support

           0       0.00      0.00      0.00         0
           1       0.80      0.32      0.45       111
           2       0.73      0.26      0.38        74
           3       0.89      0.39      0.54        62
           4       0.00      0.00      0.00         0
           5       0.88      0.30      0.45       117
           6       0.84      0.25      0.39       122
           7       0.60      0.33      0.43        18
           8       0.83      0.21      0.33        24
           9       0.67      0.20      0.31        10
          10       0.00      0.00      0.00         0
          11       0.00      0.00      0.00         0
          12       0.67      0.29      0.40         7
          13       0.94      0.27      0.42       115
          14       0.00      0.00      0.00         0
          15       0.00      0.00      0.00         0
          16       0.67      

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [332]:
# Calculate Hamming Loss
hamming_loss_value = hamming_loss(y_test, y_pred)

print("Hamming Loss:", hamming_loss_value)

Hamming Loss: 0.005964242424242424


### Admissions

In [492]:
X_train.shape

(185, 44)

In [493]:
y_train.shape

(185, 240)

In [481]:
path = "C:/Users/jenni/OneDrive/Desktop/IP/"

file = "admissions_data_train.csv"
full_path = path + file
X_train = pd.read_csv(full_path)

file = "admissions_data_test.csv"
full_path = path + file
X_test = pd.read_csv(full_path)

In [482]:
X_train.head()

Unnamed: 0,hadm_id,ed_duration,admission_type_AMBULATORY OBSERVATION,admission_type_DIRECT EMER.,admission_type_DIRECT OBSERVATION,admission_type_ELECTIVE,admission_type_EU OBSERVATION,admission_type_EW EMER.,admission_type_OBSERVATION ADMIT,admission_type_SURGICAL SAME DAY ADMISSION,...,race_HISPANIC/LATINO - PUERTO RICAN,race_HISPANIC/LATINO - SALVADORAN,race_OTHER,race_PATIENT DECLINED TO ANSWER,race_PORTUGUESE,race_UNABLE TO OBTAIN,race_UNKNOWN,race_WHITE,race_WHITE - BRAZILIAN,race_WHITE - OTHER EUROPEAN
0,24256866,0 days 00:00:00,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,1,0,0,0
1,26924951,0 days 00:00:00,0,0,0,0,0,0,0,1,...,0,0,0,0,0,1,0,0,0,0
2,23403708,0 days 00:00:00,0,0,0,0,0,0,0,1,...,0,0,0,0,0,0,0,1,0,0
3,22733922,0 days 00:00:00,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,24928679,0 days 10:00:00,0,0,0,0,0,0,1,0,...,0,0,0,0,0,0,0,0,0,0


In [483]:
X_train['ed_duration']= X_train['ed_duration'].apply(convert_to_days)
X_test['ed_duration']= X_test['ed_duration'].apply(convert_to_days)

In [484]:
# target = df_diagnoses (rows where hadm_id is in train data)
# Make sure the order is the same

# Extract the unique IDs from the column 
train_ids = X_train['hadm_id'].unique()

# Filter to keep only rows where the id column is in train_ids
y_train = df_drgcodes[df_drgcodes['hadm_id'].isin(train_ids)]

test_ids = X_test['hadm_id'].unique()

# Filter to keep only rows where the id column is in train_ids
y_test = df_drgcodes[df_drgcodes['hadm_id'].isin(test_ids)]

In [485]:
merged_df = pd.merge(y_train, X_train['hadm_id'], on='hadm_id', how='inner')

In [486]:
y_train = merged_df

In [487]:
merged_df = pd.merge(y_test, X_test['hadm_id'], on='hadm_id', how='inner')
y_test = merged_df

In [488]:
valid_ids = y_train['hadm_id'].unique()
X_train = X_train[X_train['hadm_id'].isin(valid_ids)]

In [489]:
valid_ids = y_test['hadm_id'].unique()
X_test = X_test[X_test['hadm_id'].isin(valid_ids)]

In [490]:
# Order by hadm_id and drop 

In [491]:
y_train = y_train.sort_values(by='hadm_id')
y_train = y_train.drop(columns=['hadm_id'])

y_test = y_test.sort_values(by='hadm_id')
y_test = y_test.drop(columns=['hadm_id'])

X_train = X_train.sort_values(by='hadm_id')
X_train = X_train.drop(columns=['hadm_id'])

X_test = X_test.sort_values(by='hadm_id')
X_test = X_test.drop(columns=['hadm_id'])

#### XGBoost for multi-output

In [346]:
import xgboost as xgb
from catboost import CatBoostClassifier

base_classifier = xgb.XGBClassifier()

# Initialize the multi-output classifier with the base classifier
classifier = MultiOutputClassifier(base_classifier)

# Train the classifier with both input features (X_train) and target labels (y_train)
for i in tqdm(range(100)):
    classifier.fit(X_train.values, y_train.values)

100%|██████████| 100/100 [12:08<00:00,  7.29s/it]


In [411]:
for i in tqdm(range(100)):
    y_pred = classifier.predict(X_test.values)

# Evaluate performance
accuracy = accuracy_score(y_test, y_pred)
print("Accuracy:", accuracy)

# Print classification report
print("Classification Report:")
print(classification_report(y_test, y_pred))

100%|██████████| 100/100 [00:13<00:00,  7.14it/s]

Accuracy: 0.0
Classification Report:
              precision    recall  f1-score   support

           0       0.00      0.00      0.00         1
           1       0.00      0.00      0.00         1
           2       0.00      0.00      0.00         2
           3       0.00      0.00      0.00         0
           4       0.00      0.00      0.00         1
           5       0.00      0.00      0.00         1
           6       0.00      0.00      0.00         1
           7       0.00      0.00      0.00         2
           8       0.00      0.00      0.00         0
           9       0.00      0.00      0.00         0
          10       0.00      0.00      0.00         0
          11       0.00      0.00      0.00         0
          12       0.00      0.00      0.00         0
          13       0.00      0.00      0.00         1
          14       0.00      0.00      0.00         0
          15       0.00      0.00      0.00         0
          16       0.00      0.00      0.00 


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [412]:
# Calculate Hamming Loss
hamming_loss_value = hamming_loss(y_test, y_pred)

print("Hamming Loss:", hamming_loss_value)

Hamming Loss: 0.008680555555555556


### hcpcsevents

In [508]:
X_train.shape

(48, 13)

In [509]:
y_train.shape

(48, 1472)

For some reason, none of the hadm_ids from this df appear in the drgcodes table so using diagnoses instead 

In [494]:
path = "C:/Users/jenni/OneDrive/Desktop/IP/"

file = "hcpcsevents_data_train.csv"
full_path = path + file
X_train = pd.read_csv(full_path)

file = "hcpcsevents_data_test.csv"
full_path = path + file
X_test = pd.read_csv(full_path)

In [495]:
# Convert strings to integers
X_train['days_since_admission'] = X_train['days_since_admission'].str.split().str[0].astype(int)
X_test['days_since_admission'] = X_test['days_since_admission'].str.split().str[0].astype(int)

In [496]:
X_train = X_train.drop(columns=['admittime'])
X_test = X_test.drop(columns=['admittime'])

In [497]:
# target = df_diagnoses (rows where hadm_id is in train data)
# Make sure the order is the same

# Extract the unique IDs from the column 
train_ids = X_train['hadm_id'].unique()

# Filter to keep only rows where the id column is in train_ids
y_train = df_diagnoses[df_diagnoses['hadm_id'].isin(train_ids)]

test_ids = X_test['hadm_id'].unique()

# Filter to keep only rows where the id column is in train_ids
y_test = df_diagnoses[df_diagnoses['hadm_id'].isin(test_ids)]

In [498]:
# df_diagnoses[df_diagnoses['hadm_id']==29654498]

In [499]:
merged_df = pd.merge(y_train, X_train['hadm_id'], on='hadm_id', how='inner')

In [500]:
y_train = merged_df

In [501]:
merged_df = pd.merge(y_test, X_test['hadm_id'], on='hadm_id', how='inner')
y_test = merged_df

In [502]:
y_train

Unnamed: 0,hadm_id,00845,0088,0380,0383,03842,03843,03849,0388,0389,...,Z95810,Z95820,Z961,Z96651,Z980,Z981,Z9884,Z9911,Z992,Z9981
0,20282368,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,20457729,0,0,0,0,0,0,0,0,0,...,0,1,0,0,0,0,0,0,0,0
2,20846853,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,20900955,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,21039249,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
5,21039249,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
6,22228639,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
7,22380825,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
8,22416954,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,1,0
9,22502504,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [503]:
valid_ids = y_train['hadm_id'].unique()
X_train = X_train[X_train['hadm_id'].isin(valid_ids)]

In [504]:
valid_ids = y_test['hadm_id'].unique()
X_test = X_test[X_test['hadm_id'].isin(valid_ids)]

In [505]:
# Order by hadm_id and drop 

In [506]:
y_train = y_train.sort_values(by='hadm_id')
y_train = y_train.drop(columns=['hadm_id'])

y_test = y_test.sort_values(by='hadm_id')
y_test = y_test.drop(columns=['hadm_id'])

X_train = X_train.sort_values(by='hadm_id')
X_train = X_train.drop(columns=['hadm_id'])

X_test = X_test.sort_values(by='hadm_id')
X_test = X_test.drop(columns=['hadm_id'])

In [507]:
# Need to reduce from 13 to 9

In [549]:
from sklearn.decomposition import TruncatedSVD

# Number of desired features (components)
n_components = 9

# Initialize Truncated SVD with the desired number of components
svd = TruncatedSVD(n_components=n_components)

# Fit the Truncated SVD model to the sparse matrix and transform the data
svd.fit(X_train)
X_train = svd.transform(X_train)

# Get the explained variance ratio (how much variance is explained by each component)
explained_variance_ratio = svd.explained_variance_ratio_

# Print the transformed matrix and explained variance ratio
# print("Transformed Matrix:")
# print(transformed_matrix)
print("\nExplained Variance Ratio:")
print(explained_variance_ratio)

print("\n Amount of original variance conserved:", np.sum(svd.explained_variance_ratio_))


Explained Variance Ratio:
[0.43713458 0.24268566 0.10206493 0.05400726 0.03833628 0.02801852
 0.02066158 0.01412175 0.01418972]

 Amount of original variance conserved: 0.9512202809282059


In [550]:
# Number of desired features (components)
n_components = 9

# Initialize Truncated SVD with the desired number of components
svd = TruncatedSVD(n_components=n_components)

# Fit the Truncated SVD model to the sparse matrix and transform the data
svd.fit(X_test)
X_test = svd.transform(X_test)

# Get the explained variance ratio (how much variance is explained by each component)
explained_variance_ratio = svd.explained_variance_ratio_

# Print the transformed matrix and explained variance ratio
# print("Transformed Matrix:")
# print(transformed_matrix)
print("\nExplained Variance Ratio:")
print(explained_variance_ratio)

print("\n Amount of original variance conserved:", np.sum(svd.explained_variance_ratio_))


Explained Variance Ratio:
[5.01651929e-01 1.96707388e-01 1.09078544e-01 4.81326185e-02
 4.55111394e-02 4.61980609e-02 4.72757336e-02 5.44458714e-03
 1.09168733e-33]

 Amount of original variance conserved: 1.0


#### XGBoost for multi-output

Don't run it again it took ages

In [552]:
import xgboost as xgb
from catboost import CatBoostClassifier

base_classifier = xgb.XGBClassifier()

# Initialize the multi-output classifier with the base classifier
classifier = MultiOutputClassifier(base_classifier)

# Train the classifier with both input features (X_train) and target labels (y_train)
for i in tqdm(range(100)):
    classifier.fit(X_train, y_train)

100%|██████████| 100/100 [1:00:23<00:00, 36.23s/it]


In [554]:
for i in tqdm(range(100)):
    y_pred = classifier.predict(X_test)

# Evaluate performance
accuracy = accuracy_score(y_test, y_pred)
print("Accuracy:", accuracy)

# Print classification report
print("Classification Report:")
print(classification_report(y_test, y_pred))

100%|██████████| 100/100 [01:29<00:00,  1.12it/s]

Accuracy: 0.0
Classification Report:
              precision    recall  f1-score   support

           0       0.00      0.00      0.00         0
           1       0.00      0.00      0.00         0
           2       0.00      0.00      0.00         0
           3       0.00      0.00      0.00         0
           4       0.00      0.00      0.00         0
           5       0.00      0.00      0.00         0
           6       0.00      0.00      0.00         0
           7       0.00      0.00      0.00         0
           8       0.00      0.00      0.00         0
           9       0.00      0.00      0.00         0
          10       0.00      0.00      0.00         0
          11       0.00      0.00      0.00         0
          12       0.00      0.00      0.00         0
          13       0.00      0.00      0.00         0
          14       0.00      0.00      0.00         0
          15       0.00      0.00      0.00         0
          16       0.00      0.00      0.00 


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [555]:
# Calculate Hamming Loss
hamming_loss_value = hamming_loss(y_test, y_pred)

print("Hamming Loss:", hamming_loss_value)

Hamming Loss: 0.008308946488294314


### procedures_icd

In [521]:
X_train.shape

(560, 355)

In [522]:
y_train.shape

(560, 240)

In [510]:
path = "C:/Users/jenni/OneDrive/Desktop/IP/"

file = "procedures_data_train.csv"
full_path = path + file
X_train = pd.read_csv(full_path)

file = "procedures_data_test.csv"
full_path = path + file
X_test = pd.read_csv(full_path)

In [511]:
# Convert strings to integers
X_train['days_since_admission'] = X_train['days_since_admission'].str.split().str[0].astype(int)
X_test['days_since_admission'] = X_test['days_since_admission'].str.split().str[0].astype(int)

In [512]:
# target = df_diagnoses (rows where hadm_id is in train data)
# Make sure the order is the same

# Extract the unique IDs from the column 
train_ids = X_train['hadm_id'].unique()

# Filter to keep only rows where the id column is in train_ids
y_train = df_drgcodes[df_drgcodes['hadm_id'].isin(train_ids)]

test_ids = X_test['hadm_id'].unique()

# Filter to keep only rows where the id column is in train_ids
y_test = df_drgcodes[df_drgcodes['hadm_id'].isin(test_ids)]

In [513]:
merged_df = pd.merge(y_train, X_train['hadm_id'], on='hadm_id', how='inner')

In [514]:
y_train = merged_df

In [515]:
merged_df = pd.merge(y_test, X_test['hadm_id'], on='hadm_id', how='inner')
y_test = merged_df

In [516]:
valid_ids = y_train['hadm_id'].unique()
X_train = X_train[X_train['hadm_id'].isin(valid_ids)]

In [517]:
valid_ids = y_test['hadm_id'].unique()
X_test = X_test[X_test['hadm_id'].isin(valid_ids)]

In [518]:
# Order by hadm_id and drop 

In [519]:
y_train = y_train.sort_values(by='hadm_id')
y_train = y_train.drop(columns=['hadm_id'])

y_test = y_test.sort_values(by='hadm_id')
y_test = y_test.drop(columns=['hadm_id'])

X_train = X_train.sort_values(by='hadm_id')
X_train = X_train.drop(columns=['hadm_id'])

X_test = X_test.sort_values(by='hadm_id')
X_test = X_test.drop(columns=['hadm_id'])

In [520]:
# Need to reduce from 355 to 115

In [593]:
from sklearn.decomposition import TruncatedSVD

# Number of desired features (components)
n_components = 115

# Initialize Truncated SVD with the desired number of components
svd = TruncatedSVD(n_components=n_components)

# Fit the Truncated SVD model to the sparse matrix and transform the data
svd.fit(X_train)
X_train = svd.transform(X_train)

# Get the explained variance ratio (how much variance is explained by each component)
explained_variance_ratio = svd.explained_variance_ratio_

# Print the transformed matrix and explained variance ratio
# print("Transformed Matrix:")
# print(transformed_matrix)
print("\nExplained Variance Ratio:")
print(explained_variance_ratio)

print("\n Amount of original variance conserved:", np.sum(svd.explained_variance_ratio_))


Explained Variance Ratio:
[3.08921484e-01 4.71046748e-01 2.01995355e-01 6.70578329e-04
 5.97555236e-04 4.47598195e-04 3.85753827e-04 2.94080361e-04
 2.93762187e-04 2.93053791e-04 2.82650969e-04 2.61389982e-04
 2.59750760e-04 2.48404704e-04 1.96052418e-04 1.96063673e-04
 1.96051526e-04 1.92877046e-04 1.87573604e-04 1.67152925e-04
 1.63386178e-04 1.63382172e-04 1.63385596e-04 1.62659035e-04
 1.57535338e-04 1.50846914e-04 1.30709913e-04 1.30709021e-04
 1.30709021e-04 1.29670173e-04 1.27668155e-04 1.23094801e-04
 9.80306790e-05 9.80322453e-05 9.80298799e-05 9.80300821e-05
 9.80297568e-05 9.80314221e-05 9.80265969e-05 9.80301643e-05
 9.80304148e-05 9.80318414e-05 9.80303455e-05 9.80297629e-05
 9.80294097e-05 9.68615570e-05 9.05856878e-05 8.41064144e-05
 6.53509947e-05 6.53475964e-05 6.53495782e-05 6.53439287e-05
 6.53481022e-05 6.53479094e-05 6.53468326e-05 6.53463851e-05
 6.53459390e-05 6.53457045e-05 6.53446750e-05 6.53434222e-05
 6.53424966e-05 6.53420766e-05 6.53405857e-05 6.53390422e-

In [594]:
from sklearn.decomposition import TruncatedSVD

# Number of desired features (components)
n_components = 115

# Initialize Truncated SVD with the desired number of components
svd = TruncatedSVD(n_components=n_components)

# Fit the Truncated SVD model to the sparse matrix and transform the data
svd.fit(X_test)
X_test = svd.transform(X_test)

# Get the explained variance ratio (how much variance is explained by each component)
explained_variance_ratio = svd.explained_variance_ratio_

# Print the transformed matrix and explained variance ratio
# print("Transformed Matrix:")
# print(transformed_matrix)
print("\nExplained Variance Ratio:")
print(explained_variance_ratio)

print("\n Amount of original variance conserved:", np.sum(svd.explained_variance_ratio_))


Explained Variance Ratio:
[3.04270466e-01 4.81443421e-01 1.96010480e-01 9.20171180e-04
 6.58187181e-04 5.40965866e-04 5.28767772e-04 4.13855721e-04
 4.13860369e-04 4.13797776e-04 4.13856704e-04 4.04664219e-04
 4.01329206e-04 3.65571274e-04 2.75893539e-04 2.75902385e-04
 2.75878103e-04 2.75904762e-04 2.63580951e-04 2.45864272e-04
 2.23776991e-04 1.37952664e-04 1.37950848e-04 1.37950779e-04
 1.37954348e-04 1.37949150e-04 1.37949335e-04 1.37953645e-04
 1.37941902e-04 1.37954000e-04 1.37954367e-04 1.37953813e-04
 1.37949317e-04 1.37950997e-04 1.37952842e-04 1.37951813e-04
 1.37952394e-04 1.37954312e-04 1.37953069e-04 1.37954365e-04
 1.37953611e-04 1.37952069e-04 1.37951003e-04 1.37949557e-04
 1.37952880e-04 1.37953435e-04 1.37954368e-04 1.37952196e-04
 1.37954350e-04 1.37952010e-04 1.37950181e-04 1.37947873e-04
 1.37952278e-04 1.37948086e-04 1.37954272e-04 1.37954101e-04
 1.37954196e-04 1.37953724e-04 1.37951219e-04 1.37952263e-04
 1.37945215e-04 1.37950783e-04 1.37953745e-04 1.37939509e-

#### XGBoost for multi-output

In [595]:
import xgboost as xgb
from catboost import CatBoostClassifier

base_classifier = xgb.XGBClassifier()

# Initialize the multi-output classifier with the base classifier
classifier = MultiOutputClassifier(base_classifier)

# Train the classifier with both input features (X_train) and target labels (y_train)
for i in tqdm(range(100)):
    classifier.fit(X_train, y_train)

100%|██████████| 100/100 [51:30<00:00, 30.90s/it]


In [596]:
for i in tqdm(range(100)):
    y_pred = classifier.predict(X_test)

# Evaluate performance
accuracy = accuracy_score(y_test, y_pred)
print("Accuracy:", accuracy)

# Print classification report
print("Classification Report:")
print(classification_report(y_test, y_pred))

100%|██████████| 100/100 [00:21<00:00,  4.68it/s]

Accuracy: 0.0
Classification Report:
              precision    recall  f1-score   support

           0       0.00      0.00      0.00         1
           1       0.00      0.00      0.00         4
           2       0.00      0.00      0.00         6
           3       0.00      0.00      0.00         0
           4       0.00      0.00      0.00         1
           5       0.00      0.00      0.00         1
           6       0.00      0.00      0.00         1
           7       0.00      0.00      0.00         1
           8       0.00      0.00      0.00         1
           9       0.00      0.00      0.00         1
          10       0.00      0.00      0.00         1
          11       0.00      0.00      0.00         0
          12       0.00      0.00      0.00         0
          13       0.00      0.00      0.00         1
          14       0.00      0.00      0.00         0
          15       0.00      0.00      0.00         1
          16       0.00      0.00      0.00 


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [597]:
# Calculate Hamming Loss
hamming_loss_value = hamming_loss(y_test, y_pred)

print("Hamming Loss:", hamming_loss_value)

Hamming Loss: 0.008302919708029197


### services

In [533]:
X_train.shape

(221, 25)

In [534]:
y_train.shape

(221, 240)

In [523]:
path = "C:/Users/jenni/OneDrive/Desktop/IP/"

file = "services_data_train.csv"
full_path = path + file
X_train = pd.read_csv(full_path)

file = "services_data_test.csv"
full_path = path + file
X_test = pd.read_csv(full_path)

In [524]:
# Need to deal with days_since_admission
X_train['days_since_admission'] = X_train['days_since_admission'].apply(convert_to_days)
X_test['days_since_admission'] = X_test['days_since_admission'].apply(convert_to_days)

In [525]:
# target = df_diagnoses (rows where hadm_id is in train data)
# Make sure the order is the same

# Extract the unique IDs from the column 
train_ids = X_train['hadm_id'].unique()

# Filter to keep only rows where the id column is in train_ids
y_train = df_drgcodes[df_drgcodes['hadm_id'].isin(train_ids)]

test_ids = X_test['hadm_id'].unique()

# Filter to keep only rows where the id column is in train_ids
y_test = df_drgcodes[df_drgcodes['hadm_id'].isin(test_ids)]

In [526]:
merged_df = pd.merge(y_train, X_train['hadm_id'], on='hadm_id', how='inner')

In [527]:
y_train = merged_df

In [528]:
merged_df = pd.merge(y_test, X_test['hadm_id'], on='hadm_id', how='inner')
y_test = merged_df

In [529]:
valid_ids = y_train['hadm_id'].unique()
X_train = X_train[X_train['hadm_id'].isin(valid_ids)]

In [530]:
valid_ids = y_test['hadm_id'].unique()
X_test = X_test[X_test['hadm_id'].isin(valid_ids)]

In [531]:
# Order by hadm_id and drop 

In [532]:
y_train = y_train.sort_values(by='hadm_id')
y_train = y_train.drop(columns=['hadm_id'])

y_test = y_test.sort_values(by='hadm_id')
y_test = y_test.drop(columns=['hadm_id'])

X_train = X_train.sort_values(by='hadm_id')
X_train = X_train.drop(columns=['hadm_id'])

X_test = X_test.sort_values(by='hadm_id')
X_test = X_test.drop(columns=['hadm_id'])

#### XGBoost for multi-output

In [608]:
import xgboost as xgb
from catboost import CatBoostClassifier

base_classifier = xgb.XGBClassifier()

# Initialize the multi-output classifier with the base classifier
classifier = MultiOutputClassifier(base_classifier)

# Train the classifier with both input features (X_train) and target labels (y_train)
for i in tqdm(range(100)):
    classifier.fit(X_train.values, y_train.values)

100%|██████████| 100/100 [09:51<00:00,  5.91s/it]


In [609]:
for i in tqdm(range(100)):
    y_pred = classifier.predict(X_test.values)

# Evaluate performance
accuracy = accuracy_score(y_test, y_pred)
print("Accuracy:", accuracy)

# Print classification report
print("Classification Report:")
print(classification_report(y_test, y_pred))

100%|██████████| 100/100 [00:18<00:00,  5.37it/s]

Accuracy: 0.0
Classification Report:
              precision    recall  f1-score   support

           0       0.00      0.00      0.00         1
           1       0.00      0.00      0.00         1
           2       0.00      0.00      0.00         2
           3       0.00      0.00      0.00         0
           4       0.00      0.00      0.00         1
           5       0.00      0.00      0.00         1
           6       0.00      0.00      0.00         0
           7       0.00      0.00      0.00         1
           8       0.00      0.00      0.00         0
           9       0.00      0.00      0.00         0
          10       0.00      0.00      0.00         0
          11       0.00      0.00      0.00         1
          12       0.00      0.00      0.00         1
          13       0.00      0.00      0.00         1
          14       0.00      0.00      0.00         0
          15       0.00      0.00      0.00         0
          16       0.00      0.00      0.00 


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [610]:
# Calculate Hamming Loss
hamming_loss_value = hamming_loss(y_test, y_pred)

print("Hamming Loss:", hamming_loss_value)

Hamming Loss: 0.008705357142857143


### transfers

In [546]:
X_train.shape

(815, 38)

In [547]:
y_train.shape

(815, 240)

In [535]:
path = "C:/Users/jenni/OneDrive/Desktop/IP/"

file = "transfers_data_train.csv"
full_path = path + file
X_train = pd.read_csv(full_path)

file = "transfers_data_test.csv"
full_path = path + file
X_test = pd.read_csv(full_path)

In [536]:
X_train['duration']= X_train['duration'].apply(convert_to_days)
X_test['duration']= X_test['duration'].apply(convert_to_days)

In [537]:
# Also need to deal with days_since_admission
X_train['days_since_admission'] = X_train['days_since_admission'].apply(convert_to_days)
X_test['days_since_admission'] = X_test['days_since_admission'].apply(convert_to_days)

In [538]:
# target = df_diagnoses (rows where hadm_id is in train data)
# Make sure the order is the same

# Extract the unique IDs from the column 
train_ids = X_train['hadm_id'].unique()

# Filter to keep only rows where the id column is in train_ids
y_train = df_drgcodes[df_drgcodes['hadm_id'].isin(train_ids)]

test_ids = X_test['hadm_id'].unique()

# Filter to keep only rows where the id column is in train_ids
y_test = df_drgcodes[df_drgcodes['hadm_id'].isin(test_ids)]

In [539]:
merged_df = pd.merge(y_train, X_train['hadm_id'], on='hadm_id', how='inner')

In [540]:
y_train = merged_df

In [541]:
merged_df = pd.merge(y_test, X_test['hadm_id'], on='hadm_id', how='inner')
y_test = merged_df

In [542]:
valid_ids = y_train['hadm_id'].unique()
X_train = X_train[X_train['hadm_id'].isin(valid_ids)]

In [543]:
valid_ids = y_test['hadm_id'].unique()
X_test = X_test[X_test['hadm_id'].isin(valid_ids)]

In [544]:
# Order by hadm_id and drop 

In [545]:
y_train = y_train.sort_values(by='hadm_id')
y_train = y_train.drop(columns=['hadm_id'])

y_test = y_test.sort_values(by='hadm_id')
y_test = y_test.drop(columns=['hadm_id'])

X_train = X_train.sort_values(by='hadm_id')
X_train = X_train.drop(columns=['hadm_id'])

X_test = X_test.sort_values(by='hadm_id')
X_test = X_test.drop(columns=['hadm_id'])

#### XGBoost for multi-output

In [431]:
import xgboost as xgb
from catboost import CatBoostClassifier

base_classifier = xgb.XGBClassifier()

# Initialize the multi-output classifier with the base classifier
classifier = MultiOutputClassifier(base_classifier)

# Train the classifier with both input features (X_train) and target labels (y_train)
classifier.fit(X_train.values, y_train.values)

In [623]:
for i in tqdm(range(100)):
    y_pred = classifier.predict(X_test.values)

# Evaluate performance
accuracy = accuracy_score(y_test, y_pred)
print("Accuracy:", accuracy)

# Print classification report
print("Classification Report:")
print(classification_report(y_test, y_pred))

100%|██████████| 100/100 [00:19<00:00,  5.05it/s]

Accuracy: 0.005128205128205128
Classification Report:
              precision    recall  f1-score   support

           0       0.00      0.00      0.00         0
           1       0.00      0.00      0.00         0
           2       0.00      0.00      0.00         5
           3       0.00      0.00      0.00         1
           4       0.00      0.00      0.00         2
           5       0.00      0.00      0.00         6
           6       0.00      0.00      0.00         3
           7       0.00      0.00      0.00         2
           8       0.00      0.00      0.00         2
           9       0.00      0.00      0.00         1
          10       0.00      0.00      0.00         1
          11       0.00      0.00      0.00         1
          12       0.00      0.00      0.00         1
          13       0.00      0.00      0.00         2
          14       0.00      0.00      0.00         1
          15       0.00      0.00      0.00         1
          16       0.00    


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [624]:
# Calculate Hamming Loss
hamming_loss_value = hamming_loss(y_test, y_pred)

print("Hamming Loss:", hamming_loss_value)

Hamming Loss: 0.00935897435897436


### Alternatives

In [433]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.multioutput import MultiOutputClassifier
from sklearn.metrics import accuracy_score

# Initialize the decision tree classifier
decision_tree = DecisionTreeClassifier(max_depth=50)

# Initialize the multi-output classifier with the decision tree as the base estimator
multi_output_tree = MultiOutputClassifier(decision_tree, n_jobs=-1)


In [434]:
# Train the multi-output classifier
multi_output_tree.fit(X_train, y_train)

In [435]:

# Predict the outputs for the test set
y_pred = multi_output_tree.predict(X_test.values)

In [437]:
# Example thresholding (adjust threshold as needed)
threshold = 0.5
y_pred_multilabel = np.where(y_pred > threshold, 1, 0)

# Now, both y_test.values and y_pred_multilabel should be in the multilabel format

In [438]:
# Hamming loss measures the fraction of labels that are incorrectly predicted, averaged over all samples

from sklearn.metrics import hamming_loss

# Calculate Hamming Loss
hamming_loss_value = hamming_loss(y_test.values, y_pred_multilabel)

print("Hamming Loss:", hamming_loss_value)

Hamming Loss: 0.015876068376068374


In [439]:
# Evaluate the classifier
accuracy = accuracy_score(y_test.values, y_pred_multilabel)
print("Accuracy:", accuracy)

# Classification report
print("Classification Report:")
print(classification_report(y_test.values, y_pred_multilabel))

Accuracy: 0.010256410256410256
Classification Report:
              precision    recall  f1-score   support

           0       0.00      0.00      0.00         0
           1       0.00      0.00      0.00         0
           2       0.00      0.00      0.00         5
           3       0.00      0.00      0.00         1
           4       0.00      0.00      0.00         2
           5       0.40      0.33      0.36         6
           6       0.00      0.00      0.00         3
           7       0.00      0.00      0.00         2
           8       0.00      0.00      0.00         2
           9       0.00      0.00      0.00         1
          10       0.00      0.00      0.00         1
          11       0.00      0.00      0.00         1
          12       0.00      0.00      0.00         1
          13       0.00      0.00      0.00         2
          14       0.00      0.00      0.00         1
          15       0.00      0.00      0.00         1
          16       0.00    

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [440]:
from skmultilearn.problem_transform import ClassifierChain
from sklearn.model_selection import train_test_split
from sklearn.datasets import make_multilabel_classification
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report

# Initialize a base classifier (Random Forest in this case)
base_classifier = RandomForestClassifier()

# Initialize the Classifier Chain classifier with the base classifier
classifier = ClassifierChain(classifier=base_classifier)

# Train the classifier with both input features (X_train) and target labels (y_train)
classifier.fit(X_train, y_train)

# Predict labels for test data
y_pred = classifier.predict(X_test)


In [441]:
y_pred = y_pred.toarray()

In [442]:
# Example thresholding (adjust threshold as needed)
threshold = 0.5
y_pred_multilabel = np.where(y_pred > threshold, 1, 0)

# Evaluate performance
accuracy = accuracy_score(y_test, y_pred_multilabel)
print("Accuracy:", accuracy)

# Print classification report
print("Classification Report:")
print(classification_report(y_test, y_pred_multilabel))

Accuracy: 0.035897435897435895
Classification Report:
              precision    recall  f1-score   support

           0       0.00      0.00      0.00         0
           1       0.00      0.00      0.00         0
           2       0.00      0.00      0.00         5
           3       0.00      0.00      0.00         1
           4       0.00      0.00      0.00         2
           5       0.00      0.00      0.00         6
           6       0.00      0.00      0.00         3
           7       0.00      0.00      0.00         2
           8       0.00      0.00      0.00         2
           9       0.00      0.00      0.00         1
          10       0.00      0.00      0.00         1
          11       0.00      0.00      0.00         1
          12       0.00      0.00      0.00         1
          13       0.00      0.00      0.00         2
          14       0.00      0.00      0.00         1
          15       0.00      0.00      0.00         1
          16       0.00    

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [409]:
from skmultilearn.problem_transform import ClassifierChain
from sklearn.model_selection import train_test_split
from sklearn.datasets import make_multilabel_classification
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report

# Initialize a base classifier (Random Forest in this case)
base_classifier = RandomForestClassifier()

# Initialize the Classifier Chain classifier with the base classifier
classifier = ClassifierChain(classifier=base_classifier)

# Train the classifier with both input features (X_train) and target labels (y_train)
classifier.fit(X_train, y_train)

# Predict labels for test data
y_pred = classifier.predict(X_test)


In [414]:
y_pred = y_pred.toarray()

In [415]:
# Example thresholding (adjust threshold as needed)
threshold = 0.5
y_pred_multilabel = np.where(y_pred > threshold, 1, 0)

# Evaluate performance
accuracy = accuracy_score(y_test, y_pred_multilabel)
print("Accuracy:", accuracy)

# Print classification report
print("Classification Report:")
print(classification_report(y_test, y_pred_multilabel))

Accuracy: 0.3549222797927461
Classification Report:
              precision    recall  f1-score   support

           0       0.00      0.00      0.00         1
           1       0.50      0.15      0.24        13
           2       0.00      0.00      0.00         5
           3       0.00      0.00      0.00         0
           4       0.00      0.00      0.00         1
           5       0.00      0.00      0.00         0
           6       1.00      0.22      0.36         9
           7       0.00      0.00      0.00         0
           8       0.00      0.00      0.00         0
           9       0.00      0.00      0.00         0
          10       0.00      0.00      0.00         2
          11       0.00      0.00      0.00         0
          12       0.00      0.00      0.00         0
          13       1.00      0.56      0.71         9
          14       0.00      0.00      0.00         2
          15       0.00      0.00      0.00         2
          16       0.00      

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [448]:
X_train.shape

(815, 38)

In [449]:
y_train.shape

(815, 240)

### icustays

In [559]:
X_train.shape

(112, 20)

In [560]:
y_train.shape

(112, 240)

In [548]:
path = "C:/Users/jenni/OneDrive/Desktop/IP/"

file = "icustays_data_train.csv"
full_path = path + file
X_train = pd.read_csv(full_path)

file = "icustays_data_test.csv"
full_path = path + file
X_test = pd.read_csv(full_path)

In [549]:
# Need to deal with days_since_admission
X_train['days_since_admission'] = X_train['days_since_admission'].apply(convert_to_days)
X_test['days_since_admission'] = X_test['days_since_admission'].apply(convert_to_days)

In [550]:
X_train

Unnamed: 0,hadm_id,icu_los,days_since_admission,first_careunit_Cardiac Vascular Intensive Care Unit (CVICU),first_careunit_Coronary Care Unit (CCU),first_careunit_Medical Intensive Care Unit (MICU),first_careunit_Medical/Surgical Intensive Care Unit (MICU/SICU),first_careunit_Neuro Intermediate,first_careunit_Neuro Stepdown,first_careunit_Neuro Surgical Intensive Care Unit (Neuro SICU),...,first_careunit_Trauma SICU (TSICU),last_careunit_Cardiac Vascular Intensive Care Unit (CVICU),last_careunit_Coronary Care Unit (CCU),last_careunit_Medical Intensive Care Unit (MICU),last_careunit_Medical/Surgical Intensive Care Unit (MICU/SICU),last_careunit_Neuro Intermediate,last_careunit_Neuro Stepdown,last_careunit_Neuro Surgical Intensive Care Unit (Neuro SICU),last_careunit_Surgical Intensive Care Unit (SICU),last_careunit_Trauma SICU (TSICU)
0,20291550,4.983889,0.444861,0,0,0,0,0,0,0,...,1,0,0,0,0,0,0,0,0,1
1,22205327,6.327037,0.093056,0,1,0,0,0,0,0,...,0,0,1,0,0,0,0,0,0,0
2,22429197,9.362049,0.036806,0,0,0,0,0,0,0,...,1,0,0,0,0,0,0,0,0,1
3,25926192,2.280752,1.699178,1,0,0,0,0,0,0,...,0,1,0,0,0,0,0,0,0,0
4,20199380,3.677384,0.992396,1,0,0,0,0,0,0,...,0,1,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
107,25809882,2.525509,0.060417,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,1,0
108,29642388,1.446539,0.001319,1,0,0,0,0,0,0,...,0,1,0,0,0,0,0,0,0,0
109,27417763,0.798125,0.090382,0,0,0,0,0,0,0,...,1,0,0,0,0,0,0,0,0,1
110,22580999,1.083877,1.065671,1,0,0,0,0,0,0,...,0,1,0,0,0,0,0,0,0,0


In [551]:
# target = df_diagnoses (rows where hadm_id is in train data)
# Make sure the order is the same

# Extract the unique IDs from the column 
train_ids = X_train['hadm_id'].unique()

# Filter to keep only rows where the id column is in train_ids
y_train = df_drgcodes[df_drgcodes['hadm_id'].isin(train_ids)]

test_ids = X_test['hadm_id'].unique()

# Filter to keep only rows where the id column is in train_ids
y_test = df_drgcodes[df_drgcodes['hadm_id'].isin(test_ids)]

In [552]:
merged_df = pd.merge(y_train, X_train['hadm_id'], on='hadm_id', how='inner')

In [553]:
y_train = merged_df

In [554]:
merged_df = pd.merge(y_test, X_test['hadm_id'], on='hadm_id', how='inner')
y_test = merged_df

In [555]:
valid_ids = y_train['hadm_id'].unique()
X_train = X_train[X_train['hadm_id'].isin(valid_ids)]

In [556]:
valid_ids = y_test['hadm_id'].unique()
X_test = X_test[X_test['hadm_id'].isin(valid_ids)]

In [557]:
# Order by hadm_id and drop 

In [558]:
y_train = y_train.sort_values(by='hadm_id')
y_train = y_train.drop(columns=['hadm_id'])

y_test = y_test.sort_values(by='hadm_id')
y_test = y_test.drop(columns=['hadm_id'])

X_train = X_train.sort_values(by='hadm_id')
X_train = X_train.drop(columns=['hadm_id'])

X_test = X_test.sort_values(by='hadm_id')
X_test = X_test.drop(columns=['hadm_id'])

#### XGBoost for multi-output

In [637]:
import xgboost as xgb
from catboost import CatBoostClassifier

base_classifier = xgb.XGBClassifier()

# Initialize the multi-output classifier with the base classifier
classifier = MultiOutputClassifier(base_classifier)

# Train the classifier with both input features (X_train) and target labels (y_train)
for i in tqdm(range(100)):
    classifier.fit(X_train.values, y_train.values)

100%|██████████| 100/100 [09:03<00:00,  5.44s/it]


In [638]:
for i in tqdm(range(100)):
    y_pred = classifier.predict(X_test.values)

# Evaluate performance
accuracy = accuracy_score(y_test, y_pred)
print("Accuracy:", accuracy)

# Print classification report
print("Classification Report:")
print(classification_report(y_test, y_pred))

100%|██████████| 100/100 [00:15<00:00,  6.42it/s]

Accuracy: 0.0
Classification Report:
              precision    recall  f1-score   support

           0       0.00      0.00      0.00         0
           1       0.00      0.00      0.00         0
           2       0.50      0.50      0.50         2
           3       0.00      0.00      0.00         0
           4       0.00      0.00      0.00         1
           5       0.00      0.00      0.00         0
           6       0.00      0.00      0.00         0
           7       0.00      0.00      0.00         0
           8       0.00      0.00      0.00         1
           9       0.00      0.00      0.00         0
          10       0.00      0.00      0.00         0
          11       0.00      0.00      0.00         1
          12       0.00      0.00      0.00         0
          13       0.00      0.00      0.00         0
          14       0.00      0.00      0.00         0
          15       0.00      0.00      0.00         0
          16       0.00      0.00      0.00 


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [639]:
# Calculate Hamming Loss
hamming_loss_value = hamming_loss(y_test, y_pred)

print("Hamming Loss:", hamming_loss_value)

Hamming Loss: 0.008779761904761905


### RAKEL

In [708]:
from skmultilearn.ensemble import RakelD
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.datasets import make_multilabel_classification
from sklearn.metrics import accuracy_score, classification_report

# Initialize the base classifier (Random Forest)
base_classifier = RandomForestClassifier()

# Initialize the RAkEL classifier with the base classifier
classifier = RakelD(base_classifier, labelset_size=3)

for i in tqdm(range(100)):
# Train the classifier with both input features (X_train) and target labels (y_train)
    classifier.fit(X_train, y_train)

# Predict labels for test data
y_pred = classifier.predict(X_test)

# Evaluate performance
accuracy = accuracy_score(y_test, y_pred)
print("Accuracy:", accuracy)

# Print classification report
print("Classification Report:")
print(classification_report(y_test, y_pred))

100%|██████████| 100/100 [16:12<00:00,  9.73s/it]


Accuracy: 0.03571428571428571
Classification Report:
              precision    recall  f1-score   support

           0       0.00      0.00      0.00         0
           1       0.00      0.00      0.00         0
           2       1.00      1.00      1.00         2
           3       0.00      0.00      0.00         0
           4       0.00      0.00      0.00         1
           5       0.00      0.00      0.00         0
           6       0.00      0.00      0.00         0
           7       0.00      0.00      0.00         0
           8       0.00      0.00      0.00         1
           9       0.00      0.00      0.00         0
          10       0.00      0.00      0.00         0
          11       0.00      0.00      0.00         1
          12       0.00      0.00      0.00         0
          13       0.00      0.00      0.00         0
          14       0.00      0.00      0.00         0
          15       0.00      0.00      0.00         0
          16       0.00     

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


### outputevents

In [572]:
X_train.shape

(7489, 42)

In [573]:
y_train.shape

(7489, 240)

In [561]:
path = "C:/Users/jenni/OneDrive/Desktop/IP/"

file = "output_data_train.csv"
full_path = path + file
X_train = pd.read_csv(full_path)

file = "output_data_test.csv"
full_path = path + file
X_test = pd.read_csv(full_path)

In [562]:
X_train['recording_delay']= X_train['recording_delay'].apply(convert_to_days)
X_test['recording_delay']= X_test['recording_delay'].apply(convert_to_days)

In [563]:
# Also need to deal with days_since_admission
X_train['days_since_admission'] = X_train['days_since_admission'].apply(convert_to_days)
X_test['days_since_admission'] = X_test['days_since_admission'].apply(convert_to_days)

In [564]:
# target = df_diagnoses (rows where hadm_id is in train data)
# Make sure the order is the same

# Extract the unique IDs from the column 
train_ids = X_train['hadm_id'].unique()

# Filter to keep only rows where the id column is in train_ids
y_train = df_drgcodes[df_drgcodes['hadm_id'].isin(train_ids)]

test_ids = X_test['hadm_id'].unique()

# Filter to keep only rows where the id column is in train_ids
y_test = df_drgcodes[df_drgcodes['hadm_id'].isin(test_ids)]

In [565]:
merged_df = pd.merge(y_train, X_train['hadm_id'], on='hadm_id', how='inner')

In [566]:
y_train = merged_df

In [567]:
merged_df = pd.merge(y_test, X_test['hadm_id'], on='hadm_id', how='inner')
y_test = merged_df

In [568]:
valid_ids = y_train['hadm_id'].unique()
X_train = X_train[X_train['hadm_id'].isin(valid_ids)]

In [569]:
valid_ids = y_test['hadm_id'].unique()
X_test = X_test[X_test['hadm_id'].isin(valid_ids)]

In [570]:
# Order by hadm_id and drop 

In [571]:
y_train = y_train.sort_values(by='hadm_id')
y_train = y_train.drop(columns=['hadm_id'])

y_test = y_test.sort_values(by='hadm_id')
y_test = y_test.drop(columns=['hadm_id'])

X_train = X_train.sort_values(by='hadm_id')
X_train = X_train.drop(columns=['hadm_id'])

X_test = X_test.sort_values(by='hadm_id')
X_test = X_test.drop(columns=['hadm_id'])

#### XGBoost for multi-output

In [664]:
import xgboost as xgb
from catboost import CatBoostClassifier

base_classifier = xgb.XGBClassifier()

# Initialize the multi-output classifier with the base classifier
classifier = MultiOutputClassifier(base_classifier)

# Train the classifier with both input features (X_train) and target labels (y_train)
for i in tqdm(range(100)):
    classifier.fit(X_train.values, y_train.values)

100%|██████████| 100/100 [49:00<00:00, 29.40s/it]


In [667]:
for i in tqdm(range(100)):
    y_pred = classifier.predict(X_test.values)

# Evaluate performance
accuracy = accuracy_score(y_test, y_pred)
print("Accuracy:", accuracy)

# Print classification report
print("Classification Report:")
print(classification_report(y_test, y_pred))

100%|██████████| 100/100 [00:54<00:00,  1.84it/s]

Accuracy: 0.17458622530699414
Classification Report:
              precision    recall  f1-score   support

           0       0.00      0.00      0.00         0
           1       0.96      0.48      0.64       112
           2       0.87      0.42      0.57        97
           3       0.00      0.00      0.00        15
           4       0.00      0.00      0.00         2
           5       0.52      0.30      0.38        37
           6       0.74      0.35      0.47        49
           7       0.00      0.00      0.00         9
           8       0.33      0.20      0.25         5
           9       0.00      0.00      0.00         0
          10       1.00      0.18      0.31        11
          11       0.00      0.00      0.00         4
          12       0.00      0.00      0.00         0
          13       0.50      0.04      0.08        23
          14       0.00      0.00      0.00         5
          15       1.00      0.18      0.31        11
          16       0.00     


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [668]:
# Calculate Hamming Loss
hamming_loss_value = hamming_loss(y_test, y_pred)

print("Hamming Loss:", hamming_loss_value)

Hamming Loss: 0.007677077771845524


### procedureevents

In [585]:
X_train.shape

(1174, 162)

In [586]:
y_train.shape

(1174, 240)

In [69]:
path = "C:/Users/jenni/OneDrive/Desktop/IP/"

file = "procedure_events_data_train.csv"
full_path = path + file
X_train = pd.read_csv(full_path)

file = "procedure_events_data_test.csv"
full_path = path + file
X_test = pd.read_csv(full_path)

In [70]:
X_train['duration']= X_train['duration'].apply(convert_to_days)
X_test['duration']= X_test['duration'].apply(convert_to_days)

In [71]:
X_train['recording_delay']= X_train['recording_delay'].apply(convert_to_days)
X_test['recording_delay']= X_test['recording_delay'].apply(convert_to_days)

In [72]:
# target = df_diagnoses (rows where hadm_id is in train data)
# Make sure the order is the same

# Extract the unique IDs from the column 
train_ids = X_train['hadm_id'].unique()

# Filter to keep only rows where the id column is in train_ids
y_train = df_drgcodes[df_drgcodes['hadm_id'].isin(train_ids)]

test_ids = X_test['hadm_id'].unique()

# Filter to keep only rows where the id column is in train_ids
y_test = df_drgcodes[df_drgcodes['hadm_id'].isin(test_ids)]

In [73]:
merged_df = pd.merge(y_train, X_train['hadm_id'], on='hadm_id', how='inner')

In [74]:
y_train = merged_df

In [75]:
merged_df = pd.merge(y_test, X_test['hadm_id'], on='hadm_id', how='inner')
y_test = merged_df

In [76]:
valid_ids = y_train['hadm_id'].unique()
X_train = X_train[X_train['hadm_id'].isin(valid_ids)]

In [77]:
valid_ids = y_test['hadm_id'].unique()
X_test = X_test[X_test['hadm_id'].isin(valid_ids)]

In [78]:
# Order by hadm_id and drop 

In [79]:
y_train = y_train.sort_values(by='hadm_id')
y_train = y_train.drop(columns=['hadm_id'])

y_test = y_test.sort_values(by='hadm_id')
y_test = y_test.drop(columns=['hadm_id'])

X_train = X_train.sort_values(by='hadm_id')
X_train = X_train.drop(columns=['hadm_id'])

X_test = X_test.sort_values(by='hadm_id')
X_test = X_test.drop(columns=['hadm_id'])

In [80]:
y_test

Unnamed: 0,14,20,21,22,23,24,25,26,27,39,...,950,951,952,956,957,981,982,983,987,988
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
289,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
290,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
291,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
292,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


#### XGBoost for multi-output

In [53]:
import xgboost as xgb
from catboost import CatBoostClassifier
from sklearn.multioutput import MultiOutputClassifier
from sklearn.metrics import accuracy_score

base_classifier = xgb.XGBClassifier()

# Initialize the multi-output classifier with the base classifier
classifier = MultiOutputClassifier(base_classifier)

# Train the classifier with both input features (X_train) and target labels (y_train)
classifier.fit(X_train.values, y_train.values)

In [54]:
y_pred = classifier.predict(X_test.values)

In [55]:
y_test

Unnamed: 0,14,20,21,22,23,24,25,26,27,39,...,950,951,952,956,957,981,982,983,987,988
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
289,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
290,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
291,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
292,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [651]:
for i in tqdm(range(100)):
    y_pred = classifier.predict(X_test.values)

# Evaluate performance
accuracy = accuracy_score(y_test, y_pred)
print("Accuracy:", accuracy)

# Print classification report
print("Classification Report:")
print(classification_report(y_test, y_pred))

100%|████████████████████████████████████████████████████████████████████████████████| 100/100 [00:28<00:00,  3.46it/s]

Accuracy: 0.7244897959183674
Classification Report:
              precision    recall  f1-score   support

           0       0.00      0.00      0.00         0
           1       0.95      0.90      0.93        21
           2       0.80      0.80      0.80        10
           3       1.00      0.25      0.40         4
           4       0.00      0.00      0.00         1
           5       1.00      0.75      0.86         8
           6       1.00      1.00      1.00        13
           7       1.00      0.75      0.86         4
           8       1.00      1.00      1.00         1
           9       0.00      0.00      0.00         0
          10       0.00      0.00      0.00         2
          11       0.00      0.00      0.00         1
          12       0.00      0.00      0.00         0
          13       1.00      1.00      1.00         3
          14       1.00      1.00      1.00         1
          15       0.00      0.00      0.00         2
          16       0.00      


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [682]:
for i in tqdm(range(100)):
    y_pred = classifier.predict(X_test.values)

# Evaluate performance
accuracy = accuracy_score(y_test, y_pred)
print("Accuracy:", accuracy)

# Print classification report
print("Classification Report:")
print(classification_report(y_test, y_pred))

100%|██████████| 100/100 [00:25<00:00,  3.97it/s]

Accuracy: 0.7244897959183674
Classification Report:
              precision    recall  f1-score   support

           0       0.00      0.00      0.00         0
           1       0.95      0.90      0.93        21
           2       0.80      0.80      0.80        10
           3       1.00      0.25      0.40         4
           4       0.00      0.00      0.00         1
           5       1.00      0.75      0.86         8
           6       1.00      1.00      1.00        13
           7       1.00      0.75      0.86         4
           8       1.00      1.00      1.00         1
           9       0.00      0.00      0.00         0
          10       0.00      0.00      0.00         2
          11       0.00      0.00      0.00         1
          12       0.00      0.00      0.00         0
          13       1.00      1.00      1.00         3
          14       1.00      1.00      1.00         1
          15       0.00      0.00      0.00         2
          16       0.00      


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [683]:
# Calculate Hamming Loss
hamming_loss_value = hamming_loss(y_test, y_pred)

print("Hamming Loss:", hamming_loss_value)

Hamming Loss: 0.0025085034013605442


## Took too long to train

### poe

In [597]:
X_train.shape

(35022, 1458)

In [598]:
y_train.shape

(35022, 240)

In [587]:
path = "C:/Users/jenni/OneDrive/Desktop/IP/"

file = "poe_data_train.csv"
full_path = path + file
X_train = pd.read_csv(full_path)

file = "poe_data_test.csv"
full_path = path + file
X_test = pd.read_csv(full_path)

In [588]:
# Need to deal with days_since_admission
X_train['days_since_admission'] = X_train['days_since_admission'].apply(convert_to_days)
X_test['days_since_admission'] = X_test['days_since_admission'].apply(convert_to_days)

In [589]:
# target = df_diagnoses (rows where hadm_id is in train data)
# Make sure the order is the same

# Extract the unique IDs from the column 
train_ids = X_train['hadm_id'].unique()

# Filter to keep only rows where the id column is in train_ids
y_train = df_drgcodes[df_drgcodes['hadm_id'].isin(train_ids)]

test_ids = X_test['hadm_id'].unique()

# Filter to keep only rows where the id column is in train_ids
y_test = df_drgcodes[df_drgcodes['hadm_id'].isin(test_ids)]

In [590]:
merged_df = pd.merge(y_train, X_train['hadm_id'], on='hadm_id', how='inner')

In [591]:
y_train = merged_df

In [592]:
merged_df = pd.merge(y_test, X_test['hadm_id'], on='hadm_id', how='inner')
y_test = merged_df

In [593]:
valid_ids = y_train['hadm_id'].unique()
X_train = X_train[X_train['hadm_id'].isin(valid_ids)]

In [594]:
valid_ids = y_test['hadm_id'].unique()
X_test = X_test[X_test['hadm_id'].isin(valid_ids)]

In [595]:
# Order by hadm_id and drop 

In [596]:
y_train = y_train.sort_values(by='hadm_id')
y_train = y_train.drop(columns=['hadm_id'])

y_test = y_test.sort_values(by='hadm_id')
y_test = y_test.drop(columns=['hadm_id'])

X_train = X_train.sort_values(by='hadm_id')
X_train = X_train.drop(columns=['hadm_id'])

X_test = X_test.sort_values(by='hadm_id')
X_test = X_test.drop(columns=['hadm_id'])

#### XGBoost for multi-output

In [566]:
import xgboost as xgb
from catboost import CatBoostClassifier

base_classifier = xgb.XGBClassifier()

# Initialize the multi-output classifier with the base classifier
classifier = MultiOutputClassifier(base_classifier)

# Train the classifier with both input features (X_train) and target labels (y_train)
for i in tqdm(range(100)):
    classifier.fit(X_train.values, y_train.values)

  0%|          | 0/100 [00:00<?, ?it/s]Exception ignored on calling ctypes callback function: <bound method DataIter._next_wrapper of <xgboost.data.SingleBatchInternalIter object at 0x000001DADD82D150>>
Traceback (most recent call last):
  File "C:\Users\jenni\anaconda3\Lib\site-packages\xgboost\core.py", line 589, in _next_wrapper
    def _next_wrapper(self, this: None) -> int:  # pylint: disable=unused-argument

KeyboardInterrupt: 
  0%|          | 0/100 [02:53<?, ?it/s]


XGBoostError: [10:58:32] C:\buildkite-agent\builds\buildkite-windows-cpu-autoscaling-group-i-0b3782d1791676daf-1\xgboost\xgboost-ci-windows\src\data\iterative_dmatrix.cc:231: Check failed: accumulated_rows == Info().num_row_ (70044 vs. 35022) : 

In [None]:
for i in tqdm(range(100)):
    y_pred = classifier.predict(X_test.values)

# Evaluate performance
accuracy = accuracy_score(y_test, y_pred)
print("Accuracy:", accuracy)

# Print classification report
print("Classification Report:")
print(classification_report(y_test, y_pred))

In [None]:
# Calculate Hamming Loss
hamming_loss_value = hamming_loss(y_test, y_pred)

print("Hamming Loss:", hamming_loss_value)

###  prescriptions

In [610]:
X_train.shape

(14097, 4890)

In [611]:
y_train.shape

(14097, 240)

In [599]:
path = "C:/Users/jenni/OneDrive/Desktop/IP/"

file = "prescriptions_data_train.csv"
full_path = path + file
X_train = pd.read_csv(full_path)

file = "prescriptions_data_test.csv"
full_path = path + file
X_test = pd.read_csv(full_path)

In [600]:
X_train['duration']= X_train['duration'].apply(convert_to_days)
X_test['duration']= X_test['duration'].apply(convert_to_days)

In [601]:
# target = df_diagnoses (rows where hadm_id is in train data)
# Make sure the order is the same

# Extract the unique IDs from the column 
train_ids = X_train['hadm_id'].unique()

# Filter to keep only rows where the id column is in train_ids
y_train = df_drgcodes[df_drgcodes['hadm_id'].isin(train_ids)]

test_ids = X_test['hadm_id'].unique()

# Filter to keep only rows where the id column is in train_ids
y_test = df_drgcodes[df_drgcodes['hadm_id'].isin(test_ids)]

In [602]:
merged_df = pd.merge(y_train, X_train['hadm_id'], on='hadm_id', how='inner')

In [603]:
y_train = merged_df

In [604]:
merged_df = pd.merge(y_test, X_test['hadm_id'], on='hadm_id', how='inner')
y_test = merged_df

In [605]:
valid_ids = y_train['hadm_id'].unique()
X_train = X_train[X_train['hadm_id'].isin(valid_ids)]

In [606]:
valid_ids = y_test['hadm_id'].unique()
X_test = X_test[X_test['hadm_id'].isin(valid_ids)]

In [607]:
# Order by hadm_id and drop 

In [608]:
y_train = y_train.sort_values(by='hadm_id')
y_train = y_train.drop(columns=['hadm_id'])

y_test = y_test.sort_values(by='hadm_id')
y_test = y_test.drop(columns=['hadm_id'])

X_train = X_train.sort_values(by='hadm_id')
X_train = X_train.drop(columns=['hadm_id'])

X_test = X_test.sort_values(by='hadm_id')
X_test = X_test.drop(columns=['hadm_id'])

In [609]:
# Need to reduce from 4890 to 2874 or less

In [578]:
from sklearn.decomposition import TruncatedSVD

# Number of desired features (components)
n_components = 2874

# Initialize Truncated SVD with the desired number of components
svd = TruncatedSVD(n_components=n_components)

# Fit the Truncated SVD model to the sparse matrix and transform the data
svd.fit(X_train)
X_train = svd.transform(X_train)

# Get the explained variance ratio (how much variance is explained by each component)
explained_variance_ratio = svd.explained_variance_ratio_

# Print the transformed matrix and explained variance ratio
# print("Transformed Matrix:")
# print(transformed_matrix)
print("\nExplained Variance Ratio:")
print(explained_variance_ratio)

print("\n Amount of original variance conserved:", np.sum(svd.explained_variance_ratio_))


Explained Variance Ratio:
[9.99925217e-01 4.51264065e-05 4.18328590e-06 ... 2.25190037e-36
 5.20660165e-36 2.40845222e-36]

 Amount of original variance conserved: 0.9999999999999991


In [579]:
# Number of desired features (components)
n_components = 2874

# Initialize Truncated SVD with the desired number of components
svd = TruncatedSVD(n_components=n_components)

# Fit the Truncated SVD model to the sparse matrix and transform the data
svd.fit(X_test)
X_test = svd.transform(X_test)

# Get the explained variance ratio (how much variance is explained by each component)
explained_variance_ratio = svd.explained_variance_ratio_

# Print the transformed matrix and explained variance ratio
# print("Transformed Matrix:")
# print(transformed_matrix)
print("\nExplained Variance Ratio:")
print(explained_variance_ratio)

print("\n Amount of original variance conserved:", np.sum(svd.explained_variance_ratio_))


Explained Variance Ratio:
[9.99927933e-01 4.23148006e-05 4.19434751e-06 ... 7.03449370e-38
 1.14052715e-35 7.76366226e-38]

 Amount of original variance conserved: 0.9999999999999996


#### XGBoost for multi-output

In [581]:
import xgboost as xgb
from catboost import CatBoostClassifier

base_classifier = xgb.XGBClassifier()

# Initialize the multi-output classifier with the base classifier
classifier = MultiOutputClassifier(base_classifier)

# Train the classifier with both input features (X_train) and target labels (y_train)
for i in tqdm(range(100)):
    classifier.fit(X_train, y_train)

  0%|          | 0/100 [02:45<?, ?it/s]


KeyboardInterrupt: 

In [None]:
for i in tqdm(range(100)):
    y_pred = classifier.predict(X_test.values)

# Evaluate performance
accuracy = accuracy_score(y_test, y_pred)
print("Accuracy:", accuracy)

# Print classification report
print("Classification Report:")
print(classification_report(y_test, y_pred))

In [None]:
# Calculate Hamming Loss
hamming_loss_value = hamming_loss(y_test, y_pred)

print("Hamming Loss:", hamming_loss_value)

### pharmacy

In [622]:
X_train.shape

(11983, 766)

In [623]:
y_train.shape

(11983, 240)

In [612]:
path = "C:/Users/jenni/OneDrive/Desktop/IP/"

file = "pharmacy_data_train.csv"
full_path = path + file
X_train = pd.read_csv(full_path)

file = "pharmacy_data_test.csv"
full_path = path + file
X_test = pd.read_csv(full_path)

In [613]:
X_train['medication_duration']= X_train['medication_duration'].apply(convert_to_days)
X_test['medication_duration']= X_test['medication_duration'].apply(convert_to_days)
# Convert strings to integers
X_train['verification_delay'] = X_train['verification_delay'].str.split().str[0].astype(int)
X_test['verification_delay'] = X_test['verification_delay'].str.split().str[0].astype(int)

In [614]:
# target = df_diagnoses (rows where hadm_id is in train data)
# Make sure the order is the same

# Extract the unique IDs from the column 
train_ids = X_train['hadm_id'].unique()

# Filter to keep only rows where the id column is in train_ids
y_train = df_drgcodes[df_drgcodes['hadm_id'].isin(train_ids)]

test_ids = X_test['hadm_id'].unique()

# Filter to keep only rows where the id column is in train_ids
y_test = df_drgcodes[df_drgcodes['hadm_id'].isin(test_ids)]

In [615]:
merged_df = pd.merge(y_train, X_train['hadm_id'], on='hadm_id', how='inner')

In [616]:
y_train = merged_df

In [617]:
merged_df = pd.merge(y_test, X_test['hadm_id'], on='hadm_id', how='inner')
y_test = merged_df

In [618]:
valid_ids = y_train['hadm_id'].unique()
X_train = X_train[X_train['hadm_id'].isin(valid_ids)]

In [619]:
valid_ids = y_test['hadm_id'].unique()
X_test = X_test[X_test['hadm_id'].isin(valid_ids)]

In [620]:
# Order by hadm_id and drop 

In [621]:
y_train = y_train.sort_values(by='hadm_id')
y_train = y_train.drop(columns=['hadm_id'])

y_test = y_test.sort_values(by='hadm_id')
y_test = y_test.drop(columns=['hadm_id'])

X_train = X_train.sort_values(by='hadm_id')
X_train = X_train.drop(columns=['hadm_id'])

X_test = X_test.sort_values(by='hadm_id')
X_test = X_test.drop(columns=['hadm_id'])

#### XGBoost for multi-output

In [483]:
import xgboost as xgb
from catboost import CatBoostClassifier

base_classifier = xgb.XGBClassifier()

# Initialize the multi-output classifier with the base classifier
classifier = MultiOutputClassifier(base_classifier)

# Train the classifier with both input features (X_train) and target labels (y_train)
for i in tqdm(range(100)):
    classifier.fit(X_train.values, y_train.values)

  0%|          | 0/100 [05:53<?, ?it/s]


KeyboardInterrupt: 

In [None]:
for i in tqdm(range(100)):
    y_pred = classifier.predict(X_test.values)

# Evaluate performance
accuracy = accuracy_score(y_test, y_pred)
print("Accuracy:", accuracy)

# Print classification report
print("Classification Report:")
print(classification_report(y_test, y_pred))

In [None]:
# Calculate Hamming Loss
hamming_loss_value = hamming_loss(y_test, y_pred)

print("Hamming Loss:", hamming_loss_value)

### inputevents

In [635]:
X_train.shape

(16323, 221)

In [636]:
y_train.shape

(16323, 240)

In [624]:
path = "C:/Users/jenni/OneDrive/Desktop/IP/"

file = "input_data_train.csv"
full_path = path + file
X_train = pd.read_csv(full_path)

file = "input_data_test.csv"
full_path = path + file
X_test = pd.read_csv(full_path)

In [625]:
X_train['duration']= X_train['duration'].apply(convert_to_days)
X_test['duration']= X_test['duration'].apply(convert_to_days)

In [626]:
X_train['recording_delay']= X_train['recording_delay'].apply(convert_to_days)
X_test['recording_delay']= X_test['recording_delay'].apply(convert_to_days)

In [627]:
# target = df_diagnoses (rows where hadm_id is in train data)
# Make sure the order is the same

# Extract the unique IDs from the column 
train_ids = X_train['hadm_id'].unique()

# Filter to keep only rows where the id column is in train_ids
y_train = df_drgcodes[df_drgcodes['hadm_id'].isin(train_ids)]

test_ids = X_test['hadm_id'].unique()

# Filter to keep only rows where the id column is in train_ids
y_test = df_drgcodes[df_drgcodes['hadm_id'].isin(test_ids)]

In [628]:
merged_df = pd.merge(y_train, X_train['hadm_id'], on='hadm_id', how='inner')

In [629]:
y_train = merged_df

In [630]:
merged_df = pd.merge(y_test, X_test['hadm_id'], on='hadm_id', how='inner')
y_test = merged_df

In [631]:
valid_ids = y_train['hadm_id'].unique()
X_train = X_train[X_train['hadm_id'].isin(valid_ids)]

In [632]:
valid_ids = y_test['hadm_id'].unique()
X_test = X_test[X_test['hadm_id'].isin(valid_ids)]

In [633]:
# Order by hadm_id and drop 

In [634]:
y_train = y_train.sort_values(by='hadm_id')
y_train = y_train.drop(columns=['hadm_id'])

y_test = y_test.sort_values(by='hadm_id')
y_test = y_test.drop(columns=['hadm_id'])

X_train = X_train.sort_values(by='hadm_id')
X_train = X_train.drop(columns=['hadm_id'])

X_test = X_test.sort_values(by='hadm_id')
X_test = X_test.drop(columns=['hadm_id'])

#### XGBoost for multi-output

In [652]:
import xgboost as xgb
from catboost import CatBoostClassifier

base_classifier = xgb.XGBClassifier()

# Initialize the multi-output classifier with the base classifier
classifier = MultiOutputClassifier(base_classifier)

# Train the classifier with both input features (X_train) and target labels (y_train)
for i in tqdm(range(100)):
    classifier.fit(X_train.values, y_train.values)

 58%|█████▊    | 58/100 [2:32:20<1:50:19, 157.60s/it]  


KeyboardInterrupt: 

In [None]:
for i in tqdm(range(100)):
    y_pred = classifier.predict(X_test.values)

# Evaluate performance
accuracy = accuracy_score(y_test, y_pred)
print("Accuracy:", accuracy)

# Print classification report
print("Classification Report:")
print(classification_report(y_test, y_pred))

In [None]:
# Calculate Hamming Loss
hamming_loss_value = hamming_loss(y_test, y_pred)

print("Hamming Loss:", hamming_loss_value)

## Not enough memory

### ingredientevents

In [None]:
X_train.shape

In [None]:
y_train.shape

In [637]:
path = "C:/Users/jenni/OneDrive/Desktop/IP/"

file = "ingredient_data_train.csv"
full_path = path + file
X_train = pd.read_csv(full_path)

file = "ingredient_data_test.csv"
full_path = path + file
X_test = pd.read_csv(full_path)

MemoryError: Unable to allocate 1.18 GiB for an array with shape (7723, 20582) and data type int64

In [None]:
X_train['duration']= X_train['duration'].apply(convert_to_days)
X_test['duration']= X_test['duration'].apply(convert_to_days)

In [None]:
X_train['recording_delay']= X_train['recording_delay'].apply(convert_to_days)
X_test['recording_delay']= X_test['recording_delay'].apply(convert_to_days)

In [None]:
# target = df_diagnoses (rows where hadm_id is in train data)
# Make sure the order is the same

# Extract the unique IDs from the column 
train_ids = X_train['hadm_id'].unique()

# Filter to keep only rows where the id column is in train_ids
y_train = df_drgcodes[df_drgcodes['hadm_id'].isin(train_ids)]

test_ids = X_test['hadm_id'].unique()

# Filter to keep only rows where the id column is in train_ids
y_test = df_drgcodes[df_drgcodes['hadm_id'].isin(test_ids)]

In [None]:
merged_df = pd.merge(y_train, X_train['hadm_id'], on='hadm_id', how='inner')

In [None]:
y_train = merged_df

In [None]:
merged_df = pd.merge(y_test, X_test['hadm_id'], on='hadm_id', how='inner')
y_test = merged_df

In [None]:
valid_ids = y_train['hadm_id'].unique()
X_train = X_train[X_train['hadm_id'].isin(valid_ids)]

In [None]:
valid_ids = y_test['hadm_id'].unique()
X_test = X_test[X_test['hadm_id'].isin(valid_ids)]

In [None]:
# Order by hadm_id and drop 

In [None]:
y_train = y_train.sort_values(by='hadm_id')
y_train = y_train.drop(columns=['hadm_id'])

y_test = y_test.sort_values(by='hadm_id')
y_test = y_test.drop(columns=['hadm_id'])

X_train = X_train.sort_values(by='hadm_id')
X_train = X_train.drop(columns=['hadm_id'])

X_test = X_test.sort_values(by='hadm_id')
X_test = X_test.drop(columns=['hadm_id'])

In [None]:
# Need to reduce from 7727 to 4116

In [None]:
from sklearn.decomposition import TruncatedSVD

# Number of desired features (components)
n_components = 4116

# Initialize Truncated SVD with the desired number of components
svd = TruncatedSVD(n_components=n_components)

# Fit the Truncated SVD model to the sparse matrix and transform the data
svd.fit(X_train)
X_train = svd.transform(X_train)

# Get the explained variance ratio (how much variance is explained by each component)
explained_variance_ratio = svd.explained_variance_ratio_

# Print the transformed matrix and explained variance ratio
# print("Transformed Matrix:")
# print(transformed_matrix)
print("\nExplained Variance Ratio:")
print(explained_variance_ratio)

print("\n Amount of original variance conserved:", np.sum(svd.explained_variance_ratio_))

In [None]:
from sklearn.decomposition import TruncatedSVD

# Number of desired features (components)
n_components = 4116

# Initialize Truncated SVD with the desired number of components
svd = TruncatedSVD(n_components=n_components)

# Fit the Truncated SVD model to the sparse matrix and transform the data
svd.fit(X_test)
X_test = svd.transform(X_test)

# Get the explained variance ratio (how much variance is explained by each component)
explained_variance_ratio = svd.explained_variance_ratio_

# Print the transformed matrix and explained variance ratio
# print("Transformed Matrix:")
# print(transformed_matrix)
print("\nExplained Variance Ratio:")
print(explained_variance_ratio)

print("\n Amount of original variance conserved:", np.sum(svd.explained_variance_ratio_))

#### XGBoost for multi-output

In [None]:
import xgboost as xgb
from catboost import CatBoostClassifier

base_classifier = xgb.XGBClassifier()

# Initialize the multi-output classifier with the base classifier
classifier = MultiOutputClassifier(base_classifier)

# Train the classifier with both input features (X_train) and target labels (y_train)
for i in tqdm(range(100)):
    classifier.fit(X_train.values, y_train.values)

In [None]:
for i in tqdm(range(100)):
    y_pred = classifier.predict(X_test.values)

# Evaluate performance
accuracy = accuracy_score(y_test, y_pred)
print("Accuracy:", accuracy)

# Print classification report
print("Classification Report:")
print(classification_report(y_test, y_pred))

In [None]:
# Calculate Hamming Loss
hamming_loss_value = hamming_loss(y_test, y_pred)

print("Hamming Loss:", hamming_loss_value)

### chartevents

In [666]:
# path = "C:/Users/jenni/OneDrive/Desktop/IP/"

# file = "chart_data_train.csv"
# full_path = path + file
# X_train = pd.read_csv(full_path)

# file = "chart_data_test.csv"
# full_path = path + file
# X_test = pd.read_csv(full_path)

In [None]:
X_train['delay']= X_train['delay'].apply(convert_to_days)
X_test['delay']= X_test['delay'].apply(convert_to_days)

In [None]:
# Also need to deal with days_since_admission
X_train['days_since_admission'] = X_train['days_since_admission'].apply(convert_to_days)
X_test['days_since_admission'] = X_test['days_since_admission'].apply(convert_to_days)

In [323]:
# target = df_diagnoses (rows where hadm_id is in train data)
# Make sure the order is the same

# Extract the unique IDs from the column 
train_ids = X_train['hadm_id'].unique()

# Filter to keep only rows where the id column is in train_ids
y_train = df_drgcodes[df_drgcodes['hadm_id'].isin(train_ids)]

test_ids = X_test['hadm_id'].unique()

# Filter to keep only rows where the id column is in train_ids
y_test = df_drgcodes[df_drgcodes['hadm_id'].isin(test_ids)]

In [324]:
merged_df = pd.merge(y_train, X_train['hadm_id'], on='hadm_id', how='inner')

In [325]:
y_train = merged_df

In [326]:
merged_df = pd.merge(y_test, X_test['hadm_id'], on='hadm_id', how='inner')
y_test = merged_df

In [327]:
valid_ids = y_train['hadm_id'].unique()
X_train = X_train[X_train['hadm_id'].isin(valid_ids)]

In [328]:
valid_ids = y_test['hadm_id'].unique()
X_test = X_test[X_test['hadm_id'].isin(valid_ids)]

In [329]:
# Order by hadm_id and drop 

In [330]:
y_train = y_train.sort_values(by='hadm_id')
y_train = y_train.drop(columns=['hadm_id'])

y_test = y_test.sort_values(by='hadm_id')
y_test = y_test.drop(columns=['hadm_id'])

X_train = X_train.sort_values(by='hadm_id')
X_train = X_train.drop(columns=['hadm_id'])

X_test = X_test.sort_values(by='hadm_id')
X_test = X_test.drop(columns=['hadm_id'])

#### XGBoost for multi-output

In [None]:
import xgboost as xgb
from catboost import CatBoostClassifier

base_classifier = xgb.XGBClassifier()

# Initialize the multi-output classifier with the base classifier
classifier = MultiOutputClassifier(base_classifier)

# Train the classifier with both input features (X_train) and target labels (y_train)
for i in tqdm(range(100)):
    classifier.fit(X_train.values, y_train.values)

In [None]:
for i in tqdm(range(100)):
    y_pred = classifier.predict(X_test.values)

# Evaluate performance
accuracy = accuracy_score(y_test, y_pred)
print("Accuracy:", accuracy)

# Print classification report
print("Classification Report:")
print(classification_report(y_test, y_pred))

In [None]:
# Calculate Hamming Loss
hamming_loss_value = hamming_loss(y_test, y_pred)

print("Hamming Loss:", hamming_loss_value)

### emar_detail

In [500]:
path = "C:/Users/jenni/OneDrive/Desktop/IP/"

file = "emar_detail_data_train.csv"
full_path = path + file
X_train = pd.read_csv(full_path)

file = "emar_detail_data_test.csv"
full_path = path + file
X_test = pd.read_csv(full_path)

MemoryError: Unable to allocate 701. MiB for an array with shape (1594, 57614) and data type int64

In [665]:
# # Initialize an empty list to store the chunks
# chunks = []

# # Define the chunk size (number of rows to read at a time)
# chunksize = 1000  # Adjust as needed based on your system's memory

# file = "emar_detail_data_train.csv"
# full_path = path + file

# for chunk in pd.read_csv(full_path, chunksize=chunksize):
#     chunks.append(chunk)

# # Concatenate the chunks into a single DataFrame
# X_train = pd.concat(chunks, ignore_index=True)

# file = "emar_detail_data_test.csv"
# full_path = path + file

# for chunk in pd.read_csv(full_path, chunksize=chunksize):
#     chunks.append(chunk)

# # Concatenate the chunks into a single DataFrame
# X_test = pd.concat(chunks, ignore_index=True)

In [323]:
# target = df_diagnoses (rows where hadm_id is in train data)
# Make sure the order is the same

# Extract the unique IDs from the column 
train_ids = X_train['hadm_id'].unique()

# Filter to keep only rows where the id column is in train_ids
y_train = df_drgcodes[df_drgcodes['hadm_id'].isin(train_ids)]

test_ids = X_test['hadm_id'].unique()

# Filter to keep only rows where the id column is in train_ids
y_test = df_drgcodes[df_drgcodes['hadm_id'].isin(test_ids)]

In [324]:
merged_df = pd.merge(y_train, X_train['hadm_id'], on='hadm_id', how='inner')

In [325]:
y_train = merged_df

In [326]:
merged_df = pd.merge(y_test, X_test['hadm_id'], on='hadm_id', how='inner')
y_test = merged_df

In [327]:
valid_ids = y_train['hadm_id'].unique()
X_train = X_train[X_train['hadm_id'].isin(valid_ids)]

In [328]:
valid_ids = y_test['hadm_id'].unique()
X_test = X_test[X_test['hadm_id'].isin(valid_ids)]

In [329]:
# Order by hadm_id and drop 

In [330]:
y_train = y_train.sort_values(by='hadm_id')
y_train = y_train.drop(columns=['hadm_id'])

y_test = y_test.sort_values(by='hadm_id')
y_test = y_test.drop(columns=['hadm_id'])

X_train = X_train.sort_values(by='hadm_id')
X_train = X_train.drop(columns=['hadm_id'])

X_test = X_test.sort_values(by='hadm_id')
X_test = X_test.drop(columns=['hadm_id'])

#### XGBoost for multi-output

In [None]:
import xgboost as xgb
from catboost import CatBoostClassifier

base_classifier = xgb.XGBClassifier()

# Initialize the multi-output classifier with the base classifier
classifier = MultiOutputClassifier(base_classifier)

# Train the classifier with both input features (X_train) and target labels (y_train)
for i in tqdm(range(100)):
    classifier.fit(X_train.values, y_train.values)

In [None]:
for i in tqdm(range(100)):
    y_pred = classifier.predict(X_test.values)

# Evaluate performance
accuracy = accuracy_score(y_test, y_pred)
print("Accuracy:", accuracy)

# Print classification report
print("Classification Report:")
print(classification_report(y_test, y_pred))

In [None]:
# Calculate Hamming Loss
hamming_loss_value = hamming_loss(y_test, y_pred)

print("Hamming Loss:", hamming_loss_value)

### labevents - can't allocate memory

In [121]:
# Can't allocate memory this way 

# path = "C:/Users/jenni/OneDrive/Desktop/IP/"

# file = "labevents_data_train.csv"
# full_path = path + file
# X_train = pd.read_csv(full_path)

# file = "labevents_data_test.csv"
# full_path = path + file
# X_test = pd.read_csv(full_path)

In [122]:
path = "C:/Project/Data/"

In [123]:
file = "hosp/labevents.csv"
full_path = path + file

df_labevents = pd.read_csv(full_path)

In [124]:
df_labevents['value'] = pd.to_numeric(df_labevents['value'], errors='coerce').fillna(0)

In [125]:
# Make a feature for days_since_admission using charttime - admittime

# Convert to datetime
df_labevents['charttime'] = pd.to_datetime(df_labevents['charttime'], format='%Y/%m/%d %H:%M')

# Add admittime column from other dataframe
df_labevents = df_labevents.merge(df_admittime, on='hadm_id', how='left')

# # Discard the time part and keep only the date
# df_hcpcsevents['admittime'] = df_hcpcsevents['admittime'].dt.date
# df_hcpcsevents['chartdate'] = df_hcpcsevents['chartdate'].dt.date

df_labevents['days_since_admission'] = df_labevents['charttime'] - df_labevents['admittime']

# Fill any non time values
df_labevents['days_since_admission'] = df_labevents['days_since_admission'].fillna(pd.Timedelta(0))

In [126]:
# Add storetime - charttime feature called delay

# Convert to datetime
df_labevents['storetime'] = pd.to_datetime(df_labevents['storetime'], format='%Y/%m/%d %H:%M')

df_labevents['delay'] = df_labevents['storetime'] - df_labevents['charttime']

# Fill any non time values
df_labevents['delay'] = df_labevents['delay'].fillna(pd.Timedelta(0))

In [127]:
df_labevents = df_labevents.drop(columns=['labevent_id','subject_id','order_provider_id','charttime','storetime','comments'])

In [128]:
# For flag make abnormal = 1 and fill Null with 0
df_labevents['flag'] = df_labevents['flag'].fillna(0)
df_labevents['flag'] = df_labevents['flag'].replace('abnormal', 1)

In [129]:
# For priority fill Null with N/A and then one hot encode
df_labevents['priority'] = df_labevents['priority'].fillna('N/A')
df_labevents = pd.get_dummies(df_labevents, columns=['priority'])

In [130]:
df_labevents = pd.get_dummies(df_labevents, columns=['valueuom','specimen_id','itemid'])

In [131]:
# Drop any rows with null values 
df_labevents = df_labevents.dropna()
# Reduced from 107727 rows to 66660

In [132]:
df_labevents = df_labevents.drop(columns=['admittime'])

In [133]:
df_labevents.head()

Unnamed: 0,hadm_id,value,valuenum,ref_range_lower,ref_range_upper,flag,days_since_admission,delay,priority_N/A,priority_ROUTINE,...,itemid_52286,itemid_52312,itemid_52369,itemid_52391,itemid_52419,itemid_52425,itemid_52427,itemid_52769,itemid_52955,itemid_53153
0,29600294.0,15.4,15.4,10.5,15.5,0,1 days 01:03:00,0 days 01:30:00,0,1,...,0,0,0,0,0,0,0,0,0,0
1,29600294.0,3.35,3.35,4.6,6.1,1,1 days 01:03:00,0 days 01:30:00,0,1,...,0,0,0,0,0,0,0,0,0,0
2,29600294.0,49.7,49.7,35.1,46.3,1,1 days 01:03:00,0 days 01:30:00,0,1,...,0,0,0,0,0,0,0,0,0,0
3,29600294.0,20.3,20.3,4.0,10.0,1,1 days 01:03:00,0 days 01:30:00,0,1,...,0,0,0,0,0,0,0,0,0,0
4,29600294.0,31.1,31.1,32.0,37.0,1,1 days 01:03:00,0 days 01:30:00,0,1,...,0,0,0,0,0,0,0,0,0,0


#### Split into train and test

In [134]:
data = df_labevents

# Split the dataset into training and testing sets
labevents_data_train, labevents_data_test = train_test_split(data, test_size=0.2, random_state=42)

# Print the shapes of the resulting training and testing sets
print("Training set shape:", labevents_data_train.shape)
print("Testing set shape:", labevents_data_test.shape)

Training set shape: (53328, 11680)
Testing set shape: (13332, 11680)


In [135]:
data = labevents_data_train.drop(columns=['hadm_id'])
data = data.reset_index(drop=True)

In [136]:
labevents_data_train.head()

Unnamed: 0,hadm_id,value,valuenum,ref_range_lower,ref_range_upper,flag,days_since_admission,delay,priority_N/A,priority_ROUTINE,...,itemid_52286,itemid_52312,itemid_52369,itemid_52391,itemid_52419,itemid_52425,itemid_52427,itemid_52769,itemid_52955,itemid_53153
99267,26793610.0,19.0,19.0,22.0,32.0,1,0 days 04:36:00,0 days 02:53:00,0,1,...,0,0,0,0,0,0,0,0,0,0
106026,28998349.0,17.7,17.7,10.5,15.5,1,7 days 03:35:00,0 days 00:10:00,0,1,...,0,0,0,0,0,0,0,0,0,0
44791,22490490.0,0.0,134.0,70.0,100.0,1,5 days 15:25:00,0 days 02:06:00,0,1,...,0,0,0,0,0,0,0,0,0,0
83275,28258130.0,34.0,34.0,35.0,45.0,1,10 days 23:24:00,0 days 00:10:00,1,0,...,0,0,0,0,0,0,0,0,0,0
100330,22205327.0,2.4,2.4,1.6,2.6,0,9 days 09:55:00,0 days 01:58:00,0,0,...,0,0,0,0,0,0,0,0,0,0


In [138]:
# # target = df_diagnoses (rows where hadm_id is in train data)
# # Make sure the order is the same

# # Extract the unique IDs from the column 
# train_ids = labevents_data_train['hadm_id'].unique()

# # Filter to keep only rows where the id column is in train_ids
# y_train = df_diagnoses[df_diagnoses['hadm_id'].isin(train_ids)]

# test_ids = labevents_data_test['hadm_id'].unique()

# # Filter to keep only rows where the id column is in train_ids
# y_test = df_diagnoses[df_diagnoses['hadm_id'].isin(test_ids)]

In [139]:
# target = df_diagnoses (rows where hadm_id is in train data)
# Make sure the order is the same

# Extract the unique IDs from the column 
train_ids = labevents_data_train['hadm_id'].unique()

# Filter to keep only rows where the id column is in train_ids
y_train = df_drgcodes[df_drgcodes['hadm_id'].isin(train_ids)]

test_ids = labevents_data_test['hadm_id'].unique()

# Filter to keep only rows where the id column is in train_ids
y_test = df_drgcodes[df_drgcodes['hadm_id'].isin(test_ids)]

In [140]:
merged_df = pd.merge(y_train, labevents_data_train['hadm_id'], on='hadm_id')

In [141]:
y_train = merged_df
y_train

Unnamed: 0,hadm_id,14,20,21,22,23,24,25,26,27,...,950,951,952,956,957,981,982,983,987,988
0,20044587,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,20044587,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,20044587,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,20044587,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,20044587,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
52756,29974575,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
52757,29974575,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
52758,29974575,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
52759,29974575,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [142]:
merged_df = pd.merge(y_test, labevents_data_test['hadm_id'], on='hadm_id')

y_test = merged_df
y_test

Unnamed: 0,hadm_id,14,20,21,22,23,24,25,26,27,...,950,951,952,956,957,981,982,983,987,988
0,20044587,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,20044587,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,20044587,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,20044587,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,20044587,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
13194,29974575,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
13195,29974575,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
13196,29974575,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
13197,29974575,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [143]:
X_train = labevents_data_train
X_test = labevents_data_test

In [144]:
# X_train['delay']= X_train['delay'].astype(str)
# X_train['delay']= X_train['delay'].apply(convert_to_days)
# X_train['days_since_admission'] = X_train['days_since_admission'].astype(str)
# X_train['days_since_admission'] = X_train['days_since_admission'].str.split().str[0].astype(int)

# X_train = X_train.drop(columns=['hadm_id'])
# X_train = X_train.reset_index(drop=True)

# y_train = y_train.drop(columns=['hadm_id'])
# y_train = y_train.reset_index(drop=True)

In [145]:
X_train

Unnamed: 0,hadm_id,value,valuenum,ref_range_lower,ref_range_upper,flag,days_since_admission,delay,priority_N/A,priority_ROUTINE,...,itemid_52286,itemid_52312,itemid_52369,itemid_52391,itemid_52419,itemid_52425,itemid_52427,itemid_52769,itemid_52955,itemid_53153
99267,26793610.0,19.0,19.0,22.0,32.0,1,0 days 04:36:00,0 days 02:53:00,0,1,...,0,0,0,0,0,0,0,0,0,0
106026,28998349.0,17.7,17.7,10.5,15.5,1,7 days 03:35:00,0 days 00:10:00,0,1,...,0,0,0,0,0,0,0,0,0,0
44791,22490490.0,0.0,134.0,70.0,100.0,1,5 days 15:25:00,0 days 02:06:00,0,1,...,0,0,0,0,0,0,0,0,0,0
83275,28258130.0,34.0,34.0,35.0,45.0,1,10 days 23:24:00,0 days 00:10:00,1,0,...,0,0,0,0,0,0,0,0,0,0
100330,22205327.0,2.4,2.4,1.6,2.6,0,9 days 09:55:00,0 days 01:58:00,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
58478,28872262.0,3.9,3.9,3.3,5.1,0,8 days 18:53:00,0 days 01:00:00,0,1,...,0,0,0,0,0,0,0,0,0,0
9921,21476294.0,100.0,100.0,96.0,108.0,0,21 days 10:06:00,0 days 01:08:00,0,0,...,0,0,0,0,0,0,0,0,0,0
89565,26467376.0,68.5,68.5,9.4,12.5,1,0 days 14:02:00,0 days 01:28:00,0,1,...,0,0,0,0,0,0,0,0,0,0
1419,21476294.0,0.0,0.0,0.0,0.0,0,0 days 12:06:00,0 days 02:15:00,0,0,...,0,0,0,0,0,0,0,0,0,0


In [146]:
X_train['delay']= X_train['delay'].astype(str)
X_train['delay']= X_train['delay'].apply(convert_to_days)
X_train['days_since_admission'] = X_train['days_since_admission'].astype(str)
X_train['days_since_admission'] = X_train['days_since_admission'].str.split().str[0].astype(int)

X_train = X_train.drop(columns=['hadm_id'])
X_train = X_train.reset_index(drop=True)

y_train = y_train.drop(columns=['hadm_id'])
y_train = y_train.reset_index(drop=True)

In [148]:
# sparse matrices to address memory issue

from scipy.sparse import csr_matrix

X_train_sparse = csr_matrix(X_train)
X_test_sparse = csr_matrix(X_test)

#### Dimensionality reduction - not enough memory

In [39]:
# Need to reduce from 11681 to 10665

In [40]:
# data['delay']= data['delay'].astype(str)
# data['delay']= data['delay'].apply(convert_to_days)
# data['days_since_admission'] = data['days_since_admission'].astype(str)
# data['days_since_admission'] = data['days_since_admission'].str.split().str[0].astype(int)

In [41]:
# from sklearn.decomposition import TruncatedSVD

# # Number of desired features (components)
# n_components = 10665

# # Initialize Truncated SVD with the desired number of components
# svd = TruncatedSVD(n_components=n_components)

# # Fit the Truncated SVD model to the sparse matrix and transform the data
# svd.fit(data)
# data = svd.transform(data)

# # Get the explained variance ratio (how much variance is explained by each component)
# explained_variance_ratio = svd.explained_variance_ratio_

# # Print the transformed matrix and explained variance ratio
# # print("Transformed Matrix:")
# # print(transformed_matrix)
# print("\nExplained Variance Ratio:")
# print(explained_variance_ratio)

# print("\n Amount of original variance conserved:", np.sum(svd.explained_variance_ratio_))

In [323]:
# target = df_diagnoses (rows where hadm_id is in train data)
# Make sure the order is the same

# Extract the unique IDs from the column 
train_ids = X_train['hadm_id'].unique()

# Filter to keep only rows where the id column is in train_ids
y_train = df_drgcodes[df_drgcodes['hadm_id'].isin(train_ids)]

test_ids = X_test['hadm_id'].unique()

# Filter to keep only rows where the id column is in train_ids
y_test = df_drgcodes[df_drgcodes['hadm_id'].isin(test_ids)]

In [324]:
merged_df = pd.merge(y_train, X_train['hadm_id'], on='hadm_id', how='inner')

In [325]:
y_train = merged_df

In [326]:
merged_df = pd.merge(y_test, X_test['hadm_id'], on='hadm_id', how='inner')
y_test = merged_df

In [327]:
valid_ids = y_train['hadm_id'].unique()
X_train = X_train[X_train['hadm_id'].isin(valid_ids)]

In [328]:
valid_ids = y_test['hadm_id'].unique()
X_test = X_test[X_test['hadm_id'].isin(valid_ids)]

In [329]:
# Order by hadm_id and drop 

In [330]:
y_train = y_train.sort_values(by='hadm_id')
y_train = y_train.drop(columns=['hadm_id'])

y_test = y_test.sort_values(by='hadm_id')
y_test = y_test.drop(columns=['hadm_id'])

X_train = X_train.sort_values(by='hadm_id')
X_train = X_train.drop(columns=['hadm_id'])

X_test = X_test.sort_values(by='hadm_id')
X_test = X_test.drop(columns=['hadm_id'])

#### Multi-output decision tree

In [48]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.multioutput import MultiOutputClassifier
from sklearn.metrics import accuracy_score

# Initialize the decision tree classifier
decision_tree = DecisionTreeClassifier(random_state=42, max_depth=50)

# Initialize the multi-output classifier with the decision tree as the base estimator
multi_output_tree = MultiOutputClassifier(decision_tree, n_jobs=-1)


In [49]:
# Train the multi-output classifier
multi_output_tree.fit(X_train_sparse, y_train)

# Predict the outputs for the test set
y_pred = multi_output_tree.predict(X_test_sparse)

# # Evaluate the accuracy of the multi-output classifier
# accuracy = accuracy_score(y_test, y_pred)
# print("Accuracy:", accuracy)

In [50]:
from sklearn.metrics import classification_report

# Assuming y_true and y_pred are your true and predicted labels
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.05      0.00      0.01       370
           1       0.00      0.00      0.00       112
           2       0.00      0.00      0.00       101
           3       0.00      0.00      0.00       137
           4       0.08      0.00      0.01       504
           5       0.00      0.00      0.00        85
           6       0.00      0.00      0.00       184
           7       0.00      0.00      0.00        36
           8       0.00      0.00      0.00       322
           9       0.00      0.00      0.00        27
          10       0.00      0.00      0.00       123
          11       0.00      0.00      0.00       352
          12       0.00      0.00      0.00        72
          13       0.00      0.00      0.00        30
          14       0.00      0.00      0.00        46
          15       0.00      0.00      0.00        11
          16       0.00      0.00      0.00        32
          17       0.03    

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [None]:
# KNN

Training time is fine but testing would take like 16 hours to predict 10 samples

In [86]:
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score, classification_report
from sklearn.multioutput import MultiOutputClassifier

knn_classifier = KNeighborsClassifier(n_neighbors=5)

# Create a MultiOutputClassifier to handle multi-output classification
multi_output_knn = MultiOutputClassifier(knn_classifier)

for i in tqdm(range(10)):
    multi_output_knn.fit(X_train, y_train)


100%|██████████████████████████████████████████████████████████████████████████████████| 10/10 [00:18<00:00,  1.85s/it]


In [681]:
for i in tqdm(range(1000)):
    y_pred = multi_output_knn.predict(X_test)