#### Notes from phase 1

In [1]:
# External libraries for data processing
import numpy as np
import pandas as pd
import sklearn as sk
#To render graphs within notebook
%matplotlib inline
import matplotlib.pyplot as plt
import joblib 
import os
from tqdm import tqdm

# Versions of libraries
print("Numpy version: {}".format(np.__version__))
print("Pandas version: {}".format(pd.__version__))
print("Scikit version: {}".format(sk.__version__))

Numpy version: 1.24.3
Pandas version: 1.5.3
Scikit version: 1.3.0


In [2]:
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import mean_squared_error
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MultiLabelBinarizer

In [3]:
def convert_to_days(duration_str):
    parts = duration_str.split(' days ')  # Split string into form ['22', '20:55:00']
    days = float(parts[0])  # Extract number of days and convert to float
    time_parts = parts[1].split(':')  # Split time part (hh:mm:ss) ['20', '55', '00']
    hours = float(time_parts[0])  # Extract hours and convert to float
    minutes = float(time_parts[1])  # Extract minutes and convert to float
    seconds = float(time_parts[2])  # Extract seconds and convert to float
    total_days = days + (hours / 24) + (minutes / (24 * 60)) + (seconds / (24 * 3600))  # Calculate total days
    return total_days

In [4]:
path = "C:/Project/Data/"

In [5]:
file = "hosp/admissions.csv"
full_path = path + file

df_admissions = pd.read_csv(full_path)

df_admissions['dischtime'] = pd.to_datetime(df_admissions['dischtime'], format='%d/%m/%Y %H:%M')
df_admissions['admittime'] = pd.to_datetime(df_admissions['admittime'], format='%d/%m/%Y %H:%M')

df_admittime= pd.DataFrame()
df_admittime['hadm_id'] = df_admissions['hadm_id']
df_admittime['admittime'] = df_admissions['admittime']

In [6]:
file = "hosp/diagnoses_icd.csv"
full_path = path + file

df_diagnoses = pd.read_csv(full_path)

In [7]:
df_diagnoses = df_diagnoses.drop(columns=['subject_id','seq_num','icd_version'])

In [8]:
one_hot_encoded = pd.get_dummies(df_diagnoses['icd_code'])

df_encoded = pd.concat([df_diagnoses[['hadm_id']], one_hot_encoded], axis=1)

df_aggregated = df_encoded.groupby('hadm_id').sum().reset_index()

# If you want to replace NaN values (where the category did not appear for a particular ID) with 0
df_aggregated = df_aggregated.fillna(0)

In [9]:
df_diagnoses = df_aggregated
df_diagnoses

Unnamed: 0,hadm_id,00845,0088,0380,0383,03842,03843,03849,0388,0389,...,Z95810,Z95820,Z961,Z96651,Z980,Z981,Z9884,Z9911,Z992,Z9981
0,20044587,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,20093566,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,1,0,1,0
2,20192635,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,20199380,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,20214994,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
270,29820177,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
271,29839885,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
272,29842315,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,1,1
273,29858644,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [10]:
file = "hosp/drgcodes.csv"
full_path = path + file

df_drgcodes = pd.read_csv(full_path)

In [11]:
df_drgcodes['hadm_id'].value_counts()

22187210    2
27505812    2
25926192    2
27089790    2
24490144    2
           ..
22539296    1
20385771    1
20199380    1
20973395    1
23559586    1
Name: hadm_id, Length: 233, dtype: int64

In [12]:
df_drgcodes.head()

Unnamed: 0,subject_id,hadm_id,drg_type,drg_code,description,drg_severity,drg_mortality
0,10004235,22187210,HCFA,864,FEVER,,
1,10026255,22059910,HCFA,180,RESPIRATORY NEOPLASMS W MCC,,
2,10032725,20611640,HCFA,54,NERVOUS SYSTEM NEOPLASMS W MCC,,
3,10005866,21636229,HCFA,393,OTHER DIGESTIVE SYSTEM DIAGNOSES W MCC,,
4,10008454,20291550,HCFA,956,"LIMB REATTACHMENT, HIP & FEMUR PROC FOR MULTIP...",,


In [13]:
df_drgcodes = df_drgcodes.drop(columns=['subject_id','drg_type','description','drg_severity','drg_mortality'])

In [14]:
one_hot_encoded = pd.get_dummies(df_drgcodes['drg_code'])

df_encoded = pd.concat([df_drgcodes[['hadm_id']], one_hot_encoded], axis=1)

df_aggregated = df_encoded.groupby('hadm_id').sum().reset_index()

# If you want to replace NaN values (where the category did not appear for a particular ID) with 0
df_aggregated = df_aggregated.fillna(0)

In [15]:
df_drgcodes = df_aggregated
df_drgcodes

Unnamed: 0,hadm_id,14,20,21,22,23,24,25,26,27,...,950,951,952,956,957,981,982,983,987,988
0,20044587,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,20093566,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,20192635,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,20199380,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,20214994,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
228,29802992,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
229,29839885,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
230,29842315,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
231,29858644,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


### microbiologyevents

In [None]:
path = "C:/Users/jenni/OneDrive/Desktop/IP/"

file = "microbio_data_train.csv"
full_path = path + file
X_train = pd.read_csv(full_path)

file = "microbio_data_test.csv"
full_path = path + file
X_test = pd.read_csv(full_path)

In [None]:
X_train['delay']= X_train['delay'].apply(convert_to_days)
X_test['delay']= X_test['delay'].apply(convert_to_days)

In [None]:
# Also need to deal with days_since_admission
X_train['days_since_admission'] = X_train['days_since_admission'].apply(convert_to_days)
X_test['days_since_admission'] = X_test['days_since_admission'].apply(convert_to_days)

In [None]:
# target = df_diagnoses (rows where hadm_id is in train data)
# Make sure the order is the same

# Extract the unique IDs from the column 
train_ids = X_train['hadm_id'].unique()

# Filter to keep only rows where the id column is in train_ids
y_train = df_drgcodes[df_drgcodes['hadm_id'].isin(train_ids)]

test_ids = X_test['hadm_id'].unique()

# Filter to keep only rows where the id column is in train_ids
y_test = df_drgcodes[df_drgcodes['hadm_id'].isin(test_ids)]

In [None]:
merged_df = pd.merge(y_train, X_train['hadm_id'], on='hadm_id', how='inner')

In [None]:
y_train = merged_df
y_train

In [None]:
merged_df = pd.merge(y_test, X_test['hadm_id'], on='hadm_id', how='inner')
y_test = merged_df

In [None]:
valid_ids = y_train['hadm_id'].unique()
X_train = X_train[X_train['hadm_id'].isin(valid_ids)]

In [None]:
valid_ids = y_test['hadm_id'].unique()
X_test = X_test[X_test['hadm_id'].isin(valid_ids)]

In [None]:
# Order by hadm_id and drop 

In [None]:
y_train = y_train.sort_values(by='hadm_id')
y_train = y_train.drop(columns=['hadm_id'])

y_test = y_test.sort_values(by='hadm_id')
y_test = y_test.drop(columns=['hadm_id'])

X_train = X_train.sort_values(by='hadm_id')
X_train = X_train.drop(columns=['hadm_id'])

X_test = X_test.sort_values(by='hadm_id')
X_test = X_test.drop(columns=['hadm_id'])

In [None]:
# Number of samples and labels

In [None]:
y_train.shape, y_test.shape

In [None]:
# Number of unique label classes

In [None]:
# Count the number of unique rows
num_unique_rows = y_train.drop_duplicates().shape[0]

print("Number of unique rows in y_train:", num_unique_rows)

num_unique_rows = y_test.drop_duplicates().shape[0]

print("Number of unique rows in y_test:", num_unique_rows)

In [None]:
# Number of features

In [None]:
X_train.shape[1]

There are several ways to measure a classifier’s generalization quality:

Hamming loss measures how well the classifier predicts each of the labels, averaged over samples, then over labels
accuracy score measures how well the classifier predicts label combinations, averaged over samples

jaccard similarity measures the proportion of predicted labels for a sample to its correct assignment, averaged over samples

precision measures how many samples with ,

recall measures how many samples ,

F1 score measures a weighted average of precision and recall, where both have the same impact on the score

### emar - ignore (medication comes after diagnosis)

### outputevents

In [None]:
X_train.shape

In [None]:
y_train.shape

In [None]:
path = "C:/Users/jenni/OneDrive/Desktop/IP/"

file = "output_data_train.csv"
full_path = path + file
X_train = pd.read_csv(full_path)

file = "output_data_test.csv"
full_path = path + file
X_test = pd.read_csv(full_path)

In [None]:
X_train['recording_delay']= X_train['recording_delay'].apply(convert_to_days)
X_test['recording_delay']= X_test['recording_delay'].apply(convert_to_days)

In [None]:
# Also need to deal with days_since_admission
X_train['days_since_admission'] = X_train['days_since_admission'].apply(convert_to_days)
X_test['days_since_admission'] = X_test['days_since_admission'].apply(convert_to_days)

In [None]:
# target = df_diagnoses (rows where hadm_id is in train data)
# Make sure the order is the same

# Extract the unique IDs from the column 
train_ids = X_train['hadm_id'].unique()

# Filter to keep only rows where the id column is in train_ids
y_train = df_drgcodes[df_drgcodes['hadm_id'].isin(train_ids)]

test_ids = X_test['hadm_id'].unique()

# Filter to keep only rows where the id column is in train_ids
y_test = df_drgcodes[df_drgcodes['hadm_id'].isin(test_ids)]

In [None]:
merged_df = pd.merge(y_train, X_train['hadm_id'], on='hadm_id', how='inner')

In [None]:
y_train = merged_df

In [None]:
merged_df = pd.merge(y_test, X_test['hadm_id'], on='hadm_id', how='inner')
y_test = merged_df

In [None]:
valid_ids = y_train['hadm_id'].unique()
X_train = X_train[X_train['hadm_id'].isin(valid_ids)]

In [None]:
valid_ids = y_test['hadm_id'].unique()
X_test = X_test[X_test['hadm_id'].isin(valid_ids)]

In [None]:
# Order by hadm_id and drop 

In [None]:
y_train = y_train.sort_values(by='hadm_id')
y_train = y_train.drop(columns=['hadm_id'])

y_test = y_test.sort_values(by='hadm_id')
y_test = y_test.drop(columns=['hadm_id'])

X_train = X_train.sort_values(by='hadm_id')
X_train = X_train.drop(columns=['hadm_id'])

X_test = X_test.sort_values(by='hadm_id')
X_test = X_test.drop(columns=['hadm_id'])

#### XGBoost for multi-output

In [None]:
import xgboost as xgb
from catboost import CatBoostClassifier

base_classifier = xgb.XGBClassifier()

# Initialize the multi-output classifier with the base classifier
classifier = MultiOutputClassifier(base_classifier)

# Train the classifier with both input features (X_train) and target labels (y_train)
for i in tqdm(range(100)):
    classifier.fit(X_train.values, y_train.values)

In [None]:
for i in tqdm(range(100)):
    y_pred = classifier.predict(X_test.values)

# Evaluate performance
accuracy = accuracy_score(y_test, y_pred)
print("Accuracy:", accuracy)

# Print classification report
print("Classification Report:")
print(classification_report(y_test, y_pred))

In [None]:
# Calculate Hamming Loss
hamming_loss_value = hamming_loss(y_test, y_pred)

print("Hamming Loss:", hamming_loss_value)

### procedureevents

In [None]:
X_train.shape

In [None]:
y_train.shape

In [None]:
path = "C:/Users/jenni/OneDrive/Desktop/IP/"

file = "procedure_events_data_train.csv"
full_path = path + file
X_train = pd.read_csv(full_path)

file = "procedure_events_data_test.csv"
full_path = path + file
X_test = pd.read_csv(full_path)

In [None]:
X_train['duration']= X_train['duration'].apply(convert_to_days)
X_test['duration']= X_test['duration'].apply(convert_to_days)

In [None]:
X_train['recording_delay']= X_train['recording_delay'].apply(convert_to_days)
X_test['recording_delay']= X_test['recording_delay'].apply(convert_to_days)

In [None]:
# target = df_diagnoses (rows where hadm_id is in train data)
# Make sure the order is the same

# Extract the unique IDs from the column 
train_ids = X_train['hadm_id'].unique()

# Filter to keep only rows where the id column is in train_ids
y_train = df_drgcodes[df_drgcodes['hadm_id'].isin(train_ids)]

test_ids = X_test['hadm_id'].unique()

# Filter to keep only rows where the id column is in train_ids
y_test = df_drgcodes[df_drgcodes['hadm_id'].isin(test_ids)]

In [None]:
merged_df = pd.merge(y_train, X_train['hadm_id'], on='hadm_id', how='inner')

In [None]:
y_train = merged_df

In [None]:
merged_df = pd.merge(y_test, X_test['hadm_id'], on='hadm_id', how='inner')
y_test = merged_df

In [None]:
valid_ids = y_train['hadm_id'].unique()
X_train = X_train[X_train['hadm_id'].isin(valid_ids)]

In [None]:
valid_ids = y_test['hadm_id'].unique()
X_test = X_test[X_test['hadm_id'].isin(valid_ids)]

In [None]:
# Order by hadm_id and drop 

In [None]:
y_train = y_train.sort_values(by='hadm_id')
y_train = y_train.drop(columns=['hadm_id'])

y_test = y_test.sort_values(by='hadm_id')
y_test = y_test.drop(columns=['hadm_id'])

X_train = X_train.sort_values(by='hadm_id')
X_train = X_train.drop(columns=['hadm_id'])

X_test = X_test.sort_values(by='hadm_id')
X_test = X_test.drop(columns=['hadm_id'])

#### XGBoost for multi-output

In [None]:
import xgboost as xgb
from catboost import CatBoostClassifier

base_classifier = xgb.XGBClassifier()

# Initialize the multi-output classifier with the base classifier
classifier = MultiOutputClassifier(base_classifier)

# Train the classifier with both input features (X_train) and target labels (y_train)
classifier.fit(X_train.values, y_train.values)

In [None]:
for i in tqdm(range(100)):
    y_pred = classifier.predict(X_test.values)

# Evaluate performance
accuracy = accuracy_score(y_test, y_pred)
print("Accuracy:", accuracy)

# Print classification report
print("Classification Report:")
print(classification_report(y_test, y_pred))

In [None]:
for i in tqdm(range(100)):
    y_pred = classifier.predict(X_test.values)

# Evaluate performance
accuracy = accuracy_score(y_test, y_pred)
print("Accuracy:", accuracy)

# Print classification report
print("Classification Report:")
print(classification_report(y_test, y_pred))

In [None]:
# Calculate Hamming Loss
hamming_loss_value = hamming_loss(y_test, y_pred)

print("Hamming Loss:", hamming_loss_value)

### poe

In [16]:
path = "C:/Users/jenni/OneDrive/Desktop/IP/"

file = "poe_data_train.csv"
full_path = path + file
X_train = pd.read_csv(full_path)

file = "poe_data_test.csv"
full_path = path + file
X_test = pd.read_csv(full_path)

In [17]:
# Need to deal with days_since_admission
X_train['days_since_admission'] = X_train['days_since_admission'].apply(convert_to_days)
X_test['days_since_admission'] = X_test['days_since_admission'].apply(convert_to_days)

In [18]:
# target = df_diagnoses (rows where hadm_id is in train data)
# Make sure the order is the same

# Extract the unique IDs from the column 
train_ids = X_train['hadm_id'].unique()

# Filter to keep only rows where the id column is in train_ids
y_train = df_drgcodes[df_drgcodes['hadm_id'].isin(train_ids)]

test_ids = X_test['hadm_id'].unique()

# Filter to keep only rows where the id column is in train_ids
y_test = df_drgcodes[df_drgcodes['hadm_id'].isin(test_ids)]

In [19]:
merged_df = pd.merge(y_train, X_train['hadm_id'], on='hadm_id', how='inner')

In [20]:
y_train = merged_df

In [21]:
merged_df = pd.merge(y_test, X_test['hadm_id'], on='hadm_id', how='inner')
y_test = merged_df

In [22]:
valid_ids = y_train['hadm_id'].unique()
X_train = X_train[X_train['hadm_id'].isin(valid_ids)]

In [23]:
valid_ids = y_test['hadm_id'].unique()
X_test = X_test[X_test['hadm_id'].isin(valid_ids)]

In [24]:
# Order by hadm_id and drop 

In [25]:
y_train = y_train.sort_values(by='hadm_id')
y_train = y_train.drop(columns=['hadm_id'])

y_test = y_test.sort_values(by='hadm_id')
y_test = y_test.drop(columns=['hadm_id'])

X_train = X_train.sort_values(by='hadm_id')
X_train = X_train.drop(columns=['hadm_id'])

X_test = X_test.sort_values(by='hadm_id')
X_test = X_test.drop(columns=['hadm_id'])

#### XGBoost for multi-output

In [27]:
import xgboost as xgb
from catboost import CatBoostClassifier
from sklearn.multioutput import MultiOutputClassifier

base_classifier = xgb.XGBClassifier()

# Initialize the multi-output classifier with the base classifier
classifier = MultiOutputClassifier(base_classifier)

# Train the classifier with both input features (X_train) and target labels (y_train)
classifier.fit(X_train.values, y_train.values)

In [29]:
for i in tqdm(range(100)):
    y_pred = classifier.predict(X_test.values)



In [31]:
from sklearn.metrics import accuracy_score, classification_report
# Evaluate performance
accuracy = accuracy_score(y_test, y_pred)
print("Accuracy:", accuracy)

# Print classification report
print("Classification Report:")
print(classification_report(y_test, y_pred))

Accuracy: 0.8722019186843307
Classification Report:
              precision    recall  f1-score   support

           0       1.00      0.94      0.97        34
           1       0.98      0.91      0.94       192
           2       0.99      0.82      0.90       218
           3       1.00      0.70      0.82        33
           4       1.00      0.72      0.84        65
           5       0.99      0.88      0.93       116
           6       1.00      0.97      0.99       136
           7       0.95      0.90      0.93        70
           8       0.95      0.79      0.86        24
           9       1.00      0.60      0.75        10
          10       1.00      1.00      1.00        18
          11       1.00      0.89      0.94        18
          12       1.00      1.00      1.00        30
          13       0.99      0.84      0.91       112
          14       0.96      0.79      0.87        29
          15       1.00      1.00      1.00        18
          16       0.90      

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [None]:
# Calculate Hamming Loss
hamming_loss_value = hamming_loss(y_test, y_pred)

print("Hamming Loss:", hamming_loss_value)

###  prescriptions

In [45]:
path = "C:/Users/jenni/OneDrive/Desktop/IP/"

file = "prescriptions_data_train.csv"
full_path = path + file
X_train = pd.read_csv(full_path)

file = "prescriptions_data_test.csv"
full_path = path + file
X_test = pd.read_csv(full_path)

In [46]:
X_train['duration']= X_train['duration'].apply(convert_to_days)
X_test['duration']= X_test['duration'].apply(convert_to_days)

In [47]:
# target = df_diagnoses (rows where hadm_id is in train data)
# Make sure the order is the same

# Extract the unique IDs from the column 
train_ids = X_train['hadm_id'].unique()

# Filter to keep only rows where the id column is in train_ids
y_train = df_drgcodes[df_drgcodes['hadm_id'].isin(train_ids)]

test_ids = X_test['hadm_id'].unique()

# Filter to keep only rows where the id column is in train_ids
y_test = df_drgcodes[df_drgcodes['hadm_id'].isin(test_ids)]

In [48]:
merged_df = pd.merge(y_train, X_train['hadm_id'], on='hadm_id', how='inner')

In [49]:
y_train = merged_df

In [50]:
merged_df = pd.merge(y_test, X_test['hadm_id'], on='hadm_id', how='inner')
y_test = merged_df

In [51]:
valid_ids = y_train['hadm_id'].unique()
X_train = X_train[X_train['hadm_id'].isin(valid_ids)]

In [52]:
valid_ids = y_test['hadm_id'].unique()
X_test = X_test[X_test['hadm_id'].isin(valid_ids)]

In [53]:
# Order by hadm_id and drop 

In [54]:
y_train = y_train.sort_values(by='hadm_id')
y_train = y_train.drop(columns=['hadm_id'])

y_test = y_test.sort_values(by='hadm_id')
y_test = y_test.drop(columns=['hadm_id'])

X_train = X_train.sort_values(by='hadm_id')
X_train = X_train.drop(columns=['hadm_id'])

X_test = X_test.sort_values(by='hadm_id')
X_test = X_test.drop(columns=['hadm_id'])

In [42]:
# Need to reduce from 4890 to 2874 or less

In [None]:
from sklearn.decomposition import TruncatedSVD

# Number of desired features (components)
n_components = 2874

# Initialize Truncated SVD with the desired number of components
svd = TruncatedSVD(n_components=n_components)

# Fit the Truncated SVD model to the sparse matrix and transform the data
svd.fit(X_train)
X_train = svd.transform(X_train)

# Get the explained variance ratio (how much variance is explained by each component)
explained_variance_ratio = svd.explained_variance_ratio_

# Print the transformed matrix and explained variance ratio
# print("Transformed Matrix:")
# print(transformed_matrix)
print("\nExplained Variance Ratio:")
print(explained_variance_ratio)

print("\n Amount of original variance conserved:", np.sum(svd.explained_variance_ratio_))

In [None]:
# Number of desired features (components)
n_components = 2874

# Initialize Truncated SVD with the desired number of components
svd = TruncatedSVD(n_components=n_components)

# Fit the Truncated SVD model to the sparse matrix and transform the data
svd.fit(X_test)
X_test = svd.transform(X_test)

# Get the explained variance ratio (how much variance is explained by each component)
explained_variance_ratio = svd.explained_variance_ratio_

# Print the transformed matrix and explained variance ratio
# print("Transformed Matrix:")
# print(transformed_matrix)
print("\nExplained Variance Ratio:")
print(explained_variance_ratio)

print("\n Amount of original variance conserved:", np.sum(svd.explained_variance_ratio_))

#### XGBoost for multi-output

In [55]:
import xgboost as xgb
from catboost import CatBoostClassifier

base_classifier = xgb.XGBClassifier()

# Initialize the multi-output classifier with the base classifier
classifier = MultiOutputClassifier(base_classifier)

# Train the classifier with both input features (X_train) and target labels (y_train)
classifier.fit(X_train.values, y_train.values)

In [56]:
y_pred = classifier.predict(X_test.values)

# Evaluate performance
accuracy = accuracy_score(y_test, y_pred)
print("Accuracy:", accuracy)

# Print classification report
print("Classification Report:")
print(classification_report(y_test, y_pred))

Accuracy: 0.2602040816326531
Classification Report:
              precision    recall  f1-score   support

           0       0.73      0.42      0.53        19
           1       0.58      0.16      0.26        67
           2       0.75      0.26      0.38        70
           3       1.00      0.50      0.67         6
           4       1.00      0.19      0.32        21
           5       0.76      0.36      0.49        36
           6       0.81      0.46      0.58        57
           7       1.00      0.25      0.40        16
           8       0.62      0.50      0.55        16
           9       1.00      0.60      0.75         5
          10       0.60      0.38      0.46         8
          11       0.50      0.27      0.35        11
          12       0.00      0.00      0.00         6
          13       0.67      0.16      0.25        51
          14       0.60      0.14      0.23        21
          15       0.60      0.38      0.46         8
          16       0.69      

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


### pharmacy

In [58]:
path = "C:/Users/jenni/OneDrive/Desktop/IP/"

file = "pharmacy_data_train.csv"
full_path = path + file
X_train = pd.read_csv(full_path)

file = "pharmacy_data_test.csv"
full_path = path + file
X_test = pd.read_csv(full_path)

In [59]:
X_train['medication_duration']= X_train['medication_duration'].apply(convert_to_days)
X_test['medication_duration']= X_test['medication_duration'].apply(convert_to_days)
# Convert strings to integers
X_train['verification_delay'] = X_train['verification_delay'].str.split().str[0].astype(int)
X_test['verification_delay'] = X_test['verification_delay'].str.split().str[0].astype(int)

In [60]:
# target = df_diagnoses (rows where hadm_id is in train data)
# Make sure the order is the same

# Extract the unique IDs from the column 
train_ids = X_train['hadm_id'].unique()

# Filter to keep only rows where the id column is in train_ids
y_train = df_drgcodes[df_drgcodes['hadm_id'].isin(train_ids)]

test_ids = X_test['hadm_id'].unique()

# Filter to keep only rows where the id column is in train_ids
y_test = df_drgcodes[df_drgcodes['hadm_id'].isin(test_ids)]

In [61]:
merged_df = pd.merge(y_train, X_train['hadm_id'], on='hadm_id', how='inner')

In [62]:
y_train = merged_df

In [63]:
merged_df = pd.merge(y_test, X_test['hadm_id'], on='hadm_id', how='inner')
y_test = merged_df

In [64]:
valid_ids = y_train['hadm_id'].unique()
X_train = X_train[X_train['hadm_id'].isin(valid_ids)]

In [65]:
valid_ids = y_test['hadm_id'].unique()
X_test = X_test[X_test['hadm_id'].isin(valid_ids)]

In [66]:
# Order by hadm_id and drop 

In [67]:
y_train = y_train.sort_values(by='hadm_id')
y_train = y_train.drop(columns=['hadm_id'])

y_test = y_test.sort_values(by='hadm_id')
y_test = y_test.drop(columns=['hadm_id'])

X_train = X_train.sort_values(by='hadm_id')
X_train = X_train.drop(columns=['hadm_id'])

X_test = X_test.sort_values(by='hadm_id')
X_test = X_test.drop(columns=['hadm_id'])

#### XGBoost for multi-output

In [None]:
import xgboost as xgb
from catboost import CatBoostClassifier

base_classifier = xgb.XGBClassifier()

# Initialize the multi-output classifier with the base classifier
classifier = MultiOutputClassifier(base_classifier)

# Train the classifier with both input features (X_train) and target labels (y_train)
classifier.fit(X_train.values, y_train.values)

In [None]:
y_pred = classifier.predict(X_test.values)

# Evaluate performance
accuracy = accuracy_score(y_test, y_pred)
print("Accuracy:", accuracy)

# Print classification report
print("Classification Report:")
print(classification_report(y_test, y_pred))

### inputevents

In [None]:
X_train.shape

In [None]:
y_train.shape

In [None]:
path = "C:/Users/jenni/OneDrive/Desktop/IP/"

file = "input_data_train.csv"
full_path = path + file
X_train = pd.read_csv(full_path)

file = "input_data_test.csv"
full_path = path + file
X_test = pd.read_csv(full_path)

In [None]:
X_train['duration']= X_train['duration'].apply(convert_to_days)
X_test['duration']= X_test['duration'].apply(convert_to_days)

In [None]:
X_train['recording_delay']= X_train['recording_delay'].apply(convert_to_days)
X_test['recording_delay']= X_test['recording_delay'].apply(convert_to_days)

In [None]:
# target = df_diagnoses (rows where hadm_id is in train data)
# Make sure the order is the same

# Extract the unique IDs from the column 
train_ids = X_train['hadm_id'].unique()

# Filter to keep only rows where the id column is in train_ids
y_train = df_drgcodes[df_drgcodes['hadm_id'].isin(train_ids)]

test_ids = X_test['hadm_id'].unique()

# Filter to keep only rows where the id column is in train_ids
y_test = df_drgcodes[df_drgcodes['hadm_id'].isin(test_ids)]

In [None]:
merged_df = pd.merge(y_train, X_train['hadm_id'], on='hadm_id', how='inner')

In [None]:
y_train = merged_df

In [None]:
merged_df = pd.merge(y_test, X_test['hadm_id'], on='hadm_id', how='inner')
y_test = merged_df

In [None]:
valid_ids = y_train['hadm_id'].unique()
X_train = X_train[X_train['hadm_id'].isin(valid_ids)]

In [None]:
valid_ids = y_test['hadm_id'].unique()
X_test = X_test[X_test['hadm_id'].isin(valid_ids)]

In [None]:
# Order by hadm_id and drop 

In [None]:
y_train = y_train.sort_values(by='hadm_id')
y_train = y_train.drop(columns=['hadm_id'])

y_test = y_test.sort_values(by='hadm_id')
y_test = y_test.drop(columns=['hadm_id'])

X_train = X_train.sort_values(by='hadm_id')
X_train = X_train.drop(columns=['hadm_id'])

X_test = X_test.sort_values(by='hadm_id')
X_test = X_test.drop(columns=['hadm_id'])

#### XGBoost for multi-output

In [None]:
import xgboost as xgb
from catboost import CatBoostClassifier

base_classifier = xgb.XGBClassifier()

# Initialize the multi-output classifier with the base classifier
classifier = MultiOutputClassifier(base_classifier)

# Train the classifier with both input features (X_train) and target labels (y_train)
for i in tqdm(range(100)):
    classifier.fit(X_train.values, y_train.values)

In [None]:
for i in tqdm(range(100)):
    y_pred = classifier.predict(X_test.values)

# Evaluate performance
accuracy = accuracy_score(y_test, y_pred)
print("Accuracy:", accuracy)

# Print classification report
print("Classification Report:")
print(classification_report(y_test, y_pred))

In [None]:
# Calculate Hamming Loss
hamming_loss_value = hamming_loss(y_test, y_pred)

print("Hamming Loss:", hamming_loss_value)

## Not enough memory

### ingredientevents - ignore, not relevant to diagnosis

### chartevents

In [None]:
# path = "C:/Users/jenni/OneDrive/Desktop/IP/"

# file = "chart_data_train.csv"
# full_path = path + file
# X_train = pd.read_csv(full_path)

# file = "chart_data_test.csv"
# full_path = path + file
# X_test = pd.read_csv(full_path)

In [None]:
X_train['delay']= X_train['delay'].apply(convert_to_days)
X_test['delay']= X_test['delay'].apply(convert_to_days)

In [None]:
# Also need to deal with days_since_admission
X_train['days_since_admission'] = X_train['days_since_admission'].apply(convert_to_days)
X_test['days_since_admission'] = X_test['days_since_admission'].apply(convert_to_days)

In [None]:
# target = df_diagnoses (rows where hadm_id is in train data)
# Make sure the order is the same

# Extract the unique IDs from the column 
train_ids = X_train['hadm_id'].unique()

# Filter to keep only rows where the id column is in train_ids
y_train = df_drgcodes[df_drgcodes['hadm_id'].isin(train_ids)]

test_ids = X_test['hadm_id'].unique()

# Filter to keep only rows where the id column is in train_ids
y_test = df_drgcodes[df_drgcodes['hadm_id'].isin(test_ids)]

In [None]:
merged_df = pd.merge(y_train, X_train['hadm_id'], on='hadm_id', how='inner')

In [None]:
y_train = merged_df

In [None]:
merged_df = pd.merge(y_test, X_test['hadm_id'], on='hadm_id', how='inner')
y_test = merged_df

In [None]:
valid_ids = y_train['hadm_id'].unique()
X_train = X_train[X_train['hadm_id'].isin(valid_ids)]

In [None]:
valid_ids = y_test['hadm_id'].unique()
X_test = X_test[X_test['hadm_id'].isin(valid_ids)]

In [None]:
# Order by hadm_id and drop 

In [None]:
y_train = y_train.sort_values(by='hadm_id')
y_train = y_train.drop(columns=['hadm_id'])

y_test = y_test.sort_values(by='hadm_id')
y_test = y_test.drop(columns=['hadm_id'])

X_train = X_train.sort_values(by='hadm_id')
X_train = X_train.drop(columns=['hadm_id'])

X_test = X_test.sort_values(by='hadm_id')
X_test = X_test.drop(columns=['hadm_id'])

#### XGBoost for multi-output

In [None]:
import xgboost as xgb
from catboost import CatBoostClassifier

base_classifier = xgb.XGBClassifier()

# Initialize the multi-output classifier with the base classifier
classifier = MultiOutputClassifier(base_classifier)

# Train the classifier with both input features (X_train) and target labels (y_train)
for i in tqdm(range(100)):
    classifier.fit(X_train.values, y_train.values)

In [None]:
for i in tqdm(range(100)):
    y_pred = classifier.predict(X_test.values)

# Evaluate performance
accuracy = accuracy_score(y_test, y_pred)
print("Accuracy:", accuracy)

# Print classification report
print("Classification Report:")
print(classification_report(y_test, y_pred))

In [None]:
# Calculate Hamming Loss
hamming_loss_value = hamming_loss(y_test, y_pred)

print("Hamming Loss:", hamming_loss_value)

### emar_detail - ignore (medication comes after diagnosis)

### labevents - can't allocate memory

In [None]:
# Can't allocate memory this way 

# path = "C:/Users/jenni/OneDrive/Desktop/IP/"

# file = "labevents_data_train.csv"
# full_path = path + file
# X_train = pd.read_csv(full_path)

# file = "labevents_data_test.csv"
# full_path = path + file
# X_test = pd.read_csv(full_path)

In [None]:
path = "C:/Project/Data/"

In [None]:
file = "hosp/labevents.csv"
full_path = path + file

df_labevents = pd.read_csv(full_path)

In [None]:
df_labevents['value'] = pd.to_numeric(df_labevents['value'], errors='coerce').fillna(0)

In [None]:
# Make a feature for days_since_admission using charttime - admittime

# Convert to datetime
df_labevents['charttime'] = pd.to_datetime(df_labevents['charttime'], format='%Y/%m/%d %H:%M')

# Add admittime column from other dataframe
df_labevents = df_labevents.merge(df_admittime, on='hadm_id', how='left')

# # Discard the time part and keep only the date
# df_hcpcsevents['admittime'] = df_hcpcsevents['admittime'].dt.date
# df_hcpcsevents['chartdate'] = df_hcpcsevents['chartdate'].dt.date

df_labevents['days_since_admission'] = df_labevents['charttime'] - df_labevents['admittime']

# Fill any non time values
df_labevents['days_since_admission'] = df_labevents['days_since_admission'].fillna(pd.Timedelta(0))

In [None]:
# Add storetime - charttime feature called delay

# Convert to datetime
df_labevents['storetime'] = pd.to_datetime(df_labevents['storetime'], format='%Y/%m/%d %H:%M')

df_labevents['delay'] = df_labevents['storetime'] - df_labevents['charttime']

# Fill any non time values
df_labevents['delay'] = df_labevents['delay'].fillna(pd.Timedelta(0))

In [None]:
df_labevents = df_labevents.drop(columns=['labevent_id','subject_id','order_provider_id','charttime','storetime','comments'])

In [None]:
# For flag make abnormal = 1 and fill Null with 0
df_labevents['flag'] = df_labevents['flag'].fillna(0)
df_labevents['flag'] = df_labevents['flag'].replace('abnormal', 1)

In [None]:
# For priority fill Null with N/A and then one hot encode
df_labevents['priority'] = df_labevents['priority'].fillna('N/A')
df_labevents = pd.get_dummies(df_labevents, columns=['priority'])

In [None]:
df_labevents = pd.get_dummies(df_labevents, columns=['valueuom','specimen_id','itemid'])

In [None]:
# Drop any rows with null values 
df_labevents = df_labevents.dropna()
# Reduced from 107727 rows to 66660

In [None]:
df_labevents = df_labevents.drop(columns=['admittime'])

In [None]:
df_labevents.head()

#### Split into train and test

In [None]:
data = df_labevents

# Split the dataset into training and testing sets
labevents_data_train, labevents_data_test = train_test_split(data, test_size=0.2, random_state=42)

# Print the shapes of the resulting training and testing sets
print("Training set shape:", labevents_data_train.shape)
print("Testing set shape:", labevents_data_test.shape)

In [None]:
data = labevents_data_train.drop(columns=['hadm_id'])
data = data.reset_index(drop=True)

In [None]:
labevents_data_train.head()

In [None]:
# # target = df_diagnoses (rows where hadm_id is in train data)
# # Make sure the order is the same

# # Extract the unique IDs from the column 
# train_ids = labevents_data_train['hadm_id'].unique()

# # Filter to keep only rows where the id column is in train_ids
# y_train = df_diagnoses[df_diagnoses['hadm_id'].isin(train_ids)]

# test_ids = labevents_data_test['hadm_id'].unique()

# # Filter to keep only rows where the id column is in train_ids
# y_test = df_diagnoses[df_diagnoses['hadm_id'].isin(test_ids)]

In [None]:
# target = df_diagnoses (rows where hadm_id is in train data)
# Make sure the order is the same

# Extract the unique IDs from the column 
train_ids = labevents_data_train['hadm_id'].unique()

# Filter to keep only rows where the id column is in train_ids
y_train = df_drgcodes[df_drgcodes['hadm_id'].isin(train_ids)]

test_ids = labevents_data_test['hadm_id'].unique()

# Filter to keep only rows where the id column is in train_ids
y_test = df_drgcodes[df_drgcodes['hadm_id'].isin(test_ids)]

In [None]:
merged_df = pd.merge(y_train, labevents_data_train['hadm_id'], on='hadm_id')

In [None]:
y_train = merged_df
y_train

In [None]:
merged_df = pd.merge(y_test, labevents_data_test['hadm_id'], on='hadm_id')

y_test = merged_df
y_test

In [None]:
X_train = labevents_data_train
X_test = labevents_data_test

In [None]:
# X_train['delay']= X_train['delay'].astype(str)
# X_train['delay']= X_train['delay'].apply(convert_to_days)
# X_train['days_since_admission'] = X_train['days_since_admission'].astype(str)
# X_train['days_since_admission'] = X_train['days_since_admission'].str.split().str[0].astype(int)

# X_train = X_train.drop(columns=['hadm_id'])
# X_train = X_train.reset_index(drop=True)

# y_train = y_train.drop(columns=['hadm_id'])
# y_train = y_train.reset_index(drop=True)

In [None]:
X_train

In [None]:
X_train['delay']= X_train['delay'].astype(str)
X_train['delay']= X_train['delay'].apply(convert_to_days)
X_train['days_since_admission'] = X_train['days_since_admission'].astype(str)
X_train['days_since_admission'] = X_train['days_since_admission'].str.split().str[0].astype(int)

X_train = X_train.drop(columns=['hadm_id'])
X_train = X_train.reset_index(drop=True)

y_train = y_train.drop(columns=['hadm_id'])
y_train = y_train.reset_index(drop=True)

In [None]:
# sparse matrices to address memory issue

from scipy.sparse import csr_matrix

X_train_sparse = csr_matrix(X_train)
X_test_sparse = csr_matrix(X_test)

#### Dimensionality reduction - not enough memory

In [None]:
# Need to reduce from 11681 to 10665

In [None]:
# data['delay']= data['delay'].astype(str)
# data['delay']= data['delay'].apply(convert_to_days)
# data['days_since_admission'] = data['days_since_admission'].astype(str)
# data['days_since_admission'] = data['days_since_admission'].str.split().str[0].astype(int)

In [None]:
# from sklearn.decomposition import TruncatedSVD

# # Number of desired features (components)
# n_components = 10665

# # Initialize Truncated SVD with the desired number of components
# svd = TruncatedSVD(n_components=n_components)

# # Fit the Truncated SVD model to the sparse matrix and transform the data
# svd.fit(data)
# data = svd.transform(data)

# # Get the explained variance ratio (how much variance is explained by each component)
# explained_variance_ratio = svd.explained_variance_ratio_

# # Print the transformed matrix and explained variance ratio
# # print("Transformed Matrix:")
# # print(transformed_matrix)
# print("\nExplained Variance Ratio:")
# print(explained_variance_ratio)

# print("\n Amount of original variance conserved:", np.sum(svd.explained_variance_ratio_))

In [None]:
# target = df_diagnoses (rows where hadm_id is in train data)
# Make sure the order is the same

# Extract the unique IDs from the column 
train_ids = X_train['hadm_id'].unique()

# Filter to keep only rows where the id column is in train_ids
y_train = df_drgcodes[df_drgcodes['hadm_id'].isin(train_ids)]

test_ids = X_test['hadm_id'].unique()

# Filter to keep only rows where the id column is in train_ids
y_test = df_drgcodes[df_drgcodes['hadm_id'].isin(test_ids)]

In [None]:
merged_df = pd.merge(y_train, X_train['hadm_id'], on='hadm_id', how='inner')

In [None]:
y_train = merged_df

In [None]:
merged_df = pd.merge(y_test, X_test['hadm_id'], on='hadm_id', how='inner')
y_test = merged_df

In [None]:
valid_ids = y_train['hadm_id'].unique()
X_train = X_train[X_train['hadm_id'].isin(valid_ids)]

In [None]:
valid_ids = y_test['hadm_id'].unique()
X_test = X_test[X_test['hadm_id'].isin(valid_ids)]

In [None]:
# Order by hadm_id and drop 

In [None]:
y_train = y_train.sort_values(by='hadm_id')
y_train = y_train.drop(columns=['hadm_id'])

y_test = y_test.sort_values(by='hadm_id')
y_test = y_test.drop(columns=['hadm_id'])

X_train = X_train.sort_values(by='hadm_id')
X_train = X_train.drop(columns=['hadm_id'])

X_test = X_test.sort_values(by='hadm_id')
X_test = X_test.drop(columns=['hadm_id'])