### Training of diagnosis learners for ensemble - full pipeline

In [1]:
# External libraries for data processing
import numpy as np
import pandas as pd
import sklearn as sk
#To render graphs within notebook
%matplotlib inline
import matplotlib.pyplot as plt
import joblib 
import os
from tqdm import tqdm

# Versions of libraries
print("Numpy version: {}".format(np.__version__))
print("Pandas version: {}".format(pd.__version__))
print("Scikit version: {}".format(sk.__version__))

Numpy version: 1.24.3
Pandas version: 1.5.3
Scikit version: 1.3.0


In [2]:
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import mean_squared_error
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MultiLabelBinarizer

In [3]:
path = "C:/Project/Data/"

#### Global functions

In [4]:
def convert_to_days(duration_str):
    parts = duration_str.split(' days ')  # Split string into form ['22', '20:55:00']
    days = float(parts[0])  # Extract number of days and convert to float
    time_parts = parts[1].split(':')  # Split time part (hh:mm:ss) ['20', '55', '00']
    hours = float(time_parts[0])  # Extract hours and convert to float
    minutes = float(time_parts[1])  # Extract minutes and convert to float
    seconds = float(time_parts[2])  # Extract seconds and convert to float
    total_days = days + (hours / 24) + (minutes / (24 * 60)) + (seconds / (24 * 3600))  # Calculate total days
    return total_days

In [5]:
file = "hosp/admissions.csv"
full_path = path + file

df_admissions = pd.read_csv(full_path)

df_admissions['dischtime'] = pd.to_datetime(df_admissions['dischtime'], format='%d/%m/%Y %H:%M')
df_admissions['admittime'] = pd.to_datetime(df_admissions['admittime'], format='%d/%m/%Y %H:%M')

df_admittime= pd.DataFrame()
df_admittime['hadm_id'] = df_admissions['hadm_id']
df_admittime['admittime'] = df_admissions['admittime']

In [6]:
file = "hosp/diagnoses_icd.csv"
full_path = path + file

df_diagnoses = pd.read_csv(full_path)

In [7]:
df_diagnoses = df_diagnoses.drop(columns=['subject_id','seq_num','icd_version'])

In [8]:
one_hot_encoded = pd.get_dummies(df_diagnoses['icd_code'])

df_encoded = pd.concat([df_diagnoses[['hadm_id']], one_hot_encoded], axis=1)

df_aggregated = df_encoded.groupby('hadm_id').sum().reset_index()

# If you want to replace NaN values (where the category did not appear for a particular ID) with 0
df_aggregated = df_aggregated.fillna(0)

In [9]:
df_diagnoses = df_aggregated
df_diagnoses

Unnamed: 0,hadm_id,00845,0088,0380,0383,03842,03843,03849,0388,0389,...,Z95810,Z95820,Z961,Z96651,Z980,Z981,Z9884,Z9911,Z992,Z9981
0,20044587,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,20093566,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,1,0,1,0
2,20192635,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,20199380,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,20214994,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
270,29820177,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
271,29839885,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
272,29842315,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,1,1
273,29858644,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [10]:
file = "hosp/drgcodes.csv"
full_path = path + file

df_drgcodes = pd.read_csv(full_path)

In [11]:
df_drgcodes['hadm_id'].value_counts()

22187210    2
27505812    2
25926192    2
27089790    2
24490144    2
           ..
22539296    1
20385771    1
20199380    1
20973395    1
23559586    1
Name: hadm_id, Length: 233, dtype: int64

In [12]:
df_drgcodes.head()

Unnamed: 0,subject_id,hadm_id,drg_type,drg_code,description,drg_severity,drg_mortality
0,10004235,22187210,HCFA,864,FEVER,,
1,10026255,22059910,HCFA,180,RESPIRATORY NEOPLASMS W MCC,,
2,10032725,20611640,HCFA,54,NERVOUS SYSTEM NEOPLASMS W MCC,,
3,10005866,21636229,HCFA,393,OTHER DIGESTIVE SYSTEM DIAGNOSES W MCC,,
4,10008454,20291550,HCFA,956,"LIMB REATTACHMENT, HIP & FEMUR PROC FOR MULTIP...",,


In [13]:
df_drgcodes = df_drgcodes.drop(columns=['subject_id','drg_type','description','drg_severity','drg_mortality'])

In [14]:
one_hot_encoded = pd.get_dummies(df_drgcodes['drg_code'])

df_encoded = pd.concat([df_drgcodes[['hadm_id']], one_hot_encoded], axis=1)

df_aggregated = df_encoded.groupby('hadm_id').sum().reset_index()

# If you want to replace NaN values (where the category did not appear for a particular ID) with 0
df_aggregated = df_aggregated.fillna(0)

In [15]:
df_drgcodes = df_aggregated
df_drgcodes

Unnamed: 0,hadm_id,14,20,21,22,23,24,25,26,27,...,950,951,952,956,957,981,982,983,987,988
0,20044587,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,20093566,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,20192635,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,20199380,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,20214994,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
228,29802992,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
229,29839885,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
230,29842315,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
231,29858644,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


#### Select 55 (20% of) admissions to use for evaluation

In [16]:
file = "hosp/admissions.csv"
full_path = path + file

df_admissions = pd.read_csv(full_path)

In [17]:
evaluation_admissions = df_admissions['hadm_id'].sample(n=55, random_state=42).tolist()

# Any records belonging to these admissions will be removed before training 
print(evaluation_admissions)

[27617929, 27553957, 20282368, 27296885, 24980601, 21133938, 25559382, 20611796, 28778757, 28723315, 28998349, 28676446, 29276678, 26842957, 21477991, 25922998, 26706939, 27993466, 28236161, 27259207, 20385771, 24540843, 20900955, 22413744, 27494880, 25103777, 21599196, 21540783, 22585261, 26275841, 22130791, 22490490, 25020332, 29279905, 29483621, 27167814, 25508812, 21607814, 20297618, 29974575, 24912093, 21255400, 29295881, 28829452, 24656677, 29858644, 23488445, 25970245, 22508257, 25742920, 25085565, 22228639, 27660781, 28335091, 27703517]


### Emar

In [18]:
file = "hosp/emar.csv"
full_path = path + file

df_emar = pd.read_csv(full_path)

# records for 65 different patients 
# 181 unique admissions

In [19]:
# Separate into training and evaluation sets 

df_emar_training = df_emar[~df_emar['hadm_id'].isin(evaluation_admissions)]
df_emar_evaluation = df_emar[df_emar['hadm_id'].isin(evaluation_admissions)]

In [20]:
# Save evaluation data for later 
folder_name = 'EnsembleEvaluationData'

# Define the file path
file_path = os.path.join(folder_name, 'df_emar_evaluation.csv')

# Save the DataFrame to a CSV file in the specified folder
df_emar_evaluation.to_csv(file_path, index=False)

#### Preprocessing (on training data)

In [21]:
df_emar = df_emar_training

In [22]:
df_emar = df_emar.reset_index(drop=True)

Impute with N/A and encode: enter_provider_id, medication

Drop: subject_id, emar_id, poe_id, pharmacy_id, event_txt, storetime

poe_id is an identifier which links administrations in emar to orders in poe and prescriptions
storetime is when it was recorded in the table

In [23]:
# Make a feature called delay using scheduletime - charttime

# Convert to datetime
df_emar['scheduletime'] = pd.to_datetime(df_emar['scheduletime'], format='%Y/%m/%d %H:%M')
df_emar['charttime'] = pd.to_datetime(df_emar['charttime'], format='%Y/%m/%d %H:%M')

df_emar['delay'] = df_emar['charttime'] - df_emar['scheduletime']

# Fill any non time values
df_emar['delay'] = df_emar['delay'].fillna(pd.Timedelta(0))

In [24]:
df_emar = df_emar.drop(columns=['subject_id','emar_id','poe_id','pharmacy_id',
                               'event_txt','charttime','scheduletime','storetime'])

In [25]:
# Fill Null with N/A and then one hot encode
df_emar['enter_provider_id'] = df_emar['enter_provider_id'].fillna('N/A')
df_emar['medication'] = df_emar['medication'].fillna('N/A')
df_emar = pd.get_dummies(df_emar, columns=['enter_provider_id', 'medication'])

In [26]:
column_names = df_emar.columns.to_numpy()

# Convert the array to a DataFrame
df_column_names = pd.DataFrame(column_names, columns=['Column Names'])

output_folder = 'XGBoost_features'
os.makedirs(output_folder, exist_ok=True)
file_path = os.path.join(output_folder, 'emar_features.csv')

# Save the DataFrame to a CSV file
df_column_names.to_csv(file_path, index=False)

#### Training the learner

In [27]:
X_train = df_emar

In [28]:
X_train['delay']= X_train['delay'].astype(str)
X_train['delay']= X_train['delay'].apply(convert_to_days)

In [30]:
# target = df_diagnoses (rows where hadm_id is in train data)
# Make sure the order is the same

# Extract the unique IDs from the column 
train_ids = X_train['hadm_id'].unique()

# Filter to keep only rows where the id column is in train_ids
y_train = df_drgcodes[df_drgcodes['hadm_id'].isin(train_ids)]

In [31]:
merged_df = pd.merge(y_train, X_train['hadm_id'], on='hadm_id', how='inner')

In [32]:
y_train = merged_df

In [33]:
valid_ids = y_train['hadm_id'].unique()
X_train = X_train[X_train['hadm_id'].isin(valid_ids)]

In [34]:
# Order by hadm_id and drop 

In [35]:
y_train = y_train.sort_values(by='hadm_id')
y_train = y_train.drop(columns=['hadm_id'])

X_train = X_train.sort_values(by='hadm_id')
X_train = X_train.drop(columns=['hadm_id'])

In [37]:
import xgboost as xgb
from sklearn.multioutput import MultiOutputClassifier

base_classifier = xgb.XGBClassifier()

# Initialize the multi-output classifier with the base classifier
XGBoost_emar = MultiOutputClassifier(base_classifier)

# Train the classifier with both input features (X_train) and target labels (y_train)
XGBoost_emar.fit(X_train.values, y_train.values)

In [40]:
from joblib import dump
# Save model to folder

output_folder = 'diagnosis_learners'
os.makedirs(output_folder, exist_ok=True)

# Save the trained model to a file in the new folder
model_file = os.path.join(output_folder, 'XGBoost_emar.joblib')
dump(XGBoost_emar, model_file)

['diagnosis_learners\\XGBoost_emar.joblib']

### microbiologyevents

In [27]:
file = "hosp/microbiologyevents.csv"
full_path = path + file

df_microbio = pd.read_csv(full_path)

In [28]:
# Separate into training and evaluation sets 

# df_microbio_training = df_microbio[~df_microbio['subject_id'].isin(evaluation_patients)]
# df_microbio_evaluation = df_microbio[df_microbio['subject_id'].isin(evaluation_patients)]

df_microbio_training = df_microbio[~df_microbio['hadm_id'].isin(evaluation_admissions)]
df_microbio_evaluation = df_microbio[df_microbio['hadm_id'].isin(evaluation_admissions)]

In [29]:
# Save evaluation data for later 
folder_name = 'EnsembleEvaluationData'

# Define the file path
file_path = os.path.join(folder_name, 'df_microbio_evaluation.csv')

# Save the DataFrame to a CSV file in the specified folder
df_microbio_evaluation.to_csv(file_path, index=False)

#### Preprocessing (on training data)

In [30]:
df_microbio = df_microbio_training

In [31]:
df_microbio = df_microbio.reset_index(drop=True)

In [32]:
# make days_since_admission using charttime 

# Convert to datetime
df_microbio['charttime'] = pd.to_datetime(df_microbio['charttime'], format='%Y/%m/%d %H:%M')

# Add admittime column from other dataframe
df_microbio = df_microbio.merge(df_admittime, on='hadm_id', how='left')

# # Discard the time part and keep only the date
# df_hcpcsevents['admittime'] = df_hcpcsevents['admittime'].dt.date
# df_hcpcsevents['chartdate'] = df_hcpcsevents['chartdate'].dt.date

df_microbio['days_since_admission'] = df_microbio['charttime'] - df_microbio['admittime']

# Fill any non time values
df_microbio['days_since_admission'] = df_microbio['days_since_admission'].fillna(pd.Timedelta(0))

# Drop the admission time column
df_microbio = df_microbio.drop(columns=['admittime'])

In [33]:
# Add storetime - charttime feature (call it delay)

# Convert to datetime
df_microbio['storetime'] = pd.to_datetime(df_microbio['storetime'], format='%Y/%m/%d %H:%M')

df_microbio['delay'] = df_microbio['storetime'] - df_microbio['charttime']

# Fill any non time values
df_microbio['delay'] = df_microbio['delay'].fillna(pd.Timedelta(0))

Drop: microevent_id, subject_id, chartdate, charttime, test_seq, storedate, storetime, test_name and org_itemid (since info in name), quantity, ab_name, comments, micro_specimen_id (unique identifier for sample as some measurements are made on the same sample)
Keep but categorical: order_provider_id, spec_type_desc, dilution_text, dilution_comparison
Impute null with 0: order_provider_id, org_itemid, isolate_num, ab_itemid, dilution_value
Impute with N/A and then one hot encode: interpretation

In [34]:
# Drop
# spec_itemid , test_itemid
df_microbio = df_microbio.drop(columns=['microevent_id','subject_id','chartdate','charttime','test_seq','storedate',
                                       'storetime','quantity','comments','ab_itemid',
                                       'spec_itemid','test_itemid','org_itemid','micro_specimen_id'])

In [35]:
# Impute null with 0: order_provider_id, org_itemid, isolate_num, ab_itemid, dilution_value
df_microbio['order_provider_id'] = df_microbio['order_provider_id'].fillna(0)
df_microbio['isolate_num'] = df_microbio['isolate_num'].fillna(0)
df_microbio['dilution_value'] = df_microbio['dilution_value'].fillna(0)

In [36]:
# Impute with N/A and then one hot encode: interpretation
# encode test_name, ab_name

df_microbio['interpretation'] = df_microbio['interpretation'].fillna('N/A')
df_microbio['test_name'] = df_microbio['test_name'].fillna('N/A')
df_microbio['ab_name'] = df_microbio['ab_name'].fillna('N/A')
df_microbio['org_name'] = df_microbio['org_name'].fillna('None')
df_microbio = pd.get_dummies(df_microbio, columns=['org_name','interpretation','ab_name','test_name'])

In [37]:
# Keep but categorical: order_provider_id, spec_type_desc, dilution_text, dilution_comparison
df_microbio = pd.get_dummies(df_microbio, columns=['order_provider_id','spec_type_desc','dilution_text',
                                                  'dilution_comparison'])

In [38]:
df_microbio = df_microbio.dropna()

In [39]:

# column_names = data.columns.to_numpy()

# # Convert the array to a DataFrame
# df_column_names = pd.DataFrame(column_names, columns=['Column Names'])

# output_folder = 'LOS_RF_features'
# os.makedirs(output_folder, exist_ok=True)
# file_path = os.path.join(output_folder, 'microbio_features.csv')

# # Save the DataFrame to a CSV file
# df_column_names.to_csv(file_path, index=False)

In [40]:
column_names = df_microbio.columns.to_numpy()

# Convert the array to a DataFrame
df_column_names = pd.DataFrame(column_names, columns=['Column Names'])

output_folder = 'XGBoost_features'
os.makedirs(output_folder, exist_ok=True)
file_path = os.path.join(output_folder, 'microbio_features.csv')

# Save the DataFrame to a CSV file
df_column_names.to_csv(file_path, index=False)

#### Training the learner

In [54]:
X_train = df_microbio

In [55]:
# Converting duration strings to floats
X_train['delay']= X_train['delay'].astype(str)
X_train['delay']= X_train['delay'].apply(convert_to_days)
X_train['days_since_admission'] = X_train['days_since_admission'].astype(str)
X_train['days_since_admission'] = X_train['days_since_admission'].apply(convert_to_days)

In [56]:
# target = df_diagnoses (rows where hadm_id is in train data)
# Make sure the order is the same

# Extract the unique IDs from the column 
train_ids = X_train['hadm_id'].unique()

# Filter to keep only rows where the id column is in train_ids
y_train = df_drgcodes[df_drgcodes['hadm_id'].isin(train_ids)]

In [57]:
merged_df = pd.merge(y_train, X_train['hadm_id'], on='hadm_id', how='inner')

In [58]:
y_train = merged_df

In [59]:
valid_ids = y_train['hadm_id'].unique()
X_train = X_train[X_train['hadm_id'].isin(valid_ids)]

In [60]:
y_train = y_train.sort_values(by='hadm_id')
y_train = y_train.drop(columns=['hadm_id'])

X_train = X_train.sort_values(by='hadm_id')
X_train = X_train.drop(columns=['hadm_id'])

In [61]:
base_classifier = xgb.XGBClassifier()

# Initialize the multi-output classifier with the base classifier
XGBoost_microbio = MultiOutputClassifier(base_classifier)

# Train the classifier with both input features (X_train) and target labels (y_train)
XGBoost_microbio.fit(X_train.values, y_train.values)

In [62]:
# Save model to folder

output_folder = 'diagnosis_learners'
# os.makedirs(output_folder, exist_ok=True)

# Save the trained model to a file in the new folder
model_file = os.path.join(output_folder, 'XGBoost_microbio.joblib')
dump(XGBoost_microbio, model_file)

['diagnosis_learners\\XGBoost_microbio.joblib']

### poe

In [41]:
file = "hosp/poe.csv"
full_path = path + file

df_poe = pd.read_csv(full_path)

In [42]:
# Separate into training and evaluation sets 

df_poe_training = df_poe[~df_poe['hadm_id'].isin(evaluation_admissions)]
df_poe_evaluation = df_poe[df_poe['hadm_id'].isin(evaluation_admissions)]

In [43]:
# Save evaluation data for later 
folder_name = 'EnsembleEvaluationData'

# Define the file path
file_path = os.path.join(folder_name, 'df_poe_evaluation.csv')

# Save the DataFrame to a CSV file in the specified folder
df_poe_evaluation.to_csv(file_path, index=False)

#### Preprocessing (on training data)

To drop: poe_id, subject_id, ordertime, discontinue_of_poe_id, discontinued_by_poe_id (all unique), order_status (all inactive)
Encode: order_type, transaction_type
Impute with N/A and then encode: order_subtype, order_provider_id

In [44]:
df_poe = df_poe_training

In [45]:
df_poe = df_poe.reset_index(drop=True)

In [46]:
# make a feature of ordertime - admittime for days_since_admission

# Convert to datetime
df_poe['ordertime'] = pd.to_datetime(df_poe['ordertime'], format='%Y/%m/%d %H:%M:%S')

# Add admittime column from other dataframe
df_poe = df_poe.merge(df_admittime, on='hadm_id', how='left')

# # Discard the time part and keep only the date
# df_hcpcsevents['admittime'] = df_hcpcsevents['admittime'].dt.date
# df_hcpcsevents['chartdate'] = df_hcpcsevents['chartdate'].dt.date

df_poe['days_since_admission'] = df_poe['ordertime'] - df_poe['admittime']

# Fill any non time values
df_poe['days_since_admission'] = df_poe['days_since_admission'].fillna(pd.Timedelta(0))

# Drop the admission time column
df_poe = df_poe.drop(columns=['admittime'])

In [47]:
# Drop 
df_poe = df_poe.drop(columns=['poe_id','subject_id','ordertime','discontinue_of_poe_id','discontinued_by_poe_id',
                                       'order_status'])

In [48]:
# Encode
df_poe = pd.get_dummies(df_poe, columns=['order_type','transaction_type'])

In [49]:
# Impute with N/A and encode
df_poe['order_subtype'] = df_poe['order_subtype'].fillna('N/A')
df_poe['order_provider_id'] = df_poe['order_provider_id'].fillna('N/A')
df_poe = pd.get_dummies(df_poe, columns=['order_subtype','order_provider_id'])

In [50]:
# column_names = data.columns.to_numpy()

# # Convert the array to a DataFrame
# df_column_names = pd.DataFrame(column_names, columns=['Column Names'])

# output_folder = 'LOS_RF_features'
# os.makedirs(output_folder, exist_ok=True)
# file_path = os.path.join(output_folder, 'poe_features.csv')

# # Save the DataFrame to a CSV file
# df_column_names.to_csv(file_path, index=False)

In [51]:
column_names = df_poe.columns.to_numpy()

# Convert the array to a DataFrame
df_column_names = pd.DataFrame(column_names, columns=['Column Names'])

output_folder = 'XGBoost_features'
os.makedirs(output_folder, exist_ok=True)
file_path = os.path.join(output_folder, 'poe_features.csv')

# Save the DataFrame to a CSV file
df_column_names.to_csv(file_path, index=False)

#### Training the learner

In [74]:
X_train = df_poe
X_train['days_since_admission'] = X_train['days_since_admission'].astype(str)
X_train['days_since_admission'] = X_train['days_since_admission'].apply(convert_to_days)

In [75]:
# target = df_diagnoses (rows where hadm_id is in train data)
# Make sure the order is the same

# Extract the unique IDs from the column 
train_ids = X_train['hadm_id'].unique()

# Filter to keep only rows where the id column is in train_ids
y_train = df_drgcodes[df_drgcodes['hadm_id'].isin(train_ids)]

In [76]:
merged_df = pd.merge(y_train, X_train['hadm_id'], on='hadm_id', how='inner')

In [77]:
y_train = merged_df

In [78]:
valid_ids = y_train['hadm_id'].unique()
X_train = X_train[X_train['hadm_id'].isin(valid_ids)]

In [79]:
y_train = y_train.sort_values(by='hadm_id')
y_train = y_train.drop(columns=['hadm_id'])

X_train = X_train.sort_values(by='hadm_id')
X_train = X_train.drop(columns=['hadm_id'])

In [80]:
base_classifier = xgb.XGBClassifier()

# Initialize the multi-output classifier with the base classifier
XGBoost_poe = MultiOutputClassifier(base_classifier)

# Train the classifier with both input features (X_train) and target labels (y_train)
XGBoost_poe.fit(X_train.values, y_train.values)

In [81]:
# Save model to folder

output_folder = 'diagnosis_learners'
# os.makedirs(output_folder, exist_ok=True)

# Save the trained model to a file in the new folder
model_file = os.path.join(output_folder, 'XGBoost_poe.joblib')
dump(XGBoost_poe, model_file)

['diagnosis_learners\\XGBoost_poe.joblib']

### prescriptions

In [52]:
file = "hosp/prescriptions.csv"
full_path = path + file

df_prescriptions = pd.read_csv(full_path)

In [53]:
# Separate into training and evaluation sets 

df_prescriptions_training = df_prescriptions[~df_prescriptions['hadm_id'].isin(evaluation_admissions)]
df_prescriptions_evaluation = df_prescriptions[df_prescriptions['hadm_id'].isin(evaluation_admissions)]

In [54]:
# Save evaluation data for later 
folder_name = 'EnsembleEvaluationData'

# Define the file path
file_path = os.path.join(folder_name, 'df_prescriptions_evaluation.csv')

# Save the DataFrame to a CSV file in the specified folder
df_prescriptions_evaluation.to_csv(file_path, index=False)

#### Preprocessing (on training data)

Drop na and encode: dose_val_rx, form_val_disp, order_provider_id
Drop: subject_id, pharmacy_id, starttime, stoptime, form_rx (mostly null), poe_id
Impute with N/A and encode: formulary_drug_cd, gsn, prod_strength, route
Encode: drug_type, drug, dose_unit_rx, form_unit_disp
Impute with 0: doses_per_24_hrs

Drop rows with na

order_provider_id
Was going to impute with N/A and encode but going to drop as too many features 

In [55]:
df_prescriptions = df_prescriptions_training

In [56]:
df_prescriptions = df_prescriptions.reset_index(drop=True)

In [57]:
# Make a feature of stoptime-starttime called duration 

# Convert to datetime
df_prescriptions['stoptime'] = pd.to_datetime(df_prescriptions['stoptime'], format='%Y/%m/%d %H:%M')
df_prescriptions['starttime'] = pd.to_datetime(df_prescriptions['starttime'], format='%Y/%m/%d %H:%M')

df_prescriptions['duration'] = df_prescriptions['stoptime'] - df_prescriptions['starttime']

# Fill any non time values
df_prescriptions['duration'] = df_prescriptions['duration'].fillna(pd.Timedelta(0))

In [58]:
# Drop na
df_prescriptions.dropna(subset=['dose_val_rx', 'form_val_disp'], inplace=True)

In [59]:
# Drop 
df_prescriptions = df_prescriptions.drop(columns=['subject_id','pharmacy_id','starttime','stoptime','form_rx','poe_id',
                                                 'order_provider_id'])

In [60]:
# Impute with N/A and encode
df_prescriptions['formulary_drug_cd'] = df_prescriptions['formulary_drug_cd'].fillna('N/A')
df_prescriptions['gsn'] = df_prescriptions['gsn'].fillna('N/A')
df_prescriptions['prod_strength'] = df_prescriptions['prod_strength'].fillna('N/A')
df_prescriptions['route'] = df_prescriptions['route'].fillna('N/A')

# Impute with 0
df_prescriptions['ndc'] = df_prescriptions['ndc'].fillna(0)

df_prescriptions = pd.get_dummies(df_prescriptions, columns=['formulary_drug_cd','gsn','prod_strength',
                                                            'route','drug_type','drug','dose_unit_rx','form_unit_disp',
                                                            'dose_val_rx','form_val_disp','ndc'])

In [61]:
df_prescriptions['doses_per_24_hrs'] = df_prescriptions['doses_per_24_hrs'].fillna(0)

In [62]:
# Drop any rows with null values 
df_prescriptions = df_prescriptions.dropna()

In [63]:
X_train = df_prescriptions

In [64]:
X_train['duration']= X_train['duration'].astype(str)
X_train['duration']= X_train['duration'].apply(convert_to_days)

In [65]:
column_names = df_prescriptions.columns.to_numpy()

# Convert the array to a DataFrame
df_column_names = pd.DataFrame(column_names, columns=['Column Names'])

output_folder = 'XGBoost_features'
os.makedirs(output_folder, exist_ok=True)
file_path = os.path.join(output_folder, 'prescriptions_features.csv')

# Save the DataFrame to a CSV file
df_column_names.to_csv(file_path, index=False)

#### Training the learner

In [98]:
# Extract the unique IDs from the column 
train_ids = X_train['hadm_id'].unique()

# Filter to keep only rows where the id column is in train_ids
y_train = df_drgcodes[df_drgcodes['hadm_id'].isin(train_ids)]


In [99]:
merged_df = pd.merge(y_train, X_train['hadm_id'], on='hadm_id', how='inner')

In [100]:
y_train = merged_df

In [101]:
valid_ids = y_train['hadm_id'].unique()
X_train = X_train[X_train['hadm_id'].isin(valid_ids)]

In [102]:
y_train = y_train.sort_values(by='hadm_id')
y_train = y_train.drop(columns=['hadm_id'])

X_train = X_train.sort_values(by='hadm_id')
X_train = X_train.drop(columns=['hadm_id'])

#### Dimensionality reduction ???

In [104]:
# Need to reduce from 4890 to 2874 or less

from sklearn.decomposition import TruncatedSVD

# Number of desired features (components)
n_components = 2874

# Initialize Truncated SVD with the desired number of components
svd = TruncatedSVD(n_components=n_components)

# Fit the Truncated SVD model to the sparse matrix and transform the data
svd.fit(X_train)
X_train = svd.transform(X_train)

# Get the explained variance ratio (how much variance is explained by each component)
explained_variance_ratio = svd.explained_variance_ratio_

# Print the transformed matrix and explained variance ratio
# print("Transformed Matrix:")
# print(transformed_matrix)
print("\nExplained Variance Ratio:")
print(explained_variance_ratio)

print("\n Amount of original variance conserved:", np.sum(svd.explained_variance_ratio_))


Explained Variance Ratio:
[9.99929228e-01 4.18836476e-05 4.06233251e-06 ... 1.83926232e-37
 1.43947701e-33 3.14475983e-37]

 Amount of original variance conserved: 1.0


In [106]:
base_classifier = xgb.XGBClassifier()

# Initialize the multi-output classifier with the base classifier
XGBoost_prescriptions = MultiOutputClassifier(base_classifier)

# Train the classifier with both input features (X_train) and target labels (y_train)
XGBoost_prescriptions.fit(X_train, y_train)

In [107]:
# Save model to folder

output_folder = 'diagnosis_learners'
# os.makedirs(output_folder, exist_ok=True)

# Save the trained model to a file in the new folder
model_file = os.path.join(output_folder, 'XGBoost_prescriptions.joblib')
dump(XGBoost_prescriptions, model_file)

['diagnosis_learners\\XGBoost_prescriptions.joblib']

### inputevents

In [66]:
file = "icu/inputevents.csv"
full_path = path + file

df_input = pd.read_csv(full_path)

In [67]:
# Separate into training and evaluation sets 

df_input_training = df_input[~df_input['hadm_id'].isin(evaluation_admissions)]
df_input_evaluation = df_input[df_input['hadm_id'].isin(evaluation_admissions)]

In [68]:
# Save evaluation data for later 
folder_name = 'EnsembleEvaluationData'

# Define the file path
file_path = os.path.join(folder_name, 'df_input_evaluation.csv')

# Save the DataFrame to a CSV file in the specified folder
df_input_evaluation.to_csv(file_path, index=False)

#### Preprocessing (on training data)

Drop: subject_id, starttime, endtime, storetime, orderid, linkorderid, continueinnextdept, stay_id, caregiver_id,
totalamountuom
Encode: amountuom, ordercategoryname, ordercomponenttypedescription, ordercategorydescription, statusdescription, itemid
Impute with 0: rate, totalamount
Impute with N/A and encode: rateuom, secondaryordercategoryname

In [69]:
df_input = df_input_training

In [70]:
df_input = df_input.reset_index(drop=True)

In [71]:
# Make a duration feature using endtime-starttime

# Convert to datetime
df_input['endtime'] = pd.to_datetime(df_input['endtime'], format='%Y-%m-%d %H:%M:%S')
df_input['starttime'] = pd.to_datetime(df_input['starttime'], format='%Y-%m-%d %H:%M:%S')


df_input['duration'] = df_input['endtime'] - df_input['starttime']

# Fill any non time values
df_input['duration'] = df_input['duration'].fillna(pd.Timedelta(0))

In [72]:
# Make a recording_delay feature using storetime-endtime

# Convert to datetime
df_input['storetime'] = pd.to_datetime(df_input['storetime'], format='%Y-%m-%d %H:%M:%S')

df_input['recording_delay'] = df_input['storetime'] - df_input['endtime']

# Fill any non time values
df_input['recording_delay'] = df_input['recording_delay'].fillna(pd.Timedelta(0))

In [73]:
# Drop 
df_input = df_input.drop(columns=['subject_id','stay_id','starttime','endtime','storetime','orderid','linkorderid',
                                  'continueinnextdept','totalamountuom', 'stay_id','caregiver_id'])

In [74]:
# Impute with N/A and encode
df_input['rateuom'] = df_input['rateuom'].fillna('N/A')
df_input['secondaryordercategoryname'] = df_input['secondaryordercategoryname'].fillna('N/A')
df_input = pd.get_dummies(df_input, columns=['rateuom','secondaryordercategoryname','amountuom','ordercategoryname',
                                            'ordercomponenttypedescription','ordercategorydescription','statusdescription',
                                            'itemid'])

In [75]:
# Impute with 0
df_input['rate'] = df_input['rate'].fillna(0)
df_input['totalamount'] = df_input['totalamount'].fillna(0)

In [76]:
df_input = df_input.dropna()
X_train = df_input

In [77]:
# Converting duration strings to floats

X_train['duration']= X_train['duration'].astype(str)
X_train['duration']= X_train['duration'].apply(convert_to_days)
X_train['recording_delay']= X_train['recording_delay'].astype(str)
X_train['recording_delay']= X_train['recording_delay'].apply(convert_to_days)

In [78]:
# Extract the unique IDs from the column 
train_ids = X_train['hadm_id'].unique()

# Filter to keep only rows where the id column is in train_ids
y_train = df_drgcodes[df_drgcodes['hadm_id'].isin(train_ids)]

In [79]:
merged_df = pd.merge(y_train, X_train['hadm_id'], on='hadm_id', how='inner')

In [80]:
y_train = merged_df

In [81]:
valid_ids = y_train['hadm_id'].unique()
X_train = X_train[X_train['hadm_id'].isin(valid_ids)]

In [82]:
y_train = y_train.sort_values(by='hadm_id')
y_train = y_train.drop(columns=['hadm_id'])

X_train = X_train.sort_values(by='hadm_id')
X_train = X_train.drop(columns=['hadm_id'])

In [83]:
# column_names = data.columns.to_numpy()

# # Convert the array to a DataFrame
# df_column_names = pd.DataFrame(column_names, columns=['Column Names'])

# output_folder = 'LOS_RF_features'
# os.makedirs(output_folder, exist_ok=True)
# file_path = os.path.join(output_folder, 'input_features.csv')

# # Save the DataFrame to a CSV file
# df_column_names.to_csv(file_path, index=False)

In [84]:
column_names = X_train.columns.to_numpy()

# Convert the array to a DataFrame
df_column_names = pd.DataFrame(column_names, columns=['Column Names'])

output_folder = 'XGBoost_features'
os.makedirs(output_folder, exist_ok=True)
file_path = os.path.join(output_folder, 'input_features.csv')

# Save the DataFrame to a CSV file
df_column_names.to_csv(file_path, index=False)

#### Training the learner

In [126]:
base_classifier = xgb.XGBClassifier()

# Initialize the multi-output classifier with the base classifier
XGBoost_input = MultiOutputClassifier(base_classifier)

XGBoost_input.fit(X_train.values, y_train.values)

In [127]:
# Save model to folder

output_folder = 'diagnosis_learners'
# os.makedirs(output_folder, exist_ok=True)

# Save the trained model to a file in the new folder
model_file = os.path.join(output_folder, 'XGBoost_input.joblib')
dump(XGBoost_input, model_file)

['diagnosis_learners\\XGBoost_input.joblib']

### procedureevents

In [85]:
file = "icu/procedureevents.csv"
full_path = path + file

df_procedure_events = pd.read_csv(full_path)

In [86]:
# Separate into training and evaluation sets 
df_procedure_events_training = df_procedure_events[~df_procedure_events['hadm_id'].isin(evaluation_admissions)]
df_procedure_events_evaluation = df_procedure_events[df_procedure_events['hadm_id'].isin(evaluation_admissions)]

In [87]:
# Save evaluation data for later 
folder_name = 'EnsembleEvaluationData'

# Define the file path
file_path = os.path.join(folder_name, 'df_procedure_events_evaluation.csv')

# Save the DataFrame to a CSV file in the specified folder
df_procedure_events_evaluation.to_csv(file_path, index=False)

#### Preprocessing (on training data)

Drop: subject_id, starttime, endtime, storetime, orderid, linkorderid, continueinnextdept, stay_id, caregiver_id
Encode: valueuom, ordercategoryname, ordercategorydescription, statusdescription, itemid
Impute with N/A and encode: location, locationcategory
MAKE DURATION FEATURE 

In [88]:
df_procedure_events = df_procedure_events_training

In [89]:
df_procedure_events = df_procedure_events.reset_index(drop=True)

In [90]:
# Make a duration feature using endtime-starttime

# Convert to datetime
df_procedure_events['endtime'] = pd.to_datetime(df_procedure_events['endtime'], format='%Y-%m-%d %H:%M:%S')
df_procedure_events['starttime'] = pd.to_datetime(df_procedure_events['starttime'], format='%Y-%m-%d %H:%M:%S')


df_procedure_events['duration'] = df_procedure_events['endtime'] - df_procedure_events['starttime']

# Fill any non time values
df_procedure_events['duration'] = df_procedure_events['duration'].fillna(pd.Timedelta(0))

In [91]:
# Make a recording_delay feature using storetime-endtime

# Convert to datetime
df_procedure_events['storetime'] = pd.to_datetime(df_procedure_events['storetime'], format='%Y-%m-%d %H:%M:%S')

df_procedure_events['recording_delay'] = df_procedure_events['storetime'] - df_procedure_events['endtime']

# Fill any non time values
df_procedure_events['recording_delay'] = df_procedure_events['recording_delay'].fillna(pd.Timedelta(0))

In [92]:
# Drop 
df_procedure_events = df_procedure_events.drop(columns=['subject_id','stay_id','starttime','endtime','storetime','orderid',
                                                        'linkorderid','continueinnextdept','caregiver_id'])

In [93]:
# Impute with N/A and encode
df_procedure_events['location'] = df_procedure_events['location'].fillna('N/A')
df_procedure_events['locationcategory'] = df_procedure_events['locationcategory'].fillna('N/A')
df_procedure_events = pd.get_dummies(df_procedure_events, columns=['location','locationcategory','valueuom',
                                                                   'ordercategoryname','ordercategorydescription',
                                                                   'statusdescription','itemid'])

In [94]:
X_train = df_procedure_events

In [95]:
# Converting duration strings to floats
X_train['duration']= X_train['duration'].astype(str)
X_train['duration']= X_train['duration'].apply(convert_to_days)
X_train['recording_delay']= X_train['recording_delay'].astype(str)
X_train['recording_delay']= X_train['recording_delay'].apply(convert_to_days)

In [96]:
# column_names = data.columns.to_numpy()

# # Convert the array to a DataFrame
# df_column_names = pd.DataFrame(column_names, columns=['Column Names'])

# output_folder = 'LOS_RF_features'
# os.makedirs(output_folder, exist_ok=True)
# file_path = os.path.join(output_folder, 'procedure_events_features.csv')

# # Save the DataFrame to a CSV file
# df_column_names.to_csv(file_path, index=False)

In [97]:
column_names = X_train.columns.to_numpy()

# Convert the array to a DataFrame
df_column_names = pd.DataFrame(column_names, columns=['Column Names'])

output_folder = 'XGBoost_features'
os.makedirs(output_folder, exist_ok=True)
file_path = os.path.join(output_folder, 'procedure_events_features.csv')

# Save the DataFrame to a CSV file
df_column_names.to_csv(file_path, index=False)

#### Training the learner

In [141]:
# Extract the unique IDs from the column 
train_ids = X_train['hadm_id'].unique()

# Filter to keep only rows where the id column is in train_ids
y_train = df_drgcodes[df_drgcodes['hadm_id'].isin(train_ids)]


In [142]:
merged_df = pd.merge(y_train, X_train['hadm_id'], on='hadm_id', how='inner')

In [143]:
y_train = merged_df

In [144]:
valid_ids = y_train['hadm_id'].unique()
X_train = X_train[X_train['hadm_id'].isin(valid_ids)]

In [145]:
y_train = y_train.sort_values(by='hadm_id')
y_train = y_train.drop(columns=['hadm_id'])

X_train = X_train.sort_values(by='hadm_id')
X_train = X_train.drop(columns=['hadm_id'])

In [146]:
base_classifier = xgb.XGBClassifier()

# Initialize the multi-output classifier with the base classifier
XGBoost_procedure_events = MultiOutputClassifier(base_classifier)

# Train the classifier with both input features (X_train) and target labels (y_train)
XGBoost_procedure_events.fit(X_train.values, y_train.values)

In [147]:
# Save model to folder

output_folder = 'diagnosis_learners'
# os.makedirs(output_folder, exist_ok=True)

# Save the trained model to a file in the new folder
model_file = os.path.join(output_folder, 'XGBoost_procedure_events.joblib')
dump(XGBoost_procedure_events, model_file)

['diagnosis_learners\\XGBoost_procedure_events.joblib']