### Training of transfer classifiers for ensemble - full pipeline

In [1]:
# External libraries for data processing
import numpy as np
import pandas as pd
import sklearn as sk
#To render graphs within notebook
%matplotlib inline
import matplotlib.pyplot as plt
import joblib 
import os

# Versions of libraries
print("Numpy version: {}".format(np.__version__))
print("Pandas version: {}".format(pd.__version__))
print("Scikit version: {}".format(sk.__version__))

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MultiLabelBinarizer
from sklearn.metrics import accuracy_score, classification_report

Numpy version: 1.24.3
Pandas version: 1.5.3
Scikit version: 1.3.0


In [2]:
from joblib import dump
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import mean_squared_error
import matplotlib.pyplot as plt

In [3]:
path = "C:/Project/Data/"

#### Global functions

In [4]:
def convert_to_days(duration_str):
    parts = duration_str.split(' days ')  # Split string into form ['22', '20:55:00']
    days = float(parts[0])  # Extract number of days and convert to float
    time_parts = parts[1].split(':')  # Split time part (hh:mm:ss) ['20', '55', '00']
    hours = float(time_parts[0])  # Extract hours and convert to float
    minutes = float(time_parts[1])  # Extract minutes and convert to float
    seconds = float(time_parts[2])  # Extract seconds and convert to float
    total_days = days + (hours / 24) + (minutes / (24 * 60)) + (seconds / (24 * 3600))  # Calculate total days
    return total_days

In [5]:
file = "hosp/admissions.csv"
full_path = path + file

df_admissions = pd.read_csv(full_path)

df_admissions['dischtime'] = pd.to_datetime(df_admissions['dischtime'], format='%d/%m/%Y %H:%M')
df_admissions['admittime'] = pd.to_datetime(df_admissions['admittime'], format='%d/%m/%Y %H:%M')

df_admittime= pd.DataFrame()
df_admittime['hadm_id'] = df_admissions['hadm_id']
df_admittime['admittime'] = df_admissions['admittime']

#### Select 20% of transfers to use for evaluation (183 samples)

In [6]:
file = "hosp/transfers.csv"
full_path = path + file

df_transfers = pd.read_csv(full_path)

In [7]:
# drop dishcarged samples
df_transfers = df_transfers[df_transfers['eventtype'] != 'discharge']

# convert time to datetime
df_transfers['intime'] = pd.to_datetime(df_transfers['intime'])
df_transfers['outtime'] = pd.to_datetime(df_transfers['outtime'])

In [8]:
evaluation_transfers = df_transfers['transfer_id'].sample(n=183, random_state=42).tolist()

# Any records belonging to these transfers will be removed before training 
print(evaluation_transfers)

[37509207, 32050989, 37298974, 39930808, 31077365, 34918384, 36976997, 34503142, 30265082, 30853116, 33256413, 35297811, 37348935, 35636875, 31373193, 39252789, 32283063, 34392585, 37802225, 37471901, 31983963, 33693488, 30193781, 39429848, 34411721, 35258379, 37665491, 35540249, 32660091, 31594060, 37977734, 30744153, 34976513, 31273493, 35643433, 35223874, 35065627, 35509340, 34107647, 34636616, 30458338, 31640785, 32554129, 31582912, 36686656, 38425947, 39641848, 37093652, 30955999, 30619804, 35044342, 31306648, 36668331, 36715666, 32443787, 37578680, 35706577, 33125334, 35186527, 39399961, 31824062, 34372637, 31043383, 38467810, 32695027, 39738665, 32732194, 30540455, 37419259, 39764235, 34696018, 31660580, 31960365, 38228751, 39028384, 36059427, 38433139, 33260006, 38369052, 31320729, 34170353, 33177122, 35682011, 32131502, 37145382, 39739186, 39497668, 36316394, 34176810, 38430513, 34982171, 31950481, 33916615, 30896594, 39346340, 30804580, 31313849, 38779104, 30145190, 38564981,

In [9]:
len(evaluation_transfers)

183

In [10]:
# Target variable (careunit based on transfer_id)

df_target = df_transfers.drop(columns=['subject_id', 'eventtype','hadm_id','intime','outtime'])

In [11]:
df_target

Unnamed: 0,transfer_id,careunit
26,31766090,Med/Surg
27,39182916,Med/Surg
28,33930547,Med/Surg
29,39351025,Medicine
30,37834930,Medicine
...,...,...
1185,31983963,PACU
1186,38367109,Cardiac Surgery
1187,39362807,Medicine/Cardiology
1188,38425947,Medicine/Cardiology


### Emar

In [12]:
file = "hosp/emar.csv"
full_path = path + file

df_emar = pd.read_csv(full_path)

# records for 65 different patients 
# 181 unique admissions

#### Preprocessing (on all data)

In [13]:
df_emar = df_emar[df_emar['hadm_id'].isin(df_transfers['hadm_id'])]

# For each sample, get the rows from df_transfers that have the same hadm_id
# Go through this list and if the event time is between the in and out transfer time, assign it the same transfer_id 

In [14]:
df_emar = df_emar.reset_index(drop=True)

In [15]:
# convert time to datetime
df_emar['charttime'] = pd.to_datetime(df_emar['charttime'])

In [16]:
# Make a feature called delay using scheduletime - charttime

# Convert to datetime
df_emar['scheduletime'] = pd.to_datetime(df_emar['scheduletime'], format='%Y/%m/%d %H:%M')
df_emar['charttime'] = pd.to_datetime(df_emar['charttime'], format='%Y/%m/%d %H:%M')

df_emar['delay'] = df_emar['charttime'] - df_emar['scheduletime']

# Fill any non time values
df_emar['delay'] = df_emar['delay'].fillna(pd.Timedelta(0))

In [17]:
df_emar = df_emar.drop(columns=['subject_id','emar_id','poe_id','pharmacy_id',
                               'event_txt','scheduletime','storetime'])

In [18]:
# Fill Null with N/A and then one hot encode
df_emar['enter_provider_id'] = df_emar['enter_provider_id'].fillna('N/A')
df_emar['medication'] = df_emar['medication'].fillna('N/A')
df_emar = pd.get_dummies(df_emar, columns=['enter_provider_id', 'medication'])

In [20]:
df_emar['transfer_id'] = float('nan')
for index, row in df_emar.iterrows():
    # Filter target df based on 'hadm_id'
    df_transfers_subset = df_transfers[df_transfers['hadm_id'] == row['hadm_id']]

    datetime_value = row['charttime']
    
    # Filter out datetime values in the second dataframe that are later than the datetime value in the first dataframe
    filtered_transfers = df_transfers_subset[df_transfers_subset['intime'] > datetime_value]
    
    if not filtered_transfers.empty:

        # Find the closest datetime value in the filtered second dataframe
        closest_datetime = filtered_transfers['intime'].min()

        # Get the id of the sample with the closest datetime value
        closest_id = filtered_transfers[filtered_transfers['intime'] == closest_datetime]['transfer_id']

        # Assign the id to the current row in the first dataframe
        df_emar.at[index, 'transfer_id'] = closest_id

In [None]:
df_emar = df_emar.dropna()

In [None]:
# Separate into training and evaluation sets 

df_emar_training = df_emar[~df_emar['transfer_id'].isin(evaluation_transfers)]
df_emar_evaluation = df_emar[df_emar['transfer_id'].isin(evaluation_transfers)]

In [None]:
df_emar_training

In [None]:
df_emar_evaluation

In [None]:
df_emar_evaluation['transfer_id'].unique().size

In [None]:
# Save evaluation data for later 
folder_name = 'EnsembleEvaluationData'

# Define the file path
file_path = os.path.join(folder_name, 'df_emar_evaluation.csv')

# Save the DataFrame to a CSV file in the specified folder
df_emar_evaluation.to_csv(file_path, index=False)

#### Training the learner

In [None]:
# change transfer_id to careunit

df_emar_training = pd.merge(df_emar_training, df_transfers[['transfer_id', 'careunit']], on='transfer_id', how='left')
df_emar_training.drop(columns=['transfer_id'], inplace=True)

In [None]:
data = df_emar_training.drop(columns=['hadm_id','charttime','careunit'])
target = pd.DataFrame(df_emar_training['careunit'])

In [None]:
data['delay']= data['delay'].astype(str)
data['delay']= data['delay'].apply(convert_to_days)

In [None]:
# Initialize and fit
from tqdm import tqdm
from sklearn.ensemble import BaggingClassifier
from sklearn.tree import DecisionTreeClassifier

# Instantiate the base classifier (decision tree)
base_clf = DecisionTreeClassifier(random_state=42) # best as default

# Instantiate the BaggingClassifier
bagging_clf_emar = BaggingClassifier(estimator=base_clf, n_estimators=10, random_state=42)

bagging_clf_emar.fit(data, np.ravel(target))

In [81]:
# Save model to folder

output_folder = 'target_learners'
os.makedirs(output_folder, exist_ok=True)

# Save the trained model to a file in the new folder
model_file = os.path.join(output_folder, 'bagging_clf_emar.joblib')
dump(bagging_clf_emar, model_file)

['target_learners\\bagging_clf_emar.joblib']

### microbiologyevents

In [91]:
file = "hosp/microbiologyevents.csv"
full_path = path + file

df_microbio = pd.read_csv(full_path)

#### Preprocessing (on all data)

In [92]:
df_microbio = df_microbio[df_microbio['hadm_id'].isin(df_transfers['hadm_id'])]

# For each sample, get the rows from df_transfers that have the same hadm_id
# Go through this list and if the event time is between the in and out transfer time, assign it the same transfer_id 

In [93]:
# convert time to datetime
df_microbio['charttime'] = pd.to_datetime(df_microbio['charttime'])

In [94]:
df_microbio['transfer_id'] = float('nan')

for index, row in df_microbio.iterrows():
    # Filter target df based on 'hadm_id'
    df_transfers_subset = df_transfers[df_transfers['hadm_id'] == row['hadm_id']]

    datetime_value = row['charttime']
    
    # Filter out datetime values in the second dataframe that are later than the datetime value in the first dataframe
    filtered_transfers = df_transfers_subset[df_transfers_subset['intime'] > datetime_value]
    
    if not filtered_transfers.empty:

        # Find the closest datetime value in the filtered second dataframe
        closest_datetime = filtered_transfers['intime'].min()

        # Get the id of the sample with the closest datetime value
        closest_id = filtered_transfers[filtered_transfers['intime'] == closest_datetime]['transfer_id']

        # Assign the id to the current row in the first dataframe
        df_microbio.at[index, 'transfer_id'] = closest_id

df_microbio.dropna(subset=['transfer_id'], inplace=True)

In [95]:
# make days_since_admission using charttime 

# Convert to datetime
df_microbio['charttime'] = pd.to_datetime(df_microbio['charttime'], format='%Y/%m/%d %H:%M')

# Add admittime column from other dataframe
df_microbio = df_microbio.merge(df_admittime, on='hadm_id', how='left')

# # Discard the time part and keep only the date
# df_hcpcsevents['admittime'] = df_hcpcsevents['admittime'].dt.date
# df_hcpcsevents['chartdate'] = df_hcpcsevents['chartdate'].dt.date

df_microbio['days_since_admission'] = df_microbio['charttime'] - df_microbio['admittime']

# Fill any non time values
df_microbio['days_since_admission'] = df_microbio['days_since_admission'].fillna(pd.Timedelta(0))

# Drop the admission time column
df_microbio = df_microbio.drop(columns=['admittime'])

In [96]:
# Add storetime - charttime feature (call it delay)

# Convert to datetime
df_microbio['storetime'] = pd.to_datetime(df_microbio['storetime'], format='%Y/%m/%d %H:%M')

df_microbio['delay'] = df_microbio['storetime'] - df_microbio['charttime']

# Fill any non time values
df_microbio['delay'] = df_microbio['delay'].fillna(pd.Timedelta(0))

Drop: microevent_id, subject_id, chartdate, charttime, test_seq, storedate, storetime, test_name and org_itemid (since info in name), quantity, ab_name, comments, micro_specimen_id (unique identifier for sample as some measurements are made on the same sample)
Keep but categorical: order_provider_id, spec_type_desc, dilution_text, dilution_comparison
Impute null with 0: order_provider_id, org_itemid, isolate_num, ab_itemid, dilution_value
Impute with N/A and then one hot encode: interpretation

In [97]:
# Drop
# spec_itemid , test_itemid
df_microbio = df_microbio.drop(columns=['microevent_id','subject_id','chartdate','charttime','test_seq','storedate',
                                       'storetime','quantity','comments','ab_itemid',
                                       'spec_itemid','test_itemid','org_itemid','micro_specimen_id'])

In [98]:
# Impute null with 0: order_provider_id, org_itemid, isolate_num, ab_itemid, dilution_value
df_microbio['order_provider_id'] = df_microbio['order_provider_id'].fillna(0)
df_microbio['isolate_num'] = df_microbio['isolate_num'].fillna(0)
df_microbio['dilution_value'] = df_microbio['dilution_value'].fillna(0)

In [99]:
# Impute with N/A and then one hot encode: interpretation
# encode test_name, ab_name

df_microbio['interpretation'] = df_microbio['interpretation'].fillna('N/A')
df_microbio['test_name'] = df_microbio['test_name'].fillna('N/A')
df_microbio['ab_name'] = df_microbio['ab_name'].fillna('N/A')
df_microbio['org_name'] = df_microbio['org_name'].fillna('None')
df_microbio = pd.get_dummies(df_microbio, columns=['org_name','interpretation','ab_name','test_name'])

In [100]:
# Keep but categorical: order_provider_id, spec_type_desc, dilution_text, dilution_comparison
df_microbio = pd.get_dummies(df_microbio, columns=['order_provider_id','spec_type_desc','dilution_text',
                                                  'dilution_comparison'])

In [101]:
df_microbio = df_microbio.dropna()

In [102]:
# Separate into training and evaluation sets 

# df_microbio_training = df_microbio[~df_microbio['subject_id'].isin(evaluation_patients)]
# df_microbio_evaluation = df_microbio[df_microbio['subject_id'].isin(evaluation_patients)]

df_microbio_training = df_microbio[~df_microbio['transfer_id'].isin(evaluation_transfers)]
df_microbio_evaluation = df_microbio[df_microbio['transfer_id'].isin(evaluation_transfers)]

In [103]:
# Save evaluation data for later 
folder_name = 'EnsembleEvaluationData'

# Define the file path
file_path = os.path.join(folder_name, 'df_microbio_evaluation.csv')

# Save the DataFrame to a CSV file in the specified folder
df_microbio_evaluation.to_csv(file_path, index=False)

In [178]:
df_microbio_evaluation['transfer_id'].unique().size

28

#### Training the learner

In [123]:
# change transfer_id to careunit

df_microbio_training = pd.merge(df_microbio_training, df_transfers[['transfer_id', 'careunit']], on='transfer_id', how='left')
df_microbio_training.drop(columns=['transfer_id'], inplace=True)

In [124]:
data = df_microbio_training.drop(columns=['hadm_id','careunit'])
target = pd.DataFrame(df_microbio_training['careunit'])

In [126]:
# Converting duration strings to floats

data['delay']= data['delay'].astype(str)
data['delay']= data['delay'].apply(convert_to_days)
data['days_since_admission'] = data['days_since_admission'].astype(str)
data['days_since_admission'] = data['days_since_admission'].apply(convert_to_days)

In [127]:
# Initialize and fit

base_clf = DecisionTreeClassifier(random_state=42,criterion='entropy',max_leaf_nodes=1000)

# Instantiate the BaggingClassifier
bagging_clf_microbio = BaggingClassifier(estimator=base_clf, n_estimators=12, random_state=42)

for i in tqdm(range(100)):
    # Train the BaggingClassifier
    bagging_clf_microbio.fit(data, np.ravel(target))

100%|████████████████████████████████████████████████████████████████████████████████| 100/100 [00:43<00:00,  2.29it/s]


In [128]:
# Save model to folder

output_folder = 'target_learners'
os.makedirs(output_folder, exist_ok=True)

# Save the trained model to a file in the new folder
model_file = os.path.join(output_folder, 'bagging_clf_microbio.joblib')
dump(bagging_clf_microbio, model_file)

['target_learners\\bagging_clf_microbio.joblib']

### prescriptions

In [104]:
file = "hosp/prescriptions.csv"
full_path = path + file

df_prescriptions = pd.read_csv(full_path)

#### Preprocessing (on all data)

In [105]:
df_prescriptions = df_prescriptions[df_prescriptions['hadm_id'].isin(df_transfers['hadm_id'])]

# For each sample, get the rows from df_transfers that have the same hadm_id
# Go through this list and if the event time is between the in and out transfer time, assign it the same transfer_id 

In [106]:
# convert time to datetime
df_prescriptions['starttime'] = pd.to_datetime(df_prescriptions['starttime'])

In [107]:
df_prescriptions['transfer_id'] = float('nan')

for index, row in df_prescriptions.iterrows():
    # Filter target df based on 'hadm_id'
    df_transfers_subset = df_transfers[df_transfers['hadm_id'] == row['hadm_id']]

    datetime_value = row['starttime']
    
    # Filter out datetime values in the second dataframe that are later than the datetime value in the first dataframe
    filtered_transfers = df_transfers_subset[df_transfers_subset['intime'] > datetime_value]
    
    if not filtered_transfers.empty:

        # Find the closest datetime value in the filtered second dataframe
        closest_datetime = filtered_transfers['intime'].min()

        # Get the id of the sample with the closest datetime value
        closest_id = filtered_transfers[filtered_transfers['intime'] == closest_datetime]['transfer_id']

        # Assign the id to the current row in the first dataframe
        df_prescriptions.at[index, 'transfer_id'] = closest_id

df_prescriptions.dropna(subset=['transfer_id'], inplace=True)

Drop na and encode: dose_val_rx, form_val_disp, order_provider_id
Drop: subject_id, pharmacy_id, starttime, stoptime, form_rx (mostly null), poe_id
Impute with N/A and encode: formulary_drug_cd, gsn, prod_strength, route
Encode: drug_type, drug, dose_unit_rx, form_unit_disp
Impute with 0: doses_per_24_hrs

Drop rows with na

order_provider_id
Was going to impute with N/A and encode but going to drop as too many features 

In [108]:
# Make a feature of stoptime-starttime called duration 

# Convert to datetime
df_prescriptions['stoptime'] = pd.to_datetime(df_prescriptions['stoptime'], format='%Y/%m/%d %H:%M')
df_prescriptions['starttime'] = pd.to_datetime(df_prescriptions['starttime'], format='%Y/%m/%d %H:%M')

df_prescriptions['duration'] = df_prescriptions['stoptime'] - df_prescriptions['starttime']

# Fill any non time values
df_prescriptions['duration'] = df_prescriptions['duration'].fillna(pd.Timedelta(0))

In [109]:
# Drop na
df_prescriptions.dropna(subset=['dose_val_rx', 'form_val_disp'], inplace=True)

In [110]:
# Drop 
df_prescriptions = df_prescriptions.drop(columns=['subject_id','pharmacy_id','starttime','stoptime','form_rx','poe_id',
                                                 'order_provider_id'])

In [111]:
# Impute with N/A and encode
df_prescriptions['formulary_drug_cd'] = df_prescriptions['formulary_drug_cd'].fillna('N/A')
df_prescriptions['gsn'] = df_prescriptions['gsn'].fillna('N/A')
df_prescriptions['prod_strength'] = df_prescriptions['prod_strength'].fillna('N/A')
df_prescriptions['route'] = df_prescriptions['route'].fillna('N/A')

# Impute with 0
df_prescriptions['ndc'] = df_prescriptions['ndc'].fillna(0)

df_prescriptions = pd.get_dummies(df_prescriptions, columns=['formulary_drug_cd','gsn','prod_strength',
                                                            'route','drug_type','drug','dose_unit_rx','form_unit_disp',
                                                            'dose_val_rx','form_val_disp','ndc'])

In [112]:
df_prescriptions['doses_per_24_hrs'] = df_prescriptions['doses_per_24_hrs'].fillna(0)

In [113]:
# Drop any rows with null values 
df_prescriptions = df_prescriptions.dropna()

In [114]:
df_prescriptions

Unnamed: 0,hadm_id,poe_seq,doses_per_24_hrs,transfer_id,duration,formulary_drug_cd_5000MLBAG,formulary_drug_cd_AA5D151000I,formulary_drug_cd_ACD3/1000I,formulary_drug_cd_ACE250,formulary_drug_cd_ACE500I,...,ndc_70860030005.0,ndc_70860077602.0,ndc_71019028507.0,ndc_76014000410.0,ndc_76045000905.0,ndc_76329301205.0,ndc_76329330101.0,ndc_76439034310.0,ndc_78112073623.0,ndc_87701071218.0
9,23831430,830.0,0.0,34492498.0,1 days 11:00:00,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
10,20626031,17.0,0.0,32604416.0,1 days 10:00:00,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
18,23831430,463.0,0.0,37253871.0,1 days 22:00:00,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
19,20297618,253.0,0.0,37726687.0,0 days 09:00:00,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
21,20626031,281.0,0.0,34529190.0,1 days 01:00:00,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
18080,27708593,192.0,1.0,30744153.0,7 days 05:00:00,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
18081,20214994,1372.0,0.0,31973139.0,20 days 15:00:00,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
18083,23473524,780.0,0.0,30896594.0,10 days 05:00:00,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
18084,27996267,1640.0,0.0,36762745.0,-1 days +23:00:00,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [115]:
# Converting duration strings to floats

df_prescriptions['duration']= df_prescriptions['duration'].astype(str)
df_prescriptions['duration']= df_prescriptions['duration'].apply(convert_to_days)

In [116]:
# Separate into training and evaluation sets 
df_prescriptions_training = df_prescriptions[~df_prescriptions['transfer_id'].isin(evaluation_transfers)]
df_prescriptions_evaluation = df_prescriptions[df_prescriptions['transfer_id'].isin(evaluation_transfers)]

In [117]:
# change transfer_id to careunit

df_prescriptions_training = pd.merge(df_prescriptions_training, df_transfers[['transfer_id', 'careunit']], on='transfer_id', how='left')
df_prescriptions_training.drop(columns=['transfer_id'], inplace=True)

In [118]:
# df_prescriptions_evaluation = pd.merge(df_prescriptions_evaluation, df_transfers[['transfer_id', 'careunit']], on='transfer_id', how='left')
# df_prescriptions_evaluation.drop(columns=['transfer_id'], inplace=True)
# target_test = pd.DataFrame(df_prescriptions_evaluation['careunit'])

In [119]:
data = df_prescriptions_training.drop(columns=['hadm_id','careunit'])
target = pd.DataFrame(df_prescriptions_training['careunit'])

In [120]:
# data_test = df_prescriptions_evaluation.drop(columns=['hadm_id','careunit'])

In [121]:
data

Unnamed: 0,poe_seq,doses_per_24_hrs,duration,formulary_drug_cd_5000MLBAG,formulary_drug_cd_AA5D151000I,formulary_drug_cd_ACD3/1000I,formulary_drug_cd_ACE250,formulary_drug_cd_ACE500I,formulary_drug_cd_ACET1000I,formulary_drug_cd_ACET1000PB,...,ndc_70860030005.0,ndc_70860077602.0,ndc_71019028507.0,ndc_76014000410.0,ndc_76045000905.0,ndc_76329301205.0,ndc_76329330101.0,ndc_76439034310.0,ndc_78112073623.0,ndc_87701071218.0
0,830.0,0.0,1.458333,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,17.0,0.0,1.416667,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,463.0,0.0,1.916667,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,253.0,0.0,0.375000,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,281.0,0.0,1.041667,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9219,1038.0,0.0,16.375000,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
9220,285.0,0.0,14.000000,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
9221,1372.0,0.0,20.625000,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
9222,1640.0,0.0,-0.041667,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [122]:
df_prescriptions_evaluation

Unnamed: 0,hadm_id,poe_seq,doses_per_24_hrs,transfer_id,duration,formulary_drug_cd_5000MLBAG,formulary_drug_cd_AA5D151000I,formulary_drug_cd_ACD3/1000I,formulary_drug_cd_ACE250,formulary_drug_cd_ACE500I,...,ndc_70860030005.0,ndc_70860077602.0,ndc_71019028507.0,ndc_76014000410.0,ndc_76045000905.0,ndc_76329301205.0,ndc_76329330101.0,ndc_76439034310.0,ndc_78112073623.0,ndc_87701071218.0
22,20973395,225.0,0.0,39793139.0,5.708333,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
30,23831430,538.0,0.0,31077365.0,0.333333,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
33,23831430,739.0,0.0,31077365.0,8.541667,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
44,23300884,356.0,0.0,34953924.0,0.916667,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
48,27962747,660.0,0.0,33916615.0,2.958333,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
18057,23300884,414.0,0.0,38564981.0,0.041667,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
18061,29483621,358.0,0.0,30642078.0,2.791667,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
18067,20321825,671.0,4.0,39544317.0,2.458333,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
18080,27708593,192.0,1.0,30744153.0,7.208333,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


#### Dimensionality reduction - doesn't work for training?

In [123]:
# # Need to reduce from 4890 to 1844 or less

# from sklearn.decomposition import TruncatedSVD

# # Number of desired features (components)
# n_components = 1844

# # Initialize Truncated SVD with the desired number of components
# svd = TruncatedSVD(n_components=n_components)

# # Fit the Truncated SVD model to the sparse matrix and transform the data
# svd.fit(data_train)
# data_train = svd.transform(data_train)

# # Get the explained variance ratio (how much variance is explained by each component)
# explained_variance_ratio = svd.explained_variance_ratio_

# # Print the transformed matrix and explained variance ratio
# # print("Transformed Matrix:")
# # print(transformed_matrix)
# print("\nExplained Variance Ratio:")
# print(explained_variance_ratio)

# print("\n Amount of original variance conserved:", np.sum(svd.explained_variance_ratio_))

In [124]:
# data_test = svd.transform(data_test)

In [125]:
# # Concatenate target variable back to reduced frame 

# df_prescriptions_evaluation = pd.concat([pd.DataFrame(data_test), target_test], axis=1)

In [126]:
# Save evaluation data for later 
folder_name = 'EnsembleEvaluationData'

# Define the file path
file_path = os.path.join(folder_name, 'df_prescriptions_evaluation.csv')

# Save the DataFrame to a CSV file in the specified folder
df_prescriptions_evaluation.to_csv(file_path, index=False)

In [179]:
df_prescriptions_evaluation['transfer_id'].unique().size

72

#### Training the learner

In [33]:
from sklearn.ensemble import BaggingClassifier
from sklearn.tree import DecisionTreeClassifier
from tqdm import tqdm

# Instantiate the base classifier (decision tree)
base_clf = DecisionTreeClassifier(random_state=42,criterion='entropy')

# Instantiate the BaggingClassifier
bagging_clf_prescriptions = BaggingClassifier(estimator=base_clf, n_estimators=10, random_state=42)

for i in tqdm(range(100)):
    # Train the BaggingClassifier
    bagging_clf_prescriptions.fit(data, np.ravel(target))

100%|████████████████████████████████████████████████████████████████████████████████| 100/100 [22:14<00:00, 13.35s/it]


In [34]:
# Save model to folder

output_folder = 'target_learners'
os.makedirs(output_folder, exist_ok=True)

# Save the trained model to a file in the new folder
model_file = os.path.join(output_folder, 'bagging_clf_prescriptions.joblib')
dump(bagging_clf_prescriptions, model_file)

['target_learners\\bagging_clf_prescriptions.joblib']

### ingredientevents

In [127]:
file = "icu/ingredientevents.csv"
full_path = path + file

df_ingredient = pd.read_csv(full_path)

#### Preprocessing (on all data)

Drop: subject_id, starttime, endtime, storetime, orderid, originalamount, stay_id, caregiver_id
Encode: amountuom, statusdescription, itemid
Impute with 0: rate
Impute with N/A and encode: rateuom, linkorderid

In [128]:
df_ingredient = df_ingredient[df_ingredient['hadm_id'].isin(df_transfers['hadm_id'])]

In [129]:
# convert time to datetime
df_ingredient['endtime'] = pd.to_datetime(df_ingredient['endtime'])

In [130]:
df_ingredient['transfer_id'] = float('nan')

for index, row in df_ingredient.iterrows():
    # Filter target df based on 'hadm_id'
    df_transfers_subset = df_transfers[df_transfers['hadm_id'] == row['hadm_id']]

    datetime_value = row['endtime']
    
    # Filter out datetime values in the second dataframe that are later than the datetime value in the first dataframe
    filtered_transfers = df_transfers_subset[df_transfers_subset['intime'] > datetime_value]
    
    if not filtered_transfers.empty:

        # Find the closest datetime value in the filtered second dataframe
        closest_datetime = filtered_transfers['intime'].min()

        # Get the id of the sample with the closest datetime value
        closest_id = filtered_transfers[filtered_transfers['intime'] == closest_datetime]['transfer_id']

        # Assign the id to the current row in the first dataframe
        df_ingredient.at[index, 'transfer_id'] = closest_id

df_ingredient.dropna(subset=['transfer_id'], inplace=True)

In [131]:
# Make a duration feature of endtime-starttime 

# Convert to datetime
df_ingredient['endtime'] = pd.to_datetime(df_ingredient['endtime'], format='%Y-%m-%d %H:%M:%S')
df_ingredient['starttime'] = pd.to_datetime(df_ingredient['starttime'], format='%Y-%m-%d %H:%M:%S')


df_ingredient['duration'] = df_ingredient['endtime'] - df_ingredient['starttime']

# Fill any non time values
df_ingredient['duration'] = df_ingredient['duration'].fillna(pd.Timedelta(0))

In [132]:
# make a recording_delay feature of storetime-endtime

# Convert to datetime
df_ingredient['storetime'] = pd.to_datetime(df_ingredient['storetime'], format='%Y-%m-%d %H:%M:%S')

df_ingredient['recording_delay'] = df_ingredient['storetime'] - df_ingredient['endtime']

# Fill any non time values
df_ingredient['recording_delay'] = df_ingredient['recording_delay'].fillna(pd.Timedelta(0))

In [133]:
# Drop 
df_ingredient = df_ingredient.drop(columns=['subject_id','starttime','endtime','storetime','orderid','originalamount',
                                           'stay_id','caregiver_id'])

In [134]:
# Impute with N/A and encode
df_ingredient['rateuom'] = df_ingredient['rateuom'].fillna('N/A')
df_ingredient['linkorderid'] = df_ingredient['linkorderid'].fillna('N/A')
df_ingredient = pd.get_dummies(df_ingredient, columns=['rateuom','amountuom','statusdescription','itemid','linkorderid'])

In [135]:
# Impute with 0
df_ingredient['rate'] = df_ingredient['rate'].fillna(0)

In [136]:
# Converting duration strings to floats
df_ingredient['duration']= df_ingredient['duration'].astype(str)
df_ingredient['duration']= df_ingredient['duration'].apply(convert_to_days)
df_ingredient['recording_delay']= df_ingredient['recording_delay'].astype(str)
df_ingredient['recording_delay']= df_ingredient['recording_delay'].apply(convert_to_days)

In [137]:
# Separate into training and evaluation sets 
df_ingredient_training = df_ingredient[~df_ingredient['transfer_id'].isin(evaluation_transfers)]
df_ingredient_evaluation = df_ingredient[df_ingredient['transfer_id'].isin(evaluation_transfers)]

In [138]:
# change transfer_id to careunit

df_ingredient_training = pd.merge(df_ingredient_training, df_transfers[['transfer_id', 'careunit']], on='transfer_id', how='left')
df_ingredient_training.drop(columns=['transfer_id'], inplace=True)
data = df_ingredient_training.drop(columns=['hadm_id','careunit'])
target = pd.DataFrame(df_ingredient_training['careunit'])

In [139]:
# # change transfer_id to careunit

# df_ingredient_evaluation = pd.merge(df_ingredient_evaluation, df_transfers[['transfer_id', 'careunit']], on='transfer_id', how='left')
# df_ingredient_evaluation.drop(columns=['transfer_id'], inplace=True)
# data_test = df_ingredient_evaluation.drop(columns=['hadm_id','careunit'])
# target_test = pd.DataFrame(df_ingredient_evaluation['careunit'])

In [140]:
# Save evaluation data for later 
folder_name = 'EnsembleEvaluationData'

# Define the file path
file_path = os.path.join(folder_name, 'df_ingredient_evaluation.csv')

# Save the DataFrame to a CSV file in the specified folder
df_ingredient_evaluation.to_csv(file_path, index=False)

#### Dimensionality reduction - may not work?

In [49]:
# Need to reduce from 7727 to 4116

In [50]:
# from sklearn.decomposition import TruncatedSVD

# # Number of desired features (components)
# n_components = 4116

# # Initialize Truncated SVD with the desired number of components
# svd = TruncatedSVD(n_components=n_components)

# # Fit the Truncated SVD model to the sparse matrix and transform the data
# svd.fit(data_train)
# data_train = svd.transform(data_train)

# # Get the explained variance ratio (how much variance is explained by each component)
# explained_variance_ratio = svd.explained_variance_ratio_

# # Print the transformed matrix and explained variance ratio
# # print("Transformed Matrix:")
# # print(transformed_matrix)
# print("\nExplained Variance Ratio:")
# print(explained_variance_ratio)

# print("\n Amount of original variance conserved:", np.sum(svd.explained_variance_ratio_))


Explained Variance Ratio:
[7.20216978e-01 2.74234729e-01 5.54679310e-03 ... 5.37692205e-11
 5.37678712e-11 5.37661146e-11]

 Amount of original variance conserved: 0.9999999492404776


In [51]:
# data_test = svd.transform(data_test)

In [52]:
# # Concatenate target variable back to reduced frame 

# df_ingredient_evaluation = pd.concat([pd.DataFrame(data_test), target_test], axis=1)

In [53]:
# df_ingredient_evaluation

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,4107,4108,4109,4110,4111,4112,4113,4114,4115,careunit
0,808.515909,-10.171272,603.142548,0.989442,1.107537,-0.510799,0.049756,0.937576,0.259819,0.561630,...,-2.282557e-05,0.000161,-0.000050,-0.000028,-0.000061,-0.000058,0.000083,1.395689e-04,1.490757e-04,Hematology/Oncology Intermediate
1,808.515890,-10.171278,603.142382,1.101203,0.840248,0.830488,0.377736,0.902300,0.179327,0.600121,...,-6.411198e-06,0.000122,-0.000045,-0.000020,-0.000076,-0.000062,0.000073,1.430144e-04,9.973981e-05,Hematology/Oncology Intermediate
2,339.284653,-8.223694,-4.307784,-1.630870,0.876007,-0.462956,0.039014,-0.185260,-0.242626,0.096684,...,-5.296939e-08,0.000003,-0.000001,-0.000004,0.000002,-0.000007,0.000007,1.009450e-06,2.487608e-06,Hematology/Oncology Intermediate
3,339.284593,-8.223715,-4.309147,-1.368857,0.968457,0.339430,0.084554,-0.305355,0.801207,-0.252976,...,-1.284303e-05,0.000035,-0.000010,0.000002,0.000029,0.000017,0.000014,-4.569827e-06,6.956800e-05,Hematology/Oncology Intermediate
4,254.463541,-6.167755,-3.231260,-1.386881,0.968208,0.340450,0.082272,-0.297063,0.820585,-0.209833,...,-8.855953e-06,0.000028,-0.000007,0.000006,0.000026,0.000018,0.000011,-4.908956e-06,6.269998e-05,Hematology/Oncology Intermediate
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2516,508.926699,-12.335635,-6.464756,-1.335431,0.948614,0.335427,0.082602,-0.296396,0.832177,-0.186154,...,-9.965214e-06,0.000028,-0.000012,0.000003,0.000026,0.000013,0.000010,-3.542105e-07,5.964993e-05,Neurology
2517,1425.958919,474.047325,-21.623692,-1.458454,-0.314265,-0.653447,0.023583,0.511828,-0.049323,-0.281115,...,-1.075225e-05,0.000011,-0.000016,-0.000011,-0.000002,-0.000027,0.000002,2.928441e-05,-4.458866e-07,Neurology
2518,1425.958900,474.047318,-21.623858,-1.346692,-0.581555,0.687839,0.351564,0.476552,-0.129815,-0.242624,...,5.662116e-06,-0.000028,-0.000012,-0.000002,-0.000017,-0.000031,-0.000009,3.272990e-05,-4.978175e-05,Neurology
2519,169.642550,-4.111774,-2.152030,-1.666598,0.877996,-0.460677,0.035246,-0.171798,-0.212394,0.164254,...,6.594588e-06,-0.000008,0.000003,0.000003,-0.000003,-0.000004,0.000002,-1.012310e-07,-8.356847e-06,Neurology


In [54]:
# # Save evaluation data for later 
# folder_name = 'EnsembleEvaluationData'

# # Define the file path
# file_path = os.path.join(folder_name, 'df_ingredient_evaluation.csv')

# # Save the DataFrame to a CSV file in the specified folder
# df_ingredient_evaluation.to_csv(file_path, index=False)

In [180]:
df_ingredient_evaluation['transfer_id'].unique().size

22

#### Training the learner

In [25]:
data

Unnamed: 0,amount,rate,originalrate,duration,recording_delay,rateuom_N/A,rateuom_grams/hour,rateuom_mL/hour,rateuom_mcg/kg/min,amountuom_Kcal,...,linkorderid_9985393,linkorderid_9986202,linkorderid_9986595,linkorderid_9988568,linkorderid_9989506,linkorderid_9990254,linkorderid_9990509,linkorderid_9993006,linkorderid_9993329,linkorderid_9996112
0,49.999999,50.000000,50.000000,0.041667,0.000694,0,0,1,0,0,...,0,0,0,0,0,0,0,0,0,0
1,49.999999,50.000000,50.000000,0.041667,0.000694,0,0,1,0,0,...,0,0,0,0,0,0,0,0,0,0
2,249.999990,249.999985,250.000000,0.041667,-0.008333,0,0,1,0,0,...,0,0,0,0,0,0,0,0,0,0
3,249.999990,249.999985,250.000000,0.041667,-0.008333,0,0,1,0,0,...,0,0,0,0,0,0,0,0,0,0
4,38.852669,27.425413,47.080292,0.059028,-0.054167,0,0,1,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
17097,49.999999,50.000000,50.000000,0.041667,0.016667,0,0,1,0,0,...,0,0,0,0,0,0,0,0,0,0
17098,120.000000,0.000000,120.000000,0.000694,-0.000694,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
17099,120.000000,0.000000,120.000000,0.000694,-0.000694,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
17100,500.000000,0.000000,500.000000,0.000694,-0.000694,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [26]:
# Initialize and fit
from tqdm import tqdm
from sklearn.ensemble import BaggingClassifier
from sklearn.tree import DecisionTreeClassifier

base_clf = DecisionTreeClassifier(random_state=42)

# Instantiate the BaggingClassifier
bagging_clf_ingredient = BaggingClassifier(estimator=base_clf, n_estimators=3, random_state=42)

for i in tqdm(range(100)):
    # Train the BaggingClassifier
    bagging_clf_ingredient.fit(pd.DataFrame(data), np.ravel(target))

100%|████████████████████████████████████████████████████████████████████████████████| 100/100 [17:45<00:00, 10.65s/it]


In [27]:
# Save model to folder

output_folder = 'target_learners'
os.makedirs(output_folder, exist_ok=True)

# Save the trained model to a file in the new folder
model_file = os.path.join(output_folder, 'bagging_clf_ingredient.joblib')
dump(bagging_clf_ingredient, model_file)

['target_learners\\bagging_clf_ingredient.joblib']

### inputevents

In [141]:
file = "icu/inputevents.csv"
full_path = path + file

df_input = pd.read_csv(full_path)

#### Preprocessing (on all data)

Drop: subject_id, starttime, endtime, storetime, orderid, linkorderid, continueinnextdept, stay_id, caregiver_id,
totalamountuom
Encode: amountuom, ordercategoryname, ordercomponenttypedescription, ordercategorydescription, statusdescription, itemid
Impute with 0: rate, totalamount
Impute with N/A and encode: rateuom, secondaryordercategoryname

In [142]:
df_input = df_input[df_input['hadm_id'].isin(df_transfers['hadm_id'])]

In [143]:
# convert time to datetime
df_input['endtime'] = pd.to_datetime(df_input['endtime'])

In [144]:
df_input['transfer_id'] = float('nan')

for index, row in df_input.iterrows():
    # Filter target df based on 'hadm_id'
    df_transfers_subset = df_transfers[df_transfers['hadm_id'] == row['hadm_id']]

    datetime_value = row['endtime']
    
    # Filter out datetime values in the second dataframe that are later than the datetime value in the first dataframe
    filtered_transfers = df_transfers_subset[df_transfers_subset['intime'] > datetime_value]
    
    if not filtered_transfers.empty:

        # Find the closest datetime value in the filtered second dataframe
        closest_datetime = filtered_transfers['intime'].min()

        # Get the id of the sample with the closest datetime value
        closest_id = filtered_transfers[filtered_transfers['intime'] == closest_datetime]['transfer_id']

        # Assign the id to the current row in the first dataframe
        df_input.at[index, 'transfer_id'] = closest_id

df_input.dropna(subset=['transfer_id'], inplace=True)

In [145]:
# Make a duration feature using endtime-starttime

# Convert to datetime
df_input['endtime'] = pd.to_datetime(df_input['endtime'], format='%Y-%m-%d %H:%M:%S')
df_input['starttime'] = pd.to_datetime(df_input['starttime'], format='%Y-%m-%d %H:%M:%S')


df_input['duration'] = df_input['endtime'] - df_input['starttime']

# Fill any non time values
df_input['duration'] = df_input['duration'].fillna(pd.Timedelta(0))

In [146]:
# Make a recording_delay feature using storetime-endtime

# Convert to datetime
df_input['storetime'] = pd.to_datetime(df_input['storetime'], format='%Y-%m-%d %H:%M:%S')

df_input['recording_delay'] = df_input['storetime'] - df_input['endtime']

# Fill any non time values
df_input['recording_delay'] = df_input['recording_delay'].fillna(pd.Timedelta(0))

In [147]:
# Drop 
df_input = df_input.drop(columns=['subject_id','stay_id','starttime','endtime','storetime','orderid','linkorderid',
                                  'continueinnextdept','totalamountuom', 'stay_id','caregiver_id'])

In [148]:
# Impute with N/A and encode
df_input['rateuom'] = df_input['rateuom'].fillna('N/A')
df_input['secondaryordercategoryname'] = df_input['secondaryordercategoryname'].fillna('N/A')
df_input = pd.get_dummies(df_input, columns=['rateuom','secondaryordercategoryname','amountuom','ordercategoryname',
                                            'ordercomponenttypedescription','ordercategorydescription','statusdescription',
                                            'itemid'])

In [149]:
# Impute with 0
df_input['rate'] = df_input['rate'].fillna(0)
df_input['totalamount'] = df_input['totalamount'].fillna(0)

In [150]:
df_input = df_input.dropna()

In [151]:
# Converting duration strings to floats
df_input['duration']= df_input['duration'].astype(str)
df_input['duration']= df_input['duration'].apply(convert_to_days)
df_input['recording_delay']= df_input['recording_delay'].astype(str)
df_input['recording_delay']= df_input['recording_delay'].apply(convert_to_days)

In [152]:
# Separate into training and evaluation sets 
df_input_training = df_input[~df_input['transfer_id'].isin(evaluation_transfers)]
df_input_evaluation = df_input[df_input['transfer_id'].isin(evaluation_transfers)]

In [153]:
# change transfer_id to careunit

df_input_training = pd.merge(df_input_training, df_transfers[['transfer_id', 'careunit']], on='transfer_id', how='left')
df_input_training.drop(columns=['transfer_id'], inplace=True)
data = df_input_training.drop(columns=['hadm_id','careunit'])
target = pd.DataFrame(df_input_training['careunit'])

In [154]:
# Save evaluation data for later 
folder_name = 'EnsembleEvaluationData'

# Define the file path
file_path = os.path.join(folder_name, 'df_input_evaluation.csv')

# Save the DataFrame to a CSV file in the specified folder
df_input_evaluation.to_csv(file_path, index=False)

In [181]:
df_input_evaluation['transfer_id'].unique().size

22

#### Training the learner

In [42]:
# Initialize and fit
from tqdm import tqdm
from sklearn.ensemble import BaggingClassifier
from sklearn.tree import DecisionTreeClassifier

base_clf = DecisionTreeClassifier(random_state=42,criterion='entropy')

# Instantiate the BaggingClassifier
bagging_clf_input = BaggingClassifier(estimator=base_clf, n_estimators=10, random_state=42)

for i in tqdm(range(100)):
    # Train the BaggingClassifier
    bagging_clf_input.fit(pd.DataFrame(data), np.ravel(target))

100%|████████████████████████████████████████████████████████████████████████████████| 100/100 [04:55<00:00,  2.96s/it]


In [43]:
# Save model to folder

output_folder = 'target_learners'
os.makedirs(output_folder, exist_ok=True)

# Save the trained model to a file in the new folder
model_file = os.path.join(output_folder, 'bagging_clf_input.joblib')
dump(bagging_clf_input, model_file)

['target_learners\\bagging_clf_input.joblib']

### procedureevents

In [155]:
file = "icu/procedureevents.csv"
full_path = path + file

df_procedure_events = pd.read_csv(full_path)

#### Preprocessing (on all data)

Drop: subject_id, starttime, endtime, storetime, orderid, linkorderid, continueinnextdept, stay_id, caregiver_id
Encode: valueuom, ordercategoryname, ordercategorydescription, statusdescription, itemid
Impute with N/A and encode: location, locationcategory
MAKE DURATION FEATURE 

In [156]:
df_procedure_events = df_procedure_events[df_procedure_events['hadm_id'].isin(df_transfers['hadm_id'])]

In [157]:
# convert time to datetime
df_procedure_events['endtime'] = pd.to_datetime(df_procedure_events['endtime'])

In [158]:
df_procedure_events['transfer_id'] = float('nan')

for index, row in df_procedure_events.iterrows():
    # Filter target df based on 'hadm_id'
    df_transfers_subset = df_transfers[df_transfers['hadm_id'] == row['hadm_id']]

    datetime_value = row['endtime']
    
    # Filter out datetime values in the second dataframe that are later than the datetime value in the first dataframe
    filtered_transfers = df_transfers_subset[df_transfers_subset['intime'] > datetime_value]
    
    if not filtered_transfers.empty:

        # Find the closest datetime value in the filtered second dataframe
        closest_datetime = filtered_transfers['intime'].min()

        # Get the id of the sample with the closest datetime value
        closest_id = filtered_transfers[filtered_transfers['intime'] == closest_datetime]['transfer_id']

        # Assign the id to the current row in the first dataframe
        df_procedure_events.at[index, 'transfer_id'] = closest_id

df_procedure_events.dropna(subset=['transfer_id'], inplace=True)

In [159]:
# Make a duration feature using endtime-starttime

# Convert to datetime
df_procedure_events['endtime'] = pd.to_datetime(df_procedure_events['endtime'], format='%Y-%m-%d %H:%M:%S')
df_procedure_events['starttime'] = pd.to_datetime(df_procedure_events['starttime'], format='%Y-%m-%d %H:%M:%S')


df_procedure_events['duration'] = df_procedure_events['endtime'] - df_procedure_events['starttime']

# Fill any non time values
df_procedure_events['duration'] = df_procedure_events['duration'].fillna(pd.Timedelta(0))

In [160]:
# Make a recording_delay feature using storetime-endtime

# Convert to datetime
df_procedure_events['storetime'] = pd.to_datetime(df_procedure_events['storetime'], format='%Y-%m-%d %H:%M:%S')

df_procedure_events['recording_delay'] = df_procedure_events['storetime'] - df_procedure_events['endtime']

# Fill any non time values
df_procedure_events['recording_delay'] = df_procedure_events['recording_delay'].fillna(pd.Timedelta(0))

In [161]:
# Drop 
df_procedure_events = df_procedure_events.drop(columns=['subject_id','stay_id','starttime','endtime','storetime','orderid',
                                                        'linkorderid','continueinnextdept','caregiver_id'])

In [162]:
# Impute with N/A and encode
df_procedure_events['location'] = df_procedure_events['location'].fillna('N/A')
df_procedure_events['locationcategory'] = df_procedure_events['locationcategory'].fillna('N/A')
df_procedure_events = pd.get_dummies(df_procedure_events, columns=['location','locationcategory','valueuom',
                                                                   'ordercategoryname','ordercategorydescription',
                                                                   'statusdescription','itemid'])

In [163]:
# Converting duration strings to floats
df_procedure_events['duration']= df_procedure_events['duration'].astype(str)
df_procedure_events['duration']= df_procedure_events['duration'].apply(convert_to_days)
df_procedure_events['recording_delay']= df_procedure_events['recording_delay'].astype(str)
df_procedure_events['recording_delay']= df_procedure_events['recording_delay'].apply(convert_to_days)

In [164]:
# Separate into training and evaluation sets 
df_procedure_events_training = df_procedure_events[~df_procedure_events['transfer_id'].isin(evaluation_transfers)]
df_procedure_events_evaluation = df_procedure_events[df_procedure_events['transfer_id'].isin(evaluation_transfers)]

In [165]:
# Save evaluation data for later 
folder_name = 'EnsembleEvaluationData'

# Define the file path
file_path = os.path.join(folder_name, 'df_procedure_events_evaluation.csv')

# Save the DataFrame to a CSV file in the specified folder
df_procedure_events_evaluation.to_csv(file_path, index=False)

In [166]:
# change transfer_id to careunit

df_procedure_events_training = pd.merge(df_procedure_events_training, df_transfers[['transfer_id', 'careunit']], on='transfer_id', how='left')
df_procedure_events_training.drop(columns=['transfer_id'], inplace=True)
data = df_procedure_events_training.drop(columns=['hadm_id','careunit'])
target = pd.DataFrame(df_procedure_events_training['careunit'])

In [182]:
df_procedure_events_evaluation['transfer_id'].unique().size

20

#### Training the learner

In [57]:
# Initialize and fit
base_clf = DecisionTreeClassifier(random_state=42,criterion='entropy',min_samples_leaf=2)

# Instantiate the BaggingClassifier
bagging_clf_procedure_events = BaggingClassifier(estimator=base_clf, n_estimators=10, random_state=42,bootstrap=False,bootstrap_features=True)

for i in tqdm(range(100)):
    # Train the BaggingClassifier
    bagging_clf_procedure_events.fit(pd.DataFrame(data), np.ravel(target))

100%|████████████████████████████████████████████████████████████████████████████████| 100/100 [00:37<00:00,  2.69it/s]


In [58]:
# Save model to folder

output_folder = 'target_learners'
os.makedirs(output_folder, exist_ok=True)

# Save the trained model to a file in the new folder
model_file = os.path.join(output_folder, 'bagging_clf_procedure_events.joblib')
dump(bagging_clf_procedure_events, model_file)

['target_learners\\bagging_clf_procedure_events.joblib']

In [59]:
data

Unnamed: 0,value,patientweight,isopenbag,ORIGINALAMOUNT,ORIGINALRATE,duration,recording_delay,location_Left Accessory Basilic,location_Left Accessory Cephalic,location_Left Antecubital,...,itemid_228128,itemid_228129,itemid_228286,itemid_228715,itemid_229351,itemid_229380,itemid_229526,itemid_229532,itemid_229581,itemid_229586
0,3410.0,103.0,1,3410.0,1,2.368056,0.000000,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,4732.0,103.0,1,4732.0,1,3.286111,0.009028,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,4323.0,103.0,1,4323.0,1,3.002083,0.092361,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,4557.0,103.0,1,4557.0,1,3.164583,0.095833,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,1.0,103.0,0,1.0,0,0.000694,-0.000694,0,0,0,...,0,1,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1044,12720.0,98.9,1,12720.0,1,8.833333,0.000167,0,0,0,...,0,0,0,0,1,0,0,0,0,0
1045,2481.0,127.7,1,2481.0,1,1.722917,0.000061,0,0,0,...,0,0,0,0,0,0,0,1,0,0
1046,566.0,39.4,1,566.0,1,0.393056,0.000579,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1047,566.0,39.4,1,566.0,1,0.393056,0.000579,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [61]:
target.value_counts()

careunit                                        
Cardiac Surgery                                     205
Medicine                                            182
Med/Surg/Trauma                                     167
Med/Surg                                             76
Medicine/Cardiology                                  64
Neurology                                            54
Medical Intensive Care Unit (MICU)                   38
Medical/Surgical Intensive Care Unit (MICU/SICU)     34
Vascular                                             31
Trauma SICU (TSICU)                                  30
Hematology/Oncology                                  29
PACU                                                 27
Transplant                                           27
Med/Surg/GYN                                         23
Cardiac Vascular Intensive Care Unit (CVICU)         22
Coronary Care Unit (CCU)                             11
Neuro Stepdown                                       10