### Training of LOS learners for ensemble - full pipeline

In [1]:
# NOTE
# might need further dimension reduction because I'm removing a lot of data by filtering out the 20 subjects

In [1]:
# External libraries for data processing
import numpy as np
import pandas as pd
import sklearn as sk
#To render graphs within notebook
%matplotlib inline
import matplotlib.pyplot as plt
import joblib 
import os

# Versions of libraries
print("Numpy version: {}".format(np.__version__))
print("Pandas version: {}".format(pd.__version__))
print("Scikit version: {}".format(sk.__version__))

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MultiLabelBinarizer

Numpy version: 1.24.3
Pandas version: 1.5.3
Scikit version: 1.3.0


In [2]:
from joblib import dump
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import ElasticNet
from sklearn.metrics import mean_squared_error
import matplotlib.pyplot as plt
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.ensemble import RandomForestRegressor

In [3]:
path = "C:/Project/Data/"

#### Global functions

In [4]:
def convert_to_days(duration_str):
    parts = duration_str.split(' days ')  # Split string into form ['22', '20:55:00']
    days = float(parts[0])  # Extract number of days and convert to float
    time_parts = parts[1].split(':')  # Split time part (hh:mm:ss) ['20', '55', '00']
    hours = float(time_parts[0])  # Extract hours and convert to float
    minutes = float(time_parts[1])  # Extract minutes and convert to float
    seconds = float(time_parts[2])  # Extract seconds and convert to float
    total_days = days + (hours / 24) + (minutes / (24 * 60)) + (seconds / (24 * 3600))  # Calculate total days
    return total_days

#### Select 20 patients (based on subject_id from patients) to use for evaluation

In [5]:
file = "hosp/patients.csv"
full_path = path + file

df_patients = pd.read_csv(full_path)

In [6]:
evaluation_patients = df_patients['subject_id'].sample(n=20, random_state=42).tolist()

# Any records belonging to these 20 subjects will be removed before training 
print(evaluation_patients)

[10018845, 10011398, 10014354, 10024043, 10035631, 10018328, 10039997, 10004235, 10014078, 10014729, 10010471, 10020187, 10019385, 10021312, 10005817, 10027445, 10026406, 10007818, 10001217, 10016810]


Changed my mind, I want to filter based on hadm_id not subject_id
If a table only has subject_id then remove the subjects the evaluation admissions belong to (admissions maps subject_id to hadm_id)

#### Select 55 (20% of) admissions to use for evaluation

In [7]:
file = "hosp/admissions.csv"
full_path = path + file

df_admissions = pd.read_csv(full_path)

In [8]:
evaluation_admissions = df_admissions['hadm_id'].sample(n=55, random_state=42).tolist()

# Any records belonging to these admissions will be removed before training 
print(evaluation_admissions)

[27617929, 27553957, 20282368, 27296885, 24980601, 21133938, 25559382, 20611796, 28778757, 28723315, 28998349, 28676446, 29276678, 26842957, 21477991, 25922998, 26706939, 27993466, 28236161, 27259207, 20385771, 24540843, 20900955, 22413744, 27494880, 25103777, 21599196, 21540783, 22585261, 26275841, 22130791, 22490490, 25020332, 29279905, 29483621, 27167814, 25508812, 21607814, 20297618, 29974575, 24912093, 21255400, 29295881, 28829452, 24656677, 29858644, 23488445, 25970245, 22508257, 25742920, 25085565, 22228639, 27660781, 28335091, 27703517]


In [9]:
evaluation_patients = df_admissions[df_admissions['hadm_id'].isin(evaluation_admissions)]

In [10]:
evaluation_patients = evaluation_patients['subject_id'].tolist()

#### Target variable LOS

In [11]:
# LOS based on admissions table (target dataframe)

file = "hosp/admissions.csv"
full_path = path + file

df_admissions = pd.read_csv(full_path)

In [12]:
df_admissions['dischtime'] = pd.to_datetime(df_admissions['dischtime'], format='%d/%m/%Y %H:%M')
df_admissions['admittime'] = pd.to_datetime(df_admissions['admittime'], format='%d/%m/%Y %H:%M')

In [13]:
df_los_hadm = pd.DataFrame()
df_los_subject = pd.DataFrame()

df_los_subject['subject_id'] = df_admissions['subject_id']
df_los_hadm['hadm_id'] = df_admissions['hadm_id']
df_los_hadm['los'] = df_admissions['dischtime']-df_admissions['admittime']
df_los_subject['los'] = df_admissions['dischtime']-df_admissions['admittime']

In [14]:
df_los_hadm

Unnamed: 0,hadm_id,los
0,24181354,8 days 23:24:00
1,25926192,7 days 20:12:00
2,23983182,5 days 17:33:00
3,22942076,1 days 17:41:00
4,21606243,2 days 02:11:00
...,...,...
270,24745425,5 days 15:57:00
271,22168393,4 days 12:18:00
272,27708593,7 days 07:10:00
273,23251352,4 days 04:56:00


In [15]:
# Average LOS for each subject_id
df_los_subject = df_los_subject.groupby('subject_id').mean().reset_index()

In [16]:
df_admittime= pd.DataFrame()
df_admittime['hadm_id'] = df_admissions['hadm_id']
df_admittime['admittime'] = df_admissions['admittime']

### omr

In [17]:
file = "hosp/omr.csv"
full_path = path + file

df_omr = pd.read_csv(full_path)

In [18]:
# Separate into training and evaluation sets 

df_omr_training = df_omr[~df_omr['subject_id'].isin(evaluation_patients)]
df_omr_evaluation = df_omr[df_omr['subject_id'].isin(evaluation_patients)]

In [19]:
# Save evaluation data for later 

# Create directory 
folder_name = 'EnsembleEvaluationData'
os.makedirs(folder_name, exist_ok=True)

# Define the file path
file_path = os.path.join(folder_name, 'df_omr_evaluation.csv')

# Save the DataFrame to a CSV file in the specified folder
df_omr_evaluation.to_csv(file_path, index=False)

#### Preprocessing (on training data)

In [None]:
df_omr = df_omr_training

In [None]:
df_omr = df_omr.reset_index(drop=True)

In [None]:
# Combine result_name and seq_num into the column name with result_value from the same row as its value 

# Function to combine values from columns into a new column 
def new_columns(row):
    return row['result_name'] + '_' + str(row['seq_num'])

new_names = df_omr.apply(new_columns, axis=1) # series of names of combinations 


def add_values(row, colName):
    name = row['result_name'] + '_' + str(row['seq_num'])
    if str(name) == colName:
        return row['result_value']
    else:
        return 0


for i in range(len(new_names)):
    df_omr[new_names[i]] = df_omr.apply(add_values, args=(new_names[i],), axis=1)

In [None]:
# Drop seq_num, result_name, result_value
df_omr = df_omr.drop(columns=['seq_num', 'result_name', 'result_value'])
# sequence number doesn't add any useful info

In [None]:
df_omr['subject_id'].value_counts()

# The patient with the most measurements has 391 so could make it 391 features for everyone but most will have lots of 
# zeroes
# Fine as sparcity represents not taking many measurements which could also be a factor?
# Could have number of measurements as an additional feature too

In [None]:
filtered_df = df_omr[df_omr['subject_id'] == 10019003]
filtered_df

# Preserves every measurement made for each subject across all of their stays 
# Only one entry per row 

In [None]:
backup = df_omr.copy()

In [None]:
# ordering by date (so each patients measurements are chronological from top to bottom)

df_omr = df_omr.sort_values(by=['subject_id', 'chartdate'])

df_omr

# This preserves for example, increase in weight over time 

In [None]:
# drop chartdate since the time shift is not consistent for each subject 
df_omr = df_omr.drop(columns=['chartdate'])

In [None]:
# reset index
df_omr = df_omr.reset_index(drop=True)

In [None]:
df_omr_final = pd.DataFrame()

In [None]:
# Row for each subject, features for every measurement made on them 

colNames = df_omr.columns.tolist()
colNames.remove('subject_id')

x = 0
prev_subject = 0


for row in range(len(df_omr)):
    current_subject = df_omr['subject_id'][row] 
    if current_subject != prev_subject:
        x = 0 # reset x
    for i in range(len(colNames)): # for each column
        if df_omr.loc[row, colNames[i]] != 0:
            if colNames[i] + '_0' not in df_omr_final.columns: # New column name added
                x = 0 # reset x
            new_name = colNames[i] + '_' + str(x)
            if new_name in df_omr_final.columns and (current_subject == prev_subject): # Trying to add another of the same 
                # measurement for the same patient 
                x += 1
                new_name = colNames[i] + '_' + str(x)
            df_omr_final.loc[current_subject, new_name] = df_omr.loc[row, colNames[i]]
            df_omr_final = df_omr_final.copy()
            break # leave for loop as the rest of the columns will be 0 for this row
    prev_subject = current_subject
    

In [None]:
df_omr_final.fillna(0, inplace=True)
df_omr_final

In [None]:
# Convert all values to numbers 

df_omr_final = df_omr_final.astype(str)

# Function to convert fraction string to decimal
def fraction_to_decimal(fraction_str):
    try:
        numerator, denominator = map(int, fraction_str.split('/'))
        return numerator / denominator
    except ValueError:
        return fraction_str  # Return unchanged if not a fraction

# Apply the function to the entire DataFrame
df_omr_final = df_omr_final.applymap(fraction_to_decimal)

In [None]:
df_omr_final = df_omr_final.astype(float)
# df_omr_final.info()

In [None]:
# Reset the index and convert it to a column
df_omr_final.reset_index(inplace=True)
df_omr_final.rename(columns={'index': 'subject_id'}, inplace=True)

In [None]:
# Merge the DataFrames based on the ID column
df_omr_final = df_omr_final.merge(df_los_subject, on='subject_id', how='left')

In [None]:
# What does this show?
# Each patient (subject_id is the index of the df) has measurements showing type_sequence_date
# sequence starts from 1 and it is used when the same measurement was taken more than once in a single day
# date starts from 0 and is used when the same measurement for the same patient was taken on a different day
# Note that they were NOT taken on the same date for each patient but the bigger the date integer, the later the measurement
# was taken, relative to that patient's admission  

# Weight (Lbs)_1_0 is the first time the patient was weighed, Weight (Lbs)_3_0 is the third time they were weighed on that 
# same day as they were first weighed
# Weight (Lbs)_1_1 is from a separate (later) date where the patient was weighed again, this is the first measurement 
# from this day 
# Any non applicable measurements are imputed with 0 

Decide which ones to keep all measurements of per patient and which to just take the average and keep as one record for patient (that aren’t likely to change):

Remove blood pressure sitting, lying and standing as too few samples 
Take average for height 

Could probably drop a few of the features that are really empty ?

In [None]:
# drop subject_id
df_omr_final = df_omr_final.drop(columns=['subject_id'])

#### Training the learner

In [None]:
data = df_omr_final.drop(columns=['los'])
target = pd.DataFrame(df_omr_final['los'])

In [None]:
# Dimensionality reduction for data

from sklearn.decomposition import TruncatedSVD

# Number of desired features (components)
n_components = 12

# Initialize Truncated SVD with the desired number of components
svd = TruncatedSVD(n_components=n_components)

# Fit the Truncated SVD model to the sparse matrix and transform the data
svd.fit(data)
data = svd.transform(data)

# Get the explained variance ratio (how much variance is explained by each component)
explained_variance_ratio = svd.explained_variance_ratio_

# Print the transformed matrix and explained variance ratio
# print("Transformed Matrix:")
# print(transformed_matrix)
print("\nExplained Variance Ratio:")
print(explained_variance_ratio)

print("\n Amount of original variance conserved:", np.sum(svd.explained_variance_ratio_))

In [None]:
# Converting duration strings to floats

target['los'] = target['los'].astype(str)
target.fillna(0, inplace=True)
# y_test['los'] = y_test['los'].astype(str)
# y_test.fillna(0, inplace=True)
target.loc[~target['los'].str.contains('days'), 'los'] = '0 days 00:00:00'
# y_test.loc[~y_test['los'].str.contains('days'), 'los'] = '0 days 00:00:00'

target['los'] = target['los'].apply(convert_to_days)
# y_test['los'] = y_test['los'].apply(convert_to_days)

In [None]:
# Random forest regression

# Convert DataFrame to 1D array using ravel()
target = target.values.ravel()

# Initialize and fit the Random Forest regression model
random_forest_omr = RandomForestRegressor(n_estimators=100, max_depth=10, random_state=42)
random_forest_omr.fit(data, target)

# # Predict on the test set
# y_pred = random_forest.predict(X_test)

# # Calculate mean squared error
# mse = mean_squared_error(y_test, y_pred)
# print("Mean Squared Error:", mse)

# # Plot true vs predicted values
# plt.scatter(y_test, y_pred)
# plt.xlabel("True Values")
# plt.ylabel("Predicted Values")
# plt.title("True vs Predicted Values (Random Forest Regression)")
# plt.show()

In [None]:
# Save model to folder

# Create a new directory for the model file
output_folder = 'LOS_RF_learners'
os.makedirs(output_folder, exist_ok=True)

# Save the trained model to a file in the new folder
model_file = os.path.join(output_folder, 'random_forest_omr.joblib')
dump(random_forest_omr, model_file)

### Admissions

In [20]:
file = "hosp/admissions.csv"
full_path = path + file

df_admissions = pd.read_csv(full_path)

In [21]:
# Separate into training and evaluation sets 

df_admissions_training = df_admissions[~df_admissions['subject_id'].isin(evaluation_patients)]
df_admissions_evaluation = df_admissions[df_admissions['subject_id'].isin(evaluation_patients)]

In [22]:
# Save evaluation data for later 
folder_name = 'EnsembleEvaluationData'

# Define the file path
file_path = os.path.join(folder_name, 'df_admissions_evaluation.csv')

# Save the DataFrame to a CSV file in the specified folder
df_admissions_evaluation.to_csv(file_path, index=False)

#### Preprocessing (on training data)

In [None]:
df_admissions = df_admissions_training

In [None]:
df_admissions = df_admissions.reset_index(drop=True)

In [None]:
# Make an ed_duration feature for edouttime - edregtime (how long the patient stayed in the emergency department)

# Convert to datetime
df_admissions['edouttime'] = pd.to_datetime(df_admissions['edouttime'], format='%d/%m/%Y %H:%M')
df_admissions['edregtime'] = pd.to_datetime(df_admissions['edregtime'], format='%d/%m/%Y %H:%M')

df_admissions['ed_duration'] = df_admissions['edouttime'] - df_admissions['edregtime']

# Fill any non time values
df_admissions['ed_duration'] = df_admissions['ed_duration'].fillna(pd.Timedelta(0))

In [None]:
df_admissions = df_admissions.drop(columns=['subject_id', 'admittime', 'dischtime', 'deathtime', 'hospital_expire_flag'
                            , 'edregtime', 'edouttime', 'admit_provider_id','discharge_location'])

# discharge_location is an outcome feature, should not be used to predict LOS as not known beforehand

In [None]:
# Fill Null with N/A and then one hot encode
df_admissions['marital_status'] = df_admissions['marital_status'].fillna('N/A')
df_admissions = pd.get_dummies(df_admissions, columns=['admission_type', 'admission_location', 
                                                      'insurance','language', 'marital_status','race'])

In [None]:
df_admissions = df_admissions.merge(df_los_hadm, on='hadm_id', how='left')
df_admissions = df_admissions.drop(columns=['hadm_id'])

#### Training the learner

In [None]:
data = df_admissions.drop(columns=['los'])
target = pd.DataFrame(df_admissions['los'])

In [None]:
# Converting duration strings to floats

target['los'] = target['los'].astype(str)
target.fillna(0, inplace=True)
target.loc[~target['los'].str.contains('days'), 'los'] = '0 days 00:00:00'

target['los'] = target['los'].apply(convert_to_days)

data['ed_duration']= data['ed_duration'].astype(str)
data['ed_duration']= data['ed_duration'].apply(convert_to_days)

In [None]:
# Random forest model

# Convert DataFrame to 1D array using ravel()
target = target.values.ravel()

# Initialize and fit the Random Forest regression model
random_forest_admissions = RandomForestRegressor(n_estimators=100, max_depth=10, random_state=42)
random_forest_admissions.fit(data, target)

In [None]:
# Save model to folder

output_folder = 'LOS_RF_learners'
# os.makedirs(output_folder, exist_ok=True)

# Save the trained model to a file in the new folder
model_file = os.path.join(output_folder, 'random_forest_admissions.joblib')
dump(random_forest_admissions, model_file)

### Emar

In [23]:
file = "hosp/emar.csv"
full_path = path + file

df_emar = pd.read_csv(full_path)

# records for 65 different patients 
# 181 unique admissions

In [24]:
# Separate into training and evaluation sets 

df_emar_training = df_emar[~df_emar['subject_id'].isin(evaluation_patients)]
df_emar_evaluation = df_emar[df_emar['subject_id'].isin(evaluation_patients)]

In [25]:
# Save evaluation data for later 
folder_name = 'EnsembleEvaluationData'

# Define the file path
file_path = os.path.join(folder_name, 'df_emar_evaluation.csv')

# Save the DataFrame to a CSV file in the specified folder
df_emar_evaluation.to_csv(file_path, index=False)

#### Preprocessing (on training data)

In [None]:
df_emar = df_emar_training

In [None]:
df_emar = df_emar.reset_index(drop=True)

Impute with N/A and encode: enter_provider_id, medication

Drop: subject_id, emar_id, poe_id, pharmacy_id, event_txt, storetime

poe_id is an identifier which links administrations in emar to orders in poe and prescriptions
storetime is when it was recorded in the table

In [None]:
# Make a feature called delay using scheduletime - charttime

# Convert to datetime
df_emar['scheduletime'] = pd.to_datetime(df_emar['scheduletime'], format='%Y/%m/%d %H:%M')
df_emar['charttime'] = pd.to_datetime(df_emar['charttime'], format='%Y/%m/%d %H:%M')

df_emar['delay'] = df_emar['charttime'] - df_emar['scheduletime']

# Fill any non time values
df_emar['delay'] = df_emar['delay'].fillna(pd.Timedelta(0))

In [None]:
df_emar = df_emar.drop(columns=['subject_id','emar_id','poe_id','pharmacy_id',
                               'event_txt','charttime','scheduletime','storetime'])

In [None]:
# Fill Null with N/A and then one hot encode
df_emar['enter_provider_id'] = df_emar['enter_provider_id'].fillna('N/A')
df_emar['medication'] = df_emar['medication'].fillna('N/A')
df_emar = pd.get_dummies(df_emar, columns=['enter_provider_id', 'medication'])

In [None]:
df_emar = df_emar.merge(df_los_hadm, on='hadm_id', how='left')
df_emar = df_emar.drop(columns=['hadm_id'])

#### Training the learner

In [None]:
data = df_emar.drop(columns=['los'])
target = pd.DataFrame(df_emar['los'])

In [None]:
# Converting duration strings to floats

target['los'] = target['los'].astype(str)
target.fillna(0, inplace=True)
target.loc[~target['los'].str.contains('days'), 'los'] = '0 days 00:00:00'

In [None]:
target['los'] = target['los'].apply(convert_to_days)
data['delay']= data['delay'].astype(str)
data['delay']= data['delay'].apply(convert_to_days)

In [None]:
# Initialize and fit the Random Forest regression model
random_forest_emar = RandomForestRegressor(n_estimators=100, max_depth=10, random_state=42)
random_forest_emar.fit(data, target)

In [None]:
# Save model to folder

output_folder = 'LOS_RF_learners'
# os.makedirs(output_folder, exist_ok=True)

# Save the trained model to a file in the new folder
model_file = os.path.join(output_folder, 'random_forest_emar.joblib')
dump(random_forest_emar, model_file)

### Emar_detail

In [26]:
file = "hosp/emar_detail.csv"
full_path = path + file

df_emar_detail = pd.read_csv(full_path,low_memory=False)

In [27]:
# Separate into training and evaluation sets 

df_emar_detail_training = df_emar_detail[~df_emar_detail['subject_id'].isin(evaluation_patients)]
df_emar_detail_evaluation = df_emar_detail[df_emar_detail['subject_id'].isin(evaluation_patients)]

In [28]:
# Save evaluation data for later 
folder_name = 'EnsembleEvaluationData'

# Define the file path
file_path = os.path.join(folder_name, 'df_emar_detail_evaluation.csv')

# Save the DataFrame to a CSV file in the specified folder
df_emar_detail.to_csv(file_path, index=False)

#### Preprocessing (on training data)

In [None]:
df_emar_detail = df_emar_detail_training

In [None]:
df_emar_detail = df_emar_detail.reset_index(drop=True)

Fields that have lots of null values:
reason_for_no_barcode: drop
prior_infusion_rate: impute with zeroes
infusion_rate: impute with zeroes
infusion_rate_adjustment: impute with 'N/A', then one hot encoding
infusion_rate_adjustment_amount: impute with zeroes
infusion_rate_unit: impute with 'N/A', then one hot encoding
infusion_complete: impute with 'N/A', then one hot encoding
completion_interval: impute with 0, then ordinal encoding 
new_iv_bag_hung: impute with N, then binary encoding 

Text data to remove but maybe consider later:
product_description, product_description_other

In [None]:
df_emar_detail = df_emar_detail.drop(columns=['reason_for_no_barcode']) # Too hard to encode, adds not much value

In [None]:
# Impute with 0s
df_emar_detail['prior_infusion_rate'] = df_emar_detail['prior_infusion_rate'].fillna(0)
df_emar_detail['infusion_rate'] = df_emar_detail['infusion_rate'].fillna(0)
df_emar_detail['infusion_rate_adjustment_amount'] = df_emar_detail['infusion_rate_adjustment_amount'].fillna(0)

In [None]:
# Impute with N/A and encode
df_emar_detail['infusion_rate_adjustment'] = df_emar_detail['infusion_rate_adjustment'].fillna('N/A')
df_emar_detail['infusion_rate_unit'] = df_emar_detail['infusion_rate_unit'].fillna('N/A')
df_emar_detail['infusion_complete'] = df_emar_detail['infusion_complete'].fillna('N/A')
df_emar_detail = pd.get_dummies(df_emar_detail, columns=['infusion_rate_adjustment','infusion_complete',
                                                         'infusion_rate_unit'])

In [None]:
df_emar_detail['completion_interval'] = df_emar_detail['completion_interval'].fillna(0)
df_emar_detail['completion_interval'] = df_emar_detail['completion_interval'].replace('PRN', 0)
#Converting all the intervals to minutes
df_emar_detail['completion_interval'] = df_emar_detail['completion_interval'].replace('within 2 hours', 120)
df_emar_detail['completion_interval'] = df_emar_detail['completion_interval'].replace('within 4 hours', 240)
df_emar_detail['completion_interval'] = df_emar_detail['completion_interval'].replace('within 1 hour', 60)
df_emar_detail['completion_interval'] = df_emar_detail['completion_interval'].replace('within 1.5 hours', 90)
df_emar_detail['completion_interval'] = df_emar_detail['completion_interval'].replace('within 8 hours', 480)
df_emar_detail['completion_interval'] = df_emar_detail['completion_interval'].replace('within 15 minutes', 15)
df_emar_detail['completion_interval'] = df_emar_detail['completion_interval'].replace('within 12 hours', 720)
df_emar_detail['completion_interval'] = df_emar_detail['completion_interval'].replace('within 30 minutes', 30)
df_emar_detail['completion_interval'] = df_emar_detail['completion_interval'].replace('within 24 hours', 1140)
df_emar_detail['completion_interval'] = df_emar_detail['completion_interval'].replace('within 1 minutes', 1)
df_emar_detail['completion_interval'] = df_emar_detail['completion_interval'].replace('within 14 hours', 840)
df_emar_detail['completion_interval'] = df_emar_detail['completion_interval'].replace('within 7 hours', 420)
df_emar_detail['completion_interval'] = df_emar_detail['completion_interval'].replace('within 5 hours', 300)
df_emar_detail['completion_interval'] = df_emar_detail['completion_interval'].replace('within 3 hours', 180)

In [None]:
df_emar_detail['new_iv_bag_hung'] = df_emar_detail['new_iv_bag_hung'].fillna('N')

In [None]:
# Binary encoding
df_emar_detail['new_iv_bag_hung'] = df_emar_detail['new_iv_bag_hung'].map({'Y': 1, 'N': 0})

In [None]:
# Impute with N/A and one hot encode:
# administration_type
# barcode_type
# complete_dose_not_given
# dose_due_unit
# dose_given_unit
# will_remainder_of_dose_be_given
# product_unit
# product_code
# route
# side
# site

In [None]:
df_emar_detail['administration_type'] = df_emar_detail['administration_type'].fillna('N/A')
df_emar_detail['barcode_type'] = df_emar_detail['barcode_type'].fillna('N/A')
df_emar_detail['complete_dose_not_given'] = df_emar_detail['complete_dose_not_given'].fillna('N/A')
df_emar_detail['dose_due_unit'] = df_emar_detail['dose_due_unit'].fillna('N/A')
df_emar_detail['dose_given_unit'] = df_emar_detail['dose_given_unit'].fillna('N/A')
df_emar_detail['will_remainder_of_dose_be_given'] = df_emar_detail['will_remainder_of_dose_be_given'].fillna('N/A')
df_emar_detail['product_unit'] = df_emar_detail['product_unit'].fillna('N/A')
df_emar_detail['product_code'] = df_emar_detail['product_code'].fillna('N/A')
df_emar_detail['route'] = df_emar_detail['route'].fillna('N/A')
df_emar_detail['side'] = df_emar_detail['side'].fillna('N/A')
df_emar_detail['site'] = df_emar_detail['site'].fillna('N/A')
df_emar_detail = pd.get_dummies(df_emar_detail, columns=['administration_type','barcode_type','complete_dose_not_given',
                                                        'dose_due_unit','dose_given_unit',
                                                        'will_remainder_of_dose_be_given','product_unit','product_code',
                                                        'route','side','site'])

In [None]:
# Impute with zeroes:
# dose_due and dose_given, but also need to deal with some of them being ranges
# product_amount_given
# restart_interval, then ordinal encoding

In [None]:
df_emar_detail['product_amount_given'] = df_emar_detail['product_amount_given'].fillna(0)
df_emar_detail['dose_due'] = df_emar_detail['dose_due'].fillna(0)
df_emar_detail['dose_given'] = df_emar_detail['dose_given'].fillna(0)
df_emar_detail['restart_interval'] = df_emar_detail['restart_interval'].fillna(0)

In [None]:
df_emar_detail['dose_due'] = df_emar_detail['dose_due'].astype(str)
df_emar_detail['dose_given'] = df_emar_detail['dose_given'].astype(str)

In [None]:
def find_middle_value(range_string):
    if '-' in range_string:
        start, end = map(float, range_string.split('-'))
        return (start + end) / 2
    else:
        return range_string

df_emar_detail['dose_due'] = df_emar_detail['dose_due'].apply(find_middle_value)
df_emar_detail['dose_given'] = df_emar_detail['dose_given'].apply(find_middle_value)

In [None]:
df_emar_detail['restart_interval'] = df_emar_detail['restart_interval'].replace('PRN', 0)
#Converting all the intervals to minutes
df_emar_detail['restart_interval'] = df_emar_detail['restart_interval'].replace('within 2 hours', 120)
df_emar_detail['restart_interval'] = df_emar_detail['restart_interval'].replace('within 4 hours', 240)
df_emar_detail['restart_interval'] = df_emar_detail['restart_interval'].replace('within 1 hour', 60)
df_emar_detail['restart_interval'] = df_emar_detail['restart_interval'].replace('within 30 minutes', 30)
df_emar_detail['restart_interval'] = df_emar_detail['restart_interval'].replace('within 24 hours', 1140)

In [None]:
# Impute with N and map to binary encoding:
# continued_infusion_in_other_location
# non_formulary_visual_verification

In [None]:
df_emar_detail['continued_infusion_in_other_location'] = df_emar_detail['continued_infusion_in_other_location'].fillna('N')
df_emar_detail['non_formulary_visual_verification'] = df_emar_detail['non_formulary_visual_verification'].fillna('N')
# Binary encoding
df_emar_detail['continued_infusion_in_other_location'] = df_emar_detail['continued_infusion_in_other_location'].map({'Y': 1, 'N': 0})
df_emar_detail['non_formulary_visual_verification'] = df_emar_detail['non_formulary_visual_verification'].map({'Y': 1, 'N': 0})

In [None]:
df_emar_detail = df_emar_detail.drop(columns=['pharmacy_id']) # Contains NaN values

In [None]:
df_emar_detail = df_emar_detail.drop(columns=['emar_id']) # Practically unique

In [None]:
# Replace blanks with zero
df_emar_detail['dose_due'] = df_emar_detail['dose_due'].replace('___', 0)
df_emar_detail['dose_given'] = df_emar_detail['dose_given'].replace('___', 0)

In [None]:
df_emar_detail['dose_due'] = df_emar_detail['dose_due'].astype(float)
df_emar_detail['dose_given'] = df_emar_detail['dose_given'].astype(float)

In [None]:
# Impute with N/A or 0
# One hot encode the categorical features 

df_emar_detail['product_description'] = df_emar_detail['product_description'].fillna('N/A')
df_emar_detail['product_description_other'] = df_emar_detail['product_description_other'].fillna('N/A')
df_emar_detail['parent_field_ordinal'] = df_emar_detail['parent_field_ordinal'].fillna(0)
df_emar_detail = pd.get_dummies(df_emar_detail, columns=['product_description_other','product_description'])

In [None]:
df_emar_detail = df_emar_detail.merge(df_los_subject, on='subject_id', how='left')
df_emar_detail = df_emar_detail.drop(columns=['subject_id'])

#### Training the learner

In [None]:
data = df_emar_detail.drop(columns=['los'])
target = pd.DataFrame(df_emar_detail['los'])

In [None]:
# Converting duration strings to floats

target['los'] = target['los'].astype(str)
target.fillna(0, inplace=True)
target.loc[~target['los'].str.contains('days'), 'los'] = '0 days 00:00:00'

target['los'] = target['los'].apply(convert_to_days)

In [None]:
# Random forest model

# Convert DataFrame to 1D array using ravel()
target = target.values.ravel()

# Initialize and fit the Random Forest regression model
random_forest_emar_detail = RandomForestRegressor(n_estimators=100, max_depth=10, random_state=42)
random_forest_emar_detail.fit(data, target)

In [None]:
# Save model to folder

output_folder = 'LOS_RF_learners'
# os.makedirs(output_folder, exist_ok=True)

# Save the trained model to a file in the new folder
model_file = os.path.join(output_folder, 'random_forest_emar_detail.joblib')
dump(random_forest_emar_detail, model_file)

### hcpcsevents

In [29]:
file = "hosp/hcpcsevents.csv"
full_path = path + file

df_hcpcsevents = pd.read_csv(full_path)

# Contains info for 18 different patients
# d_hcpcs has longer descriptions (connected by code) but no other useful info 

In [30]:
# Separate into training and evaluation sets 

df_hcpcsevents_training = df_hcpcsevents[~df_hcpcsevents['subject_id'].isin(evaluation_patients)]
df_hcpcsevents_evaluation = df_hcpcsevents[df_hcpcsevents['subject_id'].isin(evaluation_patients)]

In [31]:
# Save evaluation data for later 
folder_name = 'EnsembleEvaluationData'

# Define the file path
file_path = os.path.join(folder_name, 'df_hcpcsevents_evaluation.csv')

# Save the DataFrame to a CSV file in the specified folder
df_hcpcsevents_evaluation.to_csv(file_path, index=False)

#### Preprocessing (on training data)

To drop: subject_id, chartdate, hcpcs_cd (code that links to longer description in d_hcpcs)

In [None]:
df_hcpcsevents = df_hcpcsevents_training

In [None]:
df_hcpcsevents = df_hcpcsevents.reset_index(drop=True)

In [None]:
# Make a feature for days_since_admission using chartdate - admittime

# Convert to datetime
df_hcpcsevents['chartdate'] = pd.to_datetime(df_hcpcsevents['chartdate'], format='%Y/%m/%d %H:%M')

# Add admittime column from other dataframe
df_hcpcsevents = df_hcpcsevents.merge(df_admittime, on='hadm_id', how='left')

# Discard the time part and keep only the date
df_hcpcsevents['admittime'] = df_hcpcsevents['admittime'].dt.date
df_hcpcsevents['chartdate'] = df_hcpcsevents['chartdate'].dt.date

df_hcpcsevents['days_since_admission'] = df_hcpcsevents['chartdate'] - df_hcpcsevents['admittime']

# Fill any non time values
df_hcpcsevents['days_since_admission'] = df_hcpcsevents['days_since_admission'].fillna(pd.Timedelta(0))

In [None]:
df_hcpcsevents['days_since_admission'].value_counts()

In [None]:
df_hcpcsevents = df_hcpcsevents.drop(columns=['subject_id','chartdate','hcpcs_cd'])
# Not enough samples to include code as after encoding there would be a lot more features 

In [None]:
df_hcpcsevents = pd.get_dummies(df_hcpcsevents, columns=['short_description'])

In [None]:
df_hcpcsevents = df_hcpcsevents.merge(df_los_hadm, on='hadm_id', how='left')
df_hcpcsevents = df_hcpcsevents.drop(columns=['hadm_id', 'admittime'])

#### Dimensionality reduction

In [None]:
data = df_hcpcsevents.drop(columns=['los'])
target = pd.DataFrame(df_hcpcsevents['los'])

In [None]:
# Need to reduce from 13 to 9

In [None]:
# Convert strings to integers
data['days_since_admission'] = data['days_since_admission'].astype(str)
data['days_since_admission'] = data['days_since_admission'].str.split().str[0].astype(int)

In [None]:
from sklearn.decomposition import TruncatedSVD

# Number of desired features (components)
n_components = 9

# Initialize Truncated SVD with the desired number of components
svd = TruncatedSVD(n_components=n_components)

# Fit the Truncated SVD model to the sparse matrix and transform the data
svd.fit(data)
data = svd.transform(data)

# Get the explained variance ratio (how much variance is explained by each component)
explained_variance_ratio = svd.explained_variance_ratio_

# Print the transformed matrix and explained variance ratio
# print("Transformed Matrix:")
# print(transformed_matrix)
print("\nExplained Variance Ratio:")
print(explained_variance_ratio)

print("\n Amount of original variance conserved:", np.sum(svd.explained_variance_ratio_))

#### Training the learner

In [None]:
# Converting duration strings to floats

target['los'] = target['los'].astype(str)
target.fillna(0, inplace=True)
target.loc[~target['los'].str.contains('days'), 'los'] = '0 days 00:00:00'

target['los'] = target['los'].apply(convert_to_days)

In [None]:
# Random forest model

# Convert DataFrame to 1D array using ravel()
target = target.values.ravel()

# Initialize and fit the Random Forest regression model
random_forest_hcpcsevents = RandomForestRegressor(n_estimators=100, max_depth=10, random_state=42)
random_forest_hcpcsevents.fit(data, target)

In [None]:
# Save model to folder

output_folder = 'LOS_RF_learners'
# os.makedirs(output_folder, exist_ok=True)

# Save the trained model to a file in the new folder
model_file = os.path.join(output_folder, 'random_forest_hcpcsevents.joblib')
dump(random_forest_hcpcsevents, model_file)

### labevents

In [17]:
file = "hosp/labevents.csv"
full_path = path + file

df_labevents = pd.read_csv(full_path)

# Information regarding 252 different admissions

In [18]:
# Separate into training and evaluation sets 

# df_labevents_training = df_labevents[~df_labevents['subject_id'].isin(evaluation_patients)]
# df_labevents_evaluation = df_labevents[df_labevents['subject_id'].isin(evaluation_patients)]

df_labevents_training = df_labevents[~df_labevents['hadm_id'].isin(evaluation_admissions)]
df_labevents_evaluation = df_labevents[df_labevents['hadm_id'].isin(evaluation_admissions)]

In [19]:
# Save evaluation data for later 
folder_name = 'EnsembleEvaluationData'

# Define the file path
file_path = os.path.join(folder_name, 'df_labevents_evaluation.csv')

# Save the DataFrame to a CSV file in the specified folder
df_labevents_evaluation.to_csv(file_path, index=False)

#### Preprocessing (on training data)

In [20]:
df_labevents = df_labevents_training

In [21]:
df_labevents = df_labevents.reset_index(drop=True)

In [22]:
df_labevents['value'] = pd.to_numeric(df_labevents['value'], errors='coerce').fillna(0)

In [23]:
# Make a feature for days_since_admission using charttime - admittime

# Convert to datetime
df_labevents['charttime'] = pd.to_datetime(df_labevents['charttime'], format='%Y/%m/%d %H:%M')

# Add admittime column from other dataframe
df_labevents = df_labevents.merge(df_admittime, on='hadm_id', how='left')

# # Discard the time part and keep only the date
# df_hcpcsevents['admittime'] = df_hcpcsevents['admittime'].dt.date
# df_hcpcsevents['chartdate'] = df_hcpcsevents['chartdate'].dt.date

df_labevents['days_since_admission'] = df_labevents['charttime'] - df_labevents['admittime']

# Fill any non time values
df_labevents['days_since_admission'] = df_labevents['days_since_admission'].fillna(pd.Timedelta(0))

In [24]:
# Add storetime - charttime feature called delay

# Convert to datetime
df_labevents['storetime'] = pd.to_datetime(df_labevents['storetime'], format='%Y/%m/%d %H:%M')

df_labevents['delay'] = df_labevents['storetime'] - df_labevents['charttime']

# Fill any non time values
df_labevents['delay'] = df_labevents['delay'].fillna(pd.Timedelta(0))

Drop: labevent_id, subject_id, order_provider_id (too many Null), charttime, storetime, comments

In [25]:
df_labevents = df_labevents.drop(columns=['labevent_id','subject_id','order_provider_id','charttime','storetime','comments'])

In [26]:
# For flag make abnormal = 1 and fill Null with 0
df_labevents['flag'] = df_labevents['flag'].fillna(0)
df_labevents['flag'] = df_labevents['flag'].replace('abnormal', 1)

In [27]:
# For priority fill Null with N/A and then one hot encode
df_labevents['priority'] = df_labevents['priority'].fillna('N/A')
df_labevents = pd.get_dummies(df_labevents, columns=['priority'])

In [28]:
df_labevents = pd.get_dummies(df_labevents, columns=['valueuom','specimen_id','itemid'])

In [29]:
# Drop any rows with null values 
df_labevents = df_labevents.dropna()
# Reduced from 107727 rows to 66660

In [30]:
df_labevents = df_labevents.merge(df_los_hadm, on='hadm_id', how='left')
df_labevents = df_labevents.drop(columns=['hadm_id', 'admittime'])

In [31]:
data = df_labevents.drop(columns=['los'])
target = pd.DataFrame(df_labevents['los'])

In [32]:
#Bookmark 

column_names = data.columns.to_numpy()

# Convert the array to a DataFrame
df_column_names = pd.DataFrame(column_names, columns=['Column Names'])

output_folder = 'LOS_RF_features'
os.makedirs(output_folder, exist_ok=True)
file_path = os.path.join(output_folder, 'labevents_features.csv')

# Save the DataFrame to a CSV file
df_column_names.to_csv(file_path, index=False)

#### Training the learner

In [33]:
# Converting duration strings to floats

target['los'] = target['los'].astype(str)
target.fillna(0, inplace=True)
target.loc[~target['los'].str.contains('days'), 'los'] = '0 days 00:00:00'

target['los'] = target['los'].apply(convert_to_days)
data['delay']= data['delay'].astype(str)
data['delay']= data['delay'].apply(convert_to_days)

In [34]:
# Convert strings to integers
data['days_since_admission'] = data['days_since_admission'].astype(str)
data['days_since_admission'] = data['days_since_admission'].str.split().str[0].astype(int)

In [35]:
# Random forest model

# Convert DataFrame to 1D array using ravel()
target = target.values.ravel()

# Initialize and fit the Random Forest regression model
random_forest_labevents = RandomForestRegressor(n_estimators=100, max_depth=10, random_state=42)
random_forest_labevents.fit(data, target)

In [36]:
# Save model to folder

output_folder = 'LOS_RF_learners'
# os.makedirs(output_folder, exist_ok=True)

# Save the trained model to a file in the new folder
model_file = os.path.join(output_folder, 'random_forest_labevents.joblib')
dump(random_forest_labevents, model_file)

['LOS_RF_learners\\random_forest_labevents.joblib']

### microbiologyevents

In [32]:
file = "hosp/microbiologyevents.csv"
full_path = path + file

df_microbio = pd.read_csv(full_path)

In [33]:
# Separate into training and evaluation sets 

# df_microbio_training = df_microbio[~df_microbio['subject_id'].isin(evaluation_patients)]
# df_microbio_evaluation = df_microbio[df_microbio['subject_id'].isin(evaluation_patients)]

df_microbio_training = df_microbio[~df_microbio['hadm_id'].isin(evaluation_admissions)]
df_microbio_evaluation = df_microbio[df_microbio['hadm_id'].isin(evaluation_admissions)]

In [34]:
# Save evaluation data for later 
folder_name = 'EnsembleEvaluationData'

# Define the file path
file_path = os.path.join(folder_name, 'df_microbio_evaluation.csv')

# Save the DataFrame to a CSV file in the specified folder
df_microbio_evaluation.to_csv(file_path, index=False)

#### Preprocessing (on training data)

In [40]:
df_microbio = df_microbio_training

In [41]:
df_microbio = df_microbio.reset_index(drop=True)

In [42]:
# make days_since_admission using charttime 

# Convert to datetime
df_microbio['charttime'] = pd.to_datetime(df_microbio['charttime'], format='%Y/%m/%d %H:%M')

# Add admittime column from other dataframe
df_microbio = df_microbio.merge(df_admittime, on='hadm_id', how='left')

# # Discard the time part and keep only the date
# df_hcpcsevents['admittime'] = df_hcpcsevents['admittime'].dt.date
# df_hcpcsevents['chartdate'] = df_hcpcsevents['chartdate'].dt.date

df_microbio['days_since_admission'] = df_microbio['charttime'] - df_microbio['admittime']

# Fill any non time values
df_microbio['days_since_admission'] = df_microbio['days_since_admission'].fillna(pd.Timedelta(0))

# Drop the admission time column
df_microbio = df_microbio.drop(columns=['admittime'])

In [43]:
# Add storetime - charttime feature (call it delay)

# Convert to datetime
df_microbio['storetime'] = pd.to_datetime(df_microbio['storetime'], format='%Y/%m/%d %H:%M')

df_microbio['delay'] = df_microbio['storetime'] - df_microbio['charttime']

# Fill any non time values
df_microbio['delay'] = df_microbio['delay'].fillna(pd.Timedelta(0))

Drop: microevent_id, subject_id, chartdate, charttime, test_seq, storedate, storetime, test_name and org_itemid (since info in name), quantity, ab_name, comments, micro_specimen_id (unique identifier for sample as some measurements are made on the same sample)
Keep but categorical: order_provider_id, spec_type_desc, dilution_text, dilution_comparison
Impute null with 0: order_provider_id, org_itemid, isolate_num, ab_itemid, dilution_value
Impute with N/A and then one hot encode: interpretation

In [44]:
# Drop
# spec_itemid , test_itemid
df_microbio = df_microbio.drop(columns=['microevent_id','subject_id','chartdate','charttime','test_seq','storedate',
                                       'storetime','quantity','comments','ab_itemid',
                                       'spec_itemid','test_itemid','org_itemid','micro_specimen_id'])

In [45]:
# Impute null with 0: order_provider_id, org_itemid, isolate_num, ab_itemid, dilution_value
df_microbio['order_provider_id'] = df_microbio['order_provider_id'].fillna(0)
df_microbio['isolate_num'] = df_microbio['isolate_num'].fillna(0)
df_microbio['dilution_value'] = df_microbio['dilution_value'].fillna(0)

In [46]:
# Impute with N/A and then one hot encode: interpretation
# encode test_name, ab_name

df_microbio['interpretation'] = df_microbio['interpretation'].fillna('N/A')
df_microbio['test_name'] = df_microbio['test_name'].fillna('N/A')
df_microbio['ab_name'] = df_microbio['ab_name'].fillna('N/A')
df_microbio['org_name'] = df_microbio['org_name'].fillna('None')
df_microbio = pd.get_dummies(df_microbio, columns=['org_name','interpretation','ab_name','test_name'])

In [47]:
# Keep but categorical: order_provider_id, spec_type_desc, dilution_text, dilution_comparison
df_microbio = pd.get_dummies(df_microbio, columns=['order_provider_id','spec_type_desc','dilution_text',
                                                  'dilution_comparison'])

In [48]:
df_microbio = df_microbio.dropna()

In [49]:
df_microbio = df_microbio.merge(df_los_hadm, on='hadm_id', how='left')
df_microbio = df_microbio.drop(columns=['hadm_id'])

In [50]:
data = df_microbio.drop(columns=['los'])
target = pd.DataFrame(df_microbio['los'])

In [51]:

column_names = data.columns.to_numpy()

# Convert the array to a DataFrame
df_column_names = pd.DataFrame(column_names, columns=['Column Names'])

output_folder = 'LOS_RF_features'
os.makedirs(output_folder, exist_ok=True)
file_path = os.path.join(output_folder, 'microbio_features.csv')

# Save the DataFrame to a CSV file
df_column_names.to_csv(file_path, index=False)

#### Training the learner

In [52]:
# Converting duration strings to floats

target['los'] = target['los'].astype(str)
target.fillna(0, inplace=True)
target.loc[~target['los'].str.contains('days'), 'los'] = '0 days 00:00:00'

target['los'] = target['los'].apply(convert_to_days)
data['delay']= data['delay'].astype(str)
data['delay']= data['delay'].apply(convert_to_days)
data['days_since_admission'] = data['days_since_admission'].astype(str)
data['days_since_admission'] = data['days_since_admission'].apply(convert_to_days)

In [53]:
# Random forest model

# Convert DataFrame to 1D array using ravel()
target = target.values.ravel()

# Initialize and fit the Random Forest regression model
random_forest_microbio = RandomForestRegressor(n_estimators=100, max_depth=10, random_state=42)
random_forest_microbio.fit(data, target)

In [54]:
# Save model to folder

output_folder = 'LOS_RF_learners'
# os.makedirs(output_folder, exist_ok=True)

# Save the trained model to a file in the new folder
model_file = os.path.join(output_folder, 'random_forest_microbio.joblib')
dump(random_forest_microbio, model_file)

['LOS_RF_learners\\random_forest_microbio.joblib']

### patients

In [35]:
file = "hosp/patients.csv"
full_path = path + file

df_patients = pd.read_csv(full_path)

In [36]:
# Separate into training and evaluation sets 

df_patients_training = df_patients[~df_patients['subject_id'].isin(evaluation_patients)]
df_patients_evaluation = df_patients[df_patients['subject_id'].isin(evaluation_patients)]

In [37]:
# Save evaluation data for later 
folder_name = 'EnsembleEvaluationData'

# Define the file path
file_path = os.path.join(folder_name, 'df_patients_evaluation.csv')

# Save the DataFrame to a CSV file in the specified folder
df_patients_evaluation.to_csv(file_path, index=False)

#### Preprocessing (on training data)

In [None]:
df_patients = df_patients_training

In [None]:
df_patients = df_patients.reset_index(drop=True)

Drop: anchor_year
Encode: gender (M to 0 and F to 1), dod (change all to 1 and nulls to 0)
Dummies: anchor_year_group  

In [None]:
# Drop
df_patients = df_patients.drop(columns=['anchor_year','dod']) 
# Since this is the shifted year and dod is an outcome value

In [None]:
# Encode: gender (M to 0 and F to 1), dod (change all to 1 and nulls to 0)
df_patients['gender'] = df_patients['gender'].replace('M', 0)
df_patients['gender'] = df_patients['gender'].replace('F', 1)

In [None]:
# Dummies: anchor_year_group  
df_patients = pd.get_dummies(df_patients, columns=['anchor_year_group'])

In [None]:
df_patients = df_patients.merge(df_los_subject, on='subject_id', how='left')
df_patients = df_patients.drop(columns=['subject_id'])

In [None]:
data = df_patients.drop(columns=['los'])
target = pd.DataFrame(df_patients['los'])

#### Training the learner

In [None]:
# Converting duration strings to floats

target['los'] = target['los'].astype(str)
target.fillna(0, inplace=True)
target.loc[~target['los'].str.contains('days'), 'los'] = '0 days 00:00:00'

target['los'] = target['los'].apply(convert_to_days)

In [None]:
# Random forest model

# Convert DataFrame to 1D array using ravel()
target = target.values.ravel()

# Initialize and fit the Random Forest regression model
random_forest_patients = RandomForestRegressor(n_estimators=100, max_depth=10, random_state=42)
random_forest_patients.fit(data, target)

In [None]:
# Save model to folder

output_folder = 'LOS_RF_learners'
# os.makedirs(output_folder, exist_ok=True)

# Save the trained model to a file in the new folder
model_file = os.path.join(output_folder, 'random_forest_patients.joblib')
dump(random_forest_patients, model_file)

### pharmacy

In [38]:
file = "hosp/pharmacy.csv"
full_path = path + file

df_pharmacy = pd.read_csv(full_path)

In [39]:
# Separate into training and evaluation sets 

df_pharmacy_training = df_pharmacy[~df_pharmacy['subject_id'].isin(evaluation_patients)]
df_pharmacy_evaluation = df_pharmacy[df_pharmacy['subject_id'].isin(evaluation_patients)]

In [40]:
# Save evaluation data for later 
folder_name = 'EnsembleEvaluationData'

# Define the file path
file_path = os.path.join(folder_name, 'df_pharmacy_evaluation.csv')

# Save the DataFrame to a CSV file in the specified folder
df_pharmacy_evaluation.to_csv(file_path, index=False)

#### Preprocessing (on training data)

drop: subject_id, pharmacy_id, poe_id, starttime, stoptime, entertime, verifiedtime, disp_sched, basal_rate, one_hr_max,
expirationdate, fill_quantity
Encode: proc_type, status
Impute with N/A and encode: infusion_type, sliding_scale, duration_interval, expiration_unit, dispensation, medication, route, frequency
Impute with 0: lockout_interval, doses_per_24_hrs, duration, expiration_value

In [None]:
df_pharmacy = df_pharmacy_training

In [None]:
df_pharmacy = df_pharmacy.reset_index(drop=True)

In [None]:
# stoptime-starttime for a duration feature

# Convert to datetime
df_pharmacy['stoptime'] = pd.to_datetime(df_pharmacy['stoptime'], format='%Y/%m/%d %H:%M')
df_pharmacy['starttime'] = pd.to_datetime(df_pharmacy['starttime'], format='%Y/%m/%d %H:%M')


df_pharmacy['medication_duration'] = df_pharmacy['stoptime'] - df_pharmacy['starttime']

# Fill any non time values
df_pharmacy['medication_duration'] = df_pharmacy['medication_duration'].fillna(pd.Timedelta(0))

In [None]:
# verifiedtime - entertime for verification_delay feature 

# Convert to datetime
df_pharmacy['verifiedtime'] = pd.to_datetime(df_pharmacy['verifiedtime'], format='%Y/%m/%d %H:%M')
df_pharmacy['entertime'] = pd.to_datetime(df_pharmacy['entertime'], format='%Y/%m/%d %H:%M')

df_pharmacy['verification_delay'] = df_pharmacy['verifiedtime'] - df_pharmacy['entertime']

# Fill any non time values
df_pharmacy['verification_delay'] = df_pharmacy['verification_delay'].fillna(pd.Timedelta(0))

In [None]:
fill_value = [0] 

# Fill null values with the list
df_pharmacy['disp_sched'] = df_pharmacy['disp_sched'].fillna(pd.Series([fill_value]*len(df_pharmacy)))

In [None]:
# Convert all categories to strings
df_pharmacy['disp_sched'] = df_pharmacy['disp_sched'].apply(lambda x: [str(item) for item in x])

mlb = MultiLabelBinarizer()

encoded_feature = pd.DataFrame(mlb.fit_transform(df_pharmacy['disp_sched']),
                               columns=mlb.classes_,
                               index=df_pharmacy.index)

df_pharmacy = pd.concat([df_pharmacy, encoded_feature], axis=1)

In [None]:
# Drop 
df_pharmacy = df_pharmacy.drop(columns=['subject_id','pharmacy_id','poe_id','starttime','stoptime','entertime',
                                       'verifiedtime','expirationdate', 'fill_quantity','disp_sched'])
# expiration date and fill quantity are all empty

In [None]:
# Encode: proc_type, status
df_pharmacy = pd.get_dummies(df_pharmacy, columns=['proc_type','status'])

In [None]:
# Impute with N/A and encode
df_pharmacy['infusion_type'] = df_pharmacy['infusion_type'].fillna('N/A')
df_pharmacy['sliding_scale'] = df_pharmacy['sliding_scale'].fillna('N/A')
df_pharmacy['duration_interval'] = df_pharmacy['duration_interval'].fillna('N/A')
df_pharmacy['expiration_unit'] = df_pharmacy['expiration_unit'].fillna('N/A')
df_pharmacy['dispensation'] = df_pharmacy['dispensation'].fillna('N/A')
df_pharmacy['medication'] = df_pharmacy['medication'].fillna('N/A')
df_pharmacy['route'] = df_pharmacy['route'].fillna('N/A')
df_pharmacy['frequency'] = df_pharmacy['frequency'].fillna('N/A')
df_pharmacy = pd.get_dummies(df_pharmacy, columns=['infusion_type','sliding_scale','duration_interval','expiration_unit',
                                                  'dispensation','medication','route','frequency'])

In [None]:
# Impute with 0: lockout_interval, doses_per_24_hrs, duration, expiration_value
df_pharmacy['lockout_interval'] = df_pharmacy['lockout_interval'].fillna(0)
df_pharmacy['doses_per_24_hrs'] = df_pharmacy['doses_per_24_hrs'].fillna(0)
df_pharmacy['expiration_value'] = df_pharmacy['expiration_value'].fillna(0)
df_pharmacy['basal_rate'] = df_pharmacy['basal_rate'].fillna(0)
df_pharmacy['one_hr_max'] = df_pharmacy['one_hr_max'].fillna(0)

In [None]:
df_pharmacy = df_pharmacy.merge(df_los_hadm, on='hadm_id', how='left')
df_pharmacy = df_pharmacy.drop(columns=['hadm_id'])

In [None]:
data = df_pharmacy.drop(columns=['los'])
target = pd.DataFrame(df_pharmacy['los'])

#### Training the learner

In [None]:
# Converting duration strings to floats

target['los'] = target['los'].astype(str)
target.fillna(0, inplace=True)
target.loc[~target['los'].str.contains('days'), 'los'] = '0 days 00:00:00'

target['los'] = target['los'].apply(convert_to_days)

data['medication_duration']= data['medication_duration'].astype(str)
data['medication_duration']= data['medication_duration'].apply(convert_to_days)
# Convert strings to integers
data['verification_delay'] = data['verification_delay'].astype(str)
data['verification_delay'] = data['verification_delay'].str.split().str[0].astype(int)

In [None]:
data.fillna(0, inplace=True)

In [None]:
# Random forest model

# Convert DataFrame to 1D array using ravel()
target = target.values.ravel()

# Initialize and fit the Random Forest regression model
random_forest_pharmacy = RandomForestRegressor(n_estimators=100, max_depth=10, random_state=42)
random_forest_pharmacy.fit(data, target)

In [None]:
# Save model to folder

output_folder = 'LOS_RF_learners'
# os.makedirs(output_folder, exist_ok=True)

# Save the trained model to a file in the new folder
model_file = os.path.join(output_folder, 'random_forest_pharmacy.joblib')
dump(random_forest_pharmacy, model_file)

### poe

In [41]:
file = "hosp/poe.csv"
full_path = path + file

df_poe = pd.read_csv(full_path)

In [42]:
# Separate into training and evaluation sets 

# df_poe_training = df_poe[~df_poe['subject_id'].isin(evaluation_patients)]
# df_poe_evaluation = df_poe[df_poe['subject_id'].isin(evaluation_patients)]

df_poe_training = df_poe[~df_poe['hadm_id'].isin(evaluation_admissions)]
df_poe_evaluation = df_poe[df_poe['hadm_id'].isin(evaluation_admissions)]

In [43]:
# Save evaluation data for later 
folder_name = 'EnsembleEvaluationData'

# Define the file path
file_path = os.path.join(folder_name, 'df_poe_evaluation.csv')

# Save the DataFrame to a CSV file in the specified folder
df_poe_evaluation.to_csv(file_path, index=False)

#### Preprocessing (on training data)

To drop: poe_id, subject_id, ordertime, discontinue_of_poe_id, discontinued_by_poe_id (all unique), order_status (all inactive)
Encode: order_type, transaction_type
Impute with N/A and then encode: order_subtype, order_provider_id

In [58]:
df_poe = df_poe_training

In [59]:
df_poe = df_poe.reset_index(drop=True)

In [60]:
# make a feature of ordertime - admittime for days_since_admission

# Convert to datetime
df_poe['ordertime'] = pd.to_datetime(df_poe['ordertime'], format='%Y/%m/%d %H:%M:%S')

# Add admittime column from other dataframe
df_poe = df_poe.merge(df_admittime, on='hadm_id', how='left')

# # Discard the time part and keep only the date
# df_hcpcsevents['admittime'] = df_hcpcsevents['admittime'].dt.date
# df_hcpcsevents['chartdate'] = df_hcpcsevents['chartdate'].dt.date

df_poe['days_since_admission'] = df_poe['ordertime'] - df_poe['admittime']

# Fill any non time values
df_poe['days_since_admission'] = df_poe['days_since_admission'].fillna(pd.Timedelta(0))

# Drop the admission time column
df_poe = df_poe.drop(columns=['admittime'])

In [61]:
# Drop 
df_poe = df_poe.drop(columns=['poe_id','subject_id','ordertime','discontinue_of_poe_id','discontinued_by_poe_id',
                                       'order_status'])

In [62]:
# Encode
df_poe = pd.get_dummies(df_poe, columns=['order_type','transaction_type'])

In [63]:
# Impute with N/A and encode
df_poe['order_subtype'] = df_poe['order_subtype'].fillna('N/A')
df_poe['order_provider_id'] = df_poe['order_provider_id'].fillna('N/A')
df_poe = pd.get_dummies(df_poe, columns=['order_subtype','order_provider_id'])

In [64]:
df_poe = df_poe.merge(df_los_hadm, on='hadm_id', how='left')
df_poe = df_poe.drop(columns=['hadm_id'])

In [65]:
data = df_poe.drop(columns=['los'])
target = pd.DataFrame(df_poe['los'])

In [66]:
column_names = data.columns.to_numpy()

# Convert the array to a DataFrame
df_column_names = pd.DataFrame(column_names, columns=['Column Names'])

output_folder = 'LOS_RF_features'
os.makedirs(output_folder, exist_ok=True)
file_path = os.path.join(output_folder, 'poe_features.csv')

# Save the DataFrame to a CSV file
df_column_names.to_csv(file_path, index=False)

#### Training the learner

In [67]:
# Converting duration strings to floats

target['los'] = target['los'].astype(str)
target.fillna(0, inplace=True)
target.loc[~target['los'].str.contains('days'), 'los'] = '0 days 00:00:00'

target['los'] = target['los'].apply(convert_to_days)
data['days_since_admission'] = data['days_since_admission'].astype(str)
data['days_since_admission'] = data['days_since_admission'].apply(convert_to_days)

In [68]:
# Random forest model

# Convert DataFrame to 1D array using ravel()
target = target.values.ravel()

# Initialize and fit the Random Forest regression model
random_forest_poe = RandomForestRegressor(n_estimators=100, max_depth=10, random_state=42)
random_forest_poe.fit(data, target)

In [69]:
# Save model to folder

output_folder = 'LOS_RF_learners'
# os.makedirs(output_folder, exist_ok=True)

# Save the trained model to a file in the new folder
model_file = os.path.join(output_folder, 'random_forest_poe.joblib')
dump(random_forest_poe, model_file)

['LOS_RF_learners\\random_forest_poe.joblib']

### poe_detail

In [44]:
file = "hosp/poe_detail.csv"
full_path = path + file

df_poe_detail = pd.read_csv(full_path)

In [45]:
# Separate into training and evaluation sets 

df_poe_detail_training = df_poe_detail[~df_poe_detail['subject_id'].isin(evaluation_patients)]
df_poe_detail_evaluation = df_poe_detail[df_poe_detail['subject_id'].isin(evaluation_patients)]

In [46]:
# Save evaluation data for later 
folder_name = 'EnsembleEvaluationData'

# Define the file path
file_path = os.path.join(folder_name, 'df_poe_detail_evaluation.csv')

# Save the DataFrame to a CSV file in the specified folder
df_poe_detail_evaluation.to_csv(file_path, index=False)

#### Preprocessing (on training data)

To drop: poe_id
Encode: field_name, field_value
subject_id for los and then drop

In [73]:
df_poe_detail = df_poe_detail_training

In [74]:
df_poe_detail

Unnamed: 0,poe_id,poe_seq,subject_id,field_name,field_value
0,10011398-23,23,10011398,Admit to,Surgery
1,10011398-103,103,10011398,Transfer to,Surgery
2,10011398-163,163,10011398,Discharge Planning,Finalized
3,10011398-109,109,10011398,Tubes & Drains type,Chest tube
4,10011398-35,35,10011398,Tubes & Drains type,Chest tube
...,...,...,...,...,...
3790,10021118-7,7,10021118,Admit category,Admit to inpatient
3791,10021118-24,24,10021118,Code status,Resuscitate (Full code)
3792,10021118-100,100,10021118,Tubes & Drains type,Indwelling urinary catheter (IUC) - Foley
3793,10021118-227,227,10021118,Tubes & Drains type,Indwelling urinary catheter (IUC) - Foley


In [75]:
df_poe_detail = df_poe_detail.reset_index(drop=True)

In [76]:
# Drop 
df_poe_detail = df_poe_detail.drop(columns=['poe_id'])

In [77]:
# Encode
df_poe_detail = pd.get_dummies(df_poe_detail, columns=['field_name','field_value'])

In [78]:
df_poe_detail = df_poe_detail.merge(df_los_subject, on='subject_id', how='left')
df_poe_detail = df_poe_detail.drop(columns=['subject_id'])

In [79]:
data = df_poe_detail.drop(columns=['los'])
target = pd.DataFrame(df_poe_detail['los'])

In [80]:
column_names = data.columns.to_numpy()

# Convert the array to a DataFrame
df_column_names = pd.DataFrame(column_names, columns=['Column Names'])

output_folder = 'LOS_RF_features'
os.makedirs(output_folder, exist_ok=True)
file_path = os.path.join(output_folder, 'poe_detail_features.csv')

# Save the DataFrame to a CSV file
df_column_names.to_csv(file_path, index=False)

#### Training the learner

In [81]:
# Converting duration strings to floats

target['los'] = target['los'].astype(str)
target.fillna(0, inplace=True)
target.loc[~target['los'].str.contains('days'), 'los'] = '0 days 00:00:00'

target['los'] = target['los'].apply(convert_to_days)

In [82]:
# Random forest model

# Convert DataFrame to 1D array using ravel()
target = target.values.ravel()

# Initialize and fit the Random Forest regression model
random_forest_poe_detail = RandomForestRegressor(n_estimators=100, max_depth=10, random_state=42)
random_forest_poe_detail.fit(data, target)

In [83]:
# Save model to folder

output_folder = 'LOS_RF_learners'
# os.makedirs(output_folder, exist_ok=True)

# Save the trained model to a file in the new folder
model_file = os.path.join(output_folder, 'random_forest_poe_detail.joblib')
dump(random_forest_poe_detail, model_file)

['LOS_RF_learners\\random_forest_poe_detail.joblib']

### prescriptions

In [47]:
file = "hosp/prescriptions.csv"
full_path = path + file

df_prescriptions = pd.read_csv(full_path)

In [48]:
# Separate into training and evaluation sets 

df_prescriptions_training = df_prescriptions[~df_prescriptions['subject_id'].isin(evaluation_patients)]
df_prescriptions_evaluation = df_prescriptions[df_prescriptions['subject_id'].isin(evaluation_patients)]

In [49]:
# Save evaluation data for later 
folder_name = 'EnsembleEvaluationData'

# Define the file path
file_path = os.path.join(folder_name, 'df_prescriptions_evaluation.csv')

# Save the DataFrame to a CSV file in the specified folder
df_prescriptions_evaluation.to_csv(file_path, index=False)

#### Preprocessing (on training data)

Drop na and encode: dose_val_rx, form_val_disp, order_provider_id
Drop: subject_id, pharmacy_id, starttime, stoptime, form_rx (mostly null), poe_id
Impute with N/A and encode: formulary_drug_cd, gsn, prod_strength, route
Encode: drug_type, drug, dose_unit_rx, form_unit_disp
Impute with 0: doses_per_24_hrs

Drop rows with na

order_provider_id
Was going to impute with N/A and encode but going to drop as too many features 

In [None]:
df_prescriptions = df_prescriptions_training

In [None]:
df_prescriptions = df_prescriptions.reset_index(drop=True)

In [None]:
# Make a feature of stoptime-starttime called duration 

# Convert to datetime
df_prescriptions['stoptime'] = pd.to_datetime(df_prescriptions['stoptime'], format='%Y/%m/%d %H:%M')
df_prescriptions['starttime'] = pd.to_datetime(df_prescriptions['starttime'], format='%Y/%m/%d %H:%M')

df_prescriptions['duration'] = df_prescriptions['stoptime'] - df_prescriptions['starttime']

# Fill any non time values
df_prescriptions['duration'] = df_prescriptions['duration'].fillna(pd.Timedelta(0))

In [None]:
# Drop na
df_prescriptions.dropna(subset=['dose_val_rx', 'form_val_disp'], inplace=True)

In [None]:
# Drop 
df_prescriptions = df_prescriptions.drop(columns=['subject_id','pharmacy_id','starttime','stoptime','form_rx','poe_id',
                                                 'order_provider_id'])

In [None]:
# Impute with N/A and encode
df_prescriptions['formulary_drug_cd'] = df_prescriptions['formulary_drug_cd'].fillna('N/A')
df_prescriptions['gsn'] = df_prescriptions['gsn'].fillna('N/A')
df_prescriptions['prod_strength'] = df_prescriptions['prod_strength'].fillna('N/A')
df_prescriptions['route'] = df_prescriptions['route'].fillna('N/A')

# Impute with 0
df_prescriptions['ndc'] = df_prescriptions['ndc'].fillna(0)

df_prescriptions = pd.get_dummies(df_prescriptions, columns=['formulary_drug_cd','gsn','prod_strength',
                                                            'route','drug_type','drug','dose_unit_rx','form_unit_disp',
                                                            'dose_val_rx','form_val_disp','ndc'])

In [None]:
df_prescriptions['doses_per_24_hrs'] = df_prescriptions['doses_per_24_hrs'].fillna(0)

In [None]:
# Drop any rows with null values 
df_prescriptions = df_prescriptions.dropna()

In [None]:
df_prescriptions = df_prescriptions.merge(df_los_hadm, on='hadm_id', how='left')
df_prescriptions = df_prescriptions.drop(columns=['hadm_id'])

In [None]:
data = df_prescriptions.drop(columns=['los'])
target = pd.DataFrame(df_prescriptions['los'])

#### Dimensionality reduction

In [None]:
# Converting duration strings to floats

target['los'] = target['los'].astype(str)
target.fillna(0, inplace=True)
target.loc[~target['los'].str.contains('days'), 'los'] = '0 days 00:00:00'
target['los'] = target['los'].astype(str)
target['los'] = target['los'].apply(convert_to_days)
data['duration']= data['duration'].astype(str)
data['duration']= data['duration'].apply(convert_to_days)

In [None]:
# Need to reduce from 4890 to 2874 or less

from sklearn.decomposition import TruncatedSVD

# Number of desired features (components)
n_components = 2874

# Initialize Truncated SVD with the desired number of components
svd = TruncatedSVD(n_components=n_components)

# Fit the Truncated SVD model to the sparse matrix and transform the data
svd.fit(data)
data = svd.transform(data)

# Get the explained variance ratio (how much variance is explained by each component)
explained_variance_ratio = svd.explained_variance_ratio_

# Print the transformed matrix and explained variance ratio
# print("Transformed Matrix:")
# print(transformed_matrix)
print("\nExplained Variance Ratio:")
print(explained_variance_ratio)

print("\n Amount of original variance conserved:", np.sum(svd.explained_variance_ratio_))

#### Training the learner

In [None]:
# Random forest model

# Convert DataFrame to 1D array using ravel()
target = target.values.ravel()

# Initialize and fit the Random Forest regression model
random_forest_prescriptions = RandomForestRegressor(n_estimators=100, max_depth=10, random_state=42)
random_forest_prescriptions.fit(data, target)

In [None]:
# Save model to folder

output_folder = 'LOS_RF_learners'
# os.makedirs(output_folder, exist_ok=True)

# Save the trained model to a file in the new folder
model_file = os.path.join(output_folder, 'random_forest_prescriptions.joblib')
dump(random_forest_prescriptions, model_file)

### procedures_icd

In [50]:
file = "hosp/procedures_icd.csv"
full_path = path + file

df_procedures = pd.read_csv(full_path)

In [51]:
# Separate into training and evaluation sets 

# df_procedures_training = df_procedures[~df_procedures['subject_id'].isin(evaluation_patients)]
# df_procedures_evaluation = df_procedures[df_procedures['subject_id'].isin(evaluation_patients)]

df_procedures_training = df_procedures[~df_procedures['hadm_id'].isin(evaluation_admissions)]
df_procedures_evaluation = df_procedures[df_procedures['hadm_id'].isin(evaluation_admissions)]

In [52]:
# Save evaluation data for later 
folder_name = 'EnsembleEvaluationData'

# Define the file path
file_path = os.path.join(folder_name, 'df_procedures_evaluation.csv')

# Save the DataFrame to a CSV file in the specified folder
df_procedures_evaluation.to_csv(file_path, index=False)

#### Preprocessing (on training data)

Drop: subject_id, chartdate
Encode: icd_code

In [106]:
df_procedures = df_procedures_training

In [107]:
df_procedures = df_procedures.reset_index(drop=True)

In [108]:
# make a feature called days_since_admission of chartdate - admitdate

# Convert to datetime
df_procedures['chartdate'] = pd.to_datetime(df_procedures['chartdate'], format='%Y-%m-%d')

# Add admittime column from other dataframe
df_procedures = df_procedures.merge(df_admittime, on='hadm_id', how='left')

# # Discard the time part and keep only the date
df_procedures['admittime'] = df_procedures['admittime'].dt.date
df_procedures['chartdate'] = df_procedures['chartdate'].dt.date

df_procedures['days_since_admission'] = df_procedures['chartdate'] - df_procedures['admittime']

# Fill any non time values
df_procedures['days_since_admission'] = df_procedures['days_since_admission'].fillna(pd.Timedelta(0))

# Drop the admission time column
df_procedures = df_procedures.drop(columns=['admittime'])

In [109]:
# Drop 
df_procedures = df_procedures.drop(columns=['subject_id','chartdate'])

In [110]:
# Encode
df_procedures = pd.get_dummies(df_procedures, columns=['icd_code'])

In [111]:
df_procedures = df_procedures.merge(df_los_hadm, on='hadm_id', how='left')
df_procedures = df_procedures.drop(columns=['hadm_id'])

In [112]:
data = df_procedures.drop(columns=['los'])
target = pd.DataFrame(df_procedures['los'])

In [113]:
column_names = data.columns.to_numpy()

# Convert the array to a DataFrame
df_column_names = pd.DataFrame(column_names, columns=['Column Names'])

output_folder = 'LOS_RF_features'
os.makedirs(output_folder, exist_ok=True)
file_path = os.path.join(output_folder, 'procedures_features.csv')

# Save the DataFrame to a CSV file
df_column_names.to_csv(file_path, index=False)

#### Dimensionality reduction

In [114]:
# Converting duration strings to floats

target['los'] = target['los'].astype(str)
target.fillna(0, inplace=True)
target.loc[~target['los'].str.contains('days'), 'los'] = '0 days 00:00:00'
target['los'] = target['los'].astype(str)
target['los'] = target['los'].apply(convert_to_days)
data['days_since_admission'] = data['days_since_admission'].astype(str)
data['days_since_admission'] = data['days_since_admission'].str.split().str[0].astype(int)

In [115]:
# Need to reduce from 355 to 115

from sklearn.decomposition import TruncatedSVD

# Number of desired features (components)
n_components = 115

# Initialize Truncated SVD with the desired number of components
svd = TruncatedSVD(n_components=n_components)

# Fit the Truncated SVD model to the sparse matrix and transform the data
svd.fit(data)
data = svd.transform(data)

# Get the explained variance ratio (how much variance is explained by each component)
explained_variance_ratio = svd.explained_variance_ratio_

# Print the transformed matrix and explained variance ratio
# print("Transformed Matrix:")
# print(transformed_matrix)
print("\nExplained Variance Ratio:")
print(explained_variance_ratio)

print("\n Amount of original variance conserved:", np.sum(svd.explained_variance_ratio_))


Explained Variance Ratio:
[3.45480616e-01 4.44957161e-01 1.93438956e-01 5.16315510e-04
 4.92956540e-04 3.96487536e-04 3.67158021e-04 3.36794019e-04
 2.83744621e-04 2.58429181e-04 2.52220606e-04 2.29863113e-04
 2.29726367e-04 2.19606609e-04 2.01159644e-04 2.01137042e-04
 1.93402232e-04 1.84791418e-04 1.72406978e-04 1.71482944e-04
 1.69952813e-04 1.60400245e-04 1.43677212e-04 1.43675452e-04
 1.43076186e-04 1.42911018e-04 1.34616158e-04 1.14941904e-04
 1.14942002e-04 1.14941473e-04 1.14934627e-04 1.14941897e-04
 1.13820429e-04 1.13612439e-04 1.04825486e-04 8.62057271e-05
 8.62056158e-05 8.62055575e-05 8.62052188e-05 8.62025816e-05
 8.62052606e-05 8.62051451e-05 8.62051161e-05 8.62017107e-05
 8.62043598e-05 8.41369440e-05 8.08357877e-05 7.47934610e-05
 5.74680466e-05 5.74678085e-05 5.74677056e-05 5.74670899e-05
 5.74668777e-05 5.74664468e-05 5.74657617e-05 5.74643990e-05
 5.74647151e-05 5.74639142e-05 5.74606080e-05 5.74635130e-05
 5.74623926e-05 5.74622919e-05 5.74593345e-05 5.74590134e-

#### Training the learner

In [116]:
# Random forest model

# Convert DataFrame to 1D array using ravel()
target = target.values.ravel()

# Initialize and fit the Random Forest regression model
random_forest_procedures = RandomForestRegressor(n_estimators=100, max_depth=10, random_state=42)
random_forest_procedures.fit(data, target)

In [117]:
# Save model to folder

output_folder = 'LOS_RF_learners'
# os.makedirs(output_folder, exist_ok=True)

# Save the trained model to a file in the new folder
model_file = os.path.join(output_folder, 'random_forest_procedures.joblib')
dump(random_forest_procedures, model_file)

['LOS_RF_learners\\random_forest_procedures.joblib']

### services

In [53]:
file = "hosp/services.csv"
full_path = path + file

df_services = pd.read_csv(full_path)

In [54]:
# Separate into training and evaluation sets 

df_services_training = df_services[~df_services['subject_id'].isin(evaluation_patients)]
df_services_evaluation = df_services[df_services['subject_id'].isin(evaluation_patients)]

In [55]:
# Save evaluation data for later 
folder_name = 'EnsembleEvaluationData'

# Define the file path
file_path = os.path.join(folder_name, 'df_services_evaluation.csv')

# Save the DataFrame to a CSV file in the specified folder
df_services_evaluation.to_csv(file_path, index=False)

#### Preprocessing (on training data)

Drop: subject_id, transfertime
Impute with N/A and encode: prev_service
Encode: curr_service

In [None]:
df_services = df_services_training

In [None]:
df_services = df_services.reset_index(drop=True)

In [None]:
# Make a feature called days_since_admission using transfertime-admittime 

# Convert to datetime
df_services['transfertime'] = pd.to_datetime(df_services['transfertime'], format='%Y-%m-%d %H:%M:%S')

# Add admittime column from other dataframe
df_services = df_services.merge(df_admittime, on='hadm_id', how='left')

df_services['days_since_admission'] = df_services['transfertime'] - df_services['admittime']

# Fill any non time values
df_services['days_since_admission'] = df_services['days_since_admission'].fillna(pd.Timedelta(0))

# Drop the admission time column
df_services = df_services.drop(columns=['admittime'])

In [None]:
# Drop 
df_services = df_services.drop(columns=['subject_id','transfertime'])

In [None]:
# Impute with N/A and encode
df_services['prev_service'] = df_services['prev_service'].fillna('N/A')
df_services = pd.get_dummies(df_services, columns=['prev_service','curr_service'])

In [None]:
df_services = df_services.merge(df_los_hadm, on='hadm_id', how='left')
df_services = df_services.drop(columns=['hadm_id'])

In [None]:
data = df_services.drop(columns=['los'])
target = pd.DataFrame(df_services['los'])

#### Training the learner

In [None]:
# Converting duration strings to floats

target['los'] = target['los'].astype(str)
target.fillna(0, inplace=True)
target.loc[~target['los'].str.contains('days'), 'los'] = '0 days 00:00:00'

target['los'] = target['los'].apply(convert_to_days)
data['days_since_admission'] = data['days_since_admission'].astype(str)
data['days_since_admission'] = data['days_since_admission'].apply(convert_to_days)

In [None]:
# Random forest model

# Convert DataFrame to 1D array using ravel()
target = target.values.ravel()

# Initialize and fit the Random Forest regression model
random_forest_services = RandomForestRegressor(n_estimators=100, max_depth=10, random_state=42)
random_forest_services.fit(data, target)

In [None]:
# Save model to folder

output_folder = 'LOS_RF_learners'
# os.makedirs(output_folder, exist_ok=True)

# Save the trained model to a file in the new folder
model_file = os.path.join(output_folder, 'random_forest_services.joblib')
dump(random_forest_services, model_file)

### transfers

In [56]:
file = "hosp/transfers.csv"
full_path = path + file

df_transfers = pd.read_csv(full_path)

In [57]:
# Separate into training and evaluation sets 

# df_transfers_training = df_transfers[~df_transfers['subject_id'].isin(evaluation_patients)]
# df_transfers_evaluation = df_transfers[df_transfers['subject_id'].isin(evaluation_patients)]

df_transfers_training = df_transfers[~df_transfers['hadm_id'].isin(evaluation_admissions)]
df_transfers_evaluation = df_transfers[df_transfers['hadm_id'].isin(evaluation_admissions)]

In [58]:
# Save evaluation data for later 
folder_name = 'EnsembleEvaluationData'

# Define the file path
file_path = os.path.join(folder_name, 'df_transfers_evaluation.csv')

# Save the DataFrame to a CSV file in the specified folder
df_transfers_evaluation.to_csv(file_path, index=False)

#### Preprocessing (on training data)

Drop: subject_id, transfer_id, intime, outtime
Encode: eventtype
Impute with N/A and encode: careunit

In [121]:
df_transfers = df_transfers_training

In [122]:
df_transfers = df_transfers.reset_index(drop=True)

In [123]:
# Make a days_since_admission feature of intime-admittime

# Convert to datetime
df_transfers['intime'] = pd.to_datetime(df_transfers['intime'], format='%Y-%m-%d %H:%M:%S')

# Add admittime column from other dataframe
df_transfers = df_transfers.merge(df_admittime, on='hadm_id', how='left')

df_transfers['days_since_admission'] = df_transfers['intime'] - df_transfers['admittime']

# Fill any non time values
df_transfers['days_since_admission'] = df_transfers['days_since_admission'].fillna(pd.Timedelta(0))

# Drop the admission time column
df_transfers = df_transfers.drop(columns=['admittime'])

In [124]:
# Make a duration feature of outtime-intime 

# Convert to datetime
df_transfers['outtime'] = pd.to_datetime(df_transfers['outtime'], format='%Y-%m-%d %H:%M:%S')

df_transfers['duration'] = df_transfers['outtime'] - df_transfers['intime']

# Fill any non time values
df_transfers['duration'] = df_transfers['duration'].fillna(pd.Timedelta(0))

In [125]:
# Drop 
df_transfers = df_transfers.drop(columns=['subject_id','transfer_id','intime','outtime'])

In [126]:
# Impute with N/A and encode
df_transfers['careunit'] = df_transfers['careunit'].fillna('N/A')
df_transfers = pd.get_dummies(df_transfers, columns=['eventtype','careunit'])

In [127]:
df_transfers = df_transfers.merge(df_los_hadm, on='hadm_id', how='left')
df_transfers = df_transfers.drop(columns=['hadm_id'])

In [128]:
data = df_transfers.drop(columns=['los'])
target = pd.DataFrame(df_transfers['los'])

In [129]:
column_names = data.columns.to_numpy()

# Convert the array to a DataFrame
df_column_names = pd.DataFrame(column_names, columns=['Column Names'])

output_folder = 'LOS_RF_features'
os.makedirs(output_folder, exist_ok=True)
file_path = os.path.join(output_folder, 'transfers_features.csv')

# Save the DataFrame to a CSV file
df_column_names.to_csv(file_path, index=False)

#### Training the learner

In [130]:
# Converting duration strings to floats

target['los'] = target['los'].astype(str)
target.fillna(0, inplace=True)
target.loc[~target['los'].str.contains('days'), 'los'] = '0 days 00:00:00'

target['los'] = target['los'].apply(convert_to_days)
data['duration']= data['duration'].astype(str)
data['duration']= data['duration'].apply(convert_to_days)
data['days_since_admission'] = data['days_since_admission'].astype(str)
data['days_since_admission'] = data['days_since_admission'].apply(convert_to_days)

In [131]:
# Random forest model

# Convert DataFrame to 1D array using ravel()
target = target.values.ravel()

# Initialize and fit the Random Forest regression model
random_forest_transfers = RandomForestRegressor(n_estimators=100, max_depth=10, random_state=42)
random_forest_transfers.fit(data, target)

In [132]:
# Save model to folder

output_folder = 'LOS_RF_learners'
# os.makedirs(output_folder, exist_ok=True)

# Save the trained model to a file in the new folder
model_file = os.path.join(output_folder, 'random_forest_transfers.joblib')
dump(random_forest_transfers, model_file)

['LOS_RF_learners\\random_forest_transfers.joblib']

### icustays

In [59]:
file = "icu/icustays.csv"
full_path = path + file

df_icustays = pd.read_csv(full_path)

In [60]:
# Separate into training and evaluation sets 

df_icustays_training = df_icustays[~df_icustays['subject_id'].isin(evaluation_patients)]
df_icustays_evaluation = df_icustays[df_icustays['subject_id'].isin(evaluation_patients)]

In [61]:
# Save evaluation data for later 
folder_name = 'EnsembleEvaluationData'

# Define the file path
file_path = os.path.join(folder_name, 'df_icustays_evaluation.csv')

# Save the DataFrame to a CSV file in the specified folder
df_icustays_evaluation.to_csv(file_path, index=False)

#### Preprocessing (on training data)

Drop: subject_id, stay_id, intime, outtime
Encode: first_careunit, last_careunit

In [None]:
df_icustays = df_icustays_training

In [None]:
df_icustays = df_icustays.reset_index(drop=True)

In [None]:
# make a feature called days_since_admission using intime-admittime

# Convert to datetime
df_icustays['intime'] = pd.to_datetime(df_icustays['intime'], format='%Y-%m-%d %H:%M:%S')

# Add admittime column from other dataframe
df_icustays = df_icustays.merge(df_admittime, on='hadm_id', how='left')

# # Discard the time part and keep only the date
# df_hcpcsevents['admittime'] = df_hcpcsevents['admittime'].dt.date
# df_hcpcsevents['chartdate'] = df_hcpcsevents['chartdate'].dt.date

df_icustays['days_since_admission'] = df_icustays['intime'] - df_icustays['admittime']

# Fill any non time values
df_icustays['days_since_admission'] = df_icustays['days_since_admission'].fillna(pd.Timedelta(0))

# Drop the admission time column
df_icustays = df_icustays.drop(columns=['admittime'])

In [None]:
# Drop 
df_icustays = df_icustays.drop(columns=['subject_id','stay_id','intime','outtime'])

# Rename los to icu_los
df_icustays = df_icustays.rename(columns={'los': 'icu_los'})

In [None]:
# Encode
df_icustays = pd.get_dummies(df_icustays, columns=['first_careunit','last_careunit'])

In [None]:
df_icustays = df_icustays.merge(df_los_hadm, on='hadm_id', how='left')
df_icustays = df_icustays.drop(columns=['hadm_id'])

In [None]:
data = df_services.drop(columns=['los'])
target = pd.DataFrame(df_services['los'])

#### Training the learner

In [None]:
# Converting duration strings to floats

target['los'] = target['los'].astype(str)
target.fillna(0, inplace=True)
target.loc[~target['los'].str.contains('days'), 'los'] = '0 days 00:00:00'

target['los'] = target['los'].apply(convert_to_days)
data['days_since_admission'] = data['days_since_admission'].astype(str)
data['days_since_admission'] = data['days_since_admission'].apply(convert_to_days)

In [None]:
# Random forest model

# Convert DataFrame to 1D array using ravel()
target = target.values.ravel()

# Initialize and fit the Random Forest regression model
random_forest_icustays = RandomForestRegressor(n_estimators=100, max_depth=10, random_state=42)
random_forest_icustays.fit(data, target)

In [None]:
# Save model to folder

output_folder = 'LOS_RF_learners'
# os.makedirs(output_folder, exist_ok=True)

# Save the trained model to a file in the new folder
model_file = os.path.join(output_folder, 'random_forest_icustays.joblib')
dump(random_forest_icustays, model_file)

### ingredientevents - Still to run

In [62]:
file = "icu/ingredientevents.csv"
full_path = path + file

df_ingredient = pd.read_csv(full_path)

In [63]:
# Separate into training and evaluation sets 

df_ingredient_training = df_ingredient[~df_ingredient['subject_id'].isin(evaluation_patients)]
df_ingredient_evaluation = df_ingredient[df_ingredient['subject_id'].isin(evaluation_patients)]

In [64]:
# Save evaluation data for later 
folder_name = 'EnsembleEvaluationData'

# Define the file path
file_path = os.path.join(folder_name, 'df_ingredient_evaluation.csv')

# Save the DataFrame to a CSV file in the specified folder
df_ingredient_evaluation.to_csv(file_path, index=False)

#### Preprocessing (on training data)

Drop: subject_id, starttime, endtime, storetime, orderid, originalamount, stay_id, caregiver_id
Encode: amountuom, statusdescription, itemid
Impute with 0: rate
Impute with N/A and encode: rateuom, linkorderid

In [None]:
df_ingredient = df_ingredient_training

In [None]:
df_ingredient = df_ingredient.reset_index(drop=True)

In [None]:
# Make a duration feature of endtime-starttime 

# Convert to datetime
df_ingredient['endtime'] = pd.to_datetime(df_ingredient['endtime'], format='%Y-%m-%d %H:%M:%S')
df_ingredient['starttime'] = pd.to_datetime(df_ingredient['starttime'], format='%Y-%m-%d %H:%M:%S')


df_ingredient['duration'] = df_ingredient['endtime'] - df_ingredient['starttime']

# Fill any non time values
df_ingredient['duration'] = df_ingredient['duration'].fillna(pd.Timedelta(0))

In [None]:
# make a recording_delay feature of storetime-endtime

# Convert to datetime
df_ingredient['storetime'] = pd.to_datetime(df_ingredient['storetime'], format='%Y-%m-%d %H:%M:%S')

df_ingredient['recording_delay'] = df_ingredient['storetime'] - df_ingredient['endtime']

# Fill any non time values
df_ingredient['recording_delay'] = df_ingredient['recording_delay'].fillna(pd.Timedelta(0))

In [None]:
# Drop 
df_ingredient = df_ingredient.drop(columns=['subject_id','starttime','endtime','storetime','orderid','originalamount',
                                           'stay_id','caregiver_id'])

In [None]:
# Impute with N/A and encode
df_ingredient['rateuom'] = df_ingredient['rateuom'].fillna('N/A')
df_ingredient['linkorderid'] = df_ingredient['linkorderid'].fillna('N/A')
df_ingredient = pd.get_dummies(df_ingredient, columns=['rateuom','amountuom','statusdescription','itemid','linkorderid'])

In [None]:
# Impute with 0
df_ingredient['rate'] = df_ingredient['rate'].fillna(0)

In [None]:
df_ingredient = df_ingredient.merge(df_los_hadm, on='hadm_id', how='left')
df_ingredient = df_ingredient.drop(columns=['hadm_id'])

#### Dimensionality reduction

In [None]:
data = df_ingredient.drop(columns=['los'])
target = pd.DataFrame(df_ingredient['los'])

In [None]:
# Need to reduce from 7727 to 4116

In [None]:
# Converting duration strings to floats

target['los'] = target['los'].astype(str)
target.fillna(0, inplace=True)
target.loc[~target['los'].str.contains('days'), 'los'] = '0 days 00:00:00'
target['los'] = target['los'].apply(convert_to_days)
data['duration']= data['duration'].astype(str)
data['duration']= data['duration'].apply(convert_to_days)
data['recording_delay']= data['recording_delay'].astype(str)
data['recording_delay']= data['recording_delay'].apply(convert_to_days)

In [None]:
from sklearn.decomposition import TruncatedSVD

# Number of desired features (components)
n_components = 4116

# Initialize Truncated SVD with the desired number of components
svd = TruncatedSVD(n_components=n_components)

# Fit the Truncated SVD model to the sparse matrix and transform the data
svd.fit(data)
data = svd.transform(data)

# Get the explained variance ratio (how much variance is explained by each component)
explained_variance_ratio = svd.explained_variance_ratio_

# Print the transformed matrix and explained variance ratio
# print("Transformed Matrix:")
# print(transformed_matrix)
print("\nExplained Variance Ratio:")
print(explained_variance_ratio)

print("\n Amount of original variance conserved:", np.sum(svd.explained_variance_ratio_))

#### Training the learner

In [None]:
# Random forest model

# Convert DataFrame to 1D array using ravel()
target = target.values.ravel()

# Initialize and fit the Random Forest regression model
random_forest_ingredient = RandomForestRegressor(n_estimators=100, max_depth=10, random_state=42)
random_forest_ingredient.fit(data, target)

In [None]:
# Save model to folder

output_folder = 'LOS_RF_learners'
# os.makedirs(output_folder, exist_ok=True)

# Save the trained model to a file in the new folder
model_file = os.path.join(output_folder, 'random_forest_ingredient.joblib')
dump(random_forest_ingredient, model_file)

### inputevents

In [65]:
file = "icu/inputevents.csv"
full_path = path + file

df_input = pd.read_csv(full_path)

In [66]:
# Separate into training and evaluation sets 

# df_input_training = df_input[~df_input['subject_id'].isin(evaluation_patients)]
# df_input_evaluation = df_input[df_input['subject_id'].isin(evaluation_patients)]

df_input_training = df_input[~df_input['hadm_id'].isin(evaluation_admissions)]
df_input_evaluation = df_input[df_input['hadm_id'].isin(evaluation_admissions)]

In [67]:
# Save evaluation data for later 
folder_name = 'EnsembleEvaluationData'

# Define the file path
file_path = os.path.join(folder_name, 'df_input_evaluation.csv')

# Save the DataFrame to a CSV file in the specified folder
df_input_evaluation.to_csv(file_path, index=False)

#### Preprocessing (on training data)

Drop: subject_id, starttime, endtime, storetime, orderid, linkorderid, continueinnextdept, stay_id, caregiver_id,
totalamountuom
Encode: amountuom, ordercategoryname, ordercomponenttypedescription, ordercategorydescription, statusdescription, itemid
Impute with 0: rate, totalamount
Impute with N/A and encode: rateuom, secondaryordercategoryname

In [136]:
df_input = df_input_training

In [137]:
df_input = df_input.reset_index(drop=True)

In [138]:
# Make a duration feature using endtime-starttime

# Convert to datetime
df_input['endtime'] = pd.to_datetime(df_input['endtime'], format='%Y-%m-%d %H:%M:%S')
df_input['starttime'] = pd.to_datetime(df_input['starttime'], format='%Y-%m-%d %H:%M:%S')


df_input['duration'] = df_input['endtime'] - df_input['starttime']

# Fill any non time values
df_input['duration'] = df_input['duration'].fillna(pd.Timedelta(0))

In [139]:
# Make a recording_delay feature using storetime-endtime

# Convert to datetime
df_input['storetime'] = pd.to_datetime(df_input['storetime'], format='%Y-%m-%d %H:%M:%S')

df_input['recording_delay'] = df_input['storetime'] - df_input['endtime']

# Fill any non time values
df_input['recording_delay'] = df_input['recording_delay'].fillna(pd.Timedelta(0))

In [140]:
# Drop 
df_input = df_input.drop(columns=['subject_id','stay_id','starttime','endtime','storetime','orderid','linkorderid',
                                  'continueinnextdept','totalamountuom', 'stay_id','caregiver_id'])

In [141]:
# Impute with N/A and encode
df_input['rateuom'] = df_input['rateuom'].fillna('N/A')
df_input['secondaryordercategoryname'] = df_input['secondaryordercategoryname'].fillna('N/A')
df_input = pd.get_dummies(df_input, columns=['rateuom','secondaryordercategoryname','amountuom','ordercategoryname',
                                            'ordercomponenttypedescription','ordercategorydescription','statusdescription',
                                            'itemid'])

In [142]:
# Impute with 0
df_input['rate'] = df_input['rate'].fillna(0)
df_input['totalamount'] = df_input['totalamount'].fillna(0)

In [143]:
df_input = df_input.dropna()

In [144]:
df_input = df_input.merge(df_los_hadm, on='hadm_id', how='left')
df_input = df_input.drop(columns=['hadm_id'])

In [145]:
data = df_input.drop(columns=['los'])
target = pd.DataFrame(df_input['los'])

In [146]:
# Converting duration strings to floats

target['los'] = target['los'].astype(str)
target.fillna(0, inplace=True)
target.loc[~target['los'].str.contains('days'), 'los'] = '0 days 00:00:00'
target['los'] = target['los'].apply(convert_to_days)
data['duration']= data['duration'].astype(str)
data['duration']= data['duration'].apply(convert_to_days)
data['recording_delay']= data['recording_delay'].astype(str)
data['recording_delay']= data['recording_delay'].apply(convert_to_days)

In [147]:
column_names = data.columns.to_numpy()

# Convert the array to a DataFrame
df_column_names = pd.DataFrame(column_names, columns=['Column Names'])

output_folder = 'LOS_RF_features'
os.makedirs(output_folder, exist_ok=True)
file_path = os.path.join(output_folder, 'input_features.csv')

# Save the DataFrame to a CSV file
df_column_names.to_csv(file_path, index=False)

#### Training the learner

In [148]:
# Random forest model

# Convert DataFrame to 1D array using ravel()
target = target.values.ravel()

# Initialize and fit the Random Forest regression model
random_forest_input = RandomForestRegressor(n_estimators=100, max_depth=10, random_state=42)
random_forest_input.fit(data, target)

In [149]:
# Save model to folder

output_folder = 'LOS_RF_learners'
# os.makedirs(output_folder, exist_ok=True)

# Save the trained model to a file in the new folder
model_file = os.path.join(output_folder, 'random_forest_input.joblib')
dump(random_forest_input, model_file)

['LOS_RF_learners\\random_forest_input.joblib']

### outputevents

In [68]:
file = "icu/outputevents.csv"
full_path = path + file

df_output = pd.read_csv(full_path)

In [69]:
# Separate into training and evaluation sets 

df_output_training = df_output[~df_output['subject_id'].isin(evaluation_patients)]
df_output_evaluation = df_output[df_output['subject_id'].isin(evaluation_patients)]

In [70]:
# Save evaluation data for later 
folder_name = 'EnsembleEvaluationData'

# Define the file path
file_path = os.path.join(folder_name, 'df_output_evaluation.csv')

# Save the DataFrame to a CSV file in the specified folder
df_output_evaluation.to_csv(file_path, index=False)

#### Preprocessing (on training data)

Drop: subject_id, charttime, storetime, valueuom, stay_id, caregiver_id'
Encode: itemid

In [None]:
df_output = df_output_training

In [None]:
df_output = df_output.reset_index(drop=True)

In [None]:
# Make a days_since_admission feature using charttime-admittime 

# Convert to datetime
df_output['charttime'] = pd.to_datetime(df_output['charttime'], format='%Y-%m-%d %H:%M:%S')

# Add admittime column from other dataframe
df_output = df_output.merge(df_admittime, on='hadm_id', how='left')

df_output['days_since_admission'] = df_output['charttime'] - df_output['admittime']

# Fill any non time values
df_output['days_since_admission'] = df_output['days_since_admission'].fillna(pd.Timedelta(0))

# Drop the admission time column
df_output = df_output.drop(columns=['admittime'])

In [None]:
# Make a recording_delay feature using storetime-charttime

# Convert to datetime
df_output['storetime'] = pd.to_datetime(df_output['storetime'], format='%Y-%m-%d %H:%M:%S')

df_output['recording_delay'] = df_output['storetime'] - df_output['charttime']

# Fill any non time values
df_output['recording_delay'] = df_output['recording_delay'].fillna(pd.Timedelta(0))

In [None]:
# Drop 
df_output = df_output.drop(columns=['subject_id','stay_id','charttime','storetime','storetime','valueuom','caregiver_id'])

In [None]:
#Encode
df_output = pd.get_dummies(df_output, columns=['itemid'])

In [None]:
df_output = df_output.merge(df_los_hadm, on='hadm_id', how='left')
df_output = df_output.drop(columns=['hadm_id'])

In [None]:
data = df_ingredient.drop(columns=['los'])
target = pd.DataFrame(df_ingredient['los'])

In [None]:
# Converting duration strings to floats

target['los'] = target['los'].astype(str)
target.fillna(0, inplace=True)
target.loc[~target['los'].str.contains('days'), 'los'] = '0 days 00:00:00'
target['los'] = target['los'].apply(convert_to_days)
data['recording_delay']= data['recording_delay'].astype(str)
data['recording_delay']= data['recording_delay'].apply(convert_to_days)

In [None]:
data['duration']= data['duration'].astype(str)
data['duration']= data['duration'].apply(convert_to_days)

#### Training the learner

In [None]:
# Random forest model

# Convert DataFrame to 1D array using ravel()
target = target.values.ravel()

# Initialize and fit the Random Forest regression model
random_forest_output = RandomForestRegressor(n_estimators=100, max_depth=10, random_state=42)
random_forest_output.fit(data, target)

In [None]:
# Save model to folder

output_folder = 'LOS_RF_learners'
# os.makedirs(output_folder, exist_ok=True)

# Save the trained model to a file in the new folder
model_file = os.path.join(output_folder, 'random_forest_output.joblib')
dump(random_forest_output, model_file)

### procedureevents

In [71]:
file = "icu/procedureevents.csv"
full_path = path + file

df_procedure_events = pd.read_csv(full_path)

In [72]:
# Separate into training and evaluation sets 

# df_procedure_events_training = df_procedure_events[~df_procedure_events['subject_id'].isin(evaluation_patients)]
# df_procedure_events_evaluation = df_procedure_events[df_procedure_events['subject_id'].isin(evaluation_patients)]

df_procedure_events_training = df_procedure_events[~df_procedure_events['hadm_id'].isin(evaluation_admissions)]
df_procedure_events_evaluation = df_procedure_events[df_procedure_events['hadm_id'].isin(evaluation_admissions)]

In [73]:
# Save evaluation data for later 
folder_name = 'EnsembleEvaluationData'

# Define the file path
file_path = os.path.join(folder_name, 'df_procedure_events_evaluation.csv')

# Save the DataFrame to a CSV file in the specified folder
df_procedure_events_evaluation.to_csv(file_path, index=False)

#### Preprocessing (on training data)

Drop: subject_id, starttime, endtime, storetime, orderid, linkorderid, continueinnextdept, stay_id, caregiver_id
Encode: valueuom, ordercategoryname, ordercategorydescription, statusdescription, itemid
Impute with N/A and encode: location, locationcategory
MAKE DURATION FEATURE 

In [153]:
df_procedure_events = df_procedure_events_training

In [154]:
df_procedure_events = df_procedure_events.reset_index(drop=True)

In [155]:
# Make a duration feature using endtime-starttime

# Convert to datetime
df_procedure_events['endtime'] = pd.to_datetime(df_procedure_events['endtime'], format='%Y-%m-%d %H:%M:%S')
df_procedure_events['starttime'] = pd.to_datetime(df_procedure_events['starttime'], format='%Y-%m-%d %H:%M:%S')


df_procedure_events['duration'] = df_procedure_events['endtime'] - df_procedure_events['starttime']

# Fill any non time values
df_procedure_events['duration'] = df_procedure_events['duration'].fillna(pd.Timedelta(0))

In [156]:
# Make a recording_delay feature using storetime-endtime

# Convert to datetime
df_procedure_events['storetime'] = pd.to_datetime(df_procedure_events['storetime'], format='%Y-%m-%d %H:%M:%S')

df_procedure_events['recording_delay'] = df_procedure_events['storetime'] - df_procedure_events['endtime']

# Fill any non time values
df_procedure_events['recording_delay'] = df_procedure_events['recording_delay'].fillna(pd.Timedelta(0))

In [157]:
# Drop 
df_procedure_events = df_procedure_events.drop(columns=['subject_id','stay_id','starttime','endtime','storetime','orderid',
                                                        'linkorderid','continueinnextdept','caregiver_id'])

In [158]:
# Impute with N/A and encode
df_procedure_events['location'] = df_procedure_events['location'].fillna('N/A')
df_procedure_events['locationcategory'] = df_procedure_events['locationcategory'].fillna('N/A')
df_procedure_events = pd.get_dummies(df_procedure_events, columns=['location','locationcategory','valueuom',
                                                                   'ordercategoryname','ordercategorydescription',
                                                                   'statusdescription','itemid'])

In [159]:
df_procedure_events = df_procedure_events.merge(df_los_hadm, on='hadm_id', how='left')
df_procedure_events = df_procedure_events.drop(columns=['hadm_id'])

In [160]:
data = df_procedure_events.drop(columns=['los'])
target = pd.DataFrame(df_procedure_events['los'])

In [161]:
# Converting duration strings to floats

target['los'] = target['los'].astype(str)
target.fillna(0, inplace=True)
target.loc[~target['los'].str.contains('days'), 'los'] = '0 days 00:00:00'
target['los'] = target['los'].apply(convert_to_days)
data['duration']= data['duration'].astype(str)
data['duration']= data['duration'].apply(convert_to_days)
data['recording_delay']= data['recording_delay'].astype(str)
data['recording_delay']= data['recording_delay'].apply(convert_to_days)

In [162]:
column_names = data.columns.to_numpy()

# Convert the array to a DataFrame
df_column_names = pd.DataFrame(column_names, columns=['Column Names'])

output_folder = 'LOS_RF_features'
os.makedirs(output_folder, exist_ok=True)
file_path = os.path.join(output_folder, 'procedure_events_features.csv')

# Save the DataFrame to a CSV file
df_column_names.to_csv(file_path, index=False)

#### Training the learner

In [163]:
# Random forest model

# Convert DataFrame to 1D array using ravel()
target = target.values.ravel()

# Initialize and fit the Random Forest regression model
random_forest_procedure_events = RandomForestRegressor(n_estimators=100, max_depth=10, random_state=42)
random_forest_procedure_events.fit(data, target)

In [164]:
# Save model to folder

output_folder = 'LOS_RF_learners'
# os.makedirs(output_folder, exist_ok=True)

# Save the trained model to a file in the new folder
model_file = os.path.join(output_folder, 'random_forest_procedure_events.joblib')
dump(random_forest_procedure_events, model_file)

['LOS_RF_learners\\random_forest_procedure_events.joblib']