### Training of procedure classifiers for ensemble - full pipeline

In [264]:
# External libraries for data processing
import numpy as np
import pandas as pd
import sklearn as sk
#To render graphs within notebook
%matplotlib inline
import matplotlib.pyplot as plt
import joblib 
import os
from tqdm import tqdm

# Versions of libraries
print("Numpy version: {}".format(np.__version__))
print("Pandas version: {}".format(pd.__version__))
print("Scikit version: {}".format(sk.__version__))

Numpy version: 1.24.3
Pandas version: 1.5.3
Scikit version: 1.3.0


In [265]:
from sklearn.preprocessing import StandardScaler
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MultiLabelBinarizer

In [266]:
path = "C:/Project/Data/"

In [267]:
file = "hosp/admissions.csv"
full_path = path + file

df_admissions = pd.read_csv(full_path)

df_admissions['dischtime'] = pd.to_datetime(df_admissions['dischtime'], format='%d/%m/%Y %H:%M')
df_admissions['admittime'] = pd.to_datetime(df_admissions['admittime'], format='%d/%m/%Y %H:%M')

df_admittime= pd.DataFrame()
df_admittime['hadm_id'] = df_admissions['hadm_id']
df_admittime['admittime'] = df_admissions['admittime']

#### Global functions

In [268]:
def convert_to_days(duration_str):
    parts = duration_str.split(' days ')  # Split string into form ['22', '20:55:00']
    days = float(parts[0])  # Extract number of days and convert to float
    time_parts = parts[1].split(':')  # Split time part (hh:mm:ss) ['20', '55', '00']
    hours = float(time_parts[0])  # Extract hours and convert to float
    minutes = float(time_parts[1])  # Extract minutes and convert to float
    seconds = float(time_parts[2])  # Extract seconds and convert to float
    total_days = days + (hours / 24) + (minutes / (24 * 60)) + (seconds / (24 * 3600))  # Calculate total days
    return total_days

### Target variable calculation

In [269]:
file = "hosp/procedures_icd.csv"
full_path = path + file

df_procedures = pd.read_csv(full_path)

In [270]:
file = "hosp/d_icd_procedures.csv"
full_path = path + file

df_codes = pd.read_csv(full_path)

In [271]:
# drop unneeded columns 
df_procedures = df_procedures.drop(columns=['subject_id', 'seq_num','icd_version'])

In [272]:
# Concatenate code values for each group
concat_df = df_procedures.groupby(['hadm_id', 'chartdate'])['icd_code'].agg(lambda x: ','.join(x)).reset_index()

# Split the comma-separated codes into individual columns
split_df = concat_df['icd_code'].str.get_dummies(',')

# Concatenate the original DataFrame with the one-hot encoded columns
df_procedures = pd.concat([concat_df, split_df], axis=1)

In [273]:
df_procedures = df_procedures.drop(columns='icd_code')

In [274]:
# convert time to datetime
df_procedures['chartdate'] = pd.to_datetime(df_procedures['chartdate'])

In [275]:
df_procedures

Unnamed: 0,hadm_id,chartdate,0039,0040,0041,0045,0051,0066,0069,0091,...,B41GYZZ,B518YZA,B51W1ZZ,B543ZZ3,B548ZZA,B54BZZA,BT1DYZZ,BT1FYZZ,D7021ZZ,DW021ZZ
0,20044587,2113-08-25,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,20093566,2143-09-27,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,20199380,2144-10-29,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,20199380,2144-10-31,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,20214994,2137-02-25,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
386,29820177,2150-07-10,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
387,29839885,2170-10-08,0,0,0,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
388,29842315,2155-12-05,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
389,29974575,2131-02-27,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


#### Select 55 (20% of) admissions to use for evaluation

In [276]:
file = "hosp/admissions.csv"
full_path = path + file

df_admissions = pd.read_csv(full_path)

In [277]:
evaluation_admissions = df_admissions['hadm_id'].sample(n=55, random_state=42).tolist()

# Any records belonging to these admissions will be removed before training 
print(evaluation_admissions)

[27617929, 27553957, 20282368, 27296885, 24980601, 21133938, 25559382, 20611796, 28778757, 28723315, 28998349, 28676446, 29276678, 26842957, 21477991, 25922998, 26706939, 27993466, 28236161, 27259207, 20385771, 24540843, 20900955, 22413744, 27494880, 25103777, 21599196, 21540783, 22585261, 26275841, 22130791, 22490490, 25020332, 29279905, 29483621, 27167814, 25508812, 21607814, 20297618, 29974575, 24912093, 21255400, 29295881, 28829452, 24656677, 29858644, 23488445, 25970245, 22508257, 25742920, 25085565, 22228639, 27660781, 28335091, 27703517]


### Emar

In [415]:
file = "hosp/emar.csv"
full_path = path + file

df_emar = pd.read_csv(full_path)

# records for 65 different patients 
# 181 unique admissions

#### Preprocessing (on all data)

In [416]:
df_emar = df_emar[df_emar['hadm_id'].isin(df_procedures['hadm_id'])]

# For each sample, get the rows from df_procedures that have the same hadm_id

In [417]:
df_emar = df_emar.reset_index(drop=True)

In [418]:
# convert time to datetime
df_emar['charttime'] = pd.to_datetime(df_emar['charttime'])

In [419]:
# Make a feature called delay using scheduletime - charttime

# Convert to datetime
df_emar['scheduletime'] = pd.to_datetime(df_emar['scheduletime'], format='%Y/%m/%d %H:%M')
df_emar['charttime'] = pd.to_datetime(df_emar['charttime'], format='%Y/%m/%d %H:%M')

df_emar['delay'] = df_emar['charttime'] - df_emar['scheduletime']

# Fill any non time values
df_emar['delay'] = df_emar['delay'].fillna(pd.Timedelta(0))

In [420]:
df_emar = df_emar.drop(columns=['subject_id','emar_id','poe_id','pharmacy_id',
                               'event_txt','scheduletime','storetime'])

In [421]:
# Fill Null with N/A and then one hot encode
df_emar['enter_provider_id'] = df_emar['enter_provider_id'].fillna('N/A')
df_emar['medication'] = df_emar['medication'].fillna('N/A')
df_emar = pd.get_dummies(df_emar, columns=['enter_provider_id', 'medication'])

In [423]:
codes = pd.DataFrame()
data_new = pd.DataFrame()

for index, row in df_emar.iterrows():
#     print(row['outtime'])
    # Filter procedures to have the same 'hadm_id' as that row in emar
    df_procedures_subset = df_procedures[df_procedures['hadm_id'] == row['hadm_id']]
#     print(df_procedures_subset['chartdate'])
    # Get the datetime value of that emar sample
    datetime_value = row['charttime']
    
    # Filter out procedures that are later than the datetime value in emar
    filtered_procedures = df_procedures_subset[df_procedures_subset['chartdate'] > datetime_value]
    
    # if it is empty it means there are no procedures for that admission after this sample was taken 
    if not filtered_procedures.empty:
#         print(filtered_procedures)
#         print('next')
        data_new = pd.concat([data_new, pd.DataFrame([row])], ignore_index=True)
        # Find the closest datetime value in the filtered second dataframe
        # Closest to datetime_value
        closest_datetime = filtered_procedures['chartdate'].min()

        # Get the the sample with the closest datetime value
        closest_id = filtered_procedures[filtered_procedures['chartdate'] == closest_datetime]
#         codes = codes + pd.DataFrame(closest_id)
        codes = pd.concat([codes, closest_id], ignore_index=True)

In [424]:
# df_emar['icd_code'] = float('nan')
# for index, row in df_emar.iterrows():
#     # Filter target df based on 'hadm_id'
#     df_procedures_subset = df_procedures[df_procedures['hadm_id'] == row['hadm_id']]

#     datetime_value = row['charttime']
    
#     # Filter out datetime values in the second dataframe that are later than the datetime value in the first dataframe
#     filtered_procedures = df_procedures_subset[df_procedures_subset['chartdate'] > datetime_value]
    
#     if not filtered_procedures.empty:

#         # Find the closest datetime value in the filtered second dataframe
#         closest_datetime = filtered_procedures['chartdate'].min()

#         # Get the id of the sample with the closest datetime value
#         closest_id = filtered_procedures[filtered_procedures['chartdate'] == closest_datetime]['icd_code']
#         code = pd.array(closest_id)[0]

#         # Assign the id to the current row in the first dataframe
#         df_emar.at[index, 'icd_code'] = str(code)


In [492]:
df_emar = data_new

In [493]:
# Separate into training and evaluation sets 

df_emar_training = df_emar[~df_emar['hadm_id'].isin(evaluation_admissions)]
df_emar_evaluation = df_emar[df_emar['hadm_id'].isin(evaluation_admissions)]

In [494]:
df_emar_training_target = codes.loc[df_emar_training.index]
df_emar_evaluation_target = codes.loc[df_emar_evaluation.index]

In [495]:
df_emar_training_target

Unnamed: 0,hadm_id,chartdate,0039,0040,0041,0045,0051,0066,0069,0091,...,B41GYZZ,B518YZA,B51W1ZZ,B543ZZ3,B548ZZA,B54BZZA,BT1DYZZ,BT1FYZZ,D7021ZZ,DW021ZZ
0,21322534,2155-05-09,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,27738145,2187-02-11,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,24104168,2169-01-20,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,24104168,2169-01-20,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,24104168,2169-01-20,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
14231,27996267,2148-01-31,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
14232,22429197,2148-01-01,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
14233,22429197,2148-01-04,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
14235,29366372,2167-05-05,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [496]:
target = df_emar_training_target

In [497]:
# Drop columns with all zeros
target = target.loc[:, (target != 0).any(axis=0)]

In [498]:
target

Unnamed: 0,hadm_id,chartdate,021009W,02100A8,02100A9,02100Z9,02110Z3,02H633Z,02HV33Z,02RF38Z,...,B211YZZ,B214YZZ,B518YZA,B51W1ZZ,B543ZZ3,B548ZZA,BT1DYZZ,BT1FYZZ,D7021ZZ,DW021ZZ
0,21322534,2155-05-09,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,27738145,2187-02-11,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,24104168,2169-01-20,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,24104168,2169-01-20,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,24104168,2169-01-20,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
14231,27996267,2148-01-31,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
14232,22429197,2148-01-01,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
14233,22429197,2148-01-04,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
14235,29366372,2167-05-05,1,1,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [499]:
df_emar_evaluation_target = df_emar_evaluation_target.drop(columns=['hadm_id', 'chartdate'])

In [500]:
# Get the intersection of columns between the two DataFrames
common_columns = df_emar_evaluation_target.columns.intersection(target.columns)

# Keep only the columns in df1 that are also in df2
df_emar_evaluation_target = df_emar_evaluation_target[common_columns]

In [501]:
# # Define the file path
# file_path = os.path.join(folder_name, 'df_emar_evaluation_target.csv')

# # Save the DataFrame to a CSV file in the specified folder
# df_emar_evaluation_target.to_csv(file_path, index=False)

In [502]:
# Save evaluation data for later 
folder_name = 'EnsembleEvaluationData'

# Define the file path
file_path = os.path.join(folder_name, 'df_emar_evaluation.csv')

# Save the DataFrame to a CSV file in the specified folder
df_emar_evaluation.to_csv(file_path, index=False)

In [503]:
# Define the file path
file_path = os.path.join(folder_name, 'df_emar_evaluation_target.csv')

# Save the DataFrame to a CSV file in the specified folder
df_emar_evaluation_target.to_csv(file_path, index=False)

#### Training the learner

In [504]:
# data = df_icustays_training.drop(columns=['hadm_id','icd_code'])
# # target = pd.DataFrame(df_icustays_training['icd_code'])
# target = codes

data = df_emar_training.drop(columns=['hadm_id','charttime'])

# target = df_icustays['icd_code']
target = target.drop(columns=['hadm_id', 'chartdate'])

In [505]:
# Converting duration strings to floats

data['delay']= data['delay'].astype(str)
data['delay']= data['delay'].apply(convert_to_days)

#### Multi output logistic

In [506]:
data.values.shape

(10886, 573)

In [507]:
target.values.shape

(10886, 141)

In [508]:
from sklearn.multioutput import MultiOutputClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report

# # Standardize features
# scaler = StandardScaler()
# data = scaler.fit_transform(data)

# Initialize logistic regression classifier
logistic_reg = LogisticRegression(random_state = 42, penalty = None, multi_class='multinomial', solver='lbfgs',max_iter=1000)

# Wrap logistic regression classifier in MultiOutputClassifier
logistic_clf_emar = MultiOutputClassifier(logistic_reg)

# Train the multi-output logistic regression model
logistic_clf_emar.fit(data.values, target.values)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver opt

In [509]:
# Save model to folder

output_folder = 'procedures_learners'
os.makedirs(output_folder, exist_ok=True)

# Save the trained model to a file in the new folder
model_file = os.path.join(output_folder, 'logistic_clf_emar.joblib')
dump(logistic_clf_emar, model_file)

['procedures_learners\\logistic_clf_emar.joblib']

#### Training the learner

In [36]:
# data = df_emar_training.drop(columns=['hadm_id','charttime','icd_code'])
# target = pd.DataFrame(df_emar_training['icd_code'])

In [38]:
# data['delay']= data['delay'].astype(str)
# data['delay']= data['delay'].apply(convert_to_days)

In [41]:
# # Initialize and fit
# from sklearn.linear_model import LogisticRegression
# from sklearn.metrics import accuracy_score, classification_report

# # Standardize features
# scaler = StandardScaler()
# data = scaler.fit_transform(data)

# # # Instantiate the logistic regression model
# logistic_clf_emar = LogisticRegression(random_state = 42, penalty = None, multi_class='multinomial', solver='lbfgs',max_iter=1000)

# logistic_clf_emar.fit(data, np.ravel(target))

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


### microbiologyevents

In [569]:
file = "hosp/microbiologyevents.csv"
full_path = path + file

df_microbio = pd.read_csv(full_path)

#### Preprocessing (on all data)

In [570]:
df_microbio = df_microbio[df_microbio['hadm_id'].isin(df_procedures['hadm_id'])]

# For each sample, get the rows from df_procedures that have the same hadm_id
# Go through this list and if the event time is between the in and out transfer time, assign it the same icd_code 

In [571]:
# convert time to datetime
df_microbio['charttime'] = pd.to_datetime(df_microbio['charttime'])

In [572]:
codes = pd.DataFrame()
data_new = pd.DataFrame()

for index, row in df_microbio.iterrows():
#     print(row['outtime'])
    # Filter procedures to have the same 'hadm_id' as that row in emar
    df_procedures_subset = df_procedures[df_procedures['hadm_id'] == row['hadm_id']]
#     print(df_procedures_subset['chartdate'])
    # Get the datetime value of that emar sample
    datetime_value = row['charttime']
    
    # Filter out procedures that are later than the datetime value in emar
    filtered_procedures = df_procedures_subset[df_procedures_subset['chartdate'] > datetime_value]
    
    # if it is empty it means there are no procedures for that admission after this sample was taken 
    if not filtered_procedures.empty:
#         print(filtered_procedures)
#         print('next')
        data_new = pd.concat([data_new, pd.DataFrame([row])], ignore_index=True)
        # Find the closest datetime value in the filtered second dataframe
        # Closest to datetime_value
        closest_datetime = filtered_procedures['chartdate'].min()

        # Get the the sample with the closest datetime value
        closest_id = filtered_procedures[filtered_procedures['chartdate'] == closest_datetime]
#         codes = codes + pd.DataFrame(closest_id)
        codes = pd.concat([codes, closest_id], ignore_index=True)

In [573]:
df_microbio = data_new

In [574]:
# make days_since_admission using charttime 

# Convert to datetime
df_microbio['charttime'] = pd.to_datetime(df_microbio['charttime'], format='%Y/%m/%d %H:%M')

# Add admittime column from other dataframe
df_microbio = df_microbio.merge(df_admittime, on='hadm_id', how='left')

# # Discard the time part and keep only the date
# df_hcpcsevents['admittime'] = df_hcpcsevents['admittime'].dt.date
# df_hcpcsevents['chartdate'] = df_hcpcsevents['chartdate'].dt.date

df_microbio['days_since_admission'] = df_microbio['charttime'] - df_microbio['admittime']

# Fill any non time values
df_microbio['days_since_admission'] = df_microbio['days_since_admission'].fillna(pd.Timedelta(0))

# Drop the admission time column
df_microbio = df_microbio.drop(columns=['admittime'])

In [575]:
# Add storetime - charttime feature (call it delay)

# Convert to datetime
df_microbio['storetime'] = pd.to_datetime(df_microbio['storetime'], format='%Y/%m/%d %H:%M')

df_microbio['delay'] = df_microbio['storetime'] - df_microbio['charttime']

# Fill any non time values
df_microbio['delay'] = df_microbio['delay'].fillna(pd.Timedelta(0))

Drop: microevent_id, subject_id, chartdate, charttime, test_seq, storedate, storetime, test_name and org_itemid (since info in name), quantity, ab_name, comments, micro_specimen_id (unique identifier for sample as some measurements are made on the same sample)
Keep but categorical: order_provider_id, spec_type_desc, dilution_text, dilution_comparison
Impute null with 0: order_provider_id, org_itemid, isolate_num, ab_itemid, dilution_value
Impute with N/A and then one hot encode: interpretation

In [576]:
# Drop
# spec_itemid , test_itemid
df_microbio = df_microbio.drop(columns=['microevent_id','subject_id','chartdate','charttime','test_seq','storedate',
                                       'storetime','quantity','comments','ab_itemid',
                                       'spec_itemid','test_itemid','org_itemid','micro_specimen_id'])

In [577]:
# Impute null with 0: order_provider_id, org_itemid, isolate_num, ab_itemid, dilution_value
df_microbio['order_provider_id'] = df_microbio['order_provider_id'].fillna(0)
df_microbio['isolate_num'] = df_microbio['isolate_num'].fillna(0)
df_microbio['dilution_value'] = df_microbio['dilution_value'].fillna(0)

In [578]:
# Impute with N/A and then one hot encode: interpretation
# encode test_name, ab_name

df_microbio['interpretation'] = df_microbio['interpretation'].fillna('N/A')
df_microbio['test_name'] = df_microbio['test_name'].fillna('N/A')
df_microbio['ab_name'] = df_microbio['ab_name'].fillna('N/A')
df_microbio['org_name'] = df_microbio['org_name'].fillna('None')
df_microbio = pd.get_dummies(df_microbio, columns=['org_name','interpretation','ab_name','test_name'])

In [579]:
# Keep but categorical: order_provider_id, spec_type_desc, dilution_text, dilution_comparison
df_microbio = pd.get_dummies(df_microbio, columns=['order_provider_id','spec_type_desc','dilution_text',
                                                  'dilution_comparison'])

In [580]:
df_microbio = df_microbio.dropna()

In [581]:
# Separate into training and evaluation sets 

df_microbio_training = df_microbio[~df_microbio['hadm_id'].isin(evaluation_admissions)]
df_microbio_evaluation = df_microbio[df_microbio['hadm_id'].isin(evaluation_admissions)]

In [582]:
df_microbio_training_target = codes.loc[df_microbio_training.index]
df_microbio_evaluation_target = codes.loc[df_microbio_evaluation.index]

In [583]:
target = df_microbio_training_target

In [584]:
# Drop columns with all zeros
target = target.loc[:, (target != 0).any(axis=0)]

In [586]:
df_microbio_evaluation_target = df_microbio_evaluation_target.drop(columns=['hadm_id', 'chartdate'])

In [587]:
# Get the intersection of columns between the two DataFrames
common_columns = df_microbio_evaluation_target.columns.intersection(target.columns)

# Keep only the columns in df1 that are also in df2
df_microbio_evaluation_target = df_microbio_evaluation_target[common_columns]

In [588]:
# Save evaluation data for later 
folder_name = 'EnsembleEvaluationData'

# Define the file path
file_path = os.path.join(folder_name, 'df_microbio_evaluation.csv')

# Save the DataFrame to a CSV file in the specified folder
df_microbio_evaluation.to_csv(file_path, index=False)

In [589]:
# Define the file path
file_path = os.path.join(folder_name, 'df_microbio_evaluation_target.csv')

# Save the DataFrame to a CSV file in the specified folder
df_microbio_evaluation_target.to_csv(file_path, index=False)

#### Training the learner

In [555]:
data = df_microbio_training.drop(columns=['hadm_id'])
target = df_microbio_training_target

In [560]:
# Converting duration strings to floats

data['delay']= data['delay'].astype(str)
data['delay']= data['delay'].apply(convert_to_days)
data['days_since_admission'] = data['days_since_admission'].astype(str)
data['days_since_admission'] = data['days_since_admission'].apply(convert_to_days)

#### Multi output logistic

In [590]:
data.values.shape

(888, 191)

In [596]:
target.values.shape

(888, 127)

In [595]:
target = target.drop(columns=['hadm_id','chartdate'])

In [64]:
# # Initialize and fit

# # Standardize features
# scaler = StandardScaler()
# data = scaler.fit_transform(data)

# # # Instantiate the logistic regression model
# logistic_clf_microbio = LogisticRegression(random_state = 42, fit_intercept=False,max_iter=100, multi_class='multinomial', 
#                                        solver='saga', penalty='l1')

# logistic_clf_microbio.fit(data, np.ravel(target))



In [597]:
from sklearn.multioutput import MultiOutputClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report

# # Standardize features
# scaler = StandardScaler()
# data = scaler.fit_transform(data)

# Initialize logistic regression classifier
logistic_reg = LogisticRegression(random_state = 42, fit_intercept=False,max_iter=100, multi_class='multinomial', 
                                       solver='saga', penalty='l1')

# Wrap logistic regression classifier in MultiOutputClassifier
logistic_clf_microbio = MultiOutputClassifier(logistic_reg)

# Train the multi-output logistic regression model
logistic_clf_microbio.fit(data.values, target.values)







In [598]:
# Save model to folder

output_folder = 'procedures_learners' 

# Save the trained model to a file in the new folder
model_file = os.path.join(output_folder, 'logistic_clf_microbio.joblib')
dump(logistic_clf_microbio, model_file)

['procedures_learners\\logistic_clf_microbio.joblib']

In [47]:
# df_microbio['icd_code'] = float('nan')
# for index, row in df_microbio.iterrows():
#     # Filter target df based on 'hadm_id'
#     df_procedures_subset = df_procedures[df_procedures['hadm_id'] == row['hadm_id']]

#     datetime_value = row['charttime']
    
#     # Filter out datetime values in the second dataframe that are later than the datetime value in the first dataframe
#     filtered_procedures = df_procedures_subset[df_procedures_subset['chartdate'] > datetime_value]
    
#     if not filtered_procedures.empty:

#         # Find the closest datetime value in the filtered second dataframe
#         closest_datetime = filtered_procedures['chartdate'].min()

#         # Get the id of the sample with the closest datetime value
#         closest_id = filtered_procedures[filtered_procedures['chartdate'] == closest_datetime]['icd_code']
#         code = pd.array(closest_id)[0]

#         # Assign the id to the current row in the first dataframe
#         df_microbio.at[index, 'icd_code'] = str(code)

#### Preprocessing (on all data)

#### Training the learner

In [555]:
data = df_microbio_training.drop(columns=['hadm_id'])
target = df_microbio_training_target

#### Multi output logistic

In [590]:
data.values.shape

(888, 191)

In [596]:
target.values.shape

(888, 127)

In [595]:
target = target.drop(columns=['hadm_id','chartdate'])

In [599]:
from sklearn.multioutput import MultiOutputClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report

# # Standardize features
# scaler = StandardScaler()
# data = scaler.fit_transform(data)

# Initialize logistic regression classifier
logistic_reg = LogisticRegression(random_state = 42, fit_intercept=False,max_iter=100, multi_class='multinomial', 
                                       solver='saga', penalty='l1')

# Wrap logistic regression classifier in MultiOutputClassifier
logistic_clf_microbio = MultiOutputClassifier(logistic_reg)

# Train the multi-output logistic regression model
logistic_clf_microbio.fit(data.values, target.values)

### pharmacy

In [658]:
file = "hosp/pharmacy.csv"
full_path = path + file

df_pharmacy = pd.read_csv(full_path)

#### Preprocessing (on all data)

In [659]:
df_pharmacy = df_pharmacy[df_pharmacy['hadm_id'].isin(df_procedures['hadm_id'])]

# For each sample, get the rows from df_procedures that have the same hadm_id
# Go through this list and if the event time is between the in and out transfer time, assign it the same icd_code 

In [660]:
# stoptime-starttime for a duration feature

# Convert to datetime
df_pharmacy['stoptime'] = pd.to_datetime(df_pharmacy['stoptime'], format='%Y/%m/%d %H:%M')
df_pharmacy['starttime'] = pd.to_datetime(df_pharmacy['starttime'], format='%Y/%m/%d %H:%M')


df_pharmacy['medication_duration'] = df_pharmacy['stoptime'] - df_pharmacy['starttime']

# Fill any non time values
df_pharmacy['medication_duration'] = df_pharmacy['medication_duration'].fillna(pd.Timedelta(0))

In [661]:
# verifiedtime - entertime for verification_delay feature 

# Convert to datetime
df_pharmacy['verifiedtime'] = pd.to_datetime(df_pharmacy['verifiedtime'], format='%Y/%m/%d %H:%M')
df_pharmacy['entertime'] = pd.to_datetime(df_pharmacy['entertime'], format='%Y/%m/%d %H:%M')

df_pharmacy['verification_delay'] = df_pharmacy['verifiedtime'] - df_pharmacy['entertime']

# Fill any non time values
df_pharmacy['verification_delay'] = df_pharmacy['verification_delay'].fillna(pd.Timedelta(0))

In [662]:
fill_value = [0] 

# Fill null values with the list
df_pharmacy['disp_sched'] = df_pharmacy['disp_sched'].fillna(pd.Series([fill_value]*len(df_pharmacy)))

In [663]:
# Convert all categories to strings
df_pharmacy['disp_sched'] = df_pharmacy['disp_sched'].astype(str)
df_pharmacy['disp_sched'] = df_pharmacy['disp_sched'].apply(lambda x: [str(item) for item in x])

mlb = MultiLabelBinarizer()

encoded_feature = pd.DataFrame(mlb.fit_transform(df_pharmacy['disp_sched']),
                               columns=mlb.classes_,
                               index=df_pharmacy.index)

df_pharmacy = pd.concat([df_pharmacy, encoded_feature], axis=1)

In [664]:
# convert time to datetime
df_pharmacy['entertime'] = pd.to_datetime(df_pharmacy['entertime'])

In [665]:
codes = pd.DataFrame()
data_new = pd.DataFrame()

for index, row in df_pharmacy.iterrows():
#     print(row['outtime'])
    # Filter procedures to have the same 'hadm_id' as that row in emar
    df_procedures_subset = df_procedures[df_procedures['hadm_id'] == row['hadm_id']]
#     print(df_procedures_subset['chartdate'])
    # Get the datetime value of that emar sample
    datetime_value = row['entertime']
    
    # Filter out procedures that are later than the datetime value in emar
    filtered_procedures = df_procedures_subset[df_procedures_subset['chartdate'] > datetime_value]
    
    # if it is empty it means there are no procedures for that admission after this sample was taken 
    if not filtered_procedures.empty:
#         print(filtered_procedures)
#         print('next')
        data_new = pd.concat([data_new, pd.DataFrame([row])], ignore_index=True)
        # Find the closest datetime value in the filtered second dataframe
        # Closest to datetime_value
        closest_datetime = filtered_procedures['chartdate'].min()

        # Get the the sample with the closest datetime value
        closest_id = filtered_procedures[filtered_procedures['chartdate'] == closest_datetime]
#         codes = codes + pd.DataFrame(closest_id)
        codes = pd.concat([codes, closest_id], ignore_index=True)

In [666]:
df_pharmacy = data_new

In [667]:
# df_pharmacy['icd_code'] = float('nan')
# for index, row in df_pharmacy.iterrows():
#     # Filter target df based on 'hadm_id'
#     df_procedures_subset = df_procedures[df_procedures['hadm_id'] == row['hadm_id']]

#     datetime_value = row['entertime']
    
#     # Filter out datetime values in the second dataframe that are later than the datetime value in the first dataframe
#     filtered_procedures = df_procedures_subset[df_procedures_subset['chartdate'] > datetime_value]
    
#     if not filtered_procedures.empty:

#         # Find the closest datetime value in the filtered second dataframe
#         closest_datetime = filtered_procedures['chartdate'].min()

#         # Get the id of the sample with the closest datetime value
#         closest_id = filtered_procedures[filtered_procedures['chartdate'] == closest_datetime]['icd_code']
#         code = pd.array(closest_id)[0]

#         # Assign the id to the current row in the first dataframe
#         df_pharmacy.at[index, 'icd_code'] = str(code)

# df_pharmacy.dropna(subset=['icd_code'], inplace=True)

In [668]:
df_pharmacy

Unnamed: 0,subject_id,hadm_id,pharmacy_id,poe_id,starttime,stoptime,medication,proc_type,status,entertime,...,4,5,6,7,8,9,[,],a,n
0,10027602,28166872,24340150,,2201-10-30 12:00:00,NaT,Midazolam,Miscellaneous Charges,Inactive (Due to a change order),2201-10-30 12:32:11,...,0,0,0,0,0,0,1,1,0,0
1,10027602,28166872,14435820,,2201-10-30 12:00:00,NaT,Midazolam,Miscellaneous Charges,Inactive (Due to a change order),2201-10-30 12:54:34,...,0,0,0,0,0,0,1,1,0,0
2,10027602,28166872,40720238,,2201-10-30 12:00:00,NaT,Fentanyl Citrate,Miscellaneous Charges,Inactive (Due to a change order),2201-10-30 12:32:11,...,0,0,0,0,0,0,1,1,0,0
3,10027602,28166872,27168639,,2201-10-30 12:00:00,NaT,Fentanyl Citrate,Miscellaneous Charges,Inactive (Due to a change order),2201-10-30 12:54:34,...,0,0,0,0,0,0,1,1,0,0
4,10027602,28166872,62845687,,2201-10-31 12:00:00,NaT,Lorazepam,Miscellaneous Charges,Inactive (Due to a change order),2201-10-31 12:02:42,...,0,0,0,0,0,0,1,1,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6887,10014354,22508257,86519836,10014354-956,2148-05-15 10:00:00,2148-05-15 09:00:00,MoviPrep,Unit Dose,Discontinued,2148-05-15 09:11:01,...,0,0,0,0,0,0,0,0,0,0
6888,10014354,22508257,1794302,10014354-984,2148-05-16 13:00:00,2148-05-17 13:00:00,MoviPrep,Unit Dose,Discontinued,2148-05-16 12:05:01,...,1,0,0,0,0,0,0,0,0,0
6889,10014354,22508257,33207382,10014354-992,2148-05-17 09:00:00,2148-05-18 05:00:00,MoviPrep,Unit Dose,Discontinued,2148-05-17 08:56:51,...,0,0,0,0,0,0,0,0,0,0
6890,10014354,22508257,12690116,10014354-1000,2148-05-17 21:00:00,2148-05-18 05:00:00,MoviPrep,Unit Dose,Discontinued,2148-05-17 21:17:45,...,0,0,0,0,0,0,0,0,0,0


In [669]:
# Drop 
df_pharmacy = df_pharmacy.drop(columns=['subject_id','pharmacy_id','poe_id','starttime','stoptime','entertime',
                                       'verifiedtime','expirationdate', 'fill_quantity','disp_sched'])
# expiration date and fill quantity are all empty

In [670]:
# Encode: proc_type, status
df_pharmacy = pd.get_dummies(df_pharmacy, columns=['proc_type','status'])

In [671]:
# Impute with N/A and encode
df_pharmacy['infusion_type'] = df_pharmacy['infusion_type'].fillna('N/A')
df_pharmacy['sliding_scale'] = df_pharmacy['sliding_scale'].fillna('N/A')
df_pharmacy['duration_interval'] = df_pharmacy['duration_interval'].fillna('N/A')
df_pharmacy['expiration_unit'] = df_pharmacy['expiration_unit'].fillna('N/A')
df_pharmacy['dispensation'] = df_pharmacy['dispensation'].fillna('N/A')
df_pharmacy['medication'] = df_pharmacy['medication'].fillna('N/A')
df_pharmacy['route'] = df_pharmacy['route'].fillna('N/A')
df_pharmacy['frequency'] = df_pharmacy['frequency'].fillna('N/A')
df_pharmacy = pd.get_dummies(df_pharmacy, columns=['infusion_type','sliding_scale','duration_interval','expiration_unit',
                                                  'dispensation','medication','route','frequency'])

In [672]:
# Impute with 0: lockout_interval, doses_per_24_hrs, duration, expiration_value
df_pharmacy['lockout_interval'] = df_pharmacy['lockout_interval'].fillna(0)
df_pharmacy['doses_per_24_hrs'] = df_pharmacy['doses_per_24_hrs'].fillna(0)
df_pharmacy['expiration_value'] = df_pharmacy['expiration_value'].fillna(0)
df_pharmacy['basal_rate'] = df_pharmacy['basal_rate'].fillna(0)
df_pharmacy['one_hr_max'] = df_pharmacy['one_hr_max'].fillna(0)

In [673]:
# Function to convert timedeltas to floats
def convert_timedelta_to_float(value):
    if isinstance(value, pd.Timedelta):
        return value.total_seconds()
    else:
        return value

# Apply the function to the column
df_pharmacy['duration'] = df_pharmacy['duration'].apply(convert_timedelta_to_float)

In [674]:
# Separate into training and evaluation sets 

df_pharmacy_training = df_pharmacy[~df_pharmacy['hadm_id'].isin(evaluation_admissions)]
df_pharmacy_evaluation = df_pharmacy[df_pharmacy['hadm_id'].isin(evaluation_admissions)]

In [675]:
df_pharmacy_training_target = codes.loc[df_pharmacy_training.index]
df_pharmacy_evaluation_target = codes.loc[df_pharmacy_evaluation.index]

In [676]:
target = df_pharmacy_training_target

In [677]:
# Drop columns with all zeros
target = target.loc[:, (target != 0).any(axis=0)]

In [678]:
df_pharmacy_evaluation_target = df_pharmacy_evaluation_target.drop(columns=['hadm_id', 'chartdate'])

In [679]:
# Get the intersection of columns between the two DataFrames
common_columns = df_pharmacy_evaluation_target.columns.intersection(target.columns)

# Keep only the columns in df1 that are also in df2
df_pharmacy_evaluation_target = df_pharmacy_evaluation_target[common_columns]

In [680]:
# Save evaluation data for later 
folder_name = 'EnsembleEvaluationData'

# Define the file path
file_path = os.path.join(folder_name, 'df_pharmacy_evaluation.csv')

# Save the DataFrame to a CSV file in the specified folder
df_pharmacy_evaluation.to_csv(file_path, index=False)

In [681]:
# Define the file path
file_path = os.path.join(folder_name, 'df_pharmacy_evaluation_target.csv')

# Save the DataFrame to a CSV file in the specified folder
df_pharmacy_evaluation_target.to_csv(file_path, index=False)

In [682]:
# # Save evaluation data for later 
# folder_name = 'EnsembleEvaluationData'

# # Define the file path
# file_path = os.path.join(folder_name, 'df_pharmacy_evaluation.csv')

# # Save the DataFrame to a CSV file in the specified folder
# df_pharmacy_evaluation.to_csv(file_path, index=False)

#### Training the learner

In [683]:
data

Unnamed: 0,lockout_interval,basal_rate,one_hr_max,doses_per_24_hrs,expiration_value,medication_duration,verification_delay,Unnamed: 8,",",0,...,frequency_QTHUR,frequency_TID,frequency_TID W/MEALS,frequency_TID:PRN,frequency_TITRATE TO,frequency_TITRATE TO RASS,frequency_X1,frequency_X1 PRN,frequency_X1:PRN,frequency_X2 PRN
0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0,0,1,...,0,0,0,0,0,0,0,0,0,0
1,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0,0,1,...,0,0,0,0,0,0,0,0,0,0
2,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0,0,1,...,0,0,0,0,0,0,0,0,0,0
3,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0,0,1,...,0,0,0,0,0,0,0,0,0,0
4,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0,0,1,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6873,0.0,0.0,0.0,0.0,365.0,1.875000,0.0,0,0,0,...,0,0,0,0,0,1,0,0,0,0
6878,0.0,0.0,0.0,0.0,365.0,5.791667,0.0,0,0,0,...,0,0,0,0,0,1,0,0,0,0
6879,0.0,0.0,0.0,0.0,365.0,5.791667,0.0,0,0,0,...,0,0,0,0,0,1,0,0,0,0
6880,0.0,0.0,0.0,0.0,36.0,4.916667,0.0,0,0,0,...,0,0,0,0,0,1,0,0,0,0


In [684]:
data = df_pharmacy_training.drop(columns=['hadm_id','duration'])
target = target.drop(columns=['hadm_id','chartdate'])

In [685]:
# Converting duration strings to floats

data['medication_duration']= data['medication_duration'].astype(str)
data['medication_duration']= data['medication_duration'].apply(convert_to_days)
data['verification_delay'] = data['verification_delay'].astype(str)
data['verification_delay'] = data['verification_delay'].apply(convert_to_days)

In [686]:
# # Initialize and fit

# # Standardize features
# scaler = StandardScaler()
# data = scaler.fit_transform(data)

# # # Instantiate the logistic regression model
# logistic_clf_pharmacy = LogisticRegression(random_state = 42, fit_intercept=False, multi_class='multinomial', solver='lbfgs',
#                           max_iter=1000)

# logistic_clf_pharmacy.fit(data, np.ravel(target))

In [687]:
from sklearn.multioutput import MultiOutputClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report

# # Standardize features
# scaler = StandardScaler()
# data = scaler.fit_transform(data)

# Initialize logistic regression classifier
logistic_reg = LogisticRegression(random_state = 42, fit_intercept=False, multi_class='multinomial', solver='lbfgs',
                          max_iter=1000)

# Wrap logistic regression classifier in MultiOutputClassifier
logistic_clf_pharmacy = MultiOutputClassifier(logistic_reg)

# Train the multi-output logistic regression model
logistic_clf_pharmacy.fit(data.values, target.values)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver opt

In [688]:
# Save model to folder

output_folder = 'procedures_learners' 

# Save the trained model to a file in the new folder
model_file = os.path.join(output_folder, 'logistic_clf_pharmacy.joblib')
dump(logistic_clf_pharmacy, model_file)

['procedures_learners\\logistic_clf_pharmacy.joblib']

### icustays

In [349]:
file = "icu/icustays.csv"
full_path = path + file

df_icustays = pd.read_csv(full_path)

#### Preprocessing (on all data)

In [350]:
df_icustays = df_icustays[df_icustays['hadm_id'].isin(df_procedures['hadm_id'])]

# For each sample, get the rows from df_procedures that have the same hadm_id
# Go through this list and if the event time is between the in and out transfer time, assign it the same icd_code 

In [351]:
# convert time to datetime
df_icustays['outtime'] = pd.to_datetime(df_icustays['outtime'])

In [352]:
codes = pd.DataFrame()
data_new = pd.DataFrame()

for index, row in df_icustays.iterrows():
#     print(row['outtime'])
    # Filter procedures to have the same 'hadm_id' as that row in emar
    df_procedures_subset = df_procedures[df_procedures['hadm_id'] == row['hadm_id']]
#     print(df_procedures_subset['chartdate'])
    # Get the datetime value of that emar sample
    datetime_value = row['outtime']
    
    # Filter out procedures that are later than the datetime value in emar
    filtered_procedures = df_procedures_subset[df_procedures_subset['chartdate'] > datetime_value]
    
    # if it is empty it means there are no procedures for that admission after this sample was taken 
    if not filtered_procedures.empty:
#         print(filtered_procedures)
#         print('next')
        data_new = pd.concat([data_new, pd.DataFrame([row])], ignore_index=True)
        # Find the closest datetime value in the filtered second dataframe
        # Closest to datetime_value
        closest_datetime = filtered_procedures['chartdate'].min()

        # Get the the sample with the closest datetime value
        closest_id = filtered_procedures[filtered_procedures['chartdate'] == closest_datetime]
#         codes = codes + pd.DataFrame(closest_id)
        codes = pd.concat([codes, closest_id], ignore_index=True)

In [353]:
# df_icustays['icd_code'] = float('nan')
# for index, row in df_icustays.iterrows():
#     # Filter target df based on 'hadm_id'
#     df_procedures_subset = df_procedures[df_procedures['hadm_id'] == row['hadm_id']]

#     datetime_value = row['outtime']
    
#     # Filter out datetime values in the second dataframe that are later than the datetime value in the first dataframe
#     filtered_procedures = df_procedures_subset[df_procedures_subset['chartdate'] > datetime_value]
    
#     if not filtered_procedures.empty:

#         # Find the closest datetime value in the filtered second dataframe
#         closest_datetime = filtered_procedures['chartdate'].min()

#         # Get the id of the sample with the closest datetime value
#         closest_id = filtered_procedures[filtered_procedures['chartdate'] == closest_datetime]['icd_code']
#         code = pd.array(closest_id)[0]

#         # Assign the id to the current row in the first dataframe
#         df_icustays.at[index, 'icd_code'] = str(code)

# df_icustays.dropna(subset=['icd_code'], inplace=True)

In [354]:
df_icustays = data_new

In [355]:
# make a feature called days_since_admission using intime-admittime

# Convert to datetime
df_icustays['intime'] = pd.to_datetime(df_icustays['intime'], format='%Y-%m-%d %H:%M:%S')

# Add admittime column from other dataframe
df_icustays = df_icustays.merge(df_admittime, on='hadm_id', how='left')

# # Discard the time part and keep only the date
# df_hcpcsevents['admittime'] = df_hcpcsevents['admittime'].dt.date
# df_hcpcsevents['chartdate'] = df_hcpcsevents['chartdate'].dt.date

df_icustays['days_since_admission'] = df_icustays['intime'] - df_icustays['admittime']

# Fill any non time values
df_icustays['days_since_admission'] = df_icustays['days_since_admission'].fillna(pd.Timedelta(0))

# Drop the admission time column
df_icustays = df_icustays.drop(columns=['admittime'])

In [356]:
# Drop 
df_icustays = df_icustays.drop(columns=['subject_id','stay_id','intime','outtime'])

# Rename los to icu_los
df_icustays = df_icustays.rename(columns={'los': 'icu_los'})

In [357]:
# Encode
df_icustays = pd.get_dummies(df_icustays, columns=['first_careunit','last_careunit'])

In [358]:
# Separate into training and evaluation sets 

df_icustays_training = df_icustays[~df_icustays['hadm_id'].isin(evaluation_admissions)]
df_icustays_evaluation = df_icustays[df_icustays['hadm_id'].isin(evaluation_admissions)]

In [359]:
df_icustays_training.index

Int64Index([ 0,  1,  2,  4,  5,  6,  7,  9, 10, 12, 13, 15, 16, 17, 18, 19, 20,
            21, 22, 23, 25, 26, 27, 28, 29, 30, 31],
           dtype='int64')

In [360]:
df_icustays_training_target = codes.loc[df_icustays_training.index]
df_icustays_evaluation_target = codes.loc[df_icustays_evaluation.index]

In [361]:
df_icustays_evaluation_target

Unnamed: 0,hadm_id,chartdate,0039,0040,0041,0045,0051,0066,0069,0091,...,B41GYZZ,B518YZA,B51W1ZZ,B543ZZ3,B548ZZA,B54BZZA,BT1DYZZ,BT1FYZZ,D7021ZZ,DW021ZZ
3,28998349,2116-12-27,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
8,28829452,2113-09-16,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
11,25085565,2186-09-18,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
14,27660781,2117-03-12,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
24,29279905,2153-04-04,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [362]:
# Save evaluation data for later 
folder_name = 'EnsembleEvaluationData'

# Define the file path
file_path = os.path.join(folder_name, 'df_icustays_evaluation.csv')

# Save the DataFrame to a CSV file in the specified folder
df_icustays_evaluation.to_csv(file_path, index=False)

#### Training the learner

In [364]:
# data = df_icustays_training.drop(columns=['hadm_id','icd_code'])
# # target = pd.DataFrame(df_icustays_training['icd_code'])
# target = codes

data = df_icustays_training.drop(columns=['hadm_id'])

# target = df_icustays['icd_code']
target = df_icustays_training_target.drop(columns=['hadm_id', 'chartdate'])

In [365]:
# Converting duration strings to floats

data['days_since_admission']= data['days_since_admission'].astype(str)
data['days_since_admission']= data['days_since_admission'].apply(convert_to_days)

#### Multi output logistic

In [368]:
target

Unnamed: 0,0039,0040,0041,0045,0051,0066,0069,0091,0096,009600Z,...,B41GYZZ,B518YZA,B51W1ZZ,B543ZZ3,B548ZZA,B54BZZA,BT1DYZZ,BT1FYZZ,D7021ZZ,DW021ZZ
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
5,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
6,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
7,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
9,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
10,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
12,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [398]:
# Drop columns with all zeros
target = target.loc[:, (target != 0).any(axis=0)]

print(target)

    02H633Z  0BH17EZ  0D9630Z  0D963ZX  0DH63UZ  0DH98UZ  0DJ08ZZ  0JDM0ZZ  \
0         0        0        0        0        1        0        0        0   
1         0        0        0        0        0        0        0        0   
2         0        0        0        0        0        0        0        0   
4         0        0        0        0        0        0        0        0   
5         0        0        0        0        0        0        0        0   
6         0        0        0        0        0        0        0        0   
7         0        0        0        0        0        0        0        0   
9         0        0        0        0        0        0        0        0   
10        0        0        0        0        0        0        0        0   
12        0        0        1        1        0        1        0        0   
13        0        0        0        0        0        0        0        0   
15        1        0        0        0        0        0        

In [402]:
df_icustays_evaluation_target = df_icustays_evaluation_target.drop(columns=['hadm_id', 'chartdate'])

In [404]:
# Get the intersection of columns between the two DataFrames
common_columns = df_icustays_evaluation_target.columns.intersection(target.columns)

# Keep only the columns in df1 that are also in df2
df_icustays_evaluation_target = df_icustays_evaluation_target[common_columns]

In [413]:
df_icustays_evaluation_target

Unnamed: 0,02H633Z,0BH17EZ,0D9630Z,0D963ZX,0DH63UZ,0DH98UZ,0DJ08ZZ,0JDM0ZZ,0JDP0ZZ,0JH63XZ,...,4523,5491,5A1945Z,5A1D70Z,8605,8628,9427,9604,966,9671
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
8,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
11,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
14,0,0,0,0,0,0,0,0,0,0,...,0,1,0,0,0,0,0,0,0,0
24,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [412]:
# Define the file path
file_path = os.path.join(folder_name, 'df_icustays_evaluation_target.csv')

# Save the DataFrame to a CSV file in the specified folder
df_icustays_evaluation_target.to_csv(file_path, index=False)

In [406]:
data.values.shape

(27, 14)

In [407]:
target.values.shape

(27, 30)

In [408]:
np.unique(target.values)

array([0, 1], dtype=int64)

In [409]:
from sklearn.multioutput import MultiOutputClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report

# # Standardize features
# scaler = StandardScaler()
# data = scaler.fit_transform(data)

# Initialize logistic regression classifier
logistic_reg = LogisticRegression(random_state = 42, penalty = None, multi_class='multinomial', 
                            fit_intercept=False, solver='lbfgs', max_iter=100)

# Wrap logistic regression classifier in MultiOutputClassifier
logistic_clf_icustays = MultiOutputClassifier(logistic_reg)

# Train the multi-output logistic regression model
logistic_clf_icustays.fit(data.values, target.values)

In [410]:
# # Initialize and fit

# # Standardize features
# scaler = StandardScaler()
# data = scaler.fit_transform(data)

# # # Instantiate the logistic regression model
# logistic_clf_icustays = LogisticRegression(random_state = 42, penalty = None, multi_class='multinomial', 
#                            fit_intercept=False, solver='lbfgs', max_iter=100)

# logistic_clf_icustays.fit(data, np.ravel(target))

In [411]:
# Save model to folder

output_folder = 'procedures_learners' 

# Save the trained model to a file in the new folder
model_file = os.path.join(output_folder, 'logistic_clf_icustays.joblib')
dump(logistic_clf_icustays, model_file)

['procedures_learners\\logistic_clf_icustays.joblib']

### inputevents

In [689]:
file = "icu/inputevents.csv"
full_path = path + file

df_input = pd.read_csv(full_path)

#### Preprocessing (on all data)

Drop: subject_id, starttime, endtime, storetime, orderid, linkorderid, continueinnextdept, stay_id, caregiver_id,
totalamountuom
Encode: amountuom, ordercategoryname, ordercomponenttypedescription, ordercategorydescription, statusdescription, itemid
Impute with 0: rate, totalamount
Impute with N/A and encode: rateuom, secondaryordercategoryname

In [690]:
df_input = df_input[df_input['hadm_id'].isin(df_procedures['hadm_id'])]

In [691]:
# convert time to datetime
df_input['endtime'] = pd.to_datetime(df_input['endtime'])

In [693]:
codes = pd.DataFrame()
data_new = pd.DataFrame()

for index, row in df_input.iterrows():
#     print(row['outtime'])
    # Filter procedures to have the same 'hadm_id' as that row in emar
    df_procedures_subset = df_procedures[df_procedures['hadm_id'] == row['hadm_id']]
#     print(df_procedures_subset['chartdate'])
    # Get the datetime value of that emar sample
    datetime_value = row['endtime']
    
    # Filter out procedures that are later than the datetime value in emar
    filtered_procedures = df_procedures_subset[df_procedures_subset['chartdate'] > datetime_value]
    
    # if it is empty it means there are no procedures for that admission after this sample was taken 
    if not filtered_procedures.empty:
#         print(filtered_procedures)
#         print('next')
        data_new = pd.concat([data_new, pd.DataFrame([row])], ignore_index=True)
        # Find the closest datetime value in the filtered second dataframe
        # Closest to datetime_value
        closest_datetime = filtered_procedures['chartdate'].min()

        # Get the the sample with the closest datetime value
        closest_id = filtered_procedures[filtered_procedures['chartdate'] == closest_datetime]
#         codes = codes + pd.DataFrame(closest_id)
        codes = pd.concat([codes, closest_id], ignore_index=True)

In [694]:
df_input = data_new

In [696]:
# Make a duration feature using endtime-starttime

# Convert to datetime
df_input['endtime'] = pd.to_datetime(df_input['endtime'], format='%Y-%m-%d %H:%M:%S')
df_input['starttime'] = pd.to_datetime(df_input['starttime'], format='%Y-%m-%d %H:%M:%S')


df_input['duration'] = df_input['endtime'] - df_input['starttime']

# Fill any non time values
df_input['duration'] = df_input['duration'].fillna(pd.Timedelta(0))

In [697]:
# Make a recording_delay feature using storetime-endtime

# Convert to datetime
df_input['storetime'] = pd.to_datetime(df_input['storetime'], format='%Y-%m-%d %H:%M:%S')

df_input['recording_delay'] = df_input['storetime'] - df_input['endtime']

# Fill any non time values
df_input['recording_delay'] = df_input['recording_delay'].fillna(pd.Timedelta(0))

In [698]:
# Drop 
df_input = df_input.drop(columns=['subject_id','stay_id','starttime','endtime','storetime','orderid','linkorderid',
                                  'continueinnextdept','totalamountuom', 'stay_id','caregiver_id'])

In [699]:
# Impute with N/A and encode
df_input['rateuom'] = df_input['rateuom'].fillna('N/A')
df_input['secondaryordercategoryname'] = df_input['secondaryordercategoryname'].fillna('N/A')
df_input = pd.get_dummies(df_input, columns=['rateuom','secondaryordercategoryname','amountuom','ordercategoryname',
                                            'ordercomponenttypedescription','ordercategorydescription','statusdescription',
                                            'itemid'])

In [700]:
# Impute with 0
df_input['rate'] = df_input['rate'].fillna(0)
df_input['totalamount'] = df_input['totalamount'].fillna(0)

In [701]:
df_input = df_input.dropna()

In [702]:
# Converting duration strings to floats
df_input['duration']= df_input['duration'].astype(str)
df_input['duration']= df_input['duration'].apply(convert_to_days)
df_input['recording_delay']= df_input['recording_delay'].astype(str)
df_input['recording_delay']= df_input['recording_delay'].apply(convert_to_days)

In [703]:
# Separate into training and evaluation sets 

df_input_training = df_input[~df_input['hadm_id'].isin(evaluation_admissions)]
df_input_evaluation = df_input[df_input['hadm_id'].isin(evaluation_admissions)]

In [707]:
df_input_training_target = codes.loc[df_input_training.index]
df_input_evaluation_target = codes.loc[df_input_evaluation.index]

In [708]:
target = df_input_training_target

In [709]:
# Drop columns with all zeros
target = target.loc[:, (target != 0).any(axis=0)]

In [710]:
df_input_evaluation_target = df_input_evaluation_target.drop(columns=['hadm_id', 'chartdate'])

In [711]:
# Get the intersection of columns between the two DataFrames
common_columns = df_input_evaluation_target.columns.intersection(target.columns)

# Keep only the columns in df1 that are also in df2
df_input_evaluation_target = df_input_evaluation_target[common_columns]

In [722]:
# Save evaluation data for later 
folder_name = 'EnsembleEvaluationData'

# Define the file path
file_path = os.path.join(folder_name, 'df_input_evaluation.csv')

# Save the DataFrame to a CSV file in the specified folder
df_input_evaluation.to_csv(file_path, index=False)

In [723]:
# Define the file path
file_path = os.path.join(folder_name, 'df_input_evaluation_target.csv')

# Save the DataFrame to a CSV file in the specified folder
df_input_evaluation_target.to_csv(file_path, index=False)

#### Training the learner

In [725]:
data = df_input_training.drop(columns=['hadm_id','duration'])
target = target.drop(columns=['hadm_id','chartdate'])

In [728]:
from sklearn.multioutput import MultiOutputClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report

# # Standardize features
# scaler = StandardScaler()
# data = scaler.fit_transform(data)

# Initialize logistic regression classifier
logistic_reg = LogisticRegression(random_state = 42, penalty = None, multi_class='multinomial', 
                                           solver='lbfgs', max_iter=1000)

# Wrap logistic regression classifier in MultiOutputClassifier
logistic_clf_input = MultiOutputClassifier(logistic_reg)

# Train the multi-output logistic regression model
logistic_clf_input.fit(data.values, target.values)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver opt

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver opt

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver opt

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver opt

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver opt

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver opt

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver opt

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver opt

In [729]:
# Save model to folder

output_folder = 'procedures_learners'
os.makedirs(output_folder, exist_ok=True)

# Save the trained model to a file in the new folder
model_file = os.path.join(output_folder, 'logistic_clf_input.joblib')
dump(logistic_clf_input, model_file)

['procedures_learners\\logistic_clf_input.joblib']

In [160]:
# # Initialize and fit

# # Standardize features
# scaler = StandardScaler()
# data = scaler.fit_transform(data)

# # # Instantiate the logistic regression model
# logistic_clf_input = LogisticRegression(random_state = 42, penalty = None, multi_class='multinomial', 
#                                            solver='lbfgs', max_iter=1000)

# logistic_clf_input.fit(data, np.ravel(target))

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


### outputevents

In [737]:
file = "icu/outputevents.csv"
full_path = path + file

df_output = pd.read_csv(full_path)

#### Preprocessing (on all data)

In [738]:
df_output = df_output[df_output['hadm_id'].isin(df_procedures['hadm_id'])]

In [739]:
# convert time to datetime
df_output['charttime'] = pd.to_datetime(df_output['charttime'])

In [740]:
codes = pd.DataFrame()
data_new = pd.DataFrame()

for index, row in df_output.iterrows():
#     print(row['outtime'])
    # Filter procedures to have the same 'hadm_id' as that row in emar
    df_procedures_subset = df_procedures[df_procedures['hadm_id'] == row['hadm_id']]
#     print(df_procedures_subset['chartdate'])
    # Get the datetime value of that emar sample
    datetime_value = row['charttime']
    
    # Filter out procedures that are later than the datetime value in emar
    filtered_procedures = df_procedures_subset[df_procedures_subset['chartdate'] > datetime_value]
    
    # if it is empty it means there are no procedures for that admission after this sample was taken 
    if not filtered_procedures.empty:
#         print(filtered_procedures)
#         print('next')
        data_new = pd.concat([data_new, pd.DataFrame([row])], ignore_index=True)
        # Find the closest datetime value in the filtered second dataframe
        # Closest to datetime_value
        closest_datetime = filtered_procedures['chartdate'].min()

        # Get the the sample with the closest datetime value
        closest_id = filtered_procedures[filtered_procedures['chartdate'] == closest_datetime]
#         codes = codes + pd.DataFrame(closest_id)
        codes = pd.concat([codes, closest_id], ignore_index=True)

In [741]:
df_output = data_new

In [742]:
# Make a days_since_admission feature using charttime-admittime 

# Convert to datetime
df_output['charttime'] = pd.to_datetime(df_output['charttime'], format='%Y-%m-%d %H:%M:%S')

# Add admittime column from other dataframe
df_output = df_output.merge(df_admittime, on='hadm_id', how='left')

df_output['days_since_admission'] = df_output['charttime'] - df_output['admittime']

# Fill any non time values
df_output['days_since_admission'] = df_output['days_since_admission'].fillna(pd.Timedelta(0))

# Drop the admission time column
df_output = df_output.drop(columns=['admittime'])

In [743]:
# Make a recording_delay feature using storetime-charttime

# Convert to datetime
df_output['storetime'] = pd.to_datetime(df_output['storetime'], format='%Y-%m-%d %H:%M:%S')

df_output['recording_delay'] = df_output['storetime'] - df_output['charttime']

# Fill any non time values
df_output['recording_delay'] = df_output['recording_delay'].fillna(pd.Timedelta(0))

In [744]:
# Drop 
df_output = df_output.drop(columns=['subject_id','stay_id','charttime','storetime','storetime','valueuom','caregiver_id'])

In [745]:
#Encode
df_output = pd.get_dummies(df_output, columns=['itemid'])

In [746]:
# Separate into training and evaluation sets 

df_output_training = df_output[~df_output['hadm_id'].isin(evaluation_admissions)]
df_output_evaluation = df_output[df_output['hadm_id'].isin(evaluation_admissions)]

In [747]:
df_output_training_target = codes.loc[df_output_training.index]
df_output_evaluation_target = codes.loc[df_output_evaluation.index]

In [748]:
target = df_output_training_target

In [749]:
# Drop columns with all zeros
target = target.loc[:, (target != 0).any(axis=0)]

In [750]:
df_output_evaluation_target

Unnamed: 0,hadm_id,chartdate,0039,0040,0041,0045,0051,0066,0069,0091,...,B41GYZZ,B518YZA,B51W1ZZ,B543ZZ3,B548ZZA,B54BZZA,BT1DYZZ,BT1FYZZ,D7021ZZ,DW021ZZ
1,24540843,2117-03-18,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
32,29276678,2116-03-04,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
33,29276678,2116-03-04,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
34,29276678,2116-03-02,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
35,29276678,2116-03-04,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4719,28998349,2116-12-07,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4720,28998349,2116-12-07,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4721,28998349,2116-12-07,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4722,28998349,2116-12-07,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [751]:
df_output_evaluation_target = df_output_evaluation_target.drop(columns=['hadm_id', 'chartdate'])

In [752]:
# Get the intersection of columns between the two DataFrames
common_columns = df_output_evaluation_target.columns.intersection(target.columns)

# Keep only the columns in df1 that are also in df2
df_output_evaluation_target = df_output_evaluation_target[common_columns]

In [753]:
# Save evaluation data for later 
folder_name = 'EnsembleEvaluationData'

# Define the file path
file_path = os.path.join(folder_name, 'df_output_evaluation.csv')

# Save the DataFrame to a CSV file in the specified folder
df_output_evaluation.to_csv(file_path, index=False)

In [754]:
# Define the file path
file_path = os.path.join(folder_name, 'df_output_evaluation_target.csv')

# Save the DataFrame to a CSV file in the specified folder
df_output_evaluation_target.to_csv(file_path, index=False)

In [755]:
# df_output['icd_code'] = float('nan')
# for index, row in df_output.iterrows():
#     # Filter target df based on 'hadm_id'
#     df_procedures_subset = df_procedures[df_procedures['hadm_id'] == row['hadm_id']]

#     datetime_value = row['charttime']
    
#     # Filter out datetime values in the second dataframe that are later than the datetime value in the first dataframe
#     filtered_procedures = df_procedures_subset[df_procedures_subset['chartdate'] > datetime_value]
    
#     if not filtered_procedures.empty:

#         # Find the closest datetime value in the filtered second dataframe
#         closest_datetime = filtered_procedures['chartdate'].min()

#         # Get the id of the sample with the closest datetime value
#         closest_id = filtered_procedures[filtered_procedures['chartdate'] == closest_datetime]['icd_code']
#         code = pd.array(closest_id)[0]

#         # Assign the id to the current row in the first dataframe
#         df_output.at[index, 'icd_code'] = str(code)

# df_output.dropna(subset=['icd_code'], inplace=True)

#### Training the learner

In [756]:
data = df_output_training.drop(columns=['hadm_id'])
# target = pd.DataFrame(df_output_training['icd_code'])

In [757]:
data['days_since_admission']= data['days_since_admission'].astype(str)
data['days_since_admission']= data['days_since_admission'].apply(convert_to_days)
data['recording_delay']= data['recording_delay'].astype(str)
data['recording_delay']= data['recording_delay'].apply(convert_to_days)

In [765]:
target = target.drop(columns=['hadm_id','chartdate'])

In [766]:
from sklearn.multioutput import MultiOutputClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report

# # Standardize features
# scaler = StandardScaler()
# data = scaler.fit_transform(data)

# Initialize logistic regression classifier
logistic_reg = LogisticRegression(random_state = 42, penalty = None, multi_class='multinomial', 
                                           solver='lbfgs', max_iter=1000)

# Wrap logistic regression classifier in MultiOutputClassifier
logistic_clf_output = MultiOutputClassifier(logistic_reg)

# Train the multi-output logistic regression model
logistic_clf_output.fit(data.values, target.values)

In [767]:
# Save model to folder

output_folder = 'procedures_learners'

# Save the trained model to a file in the new folder
model_file = os.path.join(output_folder, 'logistic_clf_output.joblib')
dump(logistic_clf_output, model_file)

['procedures_learners\\logistic_clf_output.joblib']

In [182]:
# # Initialize and fit

# # Standardize features
# scaler = StandardScaler()
# data = scaler.fit_transform(data)

# # # Instantiate the logistic regression model
# logistic_clf_output = LogisticRegression(random_state = 42, penalty = None, multi_class='multinomial', 
#                                            solver='lbfgs', max_iter=1000)

# logistic_clf_output.fit(data, np.ravel(target))

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


### procedureevents

In [768]:
file = "icu/procedureevents.csv"
full_path = path + file

df_procedure_events = pd.read_csv(full_path)

#### Preprocessing (on all data)

Drop: subject_id, starttime, endtime, storetime, orderid, linkorderid, continueinnextdept, stay_id, caregiver_id
Encode: valueuom, ordercategoryname, ordercategorydescription, statusdescription, itemid
Impute with N/A and encode: location, locationcategory

In [769]:
df_procedure_events = df_procedure_events[df_procedure_events['hadm_id'].isin(df_procedures['hadm_id'])]

In [770]:
# convert time to datetime
df_procedure_events['endtime'] = pd.to_datetime(df_procedure_events['endtime'])

In [772]:
codes = pd.DataFrame()
data_new = pd.DataFrame()

for index, row in df_procedure_events.iterrows():
#     print(row['outtime'])
    # Filter procedures to have the same 'hadm_id' as that row in emar
    df_procedures_subset = df_procedures[df_procedures['hadm_id'] == row['hadm_id']]
#     print(df_procedures_subset['chartdate'])
    # Get the datetime value of that emar sample
    datetime_value = row['endtime']
    
    # Filter out procedures that are later than the datetime value in emar
    filtered_procedures = df_procedures_subset[df_procedures_subset['chartdate'] > datetime_value]
    
    # if it is empty it means there are no procedures for that admission after this sample was taken 
    if not filtered_procedures.empty:
#         print(filtered_procedures)
#         print('next')
        data_new = pd.concat([data_new, pd.DataFrame([row])], ignore_index=True)
        # Find the closest datetime value in the filtered second dataframe
        # Closest to datetime_value
        closest_datetime = filtered_procedures['chartdate'].min()

        # Get the the sample with the closest datetime value
        closest_id = filtered_procedures[filtered_procedures['chartdate'] == closest_datetime]
#         codes = codes + pd.DataFrame(closest_id)
        codes = pd.concat([codes, closest_id], ignore_index=True)

In [773]:
df_procedure_events = data_new

In [774]:
# Make a duration feature using endtime-starttime

# Convert to datetime
df_procedure_events['endtime'] = pd.to_datetime(df_procedure_events['endtime'], format='%Y-%m-%d %H:%M:%S')
df_procedure_events['starttime'] = pd.to_datetime(df_procedure_events['starttime'], format='%Y-%m-%d %H:%M:%S')


df_procedure_events['duration'] = df_procedure_events['endtime'] - df_procedure_events['starttime']

# Fill any non time values
df_procedure_events['duration'] = df_procedure_events['duration'].fillna(pd.Timedelta(0))

In [775]:
# Make a recording_delay feature using storetime-endtime

# Convert to datetime
df_procedure_events['storetime'] = pd.to_datetime(df_procedure_events['storetime'], format='%Y-%m-%d %H:%M:%S')

df_procedure_events['recording_delay'] = df_procedure_events['storetime'] - df_procedure_events['endtime']

# Fill any non time values
df_procedure_events['recording_delay'] = df_procedure_events['recording_delay'].fillna(pd.Timedelta(0))

In [776]:
# Drop 
df_procedure_events = df_procedure_events.drop(columns=['subject_id','stay_id','starttime','endtime','storetime','orderid',
                                                        'linkorderid','continueinnextdept','caregiver_id'])

In [777]:
# Impute with N/A and encode
df_procedure_events['location'] = df_procedure_events['location'].fillna('N/A')
df_procedure_events['locationcategory'] = df_procedure_events['locationcategory'].fillna('N/A')
df_procedure_events = pd.get_dummies(df_procedure_events, columns=['location','locationcategory','valueuom',
                                                                   'ordercategoryname','ordercategorydescription',
                                                                   'statusdescription','itemid'])

In [778]:
# Converting duration strings to floats
df_procedure_events['duration']= df_procedure_events['duration'].astype(str)
df_procedure_events['duration']= df_procedure_events['duration'].apply(convert_to_days)
df_procedure_events['recording_delay']= df_procedure_events['recording_delay'].astype(str)
df_procedure_events['recording_delay']= df_procedure_events['recording_delay'].apply(convert_to_days)

In [779]:
# Separate into training and evaluation sets 
df_procedure_events_training = df_procedure_events[~df_procedure_events['hadm_id'].isin(evaluation_admissions)]
df_procedure_events_evaluation = df_procedure_events[df_procedure_events['hadm_id'].isin(evaluation_admissions)]

In [780]:
df_procedure_events_training_target = codes.loc[df_procedure_events_training.index]
df_procedure_events_evaluation_target = codes.loc[df_procedure_events_evaluation.index]

In [798]:
target = df_procedure_events_training_target

In [799]:
# Drop columns with all zeros
target = target.loc[:, (target != 0).any(axis=0)]

In [800]:
df_procedure_events_evaluation_target

Unnamed: 0,0040,0045,0221,02C03ZZ,02H633Z,02HV33Z,0390,0391,03HY32Z,03LP3DZ,...,9462,9604,966,9671,9672,9910,B211YZZ,B543ZZ3,B548ZZA,DW021ZZ
0,0,0,0,0,0,0,0,0,0,0,...,0,0,1,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,1,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,1,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,1,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
520,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
521,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
522,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
641,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [804]:
df_procedure_events_evaluation_target = df_procedure_events_evaluation_target.drop(columns=['hadm_id', 'chartdate'])

In [805]:
# Get the intersection of columns between the two DataFrames
common_columns = df_procedure_events_evaluation_target.columns.intersection(target.columns)

df_procedure_events_evaluation_target = df_procedure_events_evaluation_target[common_columns]

In [806]:
# Save evaluation data for later 
folder_name = 'EnsembleEvaluationData'

# Define the file path
file_path = os.path.join(folder_name, 'df_procedure_events_evaluation.csv')

# Save the DataFrame to a CSV file in the specified folder
df_procedure_events_evaluation.to_csv(file_path, index=False)

In [807]:
# Define the file path
file_path = os.path.join(folder_name, 'df_procedure_events_evaluation_target.csv')

# Save the DataFrame to a CSV file in the specified folder
df_procedure_events_evaluation_target.to_csv(file_path, index=False)

In [808]:
data = df_procedure_events_training.drop(columns=['hadm_id'])
target = target.drop(columns=['hadm_id','chartdate'])

#### Training the learner

In [809]:
from sklearn.multioutput import MultiOutputClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report

# # Standardize features
# scaler = StandardScaler()
# data = scaler.fit_transform(data)

# Initialize logistic regression classifier
logistic_reg = LogisticRegression(random_state = 42, penalty = None, multi_class='multinomial', 
                                           solver='lbfgs', max_iter=1000)

# Wrap logistic regression classifier in MultiOutputClassifier
logistic_clf_procedure_events = MultiOutputClassifier(logistic_reg)

# Train the multi-output logistic regression model
logistic_clf_procedure_events.fit(data.values, target.values)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver opt

In [810]:
# Save model to folder

output_folder = 'procedures_learners'
os.makedirs(output_folder, exist_ok=True)

# Save the trained model to a file in the new folder
model_file = os.path.join(output_folder, 'logistic_clf_procedure_events.joblib')
dump(logistic_clf_procedure_events, model_file)

['procedures_learners\\logistic_clf_procedure_events.joblib']

In [187]:
# df_procedure_events['icd_code'] = float('nan')
# for index, row in df_procedure_events.iterrows():
#     # Filter target df based on 'hadm_id'
#     df_procedures_subset = df_procedures[df_procedures['hadm_id'] == row['hadm_id']]

#     datetime_value = row['endtime']
    
#     # Filter out datetime values in the second dataframe that are later than the datetime value in the first dataframe
#     filtered_procedures = df_procedures_subset[df_procedures_subset['chartdate'] > datetime_value]
    
#     if not filtered_procedures.empty:

#         # Find the closest datetime value in the filtered second dataframe
#         closest_datetime = filtered_procedures['chartdate'].min()

#         # Get the id of the sample with the closest datetime value
#         closest_id = filtered_procedures[filtered_procedures['chartdate'] == closest_datetime]['icd_code']
#         code = pd.array(closest_id)[0]

#         # Assign the id to the current row in the first dataframe
#         df_procedure_events.at[index, 'icd_code'] = str(code)

# df_procedure_events.dropna(subset=['icd_code'], inplace=True)

In [201]:
# # Initialize and fit

# # Standardize features
# scaler = StandardScaler()
# data = scaler.fit_transform(data)

# # # Instantiate the logistic regression model
# logistic_clf_procedure_events = LogisticRegression(random_state = 42, penalty = None, multi_class='multinomial', 
#                                            solver='lbfgs', max_iter=1000)

# logistic_clf_procedure_events.fit(data, np.ravel(target))

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


### datetimeevents

In [813]:
file = "icu/datetimeevents.csv"
full_path = path + file

df_datetime_events = pd.read_csv(full_path)

#### Preprocessing (on all data)

In [814]:
df_datetime_events = df_datetime_events[df_datetime_events['hadm_id'].isin(df_procedures['hadm_id'])]

In [815]:
# convert time to datetime
df_datetime_events['value'] = pd.to_datetime(df_datetime_events['value'])

In [816]:
codes = pd.DataFrame()
data_new = pd.DataFrame()

for index, row in df_datetime_events.iterrows():
#     print(row['outtime'])
    # Filter procedures to have the same 'hadm_id' as that row in emar
    df_procedures_subset = df_procedures[df_procedures['hadm_id'] == row['hadm_id']]
#     print(df_procedures_subset['chartdate'])
    # Get the datetime value of that emar sample
    datetime_value = row['value']
    
    # Filter out procedures that are later than the datetime value in emar
    filtered_procedures = df_procedures_subset[df_procedures_subset['chartdate'] > datetime_value]
    
    # if it is empty it means there are no procedures for that admission after this sample was taken 
    if not filtered_procedures.empty:
#         print(filtered_procedures)
#         print('next')
        data_new = pd.concat([data_new, pd.DataFrame([row])], ignore_index=True)
        # Find the closest datetime value in the filtered second dataframe
        # Closest to datetime_value
        closest_datetime = filtered_procedures['chartdate'].min()

        # Get the the sample with the closest datetime value
        closest_id = filtered_procedures[filtered_procedures['chartdate'] == closest_datetime]
#         codes = codes + pd.DataFrame(closest_id)
        codes = pd.concat([codes, closest_id], ignore_index=True)

In [817]:
df_datetime_events = data_new

In [818]:
# Drop 
df_datetime_events = df_datetime_events.drop(columns=['warning','value','subject_id','stay_id','caregiver_id',
                                                     'charttime','storetime','valueuom'])

In [819]:
# Encode
df_datetime_events = pd.get_dummies(df_datetime_events, columns=['itemid'])

In [820]:
# Separate into training and evaluation sets 
df_datetime_events_training = df_datetime_events[~df_datetime_events['hadm_id'].isin(evaluation_admissions)]
df_datetime_events_evaluation = df_datetime_events[df_datetime_events['hadm_id'].isin(evaluation_admissions)]

In [821]:
df_datetime_events_training_target = codes.loc[df_datetime_events_training.index]
df_datetime_events_evaluation_target = codes.loc[df_datetime_events_evaluation.index]

In [822]:
target = df_datetime_events_training_target

In [823]:
# Drop columns with all zeros
target = target.loc[:, (target != 0).any(axis=0)]

In [824]:
df_datetime_events_evaluation_target

Unnamed: 0,hadm_id,chartdate,0039,0040,0041,0045,0051,0066,0069,0091,...,B41GYZZ,B518YZA,B51W1ZZ,B543ZZ3,B548ZZA,B54BZZA,BT1DYZZ,BT1FYZZ,D7021ZZ,DW021ZZ
163,29276678,2116-02-29,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
164,29276678,2116-02-29,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
165,29276678,2116-02-29,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
166,29276678,2116-02-29,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
167,29276678,2116-02-29,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
10569,28998349,2116-12-07,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
10570,28998349,2116-12-07,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
10571,28998349,2116-12-07,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
10572,28998349,2116-12-07,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [825]:
df_datetime_events_evaluation_target = df_datetime_events_evaluation_target.drop(columns=['hadm_id', 'chartdate'])

In [826]:
# Get the intersection of columns between the two DataFrames
common_columns = df_datetime_events_evaluation_target.columns.intersection(target.columns)

df_datetime_events_evaluation_target = df_datetime_events_evaluation_target[common_columns]

In [832]:
# Save evaluation data for later 
folder_name = 'EnsembleEvaluationData'

# Define the file path
file_path = os.path.join(folder_name, 'df_datetime_events_evaluation.csv')

# Save the DataFrame to a CSV file in the specified folder
df_datetime_events_evaluation.to_csv(file_path, index=False)

In [833]:
# Define the file path
file_path = os.path.join(folder_name, 'df_datetime_events_evaluation_target.csv')

# Save the DataFrame to a CSV file in the specified folder
df_datetime_events_evaluation_target.to_csv(file_path, index=False)

In [834]:
data = df_datetime_events_training.drop(columns=['hadm_id'])
target = target.drop(columns=['hadm_id','chartdate'])

#### Training the learner

In [839]:
from sklearn.multioutput import MultiOutputClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report

# # Standardize features
# scaler = StandardScaler()
# data = scaler.fit_transform(data)

# Initialize logistic regression classifier
logistic_reg = LogisticRegression(random_state = 42, multi_class='multinomial', solver='lbfgs')

# Wrap logistic regression classifier in MultiOutputClassifier
logistic_clf_datetime_events = MultiOutputClassifier(logistic_reg)

# Train the multi-output logistic regression model
logistic_clf_datetime_events.fit(data.values, target.values)

In [840]:
# Save model to folder

output_folder = 'procedures_learners'

# Save the trained model to a file in the new folder
model_file = os.path.join(output_folder, 'logistic_clf_datetime_events.joblib')
dump(logistic_clf_datetime_events, model_file)

['procedures_learners\\logistic_clf_datetime_events.joblib']

In [213]:
# df_datetime_events['icd_code'] = float('nan')
# for index, row in df_datetime_events.iterrows():
#     # Filter target df based on 'hadm_id'
#     df_procedures_subset = df_procedures[df_procedures['hadm_id'] == row['hadm_id']]

#     datetime_value = row['value']
    
#     # Filter out datetime values in the second dataframe that are later than the datetime value in the first dataframe
#     filtered_procedures = df_procedures_subset[df_procedures_subset['chartdate'] > datetime_value]
    
#     if not filtered_procedures.empty:

#         # Find the closest datetime value in the filtered second dataframe
#         closest_datetime = filtered_procedures['chartdate'].min()

#         # Get the id of the sample with the closest datetime value
#         closest_id = filtered_procedures[filtered_procedures['chartdate'] == closest_datetime]['icd_code']
#         code = pd.array(closest_id)[0]

#         # Assign the id to the current row in the first dataframe
#         df_datetime_events.at[index, 'icd_code'] = str(code)

# df_datetime_events.dropna(subset=['icd_code'], inplace=True)

In [223]:
# # Initialize and fit

# # Standardize features
# scaler = StandardScaler()
# data = scaler.fit_transform(data)

# # # Instantiate the logistic regression model
# logistic_clf_datetime_events = LogisticRegression(random_state = 42, multi_class='multinomial', solver='lbfgs')

# logistic_clf_datetime_events.fit(data, np.ravel(target))

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [217]:
# # Save evaluation data for later 
# folder_name = 'EnsembleEvaluationData'

# # Define the file path
# file_path = os.path.join(folder_name, 'df_datetime_events_evaluation.csv')

# # Save the DataFrame to a CSV file in the specified folder
# df_datetime_events_evaluation.to_csv(file_path, index=False)

In [221]:
# data = df_datetime_events_training.drop(columns=['icd_code','hadm_id'])
# target = pd.DataFrame(df_datetime_events_training['icd_code'])