In [1]:
# External libraries for data processing
import numpy as np
import pandas as pd
import sklearn as sk
#To render graphs within notebook
%matplotlib inline
import matplotlib.pyplot as plt
import joblib 
import os

# Versions of libraries
print("Numpy version: {}".format(np.__version__))
print("Pandas version: {}".format(pd.__version__))
print("Scikit version: {}".format(sk.__version__))

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MultiLabelBinarizer

Numpy version: 1.24.3
Pandas version: 1.5.3
Scikit version: 1.3.0


In [113]:
path = "C:/Project/Data/"

In [3]:
file = "hosp/admissions.csv"
full_path = path + file

df_admissions = pd.read_csv(full_path)

df_admissions['dischtime'] = pd.to_datetime(df_admissions['dischtime'], format='%d/%m/%Y %H:%M')
df_admissions['admittime'] = pd.to_datetime(df_admissions['admittime'], format='%d/%m/%Y %H:%M')

df_admittime= pd.DataFrame()
df_admittime['hadm_id'] = df_admissions['hadm_id']
df_admittime['admittime'] = df_admissions['admittime']

In [4]:
def convert_to_days(duration_str):
    parts = duration_str.split(' days ')  # Split string into form ['22', '20:55:00']
    days = float(parts[0])  # Extract number of days and convert to float
    time_parts = parts[1].split(':')  # Split time part (hh:mm:ss) ['20', '55', '00']
    hours = float(time_parts[0])  # Extract hours and convert to float
    minutes = float(time_parts[1])  # Extract minutes and convert to float
    seconds = float(time_parts[2])  # Extract seconds and convert to float
    total_days = days + (hours / 24) + (minutes / (24 * 60)) + (seconds / (24 * 3600))  # Calculate total days
    return total_days

### Target variable calculation

In [5]:
file = "hosp/transfers.csv"
full_path = path + file

df_transfers = pd.read_csv(full_path)

In [6]:
# drop dishcarged samples
df_transfers = df_transfers[df_transfers['eventtype'] != 'discharge']

In [7]:
# drop unneeded columns 
df_transfers = df_transfers.drop(columns=['subject_id', 'eventtype'])
# Keeping outtime because when it is the same admission but multiple transfers, need to filter all input data to
# apply to only one transfer 

In [235]:
df_transfers['careunit'].value_counts()

Emergency Department                                236
Medicine                                             77
Med/Surg                                             48
Neurology                                            46
Medicine/Cardiology                                  43
Transplant                                           39
Cardiac Surgery                                      39
Medical Intensive Care Unit (MICU)                   36
Discharge Lounge                                     36
Surgical Intensive Care Unit (SICU)                  33
Medical/Surgical Intensive Care Unit (MICU/SICU)     32
Hematology/Oncology                                  31
Cardiac Vascular Intensive Care Unit (CVICU)         31
Emergency Department Observation                     26
Med/Surg/Trauma                                      25
PACU                                                 25
Hematology/Oncology Intermediate                     23
Trauma SICU (TSICU)                             

In [9]:
# convert time to datetime
df_transfers['intime'] = pd.to_datetime(df_transfers['intime'])
df_transfers['outtime'] = pd.to_datetime(df_transfers['outtime'])

In [10]:
# Change hadm_id to index??
# df_transfers.set_index('hadm_id', inplace=True)

### emar

In [11]:
file = "hosp/emar.csv"
full_path = path + file

df_emar = pd.read_csv(full_path)

In [12]:
df_emar = df_emar[df_emar['hadm_id'].isin(df_transfers['hadm_id'])]

# For each sample, get the rows from df_transfers that have the same hadm_id
# Go through this list and if the event time is between the in and out transfer time, assign it the same transfer_id 

In [13]:
df_emar

Unnamed: 0,subject_id,hadm_id,emar_id,emar_seq,poe_id,pharmacy_id,enter_provider_id,charttime,medication,event_txt,scheduletime,storetime
0,10005909,20199380.0,10005909-74,74,10005909-97,96110427.0,,2144-10-31 05:56:00,Magnesium Sulfate,,2144-10-31 05:56:00,2144-10-31 05:56:00
1,10005909,20199380.0,10005909-79,79,10005909-97,96110427.0,,2144-10-31 08:00:00,Magnesium Sulfate,,2144-10-31 08:00:00,2144-10-31 08:15:00
2,10008287,22168393.0,10008287-32,32,10008287-58,,P26PKF,2145-09-28 20:15:00,Potassium Chloride Replacement (Critical Care ...,,2145-09-28 20:15:00,2145-09-28 20:38:00
3,10010471,21322534.0,10010471-33,33,10010471-51,52131847.0,,2155-05-08 21:45:00,Metoprolol Tartrate,,2155-05-08 21:45:00,2155-05-08 22:40:00
4,10015272,27993466.0,10015272-31,31,10015272-48,88758875.0,,2137-06-13 08:36:00,Metoprolol Tartrate,,2137-06-13 08:36:00,2137-06-13 08:36:00
...,...,...,...,...,...,...,...,...,...,...,...,...
35830,10037861,24540843.0,10037861-371,371,10037861-385,,,2117-03-17 19:00:00,Midazolam,Infusion Reconciliation Not Done,2117-03-17 19:00:00,2117-03-17 18:33:00
35831,10018423,29366372.0,10018423-20,20,10018423-64,,P401QD,2167-05-04 17:00:00,Heparin,Stopped - Unscheduled in Other Location,2167-05-04 17:00:00,2167-05-04 18:19:00
35832,10014354,28335091.0,10014354-259,259,10014354-301,,,2147-04-28 17:00:00,Heparin,Stopped - Unscheduled in Other Location,2106-09-29 00:00:00,2147-04-28 22:34:00
35833,10019003,27525946.0,10019003-255,255,10019003-538,,,2153-04-14 17:00:00,PHENYLEPHrine,Stopped - Unscheduled in Other Location,2153-04-14 17:00:00,2153-04-15 02:28:00


In [14]:
# convert time to datetime
df_emar['charttime'] = pd.to_datetime(df_emar['charttime'])

Impute with N/A and encode: enter_provider_id, medication

Drop: subject_id, emar_id, poe_id, pharmacy_id, event_txt, storetime

poe_id is an identifier which links administrations in emar to orders in poe and prescriptions
storetime is when it was recorded in the table

In [15]:
# Make a feature called delay using scheduletime - charttime

# Convert to datetime
df_emar['scheduletime'] = pd.to_datetime(df_emar['scheduletime'], format='%Y/%m/%d %H:%M')
df_emar['charttime'] = pd.to_datetime(df_emar['charttime'], format='%Y/%m/%d %H:%M')

df_emar['delay'] = df_emar['charttime'] - df_emar['scheduletime']

# Fill any non time values
df_emar['delay'] = df_emar['delay'].fillna(pd.Timedelta(0))

In [16]:
df_emar = df_emar.drop(columns=['subject_id','emar_id','poe_id','pharmacy_id',
                               'event_txt','scheduletime','storetime'])

In [17]:
# Fill Null with N/A and then one hot encode
df_emar['enter_provider_id'] = df_emar['enter_provider_id'].fillna('N/A')
df_emar['medication'] = df_emar['medication'].fillna('N/A')
df_emar = pd.get_dummies(df_emar, columns=['enter_provider_id', 'medication'])

In [18]:
df_emar['delay'].value_counts()

0 days 00:00:00        15637
0 days 00:01:00          778
-1 days +23:59:00        349
0 days 00:14:00          286
0 days 00:09:00          281
                       ...  
-1 days +21:39:00          1
0 days 04:14:00            1
0 days 06:29:00            1
-1 days +22:37:00          1
14821 days 17:00:00        1
Name: delay, Length: 407, dtype: int64

In [19]:
df_emar.head()

Unnamed: 0,hadm_id,emar_seq,charttime,delay,enter_provider_id_N/A,enter_provider_id_P00SP9,enter_provider_id_P019KI,enter_provider_id_P01QR6,enter_provider_id_P02FO8,enter_provider_id_P02IVL,...,medication_amLODIPine,medication_ibrutinib,medication_irbesartan,medication_moxifloxacin,medication_nitroglycerin,medication_rifAXIMin,medication_ruxolitinib,medication_sevelamer CARBONATE,medication_vancomycin,medication_venetoclax
0,20199380.0,74,2144-10-31 05:56:00,0 days,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,20199380.0,79,2144-10-31 08:00:00,0 days,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,22168393.0,32,2145-09-28 20:15:00,0 days,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,21322534.0,33,2155-05-08 21:45:00,0 days,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,27993466.0,31,2137-06-13 08:36:00,0 days,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [20]:
df_emar['transfer_id'] = float('nan')
for index, row in df_emar.iterrows():
    # Filter target df based on 'hadm_id'
    df_transfers_subset = df_transfers[df_transfers['hadm_id'] == row['hadm_id']]

    datetime_value = row['charttime']
    
    # Filter out datetime values in the second dataframe that are later than the datetime value in the first dataframe
    filtered_transfers = df_transfers_subset[df_transfers_subset['intime'] > datetime_value]
    
    if not filtered_transfers.empty:

        # Find the closest datetime value in the filtered second dataframe
        closest_datetime = filtered_transfers['intime'].min()

        # Get the id of the sample with the closest datetime value
        closest_id = filtered_transfers[filtered_transfers['intime'] == closest_datetime]['transfer_id']

        # Assign the id to the current row in the first dataframe
        df_emar.at[index, 'transfer_id'] = closest_id



In [21]:
df_emar 

Unnamed: 0,hadm_id,emar_seq,charttime,delay,enter_provider_id_N/A,enter_provider_id_P00SP9,enter_provider_id_P019KI,enter_provider_id_P01QR6,enter_provider_id_P02FO8,enter_provider_id_P02IVL,...,medication_ibrutinib,medication_irbesartan,medication_moxifloxacin,medication_nitroglycerin,medication_rifAXIMin,medication_ruxolitinib,medication_sevelamer CARBONATE,medication_vancomycin,medication_venetoclax,transfer_id
0,20199380.0,74,2144-10-31 05:56:00,0 days 00:00:00,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,
1,20199380.0,79,2144-10-31 08:00:00,0 days 00:00:00,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,
2,22168393.0,32,2145-09-28 20:15:00,0 days 00:00:00,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,33348260.0
3,21322534.0,33,2155-05-08 21:45:00,0 days 00:00:00,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,30185783.0
4,27993466.0,31,2137-06-13 08:36:00,0 days 00:00:00,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,37282690.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
35830,24540843.0,371,2117-03-17 19:00:00,0 days 00:00:00,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,
35831,29366372.0,20,2167-05-04 17:00:00,0 days 00:00:00,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,38137273.0
35832,28335091.0,259,2147-04-28 17:00:00,14821 days 17:00:00,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,
35833,27525946.0,255,2153-04-14 17:00:00,0 days 00:00:00,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,32694014.0


In [22]:
df_emar = df_emar.dropna()

In [23]:
df_emar

Unnamed: 0,hadm_id,emar_seq,charttime,delay,enter_provider_id_N/A,enter_provider_id_P00SP9,enter_provider_id_P019KI,enter_provider_id_P01QR6,enter_provider_id_P02FO8,enter_provider_id_P02IVL,...,medication_ibrutinib,medication_irbesartan,medication_moxifloxacin,medication_nitroglycerin,medication_rifAXIMin,medication_ruxolitinib,medication_sevelamer CARBONATE,medication_vancomycin,medication_venetoclax,transfer_id
2,22168393.0,32,2145-09-28 20:15:00,0 days,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,33348260.0
3,21322534.0,33,2155-05-08 21:45:00,0 days,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,30185783.0
4,27993466.0,31,2137-06-13 08:36:00,0 days,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,37282690.0
5,27738145.0,14,2187-02-10 17:13:00,0 days,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,34578020.0
6,24104168.0,50,2169-01-16 02:27:00,0 days,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,36184711.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
35828,22429197.0,58,2147-12-31 19:00:00,0 days,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,34311920.0
35829,22429197.0,109,2148-01-02 07:00:00,0 days,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,34311920.0
35831,29366372.0,20,2167-05-04 17:00:00,0 days,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,38137273.0
35833,27525946.0,255,2153-04-14 17:00:00,0 days,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,32694014.0


#### Split into train and test

In [24]:
# Need to change transfer_id to careunit

df_emar = pd.merge(df_emar, df_transfers[['transfer_id', 'careunit']], on='transfer_id', how='left')
df_emar.drop(columns=['transfer_id'], inplace=True)

In [25]:
data = df_emar.drop(columns=['careunit','hadm_id'])
target = df_emar['careunit']

# Split the dataset into training and testing sets
emar_data_train, emar_data_test, emar_label_train, emar_label_test = train_test_split(data, target, test_size=0.2, random_state=42)

# Print the shapes of the resulting training and testing sets
print("Training set shape:", emar_data_train.shape, emar_label_train.shape)
print("Testing set shape:", emar_data_test.shape, emar_label_test.shape)

Training set shape: (11296, 651) (11296,)
Testing set shape: (2824, 651) (2824,)


In [26]:
# uncomment and run if changes are made

emar_data_train.to_csv('emar_data_train.csv', index=False)
emar_data_test.to_csv('emar_data_test.csv', index=False)

emar_label_train.to_csv('emar_label_train.csv', index=False)
emar_label_test.to_csv('emar_label_test.csv', index=False)

#### Dimensionality reduction

In [27]:
# Fine

### labevents

In [28]:
file = "hosp/labevents.csv"
full_path = path + file

df_labevents = pd.read_csv(full_path)

In [29]:
df_labevents['value'] = pd.to_numeric(df_labevents['value'], errors='coerce').fillna(0)

In [30]:
df_labevents = df_labevents[df_labevents['hadm_id'].isin(df_transfers['hadm_id'])]

# For each sample, get the rows from df_transfers that have the same hadm_id
# Go through this list and if the event time is between the in and out transfer time, assign it the same transfer_id 

In [31]:
# convert time to datetime
df_labevents['charttime'] = pd.to_datetime(df_labevents['charttime'])

In [32]:
df_labevents['transfer_id'] = float('nan')
for index, row in df_labevents.iterrows():
    # Filter target df based on 'hadm_id'
    df_transfers_subset = df_transfers[df_transfers['hadm_id'] == row['hadm_id']]

    datetime_value = row['charttime']
    
    # Filter out datetime values in the second dataframe that are later than the datetime value in the first dataframe
    filtered_transfers = df_transfers_subset[df_transfers_subset['intime'] > datetime_value]
    
    if not filtered_transfers.empty:

        # Find the closest datetime value in the filtered second dataframe
        closest_datetime = filtered_transfers['intime'].min()

        # Get the id of the sample with the closest datetime value
        closest_id = filtered_transfers[filtered_transfers['intime'] == closest_datetime]['transfer_id']

        # Assign the id to the current row in the first dataframe
        df_labevents.at[index, 'transfer_id'] = closest_id


In [33]:
# for index, row in df_labevents.iterrows():
#     df_transfers_subset = df_transfers[df_transfers['hadm_id'] == row['hadm_id']]

#     filtered_rows = df_transfers_subset[(df_transfers_subset['intime'] <= row['charttime']) & (row['charttime'] <= df_transfers_subset['outtime'])]
    
#     # If there are matching rows in DataFrame 2, assign 'transfer_id' from DataFrame 2 to DataFrame 1
#     if not filtered_rows.empty:
#         df_labevents.at[index, 'transfer_id'] = filtered_rows['transfer_id'].iloc[0]

In [34]:
df_labevents

Unnamed: 0,labevent_id,subject_id,hadm_id,specimen_id,itemid,order_provider_id,charttime,storetime,value,valuenum,valueuom,ref_range_lower,ref_range_upper,flag,priority,comments,transfer_id
0,172061,10014354,29600294.0,1808066,51277,,2148-08-16 00:00:00,2148-08-16 01:30:00,15.40,15.40,%,10.5,15.5,,ROUTINE,,39864867.0
1,172062,10014354,29600294.0,1808066,51279,,2148-08-16 00:00:00,2148-08-16 01:30:00,3.35,3.35,m/uL,4.6,6.1,abnormal,ROUTINE,,39864867.0
2,172068,10014354,29600294.0,1808066,52172,,2148-08-16 00:00:00,2148-08-16 01:30:00,49.70,49.70,fL,35.1,46.3,abnormal,ROUTINE,,39864867.0
3,172063,10014354,29600294.0,1808066,51301,,2148-08-16 00:00:00,2148-08-16 01:30:00,20.30,20.30,K/uL,4.0,10.0,abnormal,ROUTINE,,39864867.0
4,172050,10014354,29600294.0,1808066,51249,,2148-08-16 00:00:00,2148-08-16 01:30:00,31.10,31.10,g/dL,32.0,37.0,abnormal,ROUTINE,,39864867.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
107722,254700,10021487,28998349.0,78791160,50804,,2116-12-07 18:59:00,2116-12-07 19:00:00,35.00,35.00,mEq/L,21.0,30.0,abnormal,,,36003339.0
107723,254702,10021487,28998349.0,78791160,50818,,2116-12-07 18:59:00,2116-12-07 19:00:00,56.00,56.00,mm Hg,35.0,45.0,abnormal,,,36003339.0
107724,254707,10021487,28998349.0,78791160,52033,,2116-12-07 18:59:00,2116-12-07 18:59:00,0.00,,,,,,,___,36003339.0
107725,254706,10021487,28998349.0,78791160,50825,,2116-12-07 18:59:00,2116-12-07 18:59:00,39.70,39.70,,,,,,,36003339.0


In [35]:
# Make a feature for days_since_admission using charttime - admittime

# Convert to datetime
df_labevents['charttime'] = pd.to_datetime(df_labevents['charttime'], format='%Y/%m/%d %H:%M')

# Add admittime column from other dataframe
df_labevents = df_labevents.merge(df_admittime, on='hadm_id', how='left')

# # Discard the time part and keep only the date
# df_hcpcsevents['admittime'] = df_hcpcsevents['admittime'].dt.date
# df_hcpcsevents['chartdate'] = df_hcpcsevents['chartdate'].dt.date

df_labevents['days_since_admission'] = df_labevents['charttime'] - df_labevents['admittime']

# Fill any non time values
df_labevents['days_since_admission'] = df_labevents['days_since_admission'].fillna(pd.Timedelta(0))

In [36]:
# Add storetime - charttime feature called delay

# Convert to datetime
df_labevents['storetime'] = pd.to_datetime(df_labevents['storetime'], format='%Y/%m/%d %H:%M')

df_labevents['delay'] = df_labevents['storetime'] - df_labevents['charttime']

# Fill any non time values
df_labevents['delay'] = df_labevents['delay'].fillna(pd.Timedelta(0))

Drop: labevent_id, subject_id, order_provider_id (too many Null), charttime, storetime, comments

In [37]:
df_labevents = df_labevents.drop(columns=['labevent_id','subject_id','order_provider_id','charttime','storetime','comments'])

In [38]:
# For flag make abnormal = 1 and fill Null with 0
df_labevents['flag'] = df_labevents['flag'].fillna(0)
df_labevents['flag'] = df_labevents['flag'].replace('abnormal', 1)

In [39]:
# For priority fill Null with N/A and then one hot encode
df_labevents['priority'] = df_labevents['priority'].fillna('N/A')
df_labevents = pd.get_dummies(df_labevents, columns=['priority'])

In [40]:
df_labevents = pd.get_dummies(df_labevents, columns=['valueuom','specimen_id','itemid'])

In [41]:
# Drop any rows with null values 
df_labevents = df_labevents.dropna()

#### Split into train and test

In [42]:

df_labevents = pd.merge(df_labevents, df_transfers[['transfer_id', 'careunit']], on='transfer_id', how='left')
df_labevents.drop(columns=['transfer_id'], inplace=True)
data = df_labevents.drop(columns=['careunit','hadm_id'])
target = df_labevents['careunit']


# Split the dataset into training and testing sets
labevents_data_train, labevents_data_test, labevents_label_train, labevents_label_test = train_test_split(data, target, test_size=0.2, random_state=42)

# Print the shapes of the resulting training and testing sets
print("Training set shape:", labevents_data_train.shape, labevents_label_train.shape)
print("Testing set shape:", labevents_data_test.shape, labevents_label_test.shape)

Training set shape: (29245, 11680) (29245,)
Testing set shape: (7312, 11680) (7312,)


In [43]:
# uncomment and run if changes are made

labevents_data_train.to_csv('labevents_data_train.csv', index=False)
labevents_data_test.to_csv('labevents_data_test.csv', index=False)

labevents_label_train.to_csv('labevents_label_train.csv', index=False)
labevents_label_test.to_csv('labevents_label_test.csv', index=False)

In [44]:
labevents_data_train

Unnamed: 0,value,valuenum,ref_range_lower,ref_range_upper,flag,admittime,days_since_admission,delay,priority_N/A,priority_ROUTINE,...,itemid_52286,itemid_52312,itemid_52369,itemid_52391,itemid_52419,itemid_52425,itemid_52427,itemid_52769,itemid_52955,itemid_53153
26340,137.0,137.0,133.0,145.0,0,2120-05-12 12:53:00,1 days 04:46:00,0 days 01:03:00,0,0,...,0,0,0,0,0,0,0,0,0,0
31949,58.1,58.1,25.0,36.5,1,2115-10-09 20:28:00,2 days 01:22:00,0 days 00:26:00,0,0,...,0,0,0,0,0,0,0,0,0,0
19672,32.3,32.3,32.0,37.0,0,2187-02-10 18:57:00,1 days 18:31:00,0 days 00:20:00,0,0,...,0,0,0,0,0,0,0,0,0,0
13620,197.0,197.0,70.0,400.0,0,2137-08-04 00:07:00,5 days 05:10:00,0 days 07:00:00,0,1,...,0,0,0,0,0,0,0,0,0,0
15337,30.7,30.7,26.0,32.0,0,2155-10-17 18:01:00,0 days 13:19:00,0 days 03:03:00,0,1,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
16850,30.6,30.6,26.0,32.0,0,2155-07-10 17:48:00,2 days 15:34:00,0 days 00:32:00,0,0,...,0,0,0,0,0,0,0,0,0,0
6265,0.0,150.0,25.0,36.5,1,2175-03-20 23:29:00,1 days 06:35:00,0 days 02:17:00,0,0,...,0,0,0,0,0,0,0,0,0,0
11284,3.7,3.7,3.3,5.1,0,2166-08-21 23:09:00,1 days 12:04:00,0 days 00:59:00,0,0,...,0,0,0,0,0,0,0,0,0,0
860,7.0,7.0,8.0,20.0,1,2112-09-17 19:13:00,8 days 04:47:00,0 days 01:08:00,0,1,...,0,0,0,0,0,0,0,0,0,0


#### Dimensionality reduction

In [45]:
# Fine

### microbiologyevents

In [27]:
file = "hosp/microbiologyevents.csv"
full_path = path + file

df_microbio = pd.read_csv(full_path)

In [28]:
df_microbio.head(5)

Unnamed: 0,microevent_id,subject_id,hadm_id,micro_specimen_id,order_provider_id,chartdate,charttime,spec_itemid,spec_type_desc,test_seq,...,org_name,isolate_num,quantity,ab_itemid,ab_name,dilution_text,dilution_comparison,dilution_value,interpretation,comments
0,36,10000032,25742920.0,7814634,,2180-08-06 00:00:00,2180-08-06 20:35:00,70070,SWAB,1,...,,,,,,,,,,No VRE isolated.
1,15,10000032,22595853.0,5717063,,2180-05-07 00:00:00,2180-05-07 00:19:00,70070,SWAB,1,...,,,,,,,,,,No VRE isolated.
2,32,10000032,29079034.0,5901894,,2180-07-24 00:00:00,2180-07-24 00:55:00,70070,SWAB,1,...,,,,,,,,,,No VRE isolated.
3,7013,10020944,29974575.0,4646730,,2131-02-27 00:00:00,2131-02-27 17:41:00,70070,SWAB,1,...,ENTEROCOCCUS SP.,1.0,,90015.0,VANCOMYCIN,>256,,,R,
4,12898,10037975,27617929.0,1636367,,2185-01-17 00:00:00,2185-01-17 21:32:00,70070,SWAB,1,...,,,,,,,,,,No VRE isolated.


In [29]:
df_microbio = df_microbio[df_microbio['hadm_id'].isin(df_transfers['hadm_id'])]

# For each sample, get the rows from df_transfers that have the same hadm_id
# Go through this list and if the event time is between the in and out transfer time, assign it the same transfer_id 

In [30]:
# convert time to datetime
df_microbio['charttime'] = pd.to_datetime(df_microbio['charttime'])

In [31]:
df_microbio['transfer_id'] = float('nan')

for index, row in df_microbio.iterrows():
    # Filter target df based on 'hadm_id'
    df_transfers_subset = df_transfers[df_transfers['hadm_id'] == row['hadm_id']]

    datetime_value = row['charttime']
    
    # Filter out datetime values in the second dataframe that are later than the datetime value in the first dataframe
    filtered_transfers = df_transfers_subset[df_transfers_subset['intime'] > datetime_value]
    
    if not filtered_transfers.empty:

        # Find the closest datetime value in the filtered second dataframe
        closest_datetime = filtered_transfers['intime'].min()

        # Get the id of the sample with the closest datetime value
        closest_id = filtered_transfers[filtered_transfers['intime'] == closest_datetime]['transfer_id']

        # Assign the id to the current row in the first dataframe
        df_microbio.at[index, 'transfer_id'] = closest_id

df_microbio.dropna(subset=['transfer_id'], inplace=True)

In [32]:
# df_microbio.dropna(subset=['transfer_id'], inplace=True)
df_microbio 

Unnamed: 0,microevent_id,subject_id,hadm_id,micro_specimen_id,order_provider_id,chartdate,charttime,spec_itemid,spec_type_desc,test_seq,...,isolate_num,quantity,ab_itemid,ab_name,dilution_text,dilution_comparison,dilution_value,interpretation,comments,transfer_id
2,32,10000032,29079034.0,5901894,,2180-07-24 00:00:00,2180-07-24 00:55:00,70070,SWAB,1,...,,,,,,,,,No VRE isolated.,37682908.0
3,7013,10020944,29974575.0,4646730,,2131-02-27 00:00:00,2131-02-27 17:41:00,70070,SWAB,1,...,1.0,,90015.0,VANCOMYCIN,>256,,,R,,39351025.0
5,10771,10031757,28477280.0,7612706,,2137-10-18 00:00:00,2137-10-18 00:58:00,70070,SWAB,1,...,,,,,,,,,No VRE isolated.,34732250.0
7,14015,10038999,27189241.0,2819316,,2131-05-25 00:00:00,2131-05-25 05:07:00,70070,SWAB,1,...,,,,,,,,,No VRE isolated.,39884883.0
9,7290,10021487,27660781.0,5069142,,2117-03-03 00:00:00,2117-03-03 16:27:00,70070,SWAB,1,...,,,,,,,,,No VRE isolated.,35065627.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2883,3190,10009049,22995465.0,5127191,,2174-05-26 00:00:00,2174-05-26 17:00:00,70057,Rapid Respiratory Viral Screen & Culture,1,...,,,,,,,,,Negative for Respiratory Viral Antigen. Speci...,36991846.0
2886,4676,10014354,26486158.0,6440602,,2148-08-25 00:00:00,2148-08-25 14:28:00,70057,Rapid Respiratory Viral Screen & Culture,2,...,,,,,,,,,___,38144928.0
2888,6540,10019003,29279905.0,5275671,,2153-03-28 00:00:00,2153-03-28 10:52:00,70057,Rapid Respiratory Viral Screen & Culture,2,...,,,,,,,,,Negative for Respiratory Viral Antigen. Speci...,32636269.0
2891,6884,10020640,27984218.0,7142398,,2153-02-13 00:00:00,2153-02-13 05:33:00,70057,Rapid Respiratory Viral Screen & Culture,2,...,,,,,,,,,Negative for Respiratory Viral Antigen. Speci...,37407527.0


In [33]:
# make days_since_admission using charttime 

# Convert to datetime
df_microbio['charttime'] = pd.to_datetime(df_microbio['charttime'], format='%Y/%m/%d %H:%M')

# Add admittime column from other dataframe
df_microbio = df_microbio.merge(df_admittime, on='hadm_id', how='left')

# # Discard the time part and keep only the date
# df_hcpcsevents['admittime'] = df_hcpcsevents['admittime'].dt.date
# df_hcpcsevents['chartdate'] = df_hcpcsevents['chartdate'].dt.date

df_microbio['days_since_admission'] = df_microbio['charttime'] - df_microbio['admittime']

# Fill any non time values
df_microbio['days_since_admission'] = df_microbio['days_since_admission'].fillna(pd.Timedelta(0))

# Drop the admission time column
df_microbio = df_microbio.drop(columns=['admittime'])

In [34]:
# Add storetime - charttime feature (call it delay)

# Convert to datetime
df_microbio['storetime'] = pd.to_datetime(df_microbio['storetime'], format='%Y/%m/%d %H:%M')

df_microbio['delay'] = df_microbio['storetime'] - df_microbio['charttime']

# Fill any non time values
df_microbio['delay'] = df_microbio['delay'].fillna(pd.Timedelta(0))

Drop: microevent_id, subject_id, chartdate, charttime, test_seq, storedate, storetime, test_name and org_itemid (since info in name), quantity, ab_name, comments, micro_specimen_id (unique identifier for sample as some measurements are made on the same sample)
Keep but categorical: order_provider_id, spec_type_desc, dilution_text, dilution_comparison
Impute null with 0: order_provider_id, org_itemid, isolate_num, ab_itemid, dilution_value
Impute with N/A and then one hot encode: interpretation

In [35]:
# Drop
# spec_itemid , test_itemid
df_microbio = df_microbio.drop(columns=['microevent_id','subject_id','chartdate','charttime','test_seq','storedate',
                                       'storetime','quantity','comments','ab_itemid',
                                       'spec_itemid','test_itemid','org_itemid','micro_specimen_id'])

In [36]:
# Impute null with 0: order_provider_id, org_itemid, isolate_num, ab_itemid, dilution_value
df_microbio['order_provider_id'] = df_microbio['order_provider_id'].fillna(0)
df_microbio['isolate_num'] = df_microbio['isolate_num'].fillna(0)
df_microbio['dilution_value'] = df_microbio['dilution_value'].fillna(0)

In [37]:
# Impute with N/A and then one hot encode: interpretation
# encode test_name, ab_name

df_microbio['interpretation'] = df_microbio['interpretation'].fillna('N/A')
df_microbio['test_name'] = df_microbio['test_name'].fillna('N/A')
df_microbio['ab_name'] = df_microbio['ab_name'].fillna('N/A')
df_microbio['org_name'] = df_microbio['org_name'].fillna('None')
df_microbio = pd.get_dummies(df_microbio, columns=['org_name','interpretation','ab_name','test_name'])

In [38]:
# Keep but categorical: order_provider_id, spec_type_desc, dilution_text, dilution_comparison
df_microbio = pd.get_dummies(df_microbio, columns=['order_provider_id','spec_type_desc','dilution_text',
                                                  'dilution_comparison'])

In [39]:
df_microbio = df_microbio.dropna()

In [40]:
df_microbio

Unnamed: 0,hadm_id,isolate_num,dilution_value,transfer_id,days_since_admission,delay,org_name_ACINETOBACTER BAUMANNII COMPLEX,org_name_ANAEROBIC GRAM POSITIVE ROD(S),org_name_BACTEROIDES FRAGILIS GROUP,org_name_BETA STREPTOCOCCUS GROUP B,...,dilution_text_<=4,dilution_text_=>16,dilution_text_=>32,dilution_text_=>4,dilution_text_=>64,dilution_text_=>8,dilution_text_>256,dilution_comparison_<=,dilution_comparison_=,dilution_comparison_=>
0,29079034.0,0.0,0.0,37682908.0,0 days 12:20:00,3 days 10:20:00,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,29974575.0,1.0,0.0,39351025.0,0 days 02:07:00,3 days 19:28:00,0,0,0,0,...,0,0,0,0,0,0,1,0,0,0
2,28477280.0,0.0,0.0,34732250.0,5 days 02:15:00,2 days 07:48:00,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,27189241.0,0.0,0.0,39884883.0,2 days 07:18:00,3 days 11:06:00,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,27660781.0,0.0,0.0,35065627.0,0 days 00:28:00,1 days 16:48:00,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1150,22995465.0,0.0,0.0,36991846.0,0 days 08:39:00,0 days 17:34:00,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1151,26486158.0,0.0,0.0,38144928.0,2 days 23:10:00,2 days 21:44:00,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1152,29279905.0,0.0,0.0,32636269.0,0 days 11:27:00,0 days 04:28:00,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1153,27984218.0,0.0,0.0,37407527.0,0 days 05:11:00,0 days 05:57:00,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


#### Split into train and test

In [41]:
df_microbio = pd.merge(df_microbio, df_transfers[['transfer_id', 'careunit']], on='transfer_id', how='left')
df_microbio.drop(columns=['transfer_id'], inplace=True)
data = df_microbio.drop(columns=['careunit','hadm_id'])
target = df_microbio['careunit']

# Split the dataset into training and testing sets
microbio_data_train, microbio_data_test, microbio_label_train, microbio_label_test = train_test_split(data, target, test_size=0.2, random_state=42)

# Print the shapes of the resulting training and testing sets
print("Training set shape:", microbio_data_train.shape, microbio_label_train.shape)
print("Testing set shape:", microbio_data_test.shape, microbio_label_test.shape)

Training set shape: (924, 184) (924,)
Testing set shape: (231, 184) (231,)


In [42]:
# uncomment and run if changes are made

microbio_data_train.to_csv('microbio_data_train.csv', index=False)
microbio_data_test.to_csv('microbio_data_test.csv', index=False)

microbio_label_train.to_csv('microbio_label_train.csv', index=False)
microbio_label_test.to_csv('microbio_label_test.csv', index=False)

In [43]:
microbio_data_train

Unnamed: 0,isolate_num,dilution_value,days_since_admission,delay,org_name_ACINETOBACTER BAUMANNII COMPLEX,org_name_ANAEROBIC GRAM POSITIVE ROD(S),org_name_BACTEROIDES FRAGILIS GROUP,org_name_BETA STREPTOCOCCUS GROUP B,org_name_BETA STREPTOCOCCUS GROUP G,org_name_CANCELLED,...,dilution_text_<=4,dilution_text_=>16,dilution_text_=>32,dilution_text_=>4,dilution_text_=>64,dilution_text_=>8,dilution_text_>256,dilution_comparison_<=,dilution_comparison_=,dilution_comparison_=>
996,1.0,0.25,1 days 09:58:00,2 days 23:38:00,0,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0
58,0.0,0.00,0 days 01:06:00,2 days 10:45:00,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
333,0.0,0.00,1 days 06:15:00,6 days 09:48:00,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
332,0.0,0.00,1 days 12:23:00,5 days 21:53:00,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
837,1.0,1.00,2 days 21:54:00,5 days 19:01:00,0,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1044,1.0,0.25,0 days 02:26:00,3 days 19:20:00,0,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0
1095,1.0,0.00,5 days 13:44:00,2 days 09:44:00,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1130,0.0,0.00,1 days 18:59:00,1 days 20:52:00,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
860,1.0,32.00,5 days 01:44:00,1 days 19:49:00,0,0,0,0,0,0,...,0,0,1,0,0,0,0,0,0,1


#### Dimensionality reduction

In [44]:
# Fine

### pharmacy

In [45]:
file = "hosp/pharmacy.csv"
full_path = path + file

df_pharmacy = pd.read_csv(full_path)

In [46]:
df_pharmacy.head(2)

Unnamed: 0,subject_id,hadm_id,pharmacy_id,poe_id,starttime,stoptime,medication,proc_type,status,entertime,...,basal_rate,one_hr_max,doses_per_24_hrs,duration,duration_interval,expiration_value,expiration_unit,expirationdate,dispensation,fill_quantity
0,10027602,28166872,24340150,,2201-10-30 12:00:00,,Midazolam,Miscellaneous Charges,Inactive (Due to a change order),2201-10-30 12:32:11,...,,,,,,,,,,
1,10027602,28166872,14435820,,2201-10-30 12:00:00,,Midazolam,Miscellaneous Charges,Inactive (Due to a change order),2201-10-30 12:54:34,...,,,,,,,,,,


In [47]:
# stoptime-starttime for a duration feature

# Convert to datetime
df_pharmacy['stoptime'] = pd.to_datetime(df_pharmacy['stoptime'], format='%Y/%m/%d %H:%M')
df_pharmacy['starttime'] = pd.to_datetime(df_pharmacy['starttime'], format='%Y/%m/%d %H:%M')


df_pharmacy['medication_duration'] = df_pharmacy['stoptime'] - df_pharmacy['starttime']

# Fill any non time values
df_pharmacy['medication_duration'] = df_pharmacy['medication_duration'].fillna(pd.Timedelta(0))

In [48]:
# verifiedtime - entertime for verification_delay feature 

# Convert to datetime
df_pharmacy['verifiedtime'] = pd.to_datetime(df_pharmacy['verifiedtime'], format='%Y/%m/%d %H:%M')
df_pharmacy['entertime'] = pd.to_datetime(df_pharmacy['entertime'], format='%Y/%m/%d %H:%M')

df_pharmacy['verification_delay'] = df_pharmacy['verifiedtime'] - df_pharmacy['entertime']

# Fill any non time values
df_pharmacy['verification_delay'] = df_pharmacy['verification_delay'].fillna(pd.Timedelta(0))

In [49]:
fill_value = [0] 

# Fill null values with the list
df_pharmacy['disp_sched'] = df_pharmacy['disp_sched'].fillna(pd.Series([fill_value]*len(df_pharmacy)))

In [50]:
# Convert all categories to strings
df_pharmacy['disp_sched'] = df_pharmacy['disp_sched'].apply(lambda x: [str(item) for item in x])

mlb = MultiLabelBinarizer()

encoded_feature = pd.DataFrame(mlb.fit_transform(df_pharmacy['disp_sched']),
                               columns=mlb.classes_,
                               index=df_pharmacy.index)

df_pharmacy = pd.concat([df_pharmacy, encoded_feature], axis=1)

In [51]:
df_pharmacy = df_pharmacy[df_pharmacy['hadm_id'].isin(df_transfers['hadm_id'])]

# For each sample, get the rows from df_transfers that have the same hadm_id
# Go through this list and if the event time is between the in and out transfer time, assign it the same transfer_id 

In [52]:
# convert time to datetime
df_pharmacy['entertime'] = pd.to_datetime(df_pharmacy['entertime'])

In [53]:
df_pharmacy['transfer_id'] = float('nan')

for index, row in df_pharmacy.iterrows():
    # Filter target df based on 'hadm_id'
    df_transfers_subset = df_transfers[df_transfers['hadm_id'] == row['hadm_id']]

    datetime_value = row['entertime']
    
    # Filter out datetime values in the second dataframe that are later than the datetime value in the first dataframe
    filtered_transfers = df_transfers_subset[df_transfers_subset['intime'] > datetime_value]
    
    if not filtered_transfers.empty:

        # Find the closest datetime value in the filtered second dataframe
        closest_datetime = filtered_transfers['intime'].min()

        # Get the id of the sample with the closest datetime value
        closest_id = filtered_transfers[filtered_transfers['intime'] == closest_datetime]['transfer_id']

        # Assign the id to the current row in the first dataframe
        df_pharmacy.at[index, 'transfer_id'] = closest_id

df_pharmacy.dropna(subset=['transfer_id'], inplace=True)

In [54]:
df_pharmacy

Unnamed: 0,subject_id,hadm_id,pharmacy_id,poe_id,starttime,stoptime,medication,proc_type,status,entertime,...,1,2,3,4,5,6,7,8,9,transfer_id
0,10027602,28166872,24340150,,2201-10-30 12:00:00,NaT,Midazolam,Miscellaneous Charges,Inactive (Due to a change order),2201-10-30 12:32:11,...,0,0,0,0,0,0,0,0,0,34302052.0
1,10027602,28166872,14435820,,2201-10-30 12:00:00,NaT,Midazolam,Miscellaneous Charges,Inactive (Due to a change order),2201-10-30 12:54:34,...,0,0,0,0,0,0,0,0,0,34302052.0
2,10027602,28166872,40720238,,2201-10-30 12:00:00,NaT,Fentanyl Citrate,Miscellaneous Charges,Inactive (Due to a change order),2201-10-30 12:32:11,...,0,0,0,0,0,0,0,0,0,34302052.0
3,10027602,28166872,27168639,,2201-10-30 12:00:00,NaT,Fentanyl Citrate,Miscellaneous Charges,Inactive (Due to a change order),2201-10-30 12:54:34,...,0,0,0,0,0,0,0,0,0,34302052.0
4,10027602,28166872,62845687,,2201-10-31 12:00:00,NaT,Lorazepam,Miscellaneous Charges,Inactive (Due to a change order),2201-10-31 12:02:42,...,0,0,0,0,0,0,0,0,0,34302052.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
15292,10040025,27996267,12370020,10040025-1667,2148-01-25 13:00:00,2148-01-31 08:00:00,Propofol,Unit Dose,Discontinued,2148-01-25 12:47:50,...,0,0,0,0,0,0,0,0,0,36762745.0
15293,10040025,27996267,41395270,10040025-1696,2148-01-26 10:00:00,2148-01-31 08:00:00,Dexmedetomidine,IV Piggyback,Discontinued,2148-01-26 09:33:18,...,0,0,0,0,0,0,0,0,0,36762745.0
15295,10014354,22508257,89432721,10014354-850,2148-05-11 03:00:00,2148-05-11 03:00:00,MoviPrep,Unit Dose,Discontinued,2148-05-11 02:23:23,...,0,0,1,0,1,0,1,0,0,38278621.0
15296,10014354,22508257,5777610,10014354-885,2148-05-11 04:00:00,2148-05-12 04:00:00,MoviPrep,Unit Dose,Expired,2148-05-11 03:37:33,...,0,0,1,0,1,0,1,0,0,38278621.0


drop: subject_id, pharmacy_id, poe_id, starttime, stoptime, entertime, verifiedtime, disp_sched, basal_rate, one_hr_max,
expirationdate, fill_quantity
Encode: proc_type, status
Impute with N/A and encode: infusion_type, sliding_scale, duration_interval, expiration_unit, dispensation, medication, route, frequency
Impute with 0: lockout_interval, doses_per_24_hrs, duration, expiration_value

In [55]:
# Drop 
df_pharmacy = df_pharmacy.drop(columns=['subject_id','pharmacy_id','poe_id','starttime','stoptime','entertime',
                                       'verifiedtime','expirationdate', 'fill_quantity','disp_sched'])
# expiration date and fill quantity are all empty

In [56]:
# Encode: proc_type, status
df_pharmacy = pd.get_dummies(df_pharmacy, columns=['proc_type','status'])

In [57]:
# Impute with N/A and encode
df_pharmacy['infusion_type'] = df_pharmacy['infusion_type'].fillna('N/A')
df_pharmacy['sliding_scale'] = df_pharmacy['sliding_scale'].fillna('N/A')
df_pharmacy['duration_interval'] = df_pharmacy['duration_interval'].fillna('N/A')
df_pharmacy['expiration_unit'] = df_pharmacy['expiration_unit'].fillna('N/A')
df_pharmacy['dispensation'] = df_pharmacy['dispensation'].fillna('N/A')
df_pharmacy['medication'] = df_pharmacy['medication'].fillna('N/A')
df_pharmacy['route'] = df_pharmacy['route'].fillna('N/A')
df_pharmacy['frequency'] = df_pharmacy['frequency'].fillna('N/A')
df_pharmacy = pd.get_dummies(df_pharmacy, columns=['infusion_type','sliding_scale','duration_interval','expiration_unit',
                                                  'dispensation','medication','route','frequency'])

In [58]:
# Impute with 0: lockout_interval, doses_per_24_hrs, duration, expiration_value
df_pharmacy['lockout_interval'] = df_pharmacy['lockout_interval'].fillna(0)
df_pharmacy['doses_per_24_hrs'] = df_pharmacy['doses_per_24_hrs'].fillna(0)
df_pharmacy['expiration_value'] = df_pharmacy['expiration_value'].fillna(0)
df_pharmacy['basal_rate'] = df_pharmacy['basal_rate'].fillna(0)
df_pharmacy['one_hr_max'] = df_pharmacy['one_hr_max'].fillna(0)

In [59]:
df_pharmacy['verification_delay'].value_counts()


0 days 00:00:00    8882
0 days 00:41:04       1
0 days 00:35:21       1
0 days 00:46:02       1
0 days 00:09:38       1
0 days 00:47:05       1
0 days 03:18:10       1
0 days 01:20:56       1
0 days 00:21:07       1
0 days 00:14:44       1
0 days 02:42:28       1
0 days 00:48:56       1
0 days 00:30:31       1
0 days 00:16:04       1
0 days 01:51:16       1
0 days 00:49:02       1
0 days 00:11:18       1
0 days 00:32:32       1
0 days 00:53:06       1
0 days 02:43:05       1
0 days 00:09:18       1
0 days 00:05:24       1
0 days 00:06:49       1
0 days 00:05:43       1
0 days 00:11:51       1
0 days 00:15:41       1
0 days 00:15:10       1
0 days 01:01:07       1
0 days 00:05:37       1
0 days 02:04:12       1
0 days 00:07:06       1
0 days 00:11:12       1
0 days 00:07:47       1
0 days 00:13:06       1
0 days 04:22:04       1
0 days 00:05:12       1
0 days 00:06:36       1
0 days 02:39:10       1
0 days 00:27:54       1
0 days 00:05:19       1
0 days 01:17:33       1
Name: verificati

In [60]:
# Function to convert timedelta to string
def timedelta_to_string(td):
    td_str = "{:02}:{:02}:{:02}".format(td.seconds // 3600, (td.seconds // 60) % 60, td.seconds % 60)
    if td.days:
        td_str = "{} day, {}".format(td.days, td_str)
    return td_str

# Apply the function to the whole column
df_pharmacy['verification_delay'] = df_pharmacy['verification_delay'].apply(timedelta_to_string)

In [61]:
df_pharmacy['verification_delay'].value_counts()

00:00:00    8882
00:41:04       1
00:35:21       1
00:46:02       1
00:09:38       1
00:47:05       1
03:18:10       1
01:20:56       1
00:21:07       1
00:14:44       1
02:42:28       1
00:48:56       1
00:30:31       1
00:16:04       1
01:51:16       1
00:49:02       1
00:11:18       1
00:32:32       1
00:53:06       1
02:43:05       1
00:09:18       1
00:05:24       1
00:06:49       1
00:05:43       1
00:11:51       1
00:15:41       1
00:15:10       1
01:01:07       1
00:05:37       1
02:04:12       1
00:07:06       1
00:11:12       1
00:07:47       1
00:13:06       1
04:22:04       1
00:05:12       1
00:06:36       1
02:39:10       1
00:27:54       1
00:05:19       1
01:17:33       1
Name: verification_delay, dtype: int64

In [62]:
def convert_to_days_zero(duration_str):
    time_parts = duration_str.split(':')  # Split time part (hh:mm:ss) ['20', '55', '00']
    hours = float(time_parts[0])  # Extract hours and convert to float
    minutes = float(time_parts[1])  # Extract minutes and convert to float
    seconds = float(time_parts[2])  # Extract seconds and convert to float
    total_days = (hours / 24) + (minutes / (24 * 60)) + (seconds / (24 * 3600))  # Calculate total days
    return total_days

In [63]:
df_pharmacy['verification_delay']= df_pharmacy['verification_delay'].apply(convert_to_days_zero)

#### Split into train and test

In [64]:
df_pharmacy = pd.merge(df_pharmacy, df_transfers[['transfer_id', 'careunit']], on='transfer_id', how='left')
df_pharmacy.drop(columns=['transfer_id'], inplace=True)
data = df_pharmacy.drop(columns=['careunit','hadm_id'])
target = df_pharmacy['careunit']


# Split the dataset into training and testing sets
pharmacy_data_train, pharmacy_data_test, pharmacy_label_train, pharmacy_label_test = train_test_split(data, target, test_size=0.2, random_state=42)

# Print the shapes of the resulting training and testing sets
print("Training set shape:", pharmacy_data_train.shape, pharmacy_label_train.shape)
print("Testing set shape:", pharmacy_data_test.shape, pharmacy_label_test.shape)

Training set shape: (7137, 661) (7137,)
Testing set shape: (1785, 661) (1785,)


In [65]:
# uncomment and run if changes are made

pharmacy_data_train.to_csv('pharmacy_data_train.csv', index=False)
pharmacy_data_test.to_csv('pharmacy_data_test.csv', index=False)

pharmacy_label_train.to_csv('pharmacy_label_train.csv', index=False)
pharmacy_label_test.to_csv('pharmacy_label_test.csv', index=False)

In [66]:
pharmacy_data_train

Unnamed: 0,lockout_interval,basal_rate,one_hr_max,doses_per_24_hrs,duration,expiration_value,medication_duration,verification_delay,Unnamed: 9,",",...,frequency_STAT,frequency_TID,frequency_TID W/MEALS,frequency_TID:PRN,frequency_TITRATE TO,frequency_TITRATE TO RASS,frequency_X1,frequency_X1 PRN,frequency_X1:PRN,frequency_X2 PRN
1922,0.0,0.0,0.0,3.0,,36.0,6 days 06:00:00,0.0,1,1,...,0,0,0,0,0,0,0,0,0,0
7029,0.0,0.0,0.0,0.0,,36.0,0 days 00:00:00,0.0,0,0,...,0,0,0,0,0,0,0,0,0,0
1108,0.0,0.0,0.0,0.0,,36.0,14 days 02:00:00,0.0,0,0,...,0,0,0,0,0,0,0,0,0,0
6601,0.0,0.0,0.0,0.0,,36.0,-2 days +00:00:00,0.0,0,0,...,0,0,0,0,0,0,0,0,0,0
5474,0.0,0.0,0.0,0.0,4.0,0.0,0 days 14:00:00,0.0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5734,0.0,0.0,0.0,1.0,,365.0,6 days 00:00:00,0.0,0,0,...,0,0,0,0,0,0,0,0,0,0
5191,0.0,0.0,0.0,0.0,,0.0,0 days 20:00:00,0.0,0,0,...,0,0,0,0,0,0,0,0,0,0
5390,0.0,0.0,0.0,0.0,,365.0,4 days 09:00:00,0.0,0,0,...,0,0,0,0,0,0,0,0,0,0
860,0.0,0.0,0.0,0.0,,36.0,4 days 23:00:00,0.0,0,0,...,0,0,0,0,0,0,0,0,0,0


#### Dimensionality reduction

In [67]:
# Fine

### prescriptions

In [68]:
file = "hosp/prescriptions.csv"
full_path = path + file

df_prescriptions = pd.read_csv(full_path)

In [69]:
df_prescriptions

Unnamed: 0,subject_id,hadm_id,pharmacy_id,poe_id,poe_seq,order_provider_id,starttime,stoptime,drug_type,drug,...,gsn,ndc,prod_strength,form_rx,dose_val_rx,dose_unit_rx,form_val_disp,form_unit_disp,doses_per_24_hrs,route
0,10027602,28166872,27168639,,,,2201-10-30 12:00:00,,MAIN,Fentanyl Citrate,...,,,,,,,,,,
1,10027602,28166872,40720238,,,,2201-10-30 12:00:00,,MAIN,Fentanyl Citrate,...,,,,,,,,,,
2,10027602,28166872,62845687,,,,2201-10-31 12:00:00,,MAIN,Lorazepam,...,,,,,,,,,,
3,10027602,28166872,24340150,,,,2201-10-30 12:00:00,,MAIN,Midazolam,...,,,,,,,,,,
4,10027602,28166872,14435820,,,,2201-10-30 12:00:00,,MAIN,Midazolam,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
18082,10038081,20755971,33730068,10038081-378,378.0,P92KOD,2115-10-11 14:00:00,2115-10-13 07:00:00,MAIN,Artificial Tears,...,030016,2.305060e+07,0.4 mL DROPPERETTE,,1-2,DROP,0.1667-0.3333,DRP,,BOTH EYES
18083,10002428,23473524,87358294,10002428-780,780.0,P71IN4,2156-05-12 13:00:00,2156-05-22 18:00:00,MAIN,Artificial Tears,...,030016,2.305060e+07,0.3mL UD,,1-2,DROP,0.1667-0.3333,DRP,,BOTH EYES
18084,10040025,27996267,81941017,10040025-1640,1640.0,P52ORO,2148-01-26 19:00:00,2148-01-26 18:00:00,MAIN,OxyCODONE (Immediate Release),...,046474,9.046446e+08,15mg Tablet,,5-10,mg,0.3333-0.6667,TAB,,PO/NG
18085,10014354,26228185,46019806,10014354-3105,3105.0,P748G6,2150-05-01 01:00:00,2150-05-01 09:00:00,MAIN,Carbamide Peroxide 6.5%,...,008120,7.811207e+10,15mL Bottle,,5-10,DROP,0.3333-0.6667,BTL,1.0,BOTH EARS


In [70]:
df_prescriptions = df_prescriptions[df_prescriptions['hadm_id'].isin(df_transfers['hadm_id'])]

# For each sample, get the rows from df_transfers that have the same hadm_id
# Go through this list and if the event time is between the in and out transfer time, assign it the same transfer_id 

In [71]:
# convert time to datetime
df_prescriptions['starttime'] = pd.to_datetime(df_prescriptions['starttime'])

In [72]:
df_prescriptions['transfer_id'] = float('nan')

for index, row in df_prescriptions.iterrows():
    # Filter target df based on 'hadm_id'
    df_transfers_subset = df_transfers[df_transfers['hadm_id'] == row['hadm_id']]

    datetime_value = row['starttime']
    
    # Filter out datetime values in the second dataframe that are later than the datetime value in the first dataframe
    filtered_transfers = df_transfers_subset[df_transfers_subset['intime'] > datetime_value]
    
    if not filtered_transfers.empty:

        # Find the closest datetime value in the filtered second dataframe
        closest_datetime = filtered_transfers['intime'].min()

        # Get the id of the sample with the closest datetime value
        closest_id = filtered_transfers[filtered_transfers['intime'] == closest_datetime]['transfer_id']

        # Assign the id to the current row in the first dataframe
        df_prescriptions.at[index, 'transfer_id'] = closest_id

df_prescriptions.dropna(subset=['transfer_id'], inplace=True)

In [73]:
df_prescriptions

Unnamed: 0,subject_id,hadm_id,pharmacy_id,poe_id,poe_seq,order_provider_id,starttime,stoptime,drug_type,drug,...,ndc,prod_strength,form_rx,dose_val_rx,dose_unit_rx,form_val_disp,form_unit_disp,doses_per_24_hrs,route,transfer_id
0,10027602,28166872,27168639,,,,2201-10-30 12:00:00,,MAIN,Fentanyl Citrate,...,,,,,,,,,,32391858.0
1,10027602,28166872,40720238,,,,2201-10-30 12:00:00,,MAIN,Fentanyl Citrate,...,,,,,,,,,,32391858.0
2,10027602,28166872,62845687,,,,2201-10-31 12:00:00,,MAIN,Lorazepam,...,,,,,,,,,,34302052.0
3,10027602,28166872,24340150,,,,2201-10-30 12:00:00,,MAIN,Midazolam,...,,,,,,,,,,32391858.0
4,10027602,28166872,14435820,,,,2201-10-30 12:00:00,,MAIN,Midazolam,...,,,,,,,,,,32391858.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
18080,10022880,27708593,96503181,10022880-192,192.0,P76RCG,2177-03-12 14:00:00,2177-03-19 19:00:00,MAIN,Artificial Tears Preserv. Free,...,2.305060e+07,0.4 mL DROPPERETTE,,1-2,DROP,0.1667-0.3333,DRP,1.0,BOTH EYES,30744153.0
18081,10003400,20214994,86647694,10003400-1372,1372.0,P61VWF,2137-02-27 05:00:00,2137-03-19 20:00:00,MAIN,Artificial Tears,...,2.305060e+07,0.4 mL DROPPERETTE,,1-2,DROP,0.1667-0.3333,DRP,,BOTH EYES,31973139.0
18083,10002428,23473524,87358294,10002428-780,780.0,P71IN4,2156-05-12 13:00:00,2156-05-22 18:00:00,MAIN,Artificial Tears,...,2.305060e+07,0.3mL UD,,1-2,DROP,0.1667-0.3333,DRP,,BOTH EYES,30896594.0
18084,10040025,27996267,81941017,10040025-1640,1640.0,P52ORO,2148-01-26 19:00:00,2148-01-26 18:00:00,MAIN,OxyCODONE (Immediate Release),...,9.046446e+08,15mg Tablet,,5-10,mg,0.3333-0.6667,TAB,,PO/NG,36762745.0


Drop na and encode: dose_val_rx, form_val_disp, order_provider_id
Drop: subject_id, pharmacy_id, starttime, stoptime, form_rx (mostly null), poe_id
Impute with N/A and encode: formulary_drug_cd, gsn, prod_strength, route
Encode: drug_type, drug, dose_unit_rx, form_unit_disp
Impute with 0: doses_per_24_hrs

Drop rows with na

order_provider_id
Was going to impute with N/A and encode but going to drop as too many features 

In [74]:
# Make a feature of stoptime-starttime called duration 

# Convert to datetime
df_prescriptions['stoptime'] = pd.to_datetime(df_prescriptions['stoptime'], format='%Y/%m/%d %H:%M')
df_prescriptions['starttime'] = pd.to_datetime(df_prescriptions['starttime'], format='%Y/%m/%d %H:%M')

df_prescriptions['duration'] = df_prescriptions['stoptime'] - df_prescriptions['starttime']

# Fill any non time values
df_prescriptions['duration'] = df_prescriptions['duration'].fillna(pd.Timedelta(0))

In [75]:
# Drop na
df_prescriptions.dropna(subset=['dose_val_rx', 'form_val_disp'], inplace=True)

In [76]:
# Drop 
df_prescriptions = df_prescriptions.drop(columns=['subject_id','pharmacy_id','starttime','stoptime','form_rx','poe_id',
                                                 'order_provider_id'])

In [77]:
# Impute with N/A and encode
df_prescriptions['formulary_drug_cd'] = df_prescriptions['formulary_drug_cd'].fillna('N/A')
df_prescriptions['gsn'] = df_prescriptions['gsn'].fillna('N/A')
df_prescriptions['prod_strength'] = df_prescriptions['prod_strength'].fillna('N/A')
df_prescriptions['route'] = df_prescriptions['route'].fillna('N/A')

# Impute with 0
df_prescriptions['ndc'] = df_prescriptions['ndc'].fillna(0)

df_prescriptions = pd.get_dummies(df_prescriptions, columns=['formulary_drug_cd','gsn','prod_strength',
                                                            'route','drug_type','drug','dose_unit_rx','form_unit_disp',
                                                            'dose_val_rx','form_val_disp','ndc'])

In [78]:
df_prescriptions['doses_per_24_hrs'] = df_prescriptions['doses_per_24_hrs'].fillna(0)

In [79]:
# df_prescriptions

In [80]:
# Drop any rows with null values 
df_prescriptions = df_prescriptions.dropna()

In [81]:
df_prescriptions

Unnamed: 0,hadm_id,poe_seq,doses_per_24_hrs,transfer_id,duration,formulary_drug_cd_5000MLBAG,formulary_drug_cd_AA5D151000I,formulary_drug_cd_ACD3/1000I,formulary_drug_cd_ACE250,formulary_drug_cd_ACE500I,...,ndc_70860030005.0,ndc_70860077602.0,ndc_71019028507.0,ndc_76014000410.0,ndc_76045000905.0,ndc_76329301205.0,ndc_76329330101.0,ndc_76439034310.0,ndc_78112073623.0,ndc_87701071218.0
9,23831430,830.0,0.0,34492498.0,1 days 11:00:00,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
10,20626031,17.0,0.0,32604416.0,1 days 10:00:00,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
18,23831430,463.0,0.0,37253871.0,1 days 22:00:00,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
19,20297618,253.0,0.0,37726687.0,0 days 09:00:00,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
21,20626031,281.0,0.0,34529190.0,1 days 01:00:00,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
18080,27708593,192.0,1.0,30744153.0,7 days 05:00:00,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
18081,20214994,1372.0,0.0,31973139.0,20 days 15:00:00,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
18083,23473524,780.0,0.0,30896594.0,10 days 05:00:00,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
18084,27996267,1640.0,0.0,36762745.0,-1 days +23:00:00,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


#### Split into train and test

In [82]:
df_prescriptions = pd.merge(df_prescriptions, df_transfers[['transfer_id', 'careunit']], on='transfer_id', how='left')
df_prescriptions.drop(columns=['transfer_id'], inplace=True)
data = df_prescriptions.drop(columns=['careunit','hadm_id'])
target = df_prescriptions['careunit']

# Split the dataset into training and testing sets
prescriptions_data_train, prescriptions_data_test, prescriptions_label_train, prescriptions_label_test = train_test_split(data, target, test_size=0.2, random_state=42)

# Print the shapes of the resulting training and testing sets
print("Training set shape:", prescriptions_data_train.shape, prescriptions_label_train.shape)
print("Testing set shape:", prescriptions_data_test.shape, prescriptions_label_test.shape)

Training set shape: (8682, 4071) (8682,)
Testing set shape: (2171, 4071) (2171,)


In [83]:
# uncomment and run if changes are made

prescriptions_data_train.to_csv('prescriptions_data_train.csv', index=False)
prescriptions_data_test.to_csv('prescriptions_data_test.csv', index=False)

prescriptions_label_train.to_csv('prescriptions_label_train.csv', index=False)
prescriptions_label_test.to_csv('prescriptions_label_test.csv', index=False)

#### Dimensionality reduction

In [None]:
# Need to reduce from 4890 to 2874 or less

In [None]:
path = "C:/Users/jenni/OneDrive/Desktop/IP/"
file = "prescriptions_data_train.csv"
full_path = path + file

prescriptions_data_train = pd.read_csv(full_path)

file = "prescriptions_data_test.csv"
full_path = path + file

prescriptions_data_test = pd.read_csv(full_path)

file = "prescriptions_label_train.csv"
full_path = path + file

prescriptions_label_train = pd.read_csv(full_path)

file = "prescriptions_label_test.csv"
full_path = path + file

prescriptions_label_test = pd.read_csv(full_path)

### chartevents

In [None]:
file = "icu/chartevents.csv"
full_path = path + file

df_chart = pd.read_csv(full_path)

In [None]:
df_chart

In [None]:
df_chart = df_chart[df_chart['hadm_id'].isin(df_transfers['hadm_id'])]

# For each sample, get the rows from df_transfers that have the same hadm_id
# Go through this list and if the event time is between the in and out transfer time, assign it the same transfer_id 

In [None]:
# convert time to datetime
df_chart['charttime'] = pd.to_datetime(df_chart['charttime'])

In [None]:
df_chart['transfer_id'] = float('nan')

for index, row in df_chart.iterrows():
    # Filter target df based on 'hadm_id'
    df_transfers_subset = df_transfers[df_transfers['hadm_id'] == row['hadm_id']]

    datetime_value = row['charttime']
    
    # Filter out datetime values in the second dataframe that are later than the datetime value in the first dataframe
    filtered_transfers = df_transfers_subset[df_transfers_subset['intime'] > datetime_value]
    
    if not filtered_transfers.empty:

        # Find the closest datetime value in the filtered second dataframe
        closest_datetime = filtered_transfers['intime'].min()

        # Get the id of the sample with the closest datetime value
        closest_id = filtered_transfers[filtered_transfers['intime'] == closest_datetime]['transfer_id']

        # Assign the id to the current row in the first dataframe
        df_chart.at[index, 'transfer_id'] = closest_id

df_chart.dropna(subset=['transfer_id'], inplace=True)

In [None]:
df_chart

Drop: subject_id, charttime, storetime, stay_id, caregiver_id (the person who documented the data)
Encode: value,itemid
Impute with 0: valuenum, warning
Impute with N/A and encode: valueuom

In [None]:
# Make a days_since_admission feature of charttime-admittime 

# Convert to datetime
df_chart['charttime'] = pd.to_datetime(df_chart['charttime'], format='%Y-%m-%d %H:%M:%S')

# Add admittime column from other dataframe
df_chart = df_chart.merge(df_admittime, on='hadm_id', how='left')

# # Discard the time part and keep only the date
# df_hcpcsevents['admittime'] = df_hcpcsevents['admittime'].dt.date
# df_hcpcsevents['chartdate'] = df_hcpcsevents['chartdate'].dt.date

df_chart['days_since_admission'] = df_chart['charttime'] - df_chart['admittime']

# Fill any non time values
df_chart['days_since_admission'] = df_chart['days_since_admission'].fillna(pd.Timedelta(0))

# Drop the admission time column
df_chart = df_chart.drop(columns=['admittime'])

In [None]:
# Make a delay feature of storetime-charttime

# Convert to datetime
df_chart['storetime'] = pd.to_datetime(df_chart['storetime'], format='%Y-%m-%d %H:%M:%S')

df_chart['delay'] = df_chart['storetime'] - df_chart['charttime']

# Fill any non time values
df_chart['delay'] = df_chart['delay'].fillna(pd.Timedelta(0))

In [None]:
# Drop 
df_chart = df_chart.drop(columns=['subject_id','charttime','storetime', 'stay_id','caregiver_id'])

In [None]:
# Impute with N/A and encode
df_chart['valueuom'] = df_chart['valueuom'].fillna('N/A')
df_chart = pd.get_dummies(df_chart, columns=['valueuom','value','itemid'])

In [None]:
# Impute with 0
df_chart['valuenum'] = df_chart['valuenum'].fillna(0)
df_chart['warning'] = df_chart['warning'].fillna(0)

In [None]:
df_chart

#### Split into train and test

In [None]:
df_chart = pd.merge(df_chart, df_transfers[['transfer_id', 'careunit']], on='transfer_id', how='left')
df_chart.drop(columns=['transfer_id'], inplace=True)
data = df_chart.drop(columns=['careunit','hadm_id'])
target = df_chart['careunit']


# Split the dataset into training and testing sets
chart_data_train, chart_data_test, chart_label_train, chart_label_test = train_test_split(data, target, test_size=0.2, random_state=42)

# Print the shapes of the resulting training and testing sets
print("Training set shape:", chart_data_train.shape, chart_label_train.shape)
print("Testing set shape:", chart_data_test.shape, chart_label_test.shape)

In [None]:
# uncomment and run if changes are made

chart_data_train.to_csv('chart_data_train.csv', index=False)
chart_data_test.to_csv('chart_data_test.csv', index=False)

chart_label_train.to_csv('chart_label_train.csv', index=False)
chart_label_test.to_csv('chart_label_test.csv', index=False)

#### Dimensionality reduction

In [None]:
# Fine

### icustays

In [84]:
file = "icu/icustays.csv"
full_path = path + file

df_icustays = pd.read_csv(full_path)

In [85]:
df_icustays.head()

Unnamed: 0,subject_id,hadm_id,stay_id,first_careunit,last_careunit,intime,outtime,los
0,10018328,23786647,31269608,Neuro Stepdown,Neuro Stepdown,2154-04-24 23:03:44,2154-05-02 15:55:21,7.702512
1,10020187,24104168,37509585,Neuro Surgical Intensive Care Unit (Neuro SICU),Neuro Stepdown,2169-01-15 04:56:00,2169-01-20 15:47:50,5.452662
2,10020187,26842957,32554129,Neuro Intermediate,Neuro Intermediate,2170-02-24 18:18:46,2170-02-25 15:15:26,0.872685
3,10012853,27882036,31338022,Trauma SICU (TSICU),Trauma SICU (TSICU),2176-11-26 02:34:49,2176-11-29 20:58:54,3.766725
4,10020740,25826145,32145159,Trauma SICU (TSICU),Trauma SICU (TSICU),2150-06-03 20:12:32,2150-06-04 21:05:58,1.037106


In [86]:
df_icustays = df_icustays[df_icustays['hadm_id'].isin(df_transfers['hadm_id'])]

# For each sample, get the rows from df_transfers that have the same hadm_id
# Go through this list and if the event time is between the in and out transfer time, assign it the same transfer_id 

In [87]:
# convert time to datetime
df_icustays['outtime'] = pd.to_datetime(df_icustays['outtime'])

In [88]:
df_icustays['transfer_id'] = float('nan')

for index, row in df_icustays.iterrows():
    # Filter target df based on 'hadm_id'
    df_transfers_subset = df_transfers[df_transfers['hadm_id'] == row['hadm_id']]

    datetime_value = row['outtime']
    
    # Filter out datetime values in the second dataframe that are later than the datetime value in the first dataframe
    filtered_transfers = df_transfers_subset[df_transfers_subset['intime'] > datetime_value]
    
    if not filtered_transfers.empty:

        # Find the closest datetime value in the filtered second dataframe
        closest_datetime = filtered_transfers['intime'].min()

        # Get the id of the sample with the closest datetime value
        closest_id = filtered_transfers[filtered_transfers['intime'] == closest_datetime]['transfer_id']

        # Assign the id to the current row in the first dataframe
        df_icustays.at[index, 'transfer_id'] = closest_id

df_icustays.dropna(subset=['transfer_id'], inplace=True)

In [89]:
df_icustays

Unnamed: 0,subject_id,hadm_id,stay_id,first_careunit,last_careunit,intime,outtime,los,transfer_id
4,10020740,25826145,32145159,Trauma SICU (TSICU),Trauma SICU (TSICU),2150-06-03 20:12:32,2150-06-04 21:05:58,1.037106,33082837.0
5,10039708,23819016,38559363,Trauma SICU (TSICU),Trauma SICU (TSICU),2140-06-18 01:41:00,2140-06-19 21:47:16,1.837685,35921271.0
11,10021487,28998349,38197705,Trauma SICU (TSICU),Trauma SICU (TSICU),2116-12-03 01:02:00,2116-12-18 17:34:03,15.688924,34346963.0
13,10017492,27417763,36035031,Trauma SICU (TSICU),Trauma SICU (TSICU),2116-06-27 17:35:34,2116-06-27 20:26:18,0.118565,32758321.0
14,10017492,27417763,39543480,Trauma SICU (TSICU),Trauma SICU (TSICU),2116-06-26 20:35:09,2116-06-27 15:44:27,0.798125,36035031.0
15,10018501,28479513,35446858,Trauma SICU (TSICU),Trauma SICU (TSICU),2141-07-31 00:01:00,2141-08-01 22:36:21,1.941215,30558231.0
16,10008454,20291550,31959184,Trauma SICU (TSICU),Trauma SICU (TSICU),2110-11-30 17:11:36,2110-12-05 16:48:24,4.983889,37921160.0
17,10026354,24547356,36091287,Trauma SICU (TSICU),Trauma SICU (TSICU),2119-10-26 08:33:32,2119-10-27 17:50:50,1.387014,39113170.0
18,10029291,22205327,36059427,Coronary Care Unit (CCU),Coronary Care Unit (CCU),2123-02-20 04:13:00,2123-02-26 12:03:56,6.327037,35146796.0
25,10026255,22059910,31248398,Coronary Care Unit (CCU),Coronary Care Unit (CCU),2201-07-07 19:40:00,2201-07-08 15:43:15,0.83559,36929093.0


Drop: subject_id, stay_id, intime, outtime
Encode: first_careunit, last_careunit

In [90]:
# make a feature called days_since_admission using intime-admittime

# Convert to datetime
df_icustays['intime'] = pd.to_datetime(df_icustays['intime'], format='%Y-%m-%d %H:%M:%S')

# Add admittime column from other dataframe
df_icustays = df_icustays.merge(df_admittime, on='hadm_id', how='left')

# # Discard the time part and keep only the date
# df_hcpcsevents['admittime'] = df_hcpcsevents['admittime'].dt.date
# df_hcpcsevents['chartdate'] = df_hcpcsevents['chartdate'].dt.date

df_icustays['days_since_admission'] = df_icustays['intime'] - df_icustays['admittime']

# Fill any non time values
df_icustays['days_since_admission'] = df_icustays['days_since_admission'].fillna(pd.Timedelta(0))

# Drop the admission time column
df_icustays = df_icustays.drop(columns=['admittime'])

In [91]:
# Drop 
df_icustays = df_icustays.drop(columns=['subject_id','stay_id','intime','outtime'])

# Rename los to icu_los
df_icustays = df_icustays.rename(columns={'los': 'icu_los'})

In [92]:
# Encode
df_icustays = pd.get_dummies(df_icustays, columns=['first_careunit','last_careunit'])

In [93]:
df_icustays

Unnamed: 0,hadm_id,icu_los,transfer_id,days_since_admission,first_careunit_Cardiac Vascular Intensive Care Unit (CVICU),first_careunit_Coronary Care Unit (CCU),first_careunit_Medical Intensive Care Unit (MICU),first_careunit_Medical/Surgical Intensive Care Unit (MICU/SICU),first_careunit_Surgical Intensive Care Unit (SICU),first_careunit_Trauma SICU (TSICU),last_careunit_Cardiac Vascular Intensive Care Unit (CVICU),last_careunit_Coronary Care Unit (CCU),last_careunit_Medical Intensive Care Unit (MICU),last_careunit_Medical/Surgical Intensive Care Unit (MICU/SICU),last_careunit_Surgical Intensive Care Unit (SICU),last_careunit_Trauma SICU (TSICU)
0,25826145,1.037106,33082837.0,0 days 00:00:32,0,0,0,0,0,1,0,0,0,0,0,1
1,23819016,1.837685,35921271.0,0 days 01:19:00,0,0,0,0,0,1,0,0,0,0,0,1
2,28998349,15.688924,34346963.0,0 days 00:39:00,0,0,0,0,0,1,0,0,0,0,0,1
3,27417763,0.118565,32758321.0,0 days 23:10:34,0,0,0,0,0,1,0,0,0,0,0,1
4,27417763,0.798125,36035031.0,0 days 02:10:09,0,0,0,0,0,1,0,0,0,0,0,1
5,28479513,1.941215,30558231.0,0 days 01:27:00,0,0,0,0,0,1,0,0,0,0,0,1
6,20291550,4.983889,37921160.0,0 days 10:40:36,0,0,0,0,0,1,0,0,0,0,0,1
7,24547356,1.387014,39113170.0,0 days 01:22:32,0,0,0,0,0,1,0,0,0,0,0,1
8,22205327,6.327037,35146796.0,0 days 02:14:00,0,1,0,0,0,0,0,1,0,0,0,0
9,22059910,0.83559,36929093.0,0 days 01:25:00,0,1,0,0,0,0,0,1,0,0,0,0


#### Split into train and test

In [94]:
df_icustays = pd.merge(df_icustays, df_transfers[['transfer_id', 'careunit']], on='transfer_id', how='left')
df_icustays.drop(columns=['transfer_id'], inplace=True)
data = df_icustays.drop(columns=['careunit','hadm_id'])
target = df_icustays['careunit']

# Split the dataset into training and testing sets
icustays_data_train, icustays_data_test, icustays_label_train, icustays_label_test = train_test_split(data, target, test_size=0.2, random_state=42)

# Print the shapes of the resulting training and testing sets
print("Training set shape:", icustays_data_train.shape, icustays_label_train.shape)
print("Testing set shape:", icustays_data_test.shape, icustays_label_test.shape)

Training set shape: (36, 14) (36,)
Testing set shape: (10, 14) (10,)


In [95]:
# uncomment and run if changes are made

icustays_data_train.to_csv('icustays_data_train.csv', index=False)
icustays_data_test.to_csv('icustays_data_test.csv', index=False)

icustays_label_train.to_csv('icustays_label_train.csv', index=False)
icustays_label_test.to_csv('icustays_label_test.csv', index=False)

#### Dimensionality reduction

In [96]:
# Fine

### ingredientevents

In [97]:
file = "icu/ingredientevents.csv"
full_path = path + file

df_ingredient = pd.read_csv(full_path)

In [98]:
df_ingredient

Unnamed: 0,subject_id,hadm_id,stay_id,caregiver_id,starttime,endtime,storetime,itemid,amount,amountuom,rate,rateuom,orderid,linkorderid,statusdescription,originalamount,originalrate
0,10005817,20626031,32604416,4793,2132-12-17 05:00:00,2132-12-17 06:00:00,2132-12-17 06:01:00,227074,49.999999,ml,50.000000,mL/hour,7330951,7330951,FinishedRunning,0,50.000000
1,10005817,20626031,32604416,4793,2132-12-17 05:00:00,2132-12-17 06:00:00,2132-12-17 06:01:00,220490,49.999999,ml,50.000000,mL/hour,7330951,7330951,FinishedRunning,0,50.000000
2,10005817,20626031,32604416,20310,2132-12-17 12:00:00,2132-12-17 13:00:00,2132-12-17 12:48:00,220490,249.999990,ml,249.999985,mL/hour,5334154,5334154,FinishedRunning,0,250.000000
3,10005817,20626031,32604416,20310,2132-12-17 12:00:00,2132-12-17 13:00:00,2132-12-17 12:48:00,226509,249.999990,ml,249.999985,mL/hour,5334154,5334154,FinishedRunning,0,250.000000
4,10005817,20626031,32604416,92805,2132-12-15 16:35:00,2132-12-15 18:00:00,2132-12-15 16:42:00,220490,38.852669,ml,27.425413,mL/hour,1386365,3042892,ChangeDose/Rate,0,47.080292
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
25723,10019003,29279905,34107647,68979,2153-03-28 21:58:00,2153-03-28 22:58:00,2153-03-28 23:22:00,227074,49.999999,ml,50.000000,mL/hour,6547485,6547485,FinishedRunning,0,50.000000
25724,10019003,29279905,34107647,68979,2153-03-28 02:58:00,2153-03-28 02:59:00,2153-03-28 02:58:00,227075,120.000000,ml,,,103707,103707,FinishedRunning,0,120.000000
25725,10019003,29279905,34107647,68979,2153-03-28 02:58:00,2153-03-28 02:59:00,2153-03-28 02:58:00,220490,120.000000,ml,,,103707,103707,FinishedRunning,0,120.000000
25726,10019003,29279905,34107647,88156,2153-03-29 20:58:00,2153-03-29 20:59:00,2153-03-29 20:58:00,220490,500.000000,ml,,,9142525,9142525,FinishedRunning,0,500.000000


In [99]:
df_ingredient = df_ingredient[df_ingredient['hadm_id'].isin(df_transfers['hadm_id'])]

# For each sample, get the rows from df_transfers that have the same hadm_id
# Go through this list and if the event time is between the in and out transfer time, assign it the same transfer_id 

In [100]:
# convert time to datetime
df_ingredient['endtime'] = pd.to_datetime(df_ingredient['endtime'])

In [101]:
df_ingredient['transfer_id'] = float('nan')

for index, row in df_ingredient.iterrows():
    # Filter target df based on 'hadm_id'
    df_transfers_subset = df_transfers[df_transfers['hadm_id'] == row['hadm_id']]

    datetime_value = row['endtime']
    
    # Filter out datetime values in the second dataframe that are later than the datetime value in the first dataframe
    filtered_transfers = df_transfers_subset[df_transfers_subset['intime'] > datetime_value]
    
    if not filtered_transfers.empty:

        # Find the closest datetime value in the filtered second dataframe
        closest_datetime = filtered_transfers['intime'].min()

        # Get the id of the sample with the closest datetime value
        closest_id = filtered_transfers[filtered_transfers['intime'] == closest_datetime]['transfer_id']

        # Assign the id to the current row in the first dataframe
        df_ingredient.at[index, 'transfer_id'] = closest_id

df_ingredient.dropna(subset=['transfer_id'], inplace=True)

In [102]:
df_ingredient

Unnamed: 0,subject_id,hadm_id,stay_id,caregiver_id,starttime,endtime,storetime,itemid,amount,amountuom,rate,rateuom,orderid,linkorderid,statusdescription,originalamount,originalrate,transfer_id
0,10005817,20626031,32604416,4793,2132-12-17 05:00:00,2132-12-17 06:00:00,2132-12-17 06:01:00,227074,49.999999,ml,50.000000,mL/hour,7330951,7330951,FinishedRunning,0,50.000000,34529190.0
1,10005817,20626031,32604416,4793,2132-12-17 05:00:00,2132-12-17 06:00:00,2132-12-17 06:01:00,220490,49.999999,ml,50.000000,mL/hour,7330951,7330951,FinishedRunning,0,50.000000,34529190.0
2,10005817,20626031,32604416,20310,2132-12-17 12:00:00,2132-12-17 13:00:00,2132-12-17 12:48:00,220490,249.999990,ml,249.999985,mL/hour,5334154,5334154,FinishedRunning,0,250.000000,34529190.0
3,10005817,20626031,32604416,20310,2132-12-17 12:00:00,2132-12-17 13:00:00,2132-12-17 12:48:00,226509,249.999990,ml,249.999985,mL/hour,5334154,5334154,FinishedRunning,0,250.000000,34529190.0
4,10005817,20626031,32604416,92805,2132-12-15 16:35:00,2132-12-15 18:00:00,2132-12-15 16:42:00,220490,38.852669,ml,27.425413,mL/hour,1386365,3042892,ChangeDose/Rate,0,47.080292,34529190.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
25723,10019003,29279905,34107647,68979,2153-03-28 21:58:00,2153-03-28 22:58:00,2153-03-28 23:22:00,227074,49.999999,ml,50.000000,mL/hour,6547485,6547485,FinishedRunning,0,50.000000,32636269.0
25724,10019003,29279905,34107647,68979,2153-03-28 02:58:00,2153-03-28 02:59:00,2153-03-28 02:58:00,227075,120.000000,ml,,,103707,103707,FinishedRunning,0,120.000000,32636269.0
25725,10019003,29279905,34107647,68979,2153-03-28 02:58:00,2153-03-28 02:59:00,2153-03-28 02:58:00,220490,120.000000,ml,,,103707,103707,FinishedRunning,0,120.000000,32636269.0
25726,10019003,29279905,34107647,88156,2153-03-29 20:58:00,2153-03-29 20:59:00,2153-03-29 20:58:00,220490,500.000000,ml,,,9142525,9142525,FinishedRunning,0,500.000000,32636269.0


Drop: subject_id, starttime, endtime, storetime, orderid, originalamount, stay_id, caregiver_id
Encode: amountuom, statusdescription, itemid
Impute with 0: rate
Impute with N/A and encode: rateuom, linkorderid

In [103]:
# Make a duration feature of endtime-starttime 

# Convert to datetime
df_ingredient['endtime'] = pd.to_datetime(df_ingredient['endtime'], format='%Y-%m-%d %H:%M:%S')
df_ingredient['starttime'] = pd.to_datetime(df_ingredient['starttime'], format='%Y-%m-%d %H:%M:%S')


df_ingredient['duration'] = df_ingredient['endtime'] - df_ingredient['starttime']

# Fill any non time values
df_ingredient['duration'] = df_ingredient['duration'].fillna(pd.Timedelta(0))

In [104]:
# make a recording_delay feature of storetime-endtime

# Convert to datetime
df_ingredient['storetime'] = pd.to_datetime(df_ingredient['storetime'], format='%Y-%m-%d %H:%M:%S')

df_ingredient['recording_delay'] = df_ingredient['storetime'] - df_ingredient['endtime']

# Fill any non time values
df_ingredient['recording_delay'] = df_ingredient['recording_delay'].fillna(pd.Timedelta(0))

In [105]:
# Drop 
df_ingredient = df_ingredient.drop(columns=['subject_id','starttime','endtime','storetime','orderid','originalamount',
                                           'stay_id','caregiver_id'])

In [106]:
# Impute with N/A and encode
df_ingredient['rateuom'] = df_ingredient['rateuom'].fillna('N/A')
df_ingredient['linkorderid'] = df_ingredient['linkorderid'].fillna('N/A')
df_ingredient = pd.get_dummies(df_ingredient, columns=['rateuom','amountuom','statusdescription','itemid','linkorderid'])

In [107]:
# Impute with 0
df_ingredient['rate'] = df_ingredient['rate'].fillna(0)

In [108]:
df_ingredient

Unnamed: 0,hadm_id,amount,rate,originalrate,transfer_id,duration,recording_delay,rateuom_N/A,rateuom_grams/hour,rateuom_mL/hour,...,linkorderid_9985393,linkorderid_9986202,linkorderid_9986595,linkorderid_9988568,linkorderid_9989506,linkorderid_9990254,linkorderid_9990509,linkorderid_9993006,linkorderid_9993329,linkorderid_9996112
0,20626031,49.999999,50.000000,50.000000,34529190.0,0 days 01:00:00,0 days 00:01:00,0,0,1,...,0,0,0,0,0,0,0,0,0,0
1,20626031,49.999999,50.000000,50.000000,34529190.0,0 days 01:00:00,0 days 00:01:00,0,0,1,...,0,0,0,0,0,0,0,0,0,0
2,20626031,249.999990,249.999985,250.000000,34529190.0,0 days 01:00:00,-1 days +23:48:00,0,0,1,...,0,0,0,0,0,0,0,0,0,0
3,20626031,249.999990,249.999985,250.000000,34529190.0,0 days 01:00:00,-1 days +23:48:00,0,0,1,...,0,0,0,0,0,0,0,0,0,0
4,20626031,38.852669,27.425413,47.080292,34529190.0,0 days 01:25:00,-1 days +22:42:00,0,0,1,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
25723,29279905,49.999999,50.000000,50.000000,32636269.0,0 days 01:00:00,0 days 00:24:00,0,0,1,...,0,0,0,0,0,0,0,0,0,0
25724,29279905,120.000000,0.000000,120.000000,32636269.0,0 days 00:01:00,-1 days +23:59:00,1,0,0,...,0,0,0,0,0,0,0,0,0,0
25725,29279905,120.000000,0.000000,120.000000,32636269.0,0 days 00:01:00,-1 days +23:59:00,1,0,0,...,0,0,0,0,0,0,0,0,0,0
25726,29279905,500.000000,0.000000,500.000000,32636269.0,0 days 00:01:00,-1 days +23:59:00,1,0,0,...,0,0,0,0,0,0,0,0,0,0


#### Split into train and test

In [109]:
df_ingredient = pd.merge(df_ingredient, df_transfers[['transfer_id', 'careunit']], on='transfer_id', how='left')
df_ingredient.drop(columns=['transfer_id'], inplace=True)
data = df_ingredient.drop(columns=['careunit','hadm_id'])
target = df_ingredient['careunit']

# Split the dataset into training and testing sets
ingredient_data_train, ingredient_data_test, ingredient_label_train, ingredient_label_test = train_test_split(data, target, test_size=0.2, random_state=42)

# Print the shapes of the resulting training and testing sets
print("Training set shape:", ingredient_data_train.shape, ingredient_label_train.shape)
print("Testing set shape:", ingredient_data_test.shape, ingredient_label_test.shape)

Training set shape: (15698, 6032) (15698,)
Testing set shape: (3925, 6032) (3925,)


In [110]:
# uncomment and run if changes are made

ingredient_data_train.to_csv('ingredient_data_train.csv', index=False)
ingredient_data_test.to_csv('ingredient_data_test.csv', index=False)

ingredient_label_train.to_csv('ingredient_label_train.csv', index=False)
ingredient_label_test.to_csv('ingredient_label_test.csv', index=False)

#### Dimensionality reduction

In [111]:
# Need to reduce from 7727 to 4116

In [112]:
path = "C:/Users/jenni/OneDrive/Desktop/IP/"
file = "ingredient_data_train.csv"
full_path = path + file

ingredient_data_train = pd.read_csv(full_path)

file = "ingredient_data_test.csv"
full_path = path + file

ingredient_data_test = pd.read_csv(full_path)

file = "ingredient_label_train.csv"
full_path = path + file

ingredient_label_train = pd.read_csv(full_path)

file = "ingredient_label_test.csv"
full_path = path + file

ingredient_label_test = pd.read_csv(full_path)

### inputevents

In [174]:
file = "icu/inputevents.csv"
full_path = path + file

df_input = pd.read_csv(full_path)

In [175]:
df_input

Unnamed: 0,subject_id,hadm_id,stay_id,caregiver_id,starttime,endtime,storetime,itemid,amount,amountuom,...,ordercomponenttypedescription,ordercategorydescription,patientweight,totalamount,totalamountuom,isopenbag,continueinnextdept,statusdescription,originalamount,originalrate
0,10005817,20626031,32604416,4793,2132-12-16 19:50:00,2132-12-16 19:51:00,2132-12-16 19:50:00,225798,1.000000,dose,...,Main order parameter,Drug Push,91.0,500.0,ml,0,0,FinishedRunning,1.000000,1.000000
1,10005817,20626031,32604416,92805,2132-12-15 20:15:00,2132-12-15 20:16:00,2132-12-15 20:11:00,225798,1.000000,dose,...,Main order parameter,Drug Push,91.0,500.0,ml,0,0,FinishedRunning,1.000000,1.000000
2,10005817,20626031,32604416,20310,2132-12-17 09:15:00,2132-12-17 09:16:00,2132-12-17 09:28:00,225798,1.000000,dose,...,Main order parameter,Drug Push,91.0,500.0,ml,0,0,FinishedRunning,1.000000,1.000000
3,10005817,20626031,32604416,79166,2132-12-16 09:36:00,2132-12-16 09:37:00,2132-12-16 09:37:00,225798,1.000000,dose,...,Main order parameter,Drug Push,91.0,500.0,ml,0,0,FinishedRunning,1.000000,1.000000
4,10005817,20626031,32604416,92805,2132-12-15 20:10:00,2132-12-15 21:10:00,2132-12-15 20:10:00,221456,2.000000,grams,...,Additives ...,Continuous IV,91.0,100.0,ml,0,0,FinishedRunning,2.000000,0.033333
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
20399,10019003,29279905,34107647,88156,2153-03-29 20:58:00,2153-03-29 20:59:00,2153-03-29 20:58:00,221385,0.500000,mg,...,Main order parameter,Drug Push,96.0,,,0,0,FinishedRunning,0.500000,0.500000
20400,10019003,29279905,34107647,83144,2153-03-30 00:00:00,2153-03-30 00:01:00,2153-03-30 01:24:00,221385,0.500000,mg,...,Main order parameter,Drug Push,96.0,,,0,0,FinishedRunning,0.500000,0.500000
20401,10019003,29279905,34107647,68979,2153-03-28 02:57:00,2153-03-28 04:48:00,2153-03-28 02:58:00,221906,0.319770,mg,...,Main order parameter,Continuous Med,96.0,250.0,ml,0,0,ChangeDose/Rate,8.000000,0.030000
20402,10019003,29279905,34107647,68979,2153-03-28 06:05:00,2153-03-28 08:00:00,2153-03-28 06:11:00,221906,0.110377,mg,...,Main order parameter,Continuous Med,96.0,250.0,ml,0,0,Paused,7.532536,0.010000


In [176]:
df_input = df_input[df_input['hadm_id'].isin(df_transfers['hadm_id'])]

# For each sample, get the rows from df_transfers that have the same hadm_id
# Go through this list and if the event time is between the in and out transfer time, assign it the same transfer_id 

In [177]:
# convert time to datetime
df_input['endtime'] = pd.to_datetime(df_input['endtime'])

In [178]:
df_input['transfer_id'] = float('nan')

for index, row in df_input.iterrows():
    # Filter target df based on 'hadm_id'
    df_transfers_subset = df_transfers[df_transfers['hadm_id'] == row['hadm_id']]

    datetime_value = row['endtime']
    
    # Filter out datetime values in the second dataframe that are later than the datetime value in the first dataframe
    filtered_transfers = df_transfers_subset[df_transfers_subset['intime'] > datetime_value]
    
    if not filtered_transfers.empty:

        # Find the closest datetime value in the filtered second dataframe
        closest_datetime = filtered_transfers['intime'].min()

        # Get the id of the sample with the closest datetime value
        closest_id = filtered_transfers[filtered_transfers['intime'] == closest_datetime]['transfer_id']

        # Assign the id to the current row in the first dataframe
        df_input.at[index, 'transfer_id'] = closest_id

df_input.dropna(subset=['transfer_id'], inplace=True)

In [179]:
df_input

Unnamed: 0,subject_id,hadm_id,stay_id,caregiver_id,starttime,endtime,storetime,itemid,amount,amountuom,...,ordercategorydescription,patientweight,totalamount,totalamountuom,isopenbag,continueinnextdept,statusdescription,originalamount,originalrate,transfer_id
0,10005817,20626031,32604416,4793,2132-12-16 19:50:00,2132-12-16 19:51:00,2132-12-16 19:50:00,225798,1.000000,dose,...,Drug Push,91.0,500.0,ml,0,0,FinishedRunning,1.000000,1.000000,34529190.0
1,10005817,20626031,32604416,92805,2132-12-15 20:15:00,2132-12-15 20:16:00,2132-12-15 20:11:00,225798,1.000000,dose,...,Drug Push,91.0,500.0,ml,0,0,FinishedRunning,1.000000,1.000000,34529190.0
2,10005817,20626031,32604416,20310,2132-12-17 09:15:00,2132-12-17 09:16:00,2132-12-17 09:28:00,225798,1.000000,dose,...,Drug Push,91.0,500.0,ml,0,0,FinishedRunning,1.000000,1.000000,34529190.0
3,10005817,20626031,32604416,79166,2132-12-16 09:36:00,2132-12-16 09:37:00,2132-12-16 09:37:00,225798,1.000000,dose,...,Drug Push,91.0,500.0,ml,0,0,FinishedRunning,1.000000,1.000000,34529190.0
4,10005817,20626031,32604416,92805,2132-12-15 20:10:00,2132-12-15 21:10:00,2132-12-15 20:10:00,221456,2.000000,grams,...,Continuous IV,91.0,100.0,ml,0,0,FinishedRunning,2.000000,0.033333,34529190.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
20399,10019003,29279905,34107647,88156,2153-03-29 20:58:00,2153-03-29 20:59:00,2153-03-29 20:58:00,221385,0.500000,mg,...,Drug Push,96.0,,,0,0,FinishedRunning,0.500000,0.500000,32636269.0
20400,10019003,29279905,34107647,83144,2153-03-30 00:00:00,2153-03-30 00:01:00,2153-03-30 01:24:00,221385,0.500000,mg,...,Drug Push,96.0,,,0,0,FinishedRunning,0.500000,0.500000,32636269.0
20401,10019003,29279905,34107647,68979,2153-03-28 02:57:00,2153-03-28 04:48:00,2153-03-28 02:58:00,221906,0.319770,mg,...,Continuous Med,96.0,250.0,ml,0,0,ChangeDose/Rate,8.000000,0.030000,32636269.0
20402,10019003,29279905,34107647,68979,2153-03-28 06:05:00,2153-03-28 08:00:00,2153-03-28 06:11:00,221906,0.110377,mg,...,Continuous Med,96.0,250.0,ml,0,0,Paused,7.532536,0.010000,32636269.0


Drop: subject_id, starttime, endtime, storetime, orderid, linkorderid, continueinnextdept, stay_id, caregiver_id,
totalamountuom
Encode: amountuom, ordercategoryname, ordercomponenttypedescription, ordercategorydescription, statusdescription, itemid
Impute with 0: rate, totalamount
Impute with N/A and encode: rateuom, secondaryordercategoryname

In [180]:
# Make a duration feature using endtime-starttime

# Convert to datetime
df_input['endtime'] = pd.to_datetime(df_input['endtime'], format='%Y-%m-%d %H:%M:%S')
df_input['starttime'] = pd.to_datetime(df_input['starttime'], format='%Y-%m-%d %H:%M:%S')


df_input['duration'] = df_input['endtime'] - df_input['starttime']

# Fill any non time values
df_input['duration'] = df_input['duration'].fillna(pd.Timedelta(0))

In [181]:
# Make a recording_delay feature using storetime-endtime

# Convert to datetime
df_input['storetime'] = pd.to_datetime(df_input['storetime'], format='%Y-%m-%d %H:%M:%S')

df_input['recording_delay'] = df_input['storetime'] - df_input['endtime']

# Fill any non time values
df_input['recording_delay'] = df_input['recording_delay'].fillna(pd.Timedelta(0))

In [182]:
# Drop 
df_input = df_input.drop(columns=['subject_id','stay_id','starttime','endtime','storetime','orderid','linkorderid',
                                  'continueinnextdept','totalamountuom', 'stay_id','caregiver_id'])

In [183]:
# Impute with N/A and encode
df_input['rateuom'] = df_input['rateuom'].fillna('N/A')
df_input['secondaryordercategoryname'] = df_input['secondaryordercategoryname'].fillna('N/A')
df_input = pd.get_dummies(df_input, columns=['rateuom','secondaryordercategoryname','amountuom','ordercategoryname',
                                            'ordercomponenttypedescription','ordercategorydescription','statusdescription',
                                            'itemid'])

In [184]:
# Impute with 0
df_input['rate'] = df_input['rate'].fillna(0)
df_input['totalamount'] = df_input['totalamount'].fillna(0)

In [185]:
df_input = df_input.dropna()

In [186]:
df_input

Unnamed: 0,hadm_id,amount,rate,patientweight,totalamount,isopenbag,originalamount,originalrate,transfer_id,duration,...,itemid_229070,itemid_229071,itemid_229072,itemid_229295,itemid_229296,itemid_229420,itemid_229615,itemid_229616,itemid_229639,itemid_229654
0,20626031,1.000000,0.000000,91.0,500.0,0,1.000000,1.000000,34529190.0,0 days 00:01:00,...,0,0,0,0,0,0,0,0,0,0
1,20626031,1.000000,0.000000,91.0,500.0,0,1.000000,1.000000,34529190.0,0 days 00:01:00,...,0,0,0,0,0,0,0,0,0,0
2,20626031,1.000000,0.000000,91.0,500.0,0,1.000000,1.000000,34529190.0,0 days 00:01:00,...,0,0,0,0,0,0,0,0,0,0
3,20626031,1.000000,0.000000,91.0,500.0,0,1.000000,1.000000,34529190.0,0 days 00:01:00,...,0,0,0,0,0,0,0,0,0,0
4,20626031,2.000000,0.000000,91.0,100.0,0,2.000000,0.033333,34529190.0,0 days 01:00:00,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
20399,29279905,0.500000,0.000000,96.0,0.0,0,0.500000,0.500000,32636269.0,0 days 00:01:00,...,0,0,0,0,0,0,0,0,0,0
20400,29279905,0.500000,0.000000,96.0,0.0,0,0.500000,0.500000,32636269.0,0 days 00:01:00,...,0,0,0,0,0,0,0,0,0,0
20401,29279905,0.319770,0.030008,96.0,250.0,0,8.000000,0.030000,32636269.0,0 days 01:51:00,...,0,0,0,0,0,0,0,0,0,0
20402,29279905,0.110377,0.009998,96.0,250.0,0,7.532536,0.010000,32636269.0,0 days 01:55:00,...,0,0,0,0,0,0,0,0,0,0


#### Split into train and test

In [187]:
df_input = pd.merge(df_input, df_transfers[['transfer_id', 'careunit']], on='transfer_id', how='left')
df_input.drop(columns=['transfer_id'], inplace=True)
data = df_input.drop(columns=['careunit','hadm_id'])
target = df_input['careunit']

# Split the dataset into training and testing sets
input_data_train, input_data_test, input_label_train, input_label_test = train_test_split(data, target, test_size=0.2, random_state=42)

# Print the shapes of the resulting training and testing sets
print("Training set shape:", input_data_train.shape, input_label_train.shape)
print("Testing set shape:", input_data_test.shape, input_label_test.shape)

Training set shape: (12633, 210) (12633,)
Testing set shape: (3159, 210) (3159,)


In [188]:
# uncomment and run if changes are made

input_data_train.to_csv('input_data_train.csv', index=False)
input_data_test.to_csv('input_data_test.csv', index=False)

input_label_train.to_csv('input_label_train.csv', index=False)
input_label_test.to_csv('input_label_test.csv', index=False)

In [189]:
input_data_train

Unnamed: 0,amount,rate,patientweight,totalamount,isopenbag,originalamount,originalrate,duration,recording_delay,rateuom_N/A,...,itemid_229070,itemid_229071,itemid_229072,itemid_229295,itemid_229296,itemid_229420,itemid_229615,itemid_229616,itemid_229639,itemid_229654
5828,312.000015,50.000004,80.0,100.0,0,1000.000000,50.000004,0 days 01:18:00,-1 days +22:42:00,0,...,0,0,0,0,0,0,0,0,0,0
5146,50.000000,0.000000,69.9,50.0,0,50.000000,0.000000,0 days 00:01:00,-1 days +23:59:00,1,...,0,0,0,0,0,0,0,0,0,0
2133,50.000000,0.000000,43.0,50.0,0,50.000000,0.000000,0 days 00:01:00,-1 days +23:59:00,1,...,0,0,0,0,0,0,0,0,0,0
10547,0.273831,100.182121,112.5,50.0,0,2.224043,100.000000,0 days 02:44:00,-1 days +21:33:00,0,...,0,0,0,0,0,0,0,0,0,0
12574,99.999997,0.000000,143.0,0.0,0,100.000000,100.000000,0 days 00:01:00,-1 days +23:59:00,1,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5191,0.075499,0.040004,69.9,250.0,0,3.056317,0.040000,0 days 00:27:00,0 days 00:31:00,0,...,0,0,0,0,0,0,0,0,0,0
13418,240.000000,0.000000,74.8,240.0,0,240.000000,240.000000,0 days 00:01:00,0 days 00:59:00,1,...,0,0,0,0,0,0,0,0,0,0
5390,83.749998,25.000000,91.0,500.0,0,500.000000,25.000000,0 days 03:21:00,-1 days +20:43:00,0,...,0,0,0,0,0,0,0,0,0,0
860,2.061250,7.274999,97.0,250.0,0,234.915100,7.274500,0 days 00:17:00,-1 days +23:43:00,0,...,0,0,0,0,0,0,0,0,0,0


#### Dimensionality reduction

In [190]:
# Fine

### outputevents

In [191]:
file = "icu/outputevents.csv"
full_path = path + file

df_output = pd.read_csv(full_path)

In [192]:
df_output

Unnamed: 0,subject_id,hadm_id,stay_id,caregiver_id,charttime,storetime,itemid,value,valueuom
0,10002428,23473524,35479615,29441,2156-05-15 18:00:00,2156-05-15 17:42:00,226583,600,ml
1,10002428,23473524,35479615,29441,2156-05-15 12:00:00,2156-05-15 12:08:00,226559,60,ml
2,10002428,23473524,35479615,29441,2156-05-15 13:00:00,2156-05-15 13:00:00,226559,45,ml
3,10002428,23473524,35479615,29441,2156-05-15 08:00:00,2156-05-15 08:39:00,226559,125,ml
4,10002428,23473524,35479615,29441,2156-05-15 14:00:00,2156-05-15 13:56:00,226559,60,ml
...,...,...,...,...,...,...,...,...,...
9357,10016742,29281842,37057036,82943,2178-07-07 11:00:00,2178-07-07 13:25:00,226559,230,ml
9358,10016742,29281842,37057036,82943,2178-07-07 15:00:00,2178-07-07 17:40:00,226559,120,ml
9359,10016742,29281842,37057036,82943,2178-07-07 09:00:00,2178-07-07 09:25:00,226559,250,ml
9360,10016742,29281842,37057036,82943,2178-07-07 18:00:00,2178-07-07 18:18:00,226559,80,ml


In [193]:
df_output = df_output[df_output['hadm_id'].isin(df_transfers['hadm_id'])]

# For each sample, get the rows from df_transfers that have the same hadm_id
# Go through this list and if the event time is between the in and out transfer time, assign it the same transfer_id 

In [194]:
# convert time to datetime
df_output['charttime'] = pd.to_datetime(df_output['charttime'])

In [195]:
df_output['transfer_id'] = float('nan')

for index, row in df_output.iterrows():
    # Filter target df based on 'hadm_id'
    df_transfers_subset = df_transfers[df_transfers['hadm_id'] == row['hadm_id']]

    datetime_value = row['charttime']
    
    # Filter out datetime values in the second dataframe that are later than the datetime value in the first dataframe
    filtered_transfers = df_transfers_subset[df_transfers_subset['intime'] > datetime_value]
    
    if not filtered_transfers.empty:

        # Find the closest datetime value in the filtered second dataframe
        closest_datetime = filtered_transfers['intime'].min()

        # Get the id of the sample with the closest datetime value
        closest_id = filtered_transfers[filtered_transfers['intime'] == closest_datetime]['transfer_id']

        # Assign the id to the current row in the first dataframe
        df_output.at[index, 'transfer_id'] = closest_id

df_output.dropna(subset=['transfer_id'], inplace=True)

In [196]:
df_output

Unnamed: 0,subject_id,hadm_id,stay_id,caregiver_id,charttime,storetime,itemid,value,valueuom,transfer_id
0,10002428,23473524,35479615,29441,2156-05-15 18:00:00,2156-05-15 17:42:00,226583,600,ml,30896594.0
1,10002428,23473524,35479615,29441,2156-05-15 12:00:00,2156-05-15 12:08:00,226559,60,ml,30896594.0
2,10002428,23473524,35479615,29441,2156-05-15 13:00:00,2156-05-15 13:00:00,226559,45,ml,30896594.0
3,10002428,23473524,35479615,29441,2156-05-15 08:00:00,2156-05-15 08:39:00,226559,125,ml,30896594.0
4,10002428,23473524,35479615,29441,2156-05-15 14:00:00,2156-05-15 13:56:00,226559,60,ml,30896594.0
...,...,...,...,...,...,...,...,...,...,...
9351,10018328,23786647,31269608,75774,2154-04-27 02:00:00,2154-04-27 02:43:00,226559,30,ml,36211955.0
9352,10018328,23786647,31269608,75774,2154-04-26 22:00:00,2154-04-26 22:46:00,226559,110,ml,36211955.0
9353,10018328,23786647,31269608,75774,2154-04-27 08:00:00,2154-04-27 07:46:00,226559,100,ml,36211955.0
9354,10018328,23786647,31269608,75774,2154-04-27 01:00:00,2154-04-27 02:43:00,226559,30,ml,36211955.0


Drop: subject_id, charttime, storetime, valueuom, stay_id, caregiver_id'
Encode: itemid

In [197]:
# Make a days_since_admission feature using charttime-admittime 

# Convert to datetime
df_output['charttime'] = pd.to_datetime(df_output['charttime'], format='%Y-%m-%d %H:%M:%S')

# Add admittime column from other dataframe
df_output = df_output.merge(df_admittime, on='hadm_id', how='left')

df_output['days_since_admission'] = df_output['charttime'] - df_output['admittime']

# Fill any non time values
df_output['days_since_admission'] = df_output['days_since_admission'].fillna(pd.Timedelta(0))

# Drop the admission time column
df_output = df_output.drop(columns=['admittime'])

In [198]:
# Make a recording_delay feature using storetime-charttime

# Convert to datetime
df_output['storetime'] = pd.to_datetime(df_output['storetime'], format='%Y-%m-%d %H:%M:%S')

df_output['recording_delay'] = df_output['storetime'] - df_output['charttime']

# Fill any non time values
df_output['recording_delay'] = df_output['recording_delay'].fillna(pd.Timedelta(0))

In [199]:
# Drop 
df_output = df_output.drop(columns=['subject_id','stay_id','charttime','storetime','storetime','valueuom','caregiver_id'])

In [200]:
#Encode
df_output = pd.get_dummies(df_output, columns=['itemid'])

In [201]:
df_output

Unnamed: 0,hadm_id,value,transfer_id,days_since_admission,recording_delay,itemid_226559,itemid_226560,itemid_226561,itemid_226567,itemid_226571,...,itemid_226626,itemid_226627,itemid_226631,itemid_226632,itemid_226633,itemid_227488,itemid_227489,itemid_227510,itemid_227511,itemid_227701
0,23473524,600,30896594.0,4 days 03:11:00,-1 days +23:42:00,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,23473524,60,30896594.0,3 days 21:11:00,0 days 00:08:00,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,23473524,45,30896594.0,3 days 22:11:00,0 days 00:00:00,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,23473524,125,30896594.0,3 days 17:11:00,0 days 00:39:00,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,23473524,60,30896594.0,3 days 23:11:00,-1 days +23:56:00,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7864,23786647,30,36211955.0,2 days 22:45:00,0 days 00:43:00,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
7865,23786647,110,36211955.0,2 days 18:45:00,0 days 00:46:00,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
7866,23786647,100,36211955.0,3 days 04:45:00,-1 days +23:46:00,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
7867,23786647,30,36211955.0,2 days 21:45:00,0 days 01:43:00,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


#### Split into train and test

In [202]:
df_output = pd.merge(df_output, df_transfers[['transfer_id', 'careunit']], on='transfer_id', how='left')
df_output.drop(columns=['transfer_id'], inplace=True)
data = df_output.drop(columns=['careunit','hadm_id'])
target = df_output['careunit']

# Split the dataset into training and testing sets
output_data_train, output_data_test, output_label_train, output_label_test = train_test_split(data, target, test_size=0.2, random_state=42)

# Print the shapes of the resulting training and testing sets
print("Training set shape:", output_data_train.shape, output_label_train.shape)
print("Testing set shape:", output_data_test.shape, output_label_test.shape)

Training set shape: (6295, 38) (6295,)
Testing set shape: (1574, 38) (1574,)


In [203]:
# uncomment and run if changes are made

output_data_train.to_csv('output_data_train.csv', index=False)
output_data_test.to_csv('output_data_test.csv', index=False)

output_label_train.to_csv('output_label_train.csv', index=False)
output_label_test.to_csv('output_label_test.csv', index=False)

In [204]:
output_data_train

Unnamed: 0,value,days_since_admission,recording_delay,itemid_226559,itemid_226560,itemid_226561,itemid_226567,itemid_226571,itemid_226573,itemid_226575,...,itemid_226626,itemid_226627,itemid_226631,itemid_226632,itemid_226633,itemid_227488,itemid_227489,itemid_227510,itemid_227511,itemid_227701
7750,350,2 days 08:31:00,0 days 00:39:00,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
196,45,0 days 20:45:00,0 days 00:08:00,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
5521,300,2 days 03:02:00,0 days 00:15:00,0,0,0,0,0,0,1,...,0,0,0,0,0,0,0,0,0,0
3548,50,6 days 03:26:00,0 days 02:57:00,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
415,200,4 days 13:26:00,0 days 00:53:00,0,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5226,35,1 days 22:01:00,0 days 00:10:00,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
5390,150,5 days 16:32:00,0 days 01:41:00,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
860,75,5 days 20:26:00,0 days 00:44:00,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
7603,50,0 days 20:04:00,0 days 00:02:00,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


#### Dimensionality reduction

In [205]:
# Fine

### procedureevents

In [206]:
file = "icu/procedureevents.csv"
full_path = path + file

df_procedure_events = pd.read_csv(full_path)

In [207]:
df_procedure_events

Unnamed: 0,subject_id,hadm_id,stay_id,caregiver_id,starttime,endtime,storetime,itemid,value,valueuom,...,orderid,linkorderid,ordercategoryname,ordercategorydescription,patientweight,isopenbag,continueinnextdept,statusdescription,ORIGINALAMOUNT,ORIGINALRATE
0,10027445,26275841,34499716,10712.0,2142-07-31 01:54:00,2142-08-02 10:44:00,2142-08-02 10:44:00,225792,3410.0,min,...,532221,532221,Ventilation,ContinuousProcess,103.0,1,0,FinishedRunning,3410.0,1
1,10027445,26275841,34499716,80518.0,2142-07-31 08:18:00,2142-08-03 15:10:00,2142-08-03 15:23:00,224263,4732.0,min,...,401769,401769,Invasive Lines,ContinuousProcess,103.0,1,0,FinishedRunning,4732.0,1
2,10027445,26275841,34499716,96407.0,2142-07-31 06:00:00,2142-08-03 06:03:00,2142-08-03 08:16:00,224275,4323.0,min,...,9714245,9714245,Peripheral Lines,ContinuousProcess,103.0,1,0,FinishedRunning,4323.0,1
3,10027445,26275841,34499716,96407.0,2142-07-31 02:00:00,2142-08-03 05:57:00,2142-08-03 08:15:00,224275,4557.0,min,...,2870557,2870557,Peripheral Lines,ContinuousProcess,103.0,1,0,FinishedRunning,4557.0,1
4,10027445,26275841,34499716,,2142-08-03 08:00:00,2142-08-03 21:06:00,2142-08-03 21:06:00.090,224277,786.0,min,...,4920092,4920092,Peripheral Lines,ContinuousProcess,103.0,1,0,FinishedRunning,786.0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1463,10016742,27568122,30425410,80518.0,2178-07-23 17:02:00,2178-07-23 17:03:00,2178-07-23 17:02:00,225400,1.0,,...,9107536,9107536,Procedures,Task,60.0,0,0,FinishedRunning,1.0,0
1464,10016742,27568122,30425410,,2178-07-22 09:30:00,2178-07-25 16:42:00,2178-07-25 16:42:46.113,229351,4752.0,min,...,5156714,5156714,Tubes,ContinuousProcess,60.0,1,0,FinishedRunning,4752.0,1
1465,10000032,29079034,39553978,,2180-07-23 14:24:00,2180-07-23 23:50:00,2180-07-23 23:50:49.983,224275,566.0,min,...,6497934,6497934,Peripheral Lines,ContinuousProcess,39.4,1,0,FinishedRunning,566.0,1
1466,10000032,29079034,39553978,,2180-07-23 14:24:00,2180-07-23 23:50:00,2180-07-23 23:50:49.983,224277,566.0,min,...,9643097,9643097,Peripheral Lines,ContinuousProcess,39.4,1,0,FinishedRunning,566.0,1


In [208]:
df_procedure_events = df_procedure_events[df_procedure_events['hadm_id'].isin(df_transfers['hadm_id'])]

# For each sample, get the rows from df_transfers that have the same hadm_id
# Go through this list and if the event time is between the in and out transfer time, assign it the same transfer_id 

In [209]:
# convert time to datetime
df_procedure_events['endtime'] = pd.to_datetime(df_procedure_events['endtime'])

In [210]:
df_procedure_events['transfer_id'] = float('nan')

for index, row in df_procedure_events.iterrows():
    # Filter target df based on 'hadm_id'
    df_transfers_subset = df_transfers[df_transfers['hadm_id'] == row['hadm_id']]

    datetime_value = row['endtime']
    
    # Filter out datetime values in the second dataframe that are later than the datetime value in the first dataframe
    filtered_transfers = df_transfers_subset[df_transfers_subset['intime'] > datetime_value]
    
    if not filtered_transfers.empty:

        # Find the closest datetime value in the filtered second dataframe
        closest_datetime = filtered_transfers['intime'].min()

        # Get the id of the sample with the closest datetime value
        closest_id = filtered_transfers[filtered_transfers['intime'] == closest_datetime]['transfer_id']

        # Assign the id to the current row in the first dataframe
        df_procedure_events.at[index, 'transfer_id'] = closest_id

df_procedure_events.dropna(subset=['transfer_id'], inplace=True)

In [211]:
# for index, row in df_procedure_events.iterrows():
#     df_transfers_subset = df_transfers[df_transfers['hadm_id'] == row['hadm_id']]

#     filtered_rows = df_transfers_subset[(df_transfers_subset['intime'] <= row['endtime']) & (row['endtime'] <= df_transfers_subset['outtime'])]
    
#     # If there are matching rows in DataFrame 2, assign 'transfer_id' from DataFrame 2 to DataFrame 1
#     if not filtered_rows.empty:
#         df_procedure_events.at[index, 'transfer_id'] = filtered_rows['transfer_id'].iloc[0]

In [212]:
df_procedure_events

Unnamed: 0,subject_id,hadm_id,stay_id,caregiver_id,starttime,endtime,storetime,itemid,value,valueuom,...,linkorderid,ordercategoryname,ordercategorydescription,patientweight,isopenbag,continueinnextdept,statusdescription,ORIGINALAMOUNT,ORIGINALRATE,transfer_id
0,10027445,26275841,34499716,10712.0,2142-07-31 01:54:00,2142-08-02 10:44:00,2142-08-02 10:44:00,225792,3410.0,min,...,532221,Ventilation,ContinuousProcess,103.0,1,0,FinishedRunning,3410.0,1,33419940.0
1,10027445,26275841,34499716,80518.0,2142-07-31 08:18:00,2142-08-03 15:10:00,2142-08-03 15:23:00,224263,4732.0,min,...,401769,Invasive Lines,ContinuousProcess,103.0,1,0,FinishedRunning,4732.0,1,33419940.0
2,10027445,26275841,34499716,96407.0,2142-07-31 06:00:00,2142-08-03 06:03:00,2142-08-03 08:16:00,224275,4323.0,min,...,9714245,Peripheral Lines,ContinuousProcess,103.0,1,0,FinishedRunning,4323.0,1,33419940.0
3,10027445,26275841,34499716,96407.0,2142-07-31 02:00:00,2142-08-03 05:57:00,2142-08-03 08:15:00,224275,4557.0,min,...,2870557,Peripheral Lines,ContinuousProcess,103.0,1,0,FinishedRunning,4557.0,1,33419940.0
5,10027445,26275841,34499716,80518.0,2142-07-31 09:23:00,2142-07-31 09:24:00,2142-07-31 09:23:00,228129,1.0,,...,1796876,Communication,Task,103.0,0,0,FinishedRunning,1.0,0,33419940.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1458,10014354,27487226,34600477,98270.0,2148-06-30 03:00:00,2148-06-30 03:01:00,2148-06-30 04:41:00,225402,1.0,,...,4416054,Procedures,Task,127.7,0,0,FinishedRunning,1.0,0,35186527.0
1459,10014354,27487226,34600477,,2148-06-30 03:38:00,2148-07-01 20:59:00,2148-07-01 20:59:05.277,229532,2481.0,min,...,3819281,Invasive Lines,ContinuousProcess,127.7,1,0,FinishedRunning,2481.0,1,36231959.0
1465,10000032,29079034,39553978,,2180-07-23 14:24:00,2180-07-23 23:50:00,2180-07-23 23:50:49.983,224275,566.0,min,...,6497934,Peripheral Lines,ContinuousProcess,39.4,1,0,FinishedRunning,566.0,1,35888873.0
1466,10000032,29079034,39553978,,2180-07-23 14:24:00,2180-07-23 23:50:00,2180-07-23 23:50:49.983,224277,566.0,min,...,9643097,Peripheral Lines,ContinuousProcess,39.4,1,0,FinishedRunning,566.0,1,35888873.0


Drop: subject_id, starttime, endtime, storetime, orderid, linkorderid, continueinnextdept, stay_id, caregiver_id
Encode: valueuom, ordercategoryname, ordercategorydescription, statusdescription, itemid
Impute with N/A and encode: location, locationcategory
MAKE DURATION FEATURE 

In [213]:
# Make a duration feature using endtime-starttime

# Convert to datetime
df_procedure_events['endtime'] = pd.to_datetime(df_procedure_events['endtime'], format='%Y-%m-%d %H:%M:%S')
df_procedure_events['starttime'] = pd.to_datetime(df_procedure_events['starttime'], format='%Y-%m-%d %H:%M:%S')


df_procedure_events['duration'] = df_procedure_events['endtime'] - df_procedure_events['starttime']

# Fill any non time values
df_procedure_events['duration'] = df_procedure_events['duration'].fillna(pd.Timedelta(0))

In [214]:
# Make a recording_delay feature using storetime-endtime

# Convert to datetime
df_procedure_events['storetime'] = pd.to_datetime(df_procedure_events['storetime'], format='%Y-%m-%d %H:%M:%S')

df_procedure_events['recording_delay'] = df_procedure_events['storetime'] - df_procedure_events['endtime']

# Fill any non time values
df_procedure_events['recording_delay'] = df_procedure_events['recording_delay'].fillna(pd.Timedelta(0))

In [215]:
# Drop 
df_procedure_events = df_procedure_events.drop(columns=['subject_id','stay_id','starttime','endtime','storetime','orderid',
                                                        'linkorderid','continueinnextdept','caregiver_id'])

In [216]:
# Impute with N/A and encode
df_procedure_events['location'] = df_procedure_events['location'].fillna('N/A')
df_procedure_events['locationcategory'] = df_procedure_events['locationcategory'].fillna('N/A')
df_procedure_events = pd.get_dummies(df_procedure_events, columns=['location','locationcategory','valueuom',
                                                                   'ordercategoryname','ordercategorydescription',
                                                                   'statusdescription','itemid'])

In [217]:
df_procedure_events

Unnamed: 0,hadm_id,value,patientweight,isopenbag,ORIGINALAMOUNT,ORIGINALRATE,transfer_id,duration,recording_delay,location_Left Accessory Basilic,...,itemid_228128,itemid_228129,itemid_228286,itemid_228715,itemid_229351,itemid_229380,itemid_229526,itemid_229532,itemid_229581,itemid_229586
0,26275841,3410.0,103.0,1,3410.0,1,33419940.0,2 days 08:50:00,0 days 00:00:00,0,...,0,0,0,0,0,0,0,0,0,0
1,26275841,4732.0,103.0,1,4732.0,1,33419940.0,3 days 06:52:00,0 days 00:13:00,0,...,0,0,0,0,0,0,0,0,0,0
2,26275841,4323.0,103.0,1,4323.0,1,33419940.0,3 days 00:03:00,0 days 02:13:00,0,...,0,0,0,0,0,0,0,0,0,0
3,26275841,4557.0,103.0,1,4557.0,1,33419940.0,3 days 03:57:00,0 days 02:18:00,0,...,0,0,0,0,0,0,0,0,0,0
5,26275841,1.0,103.0,0,1.0,0,33419940.0,0 days 00:01:00,-1 days +23:59:00,0,...,0,1,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1458,27487226,1.0,127.7,0,1.0,0,35186527.0,0 days 00:01:00,0 days 01:40:00,0,...,0,0,0,0,0,0,0,0,0,0
1459,27487226,2481.0,127.7,1,2481.0,1,36231959.0,1 days 17:21:00,0 days 00:00:05.277000,0,...,0,0,0,0,0,0,0,1,0,0
1465,29079034,566.0,39.4,1,566.0,1,35888873.0,0 days 09:26:00,0 days 00:00:49.983000,0,...,0,0,0,0,0,0,0,0,0,0
1466,29079034,566.0,39.4,1,566.0,1,35888873.0,0 days 09:26:00,0 days 00:00:49.983000,0,...,0,0,0,0,0,0,0,0,0,0


#### Split into train and test

In [218]:
df_procedure_events = pd.merge(df_procedure_events, df_transfers[['transfer_id', 'careunit']], on='transfer_id', how='left')
df_procedure_events.drop(columns=['transfer_id'], inplace=True)
data = df_procedure_events.drop(columns=['careunit','hadm_id'])
target = df_procedure_events['careunit']


# Split the dataset into training and testing sets
procedure_events_data_train, procedure_events_data_test, procedure_events_label_train, procedure_events_label_test = train_test_split(data, target, test_size=0.2, random_state=42)

# Print the shapes of the resulting training and testing sets
print("Training set shape:", procedure_events_data_train.shape, procedure_events_label_train.shape)
print("Testing set shape:", procedure_events_data_test.shape, procedure_events_label_test.shape)

Training set shape: (978, 152) (978,)
Testing set shape: (245, 152) (245,)


In [219]:
# uncomment and run if changes are made

procedure_events_data_train.to_csv('procedure_events_data_train.csv', index=False)
procedure_events_data_test.to_csv('procedure_events_data_test.csv', index=False)

procedure_events_label_train.to_csv('procedure_events_label_train.csv', index=False)
procedure_events_label_test.to_csv('procedure_events_label_test.csv', index=False)

In [220]:
procedure_events_data_train

Unnamed: 0,value,patientweight,isopenbag,ORIGINALAMOUNT,ORIGINALRATE,duration,recording_delay,location_Left Accessory Basilic,location_Left Accessory Cephalic,location_Left Antecubital,...,itemid_228128,itemid_228129,itemid_228286,itemid_228715,itemid_229351,itemid_229380,itemid_229526,itemid_229532,itemid_229581,itemid_229586
584,999.0,88.7,1,999.0,1,0 days 16:39:00,0 days 00:00:43.800000,0,0,0,...,0,0,0,0,0,0,0,0,0,0
966,1.0,143.0,0,1.0,0,0 days 00:01:00,0 days 04:32:00,0,0,0,...,0,0,0,0,0,0,0,0,0,0
10,1.0,103.0,0,1.0,0,0 days 00:01:00,-1 days +23:59:00,0,0,0,...,0,0,0,0,0,0,0,0,0,0
514,1.0,127.0,0,1.0,0,0 days 00:01:00,0 days 04:38:00,0,0,0,...,0,1,0,0,0,0,0,0,0,0
424,1.0,49.8,0,1.0,0,0 days 00:01:00,0 days 00:13:00,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1044,660.0,80.0,1,660.0,1,0 days 11:00:00,0 days 00:10:00,0,0,1,...,0,0,0,0,0,0,0,0,0,0
1095,2745.0,64.0,1,2745.0,1,1 days 21:45:00,0 days 00:24:00,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1130,30.0,53.0,1,30.0,1,0 days 00:30:00,0 days 00:03:00,0,0,0,...,0,0,0,0,0,0,0,0,0,0
860,1873.0,80.1,1,1873.0,1,1 days 07:13:00,0 days 00:00:35.503000,0,0,0,...,0,0,0,0,0,0,0,0,0,0


#### Dimensionality reduction

In [221]:
# Fine

### datetimeevents

In [222]:
file = "icu/datetimeevents.csv"
full_path = path + file

df_datetime_events = pd.read_csv(full_path)

In [223]:
df_datetime_events

Unnamed: 0,subject_id,hadm_id,stay_id,caregiver_id,charttime,storetime,itemid,value,valueuom,warning
0,10002428,23473524,35479615,29441,2156-05-15 10:50:00,2156-05-15 10:50:00,225343,2156-05-11 00:00:00,Date,0
1,10002428,23473524,35479615,29441,2156-05-15 10:50:00,2156-05-15 10:50:00,225348,2156-05-11 00:00:00,Date,0
2,10002428,23473524,35479615,29441,2156-05-15 10:50:00,2156-05-15 10:50:00,225345,2156-05-14 09:00:00,Date and Time,0
3,10002428,23473524,35479615,29441,2156-05-15 09:00:00,2156-05-15 10:51:00,224186,2156-05-15 09:00:00,Date and Time,0
4,10002428,23473524,35479615,29441,2156-05-15 09:00:00,2156-05-15 10:51:00,224187,2156-05-15 10:50:00,Date,0
...,...,...,...,...,...,...,...,...,...,...
15275,10016742,29281842,37057036,82943,2178-07-07 08:00:00,2178-07-07 09:37:00,225754,2178-07-06 00:00:00,Date,0
15276,10016742,29281842,37057036,82943,2178-07-07 12:00:00,2178-07-07 14:03:00,224187,2178-07-07 14:03:00,Date,0
15277,10016742,29281842,37057036,82943,2178-07-07 12:00:00,2178-07-07 14:03:00,224184,2178-07-07 14:03:00,Date,0
15278,10016742,29281842,37057036,82943,2178-07-07 12:00:00,2178-07-07 14:03:00,224183,2178-07-07 14:03:00,Date,0


In [224]:
df_datetime_events = df_datetime_events[df_datetime_events['hadm_id'].isin(df_transfers['hadm_id'])]

# For each sample, get the rows from df_transfers that have the same hadm_id
# Go through this list and if the event time is between the in and out transfer time, assign it the same transfer_id 

In [225]:
# convert time to datetime
df_datetime_events['value'] = pd.to_datetime(df_datetime_events['value'])

In [226]:
df_datetime_events['transfer_id'] = float('nan')

for index, row in df_datetime_events.iterrows():
    # Filter target df based on 'hadm_id'
    df_transfers_subset = df_transfers[df_transfers['hadm_id'] == row['hadm_id']]

    datetime_value = row['value']
    
    # Filter out datetime values in the second dataframe that are later than the datetime value in the first dataframe
    filtered_transfers = df_transfers_subset[df_transfers_subset['intime'] > datetime_value]
    
    if not filtered_transfers.empty:

        # Find the closest datetime value in the filtered second dataframe
        closest_datetime = filtered_transfers['intime'].min()

        # Get the id of the sample with the closest datetime value
        closest_id = filtered_transfers[filtered_transfers['intime'] == closest_datetime]['transfer_id']

        # Assign the id to the current row in the first dataframe
        df_datetime_events.at[index, 'transfer_id'] = closest_id

df_datetime_events.dropna(subset=['transfer_id'], inplace=True)

In [227]:
df_datetime_events

Unnamed: 0,subject_id,hadm_id,stay_id,caregiver_id,charttime,storetime,itemid,value,valueuom,warning,transfer_id
0,10002428,23473524,35479615,29441,2156-05-15 10:50:00,2156-05-15 10:50:00,225343,2156-05-11 00:00:00,Date,0,33978784.0
1,10002428,23473524,35479615,29441,2156-05-15 10:50:00,2156-05-15 10:50:00,225348,2156-05-11 00:00:00,Date,0,33978784.0
2,10002428,23473524,35479615,29441,2156-05-15 10:50:00,2156-05-15 10:50:00,225345,2156-05-14 09:00:00,Date and Time,0,30896594.0
3,10002428,23473524,35479615,29441,2156-05-15 09:00:00,2156-05-15 10:51:00,224186,2156-05-15 09:00:00,Date and Time,0,30896594.0
4,10002428,23473524,35479615,29441,2156-05-15 09:00:00,2156-05-15 10:51:00,224187,2156-05-15 10:50:00,Date,0,30896594.0
...,...,...,...,...,...,...,...,...,...,...,...
15266,10020187,26842957,32554129,36862,2170-02-24 20:00:00,2170-02-24 21:10:00,224288,2170-02-24 00:00:00,Date,0,35095353.0
15267,10020187,26842957,32554129,36862,2170-02-25 04:00:00,2170-02-25 04:21:00,225755,2170-02-24 00:00:00,Date,0,35095353.0
15268,10020187,26842957,32554129,36862,2170-02-24 20:00:00,2170-02-24 21:10:00,224285,2170-02-24 00:00:00,Date,0,35095353.0
15272,10016742,29281842,37057036,82943,2178-07-07 12:00:00,2178-07-07 14:01:00,229352,2178-07-03 00:00:00,Date,0,36180820.0


Drop: subject_id, hadm_id, stay_id, caregiver_id, charttime, storetime, valueuom
Encode: itemid

In [228]:
# Drop 
df_datetime_events = df_datetime_events.drop(columns=['warning','value','subject_id','stay_id','hadm_id','caregiver_id',
                                                     'charttime','storetime','valueuom'])


In [229]:
# Encode
df_datetime_events = pd.get_dummies(df_datetime_events, columns=['itemid'])

In [230]:
df_datetime_events 

Unnamed: 0,transfer_id,itemid_224183,itemid_224184,itemid_224185,itemid_224186,itemid_224187,itemid_224261,itemid_224262,itemid_224279,itemid_224280,...,itemid_227911,itemid_228291,itemid_228793,itemid_228794,itemid_229352,itemid_229353,itemid_229732,itemid_229733,itemid_229738,itemid_229739
0,33978784.0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,33978784.0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,30896594.0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,30896594.0,0,0,0,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,30896594.0,0,0,0,0,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
15266,35095353.0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
15267,35095353.0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
15268,35095353.0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
15272,36180820.0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,1,0,0,0,0,0


#### Split into train and test

In [231]:
df_datetime_events = pd.merge(df_datetime_events, df_transfers[['transfer_id', 'careunit']], on='transfer_id', how='left')
df_datetime_events.drop(columns=['transfer_id'], inplace=True)
data = df_datetime_events.drop(columns=['careunit'])
target = df_datetime_events['careunit']

# Split the dataset into training and testing sets
datetime_events_data_train, datetime_events_data_test, datetime_events_label_train, datetime_events_label_test = train_test_split(data, target, test_size=0.2, random_state=42)

# Print the shapes of the resulting training and testing sets
print("Training set shape:", datetime_events_data_train.shape, datetime_events_label_train.shape)
print("Testing set shape:", datetime_events_data_test.shape, datetime_events_label_test.shape)

Training set shape: (10395, 87) (10395,)
Testing set shape: (2599, 87) (2599,)


In [232]:
# uncomment and run if changes are made

datetime_events_data_train.to_csv('datetime_events_data_train.csv', index=False)
datetime_events_data_test.to_csv('datetime_events_data_test.csv', index=False)

datetime_events_label_train.to_csv('datetime_events_label_train.csv', index=False)
datetime_events_label_test.to_csv('datetime_events_label_test.csv', index=False)

In [233]:
datetime_events_data_train

Unnamed: 0,itemid_224183,itemid_224184,itemid_224185,itemid_224186,itemid_224187,itemid_224261,itemid_224262,itemid_224279,itemid_224280,itemid_224282,...,itemid_227911,itemid_228291,itemid_228793,itemid_228794,itemid_229352,itemid_229353,itemid_229732,itemid_229733,itemid_229738,itemid_229739
7864,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1576,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
11828,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
6064,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
12214,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
11964,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
5191,0,0,0,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
5390,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
860,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


#### Dimensionality reduction

In [234]:
# Fine