In [1]:
# External libraries for data processing
import numpy as np
import pandas as pd
import sklearn as sk
#To render graphs within notebook
%matplotlib inline
import matplotlib.pyplot as plt
import joblib 
import os

# Versions of libraries
print("Numpy version: {}".format(np.__version__))
print("Pandas version: {}".format(pd.__version__))
print("Scikit version: {}".format(sk.__version__))

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MultiLabelBinarizer

Numpy version: 1.24.3
Pandas version: 1.5.3
Scikit version: 1.3.0


In [2]:
path = "C:/Project/Data/"

In [3]:
file = "hosp/admissions.csv"
full_path = path + file

df_admissions = pd.read_csv(full_path)

df_admissions['dischtime'] = pd.to_datetime(df_admissions['dischtime'], format='%d/%m/%Y %H:%M')
df_admissions['admittime'] = pd.to_datetime(df_admissions['admittime'], format='%d/%m/%Y %H:%M')

df_admittime= pd.DataFrame()
df_admittime['hadm_id'] = df_admissions['hadm_id']
df_admittime['admittime'] = df_admissions['admittime']

In [4]:
def convert_to_days(duration_str):
    parts = duration_str.split(' days ')  # Split string into form ['22', '20:55:00']
    days = float(parts[0])  # Extract number of days and convert to float
    time_parts = parts[1].split(':')  # Split time part (hh:mm:ss) ['20', '55', '00']
    hours = float(time_parts[0])  # Extract hours and convert to float
    minutes = float(time_parts[1])  # Extract minutes and convert to float
    seconds = float(time_parts[2])  # Extract seconds and convert to float
    total_days = days + (hours / 24) + (minutes / (24 * 60)) + (seconds / (24 * 3600))  # Calculate total days
    return total_days

### Target variable calculation

In [5]:
file = "hosp/procedures_icd.csv"
full_path = path + file

df_procedures = pd.read_csv(full_path)

In [6]:
df_procedures

Unnamed: 0,subject_id,hadm_id,seq_num,chartdate,icd_code,icd_version
0,10011398,27505812,3,2146-12-15,3961,9
1,10011398,27505812,2,2146-12-15,3615,9
2,10011398,27505812,1,2146-12-15,3614,9
3,10014729,23300884,4,2125-03-23,3897,9
4,10014729,23300884,1,2125-03-20,3403,9
...,...,...,...,...,...,...
717,10004733,27411876,3,2174-12-20,4513,9
718,10021118,24490144,4,2161-11-19,5A1221Z,10
719,10021118,24490144,3,2161-11-19,06BP4ZZ,10
720,10021118,24490144,1,2161-11-19,02100Z9,10


In [7]:
file = "hosp/d_icd_procedures.csv"
full_path = path + file

df_codes = pd.read_csv(full_path)

In [8]:
df_codes

Unnamed: 0,icd_code,icd_version,long_title
0,0039,9,Other computer assisted surgery
1,0048,9,Insertion of four or more vascular stents
2,0074,9,"Hip bearing surface, metal-on-polyethylene"
3,0077,9,"Hip bearing surface, ceramic-on-polyethylene"
4,0126,9,Insertion of catheter(s) into cranial cavity o...
...,...,...,...
85252,F15Z68Z,10,Computerized Dynamic Posturography Assessment ...
85253,HZ37ZZZ,10,Individual Counseling for Substance Abuse Trea...
85254,X27L395,10,Dilation of Proximal Left Popliteal Artery wit...
85255,X2C0361,10,"Extirpation of Matter from Coronary Artery, On..."


In [9]:
# drop unneeded columns 
df_procedures = df_procedures.drop(columns=['subject_id', 'seq_num','icd_version'])

In [10]:
df_procedures

Unnamed: 0,hadm_id,chartdate,icd_code
0,27505812,2146-12-15,3961
1,27505812,2146-12-15,3615
2,27505812,2146-12-15,3614
3,23300884,2125-03-23,3897
4,23300884,2125-03-20,3403
...,...,...,...
717,27411876,2174-12-20,4513
718,24490144,2161-11-19,5A1221Z
719,24490144,2161-11-19,06BP4ZZ
720,24490144,2161-11-19,02100Z9


In [11]:
# Concatenate code values for each group
concat_df = df_procedures.groupby(['hadm_id', 'chartdate'])['icd_code'].agg(lambda x: ','.join(x)).reset_index()

# Split the comma-separated codes into individual columns
split_df = concat_df['icd_code'].str.get_dummies(',')

# Concatenate the original DataFrame with the one-hot encoded columns
df_procedures = pd.concat([concat_df, split_df], axis=1)

In [12]:
df_procedures = df_procedures.drop(columns='icd_code')

In [13]:
df_procedures

Unnamed: 0,hadm_id,chartdate,0039,0040,0041,0045,0051,0066,0069,0091,...,B41GYZZ,B518YZA,B51W1ZZ,B543ZZ3,B548ZZA,B54BZZA,BT1DYZZ,BT1FYZZ,D7021ZZ,DW021ZZ
0,20044587,2113-08-25,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,20093566,2143-09-27,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,20199380,2144-10-29,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,20199380,2144-10-31,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,20214994,2137-02-25,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
386,29820177,2150-07-10,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
387,29839885,2170-10-08,0,0,0,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
388,29842315,2155-12-05,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
389,29974575,2131-02-27,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [14]:
# convert time to datetime
df_procedures['chartdate'] = pd.to_datetime(df_procedures['chartdate'])

### emar

In [15]:
file = "hosp/emar.csv"
full_path = path + file

df_emar = pd.read_csv(full_path)

In [16]:
df_emar = df_emar[df_emar['hadm_id'].isin(df_procedures['hadm_id'])]

# For each sample, get the rows from df_procedures that have the same hadm_id

In [17]:
df_emar

Unnamed: 0,subject_id,hadm_id,emar_id,emar_seq,poe_id,pharmacy_id,enter_provider_id,charttime,medication,event_txt,scheduletime,storetime
0,10005909,20199380.0,10005909-74,74,10005909-97,96110427.0,,2144-10-31 05:56:00,Magnesium Sulfate,,2144-10-31 05:56:00,2144-10-31 05:56:00
1,10005909,20199380.0,10005909-79,79,10005909-97,96110427.0,,2144-10-31 08:00:00,Magnesium Sulfate,,2144-10-31 08:00:00,2144-10-31 08:15:00
2,10008287,22168393.0,10008287-32,32,10008287-58,,P26PKF,2145-09-28 20:15:00,Potassium Chloride Replacement (Critical Care ...,,2145-09-28 20:15:00,2145-09-28 20:38:00
3,10010471,21322534.0,10010471-33,33,10010471-51,52131847.0,,2155-05-08 21:45:00,Metoprolol Tartrate,,2155-05-08 21:45:00,2155-05-08 22:40:00
5,10019777,27738145.0,10019777-14,14,10019777-41,,,2187-02-10 17:13:00,Morphine Sulfate,,2187-02-10 17:13:00,2187-02-10 17:13:00
...,...,...,...,...,...,...,...,...,...,...,...,...
35830,10037861,24540843.0,10037861-371,371,10037861-385,,,2117-03-17 19:00:00,Midazolam,Infusion Reconciliation Not Done,2117-03-17 19:00:00,2117-03-17 18:33:00
35831,10018423,29366372.0,10018423-20,20,10018423-64,,P401QD,2167-05-04 17:00:00,Heparin,Stopped - Unscheduled in Other Location,2167-05-04 17:00:00,2167-05-04 18:19:00
35832,10014354,28335091.0,10014354-259,259,10014354-301,,,2147-04-28 17:00:00,Heparin,Stopped - Unscheduled in Other Location,2106-09-29 00:00:00,2147-04-28 22:34:00
35833,10019003,27525946.0,10019003-255,255,10019003-538,,,2153-04-14 17:00:00,PHENYLEPHrine,Stopped - Unscheduled in Other Location,2153-04-14 17:00:00,2153-04-15 02:28:00


In [18]:
# convert time to datetime
df_emar['charttime'] = pd.to_datetime(df_emar['charttime'])

Impute with N/A and encode: enter_provider_id, medication

Drop: subject_id, emar_id, poe_id, pharmacy_id, event_txt, storetime

poe_id is an identifier which links administrations in emar to orders in poe and prescriptions
storetime is when it was recorded in the table

In [19]:
# Make a feature called delay using scheduletime - charttime

# Convert to datetime
df_emar['scheduletime'] = pd.to_datetime(df_emar['scheduletime'], format='%Y/%m/%d %H:%M')
df_emar['charttime'] = pd.to_datetime(df_emar['charttime'], format='%Y/%m/%d %H:%M')

df_emar['delay'] = df_emar['charttime'] - df_emar['scheduletime']

# Fill any non time values
df_emar['delay'] = df_emar['delay'].fillna(pd.Timedelta(0))

In [20]:
df_emar = df_emar.drop(columns=['subject_id','emar_id','poe_id','pharmacy_id',
                               'event_txt','scheduletime','storetime'])

In [21]:
# Fill Null with N/A and then one hot encode
df_emar['enter_provider_id'] = df_emar['enter_provider_id'].fillna('N/A')
df_emar['medication'] = df_emar['medication'].fillna('N/A')
df_emar = pd.get_dummies(df_emar, columns=['enter_provider_id', 'medication'])

In [22]:
df_emar['delay'].value_counts()

0 days 00:00:00        13231
0 days 00:01:00          648
-1 days +23:59:00        301
0 days 00:14:00          237
0 days 00:15:00          228
                       ...  
-1 days +20:32:00          1
-1 days +22:55:00          1
-1 days +21:00:00          1
-1 days +21:15:00          1
14821 days 17:00:00        1
Name: delay, Length: 392, dtype: int64

In [23]:
df_emar

Unnamed: 0,hadm_id,emar_seq,charttime,delay,enter_provider_id_N/A,enter_provider_id_P00SP9,enter_provider_id_P01QR6,enter_provider_id_P02FO8,enter_provider_id_P02IVL,enter_provider_id_P048FQ,...,medication_Zinc Sulfate,medication_Zolpidem Tartrate,medication_amLODIPine,medication_ibrutinib,medication_irbesartan,medication_moxifloxacin,medication_nitroglycerin,medication_rifAXIMin,medication_sevelamer CARBONATE,medication_venetoclax
0,20199380.0,74,2144-10-31 05:56:00,0 days 00:00:00,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,20199380.0,79,2144-10-31 08:00:00,0 days 00:00:00,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,22168393.0,32,2145-09-28 20:15:00,0 days 00:00:00,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,21322534.0,33,2155-05-08 21:45:00,0 days 00:00:00,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
5,27738145.0,14,2187-02-10 17:13:00,0 days 00:00:00,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
35830,24540843.0,371,2117-03-17 19:00:00,0 days 00:00:00,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
35831,29366372.0,20,2167-05-04 17:00:00,0 days 00:00:00,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
35832,28335091.0,259,2147-04-28 17:00:00,14821 days 17:00:00,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
35833,27525946.0,255,2153-04-14 17:00:00,0 days 00:00:00,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [24]:
# df_emar['icd_code'] = float('nan')
codes = pd.DataFrame()

for index, row in df_emar.iterrows():
    # Filter procedures to have the same 'hadm_id' as that row in emar
    df_procedures_subset = df_procedures[df_procedures['hadm_id'] == row['hadm_id']]

    # Get the datetime value of that emar sample
    datetime_value = row['charttime']
    
    # Filter out procedures that are later than the datetime value in emar
    filtered_procedures = df_procedures_subset[df_procedures_subset['chartdate'] > datetime_value]
    
    if not filtered_procedures.empty:

        # Find the closest datetime value in the filtered second dataframe
        # Closest to datetime_value
        closest_datetime = filtered_procedures['chartdate'].min()

        # Get the the sample with the closest datetime value
        closest_id = filtered_procedures[filtered_procedures['chartdate'] == closest_datetime]
#         codes = codes + pd.DataFrame(closest_id)
        codes = codes.append(closest_id, ignore_index=True)
        #print(closest_id)

        # Assign the id to the current row in the first dataframe
        #df_emar.at[index, 'icd_code'] = str(code)
        # NEED TO CONCATENATE CODES TO DF_EMAR AT CURRENT INDEX
       # df_emar.loc[index] = df_emar.loc[index].values + codes

  codes = codes.append(closest_id, ignore_index=True)


In [25]:
codes

Unnamed: 0,hadm_id,chartdate,0039,0040,0041,0045,0051,0066,0069,0091,...,B41GYZZ,B518YZA,B51W1ZZ,B543ZZ3,B548ZZA,B54BZZA,BT1DYZZ,BT1FYZZ,D7021ZZ,DW021ZZ
0,21322534,2155-05-09,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,27738145,2187-02-11,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,24104168,2169-01-20,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,24104168,2169-01-20,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,24104168,2169-01-20,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
14232,22429197,2148-01-01,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
14233,22429197,2148-01-04,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
14234,24540843,2117-03-18,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
14235,29366372,2167-05-05,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [26]:
df_emar 

Unnamed: 0,hadm_id,emar_seq,charttime,delay,enter_provider_id_N/A,enter_provider_id_P00SP9,enter_provider_id_P01QR6,enter_provider_id_P02FO8,enter_provider_id_P02IVL,enter_provider_id_P048FQ,...,medication_Zinc Sulfate,medication_Zolpidem Tartrate,medication_amLODIPine,medication_ibrutinib,medication_irbesartan,medication_moxifloxacin,medication_nitroglycerin,medication_rifAXIMin,medication_sevelamer CARBONATE,medication_venetoclax
0,20199380.0,74,2144-10-31 05:56:00,0 days 00:00:00,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,20199380.0,79,2144-10-31 08:00:00,0 days 00:00:00,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,22168393.0,32,2145-09-28 20:15:00,0 days 00:00:00,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,21322534.0,33,2155-05-08 21:45:00,0 days 00:00:00,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
5,27738145.0,14,2187-02-10 17:13:00,0 days 00:00:00,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
35830,24540843.0,371,2117-03-17 19:00:00,0 days 00:00:00,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
35831,29366372.0,20,2167-05-04 17:00:00,0 days 00:00:00,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
35832,28335091.0,259,2147-04-28 17:00:00,14821 days 17:00:00,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
35833,27525946.0,255,2153-04-14 17:00:00,0 days 00:00:00,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [27]:
df_emar = df_emar.dropna()

In [28]:
# target = codes 

#### Split into train and test

In [29]:
data = df_emar.drop(columns=['icd_code'])
target = df_emar['icd_code']

# Split the dataset into training and testing sets
emar_data_train, emar_data_test, emar_label_train, emar_label_test = train_test_split(data, target, test_size=0.2, random_state=42)

# Print the shapes of the resulting training and testing sets
print("Training set shape:", emar_data_train.shape, emar_label_train.shape)
print("Testing set shape:", emar_data_test.shape, emar_label_test.shape)

KeyError: "['icd_code'] not found in axis"

In [None]:
# uncomment and run if changes are made

emar_data_train.to_csv('emar_data_train.csv', index=False)
emar_data_test.to_csv('emar_data_test.csv', index=False)

emar_label_train.to_csv('emar_label_train.csv', index=False)
emar_label_test.to_csv('emar_label_test.csv', index=False)

#### Dimensionality reduction

In [None]:
# Fine

### labevents

In [29]:
# file = "hosp/labevents.csv"
# full_path = path + file

# df_labevents = pd.read_csv(full_path)

In [30]:
# df_labevents['value'] = pd.to_numeric(df_labevents['value'], errors='coerce').fillna(0)

In [31]:
# df_labevents = df_labevents[df_labevents['hadm_id'].isin(df_procedures['hadm_id'])]

# For each sample, get the rows from df_procedures that have the same hadm_id
# Go through this list and if the event time is between the in and out transfer time, assign it the same transfer_id 

In [32]:
# convert time to datetime
# df_labevents['charttime'] = pd.to_datetime(df_labevents['charttime'])

In [33]:
# df_labevents['icd_code'] = float('nan')
# for index, row in df_labevents.iterrows():
#     # Filter target df based on 'hadm_id'
#     df_procedures_subset = df_procedures[df_procedures['hadm_id'] == row['hadm_id']]

#     datetime_value = row['charttime']
    
#     # Filter out datetime values in the second dataframe that are later than the datetime value in the first dataframe
#     filtered_procedures = df_procedures_subset[df_procedures_subset['chartdate'] > datetime_value]
    
#     if not filtered_procedures.empty:

#         # Find the closest datetime value in the filtered second dataframe
#         closest_datetime = filtered_procedures['chartdate'].min()

#         # Get the id of the sample with the closest datetime value
#         closest_id = filtered_procedures[filtered_procedures['chartdate'] == closest_datetime]['icd_code']
#         code = pd.array(closest_id)[0]

#         # Assign the id to the current row in the first dataframe
#         df_labevents.at[index, 'icd_code'] = str(code)



In [34]:
# df_labevents

Unnamed: 0,labevent_id,subject_id,hadm_id,specimen_id,itemid,order_provider_id,charttime,storetime,value,valuenum,valueuom,ref_range_lower,ref_range_upper,flag,priority,comments,icd_code
54,172548,10014354,26486158.0,29769552,51237,,2148-08-24 00:00:00,2148-08-24 01:29:00,1.1,1.1,,0.9,1.1,,ROUTINE,,3E04305
55,172550,10014354,26486158.0,29769552,51275,,2148-08-24 00:00:00,2148-08-24 01:29:00,45.0,45.0,sec,25.0,36.5,abnormal,ROUTINE,,3E04305
56,172549,10014354,26486158.0,29769552,51274,,2148-08-24 00:00:00,2148-08-24 01:29:00,12.1,12.1,sec,9.4,12.5,,ROUTINE,,3E04305
57,172547,10014354,26486158.0,29769552,51214,,2148-08-24 00:00:00,2148-08-24 01:29:00,468.0,468.0,mg/dL,180.0,400.0,abnormal,ROUTINE,,3E04305
58,172568,10014354,26486158.0,46932894,51255,,2148-08-24 00:00:00,2148-08-24 01:32:00,1.0,1.0,%,0.0,0.0,abnormal,ROUTINE,,3E04305
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
107722,254700,10021487,28998349.0,78791160,50804,,2116-12-07 18:59:00,2116-12-07 19:00:00,35.0,35.0,mEq/L,21.0,30.0,abnormal,,,3324
107723,254702,10021487,28998349.0,78791160,50818,,2116-12-07 18:59:00,2116-12-07 19:00:00,56.0,56.0,mm Hg,35.0,45.0,abnormal,,,3324
107724,254707,10021487,28998349.0,78791160,52033,,2116-12-07 18:59:00,2116-12-07 18:59:00,0.0,,,,,,,___,3324
107725,254706,10021487,28998349.0,78791160,50825,,2116-12-07 18:59:00,2116-12-07 18:59:00,39.7,39.7,,,,,,,3324


In [35]:
# # Make a feature for days_since_admission using charttime - admittime

# # Convert to datetime
# df_labevents['charttime'] = pd.to_datetime(df_labevents['charttime'], format='%Y/%m/%d %H:%M')

# # Add admittime column from other dataframe
# df_labevents = df_labevents.merge(df_admittime, on='hadm_id', how='left')

# # # Discard the time part and keep only the date
# # df_hcpcsevents['admittime'] = df_hcpcsevents['admittime'].dt.date
# # df_hcpcsevents['chartdate'] = df_hcpcsevents['chartdate'].dt.date

# df_labevents['days_since_admission'] = df_labevents['charttime'] - df_labevents['admittime']

# # Fill any non time values
# df_labevents['days_since_admission'] = df_labevents['days_since_admission'].fillna(pd.Timedelta(0))

In [36]:
# # Add storetime - charttime feature called delay

# # Convert to datetime
# df_labevents['storetime'] = pd.to_datetime(df_labevents['storetime'], format='%Y/%m/%d %H:%M')

# df_labevents['delay'] = df_labevents['storetime'] - df_labevents['charttime']

# # Fill any non time values
# df_labevents['delay'] = df_labevents['delay'].fillna(pd.Timedelta(0))

Drop: labevent_id, subject_id, order_provider_id (too many Null), charttime, storetime, comments

In [37]:
# df_labevents = df_labevents.drop(columns=['labevent_id','subject_id','order_provider_id','charttime','storetime','comments'])

In [38]:
# # For flag make abnormal = 1 and fill Null with 0
# df_labevents['flag'] = df_labevents['flag'].fillna(0)
# df_labevents['flag'] = df_labevents['flag'].replace('abnormal', 1)

In [39]:
# # For priority fill Null with N/A and then one hot encode
# df_labevents['priority'] = df_labevents['priority'].fillna('N/A')
# df_labevents = pd.get_dummies(df_labevents, columns=['priority'])

In [40]:
# df_labevents = pd.get_dummies(df_labevents, columns=['valueuom','specimen_id','itemid'])

In [41]:
# Drop any rows with null values 
# df_labevents = df_labevents.dropna()

In [42]:
# df_labevents

Unnamed: 0,hadm_id,value,valuenum,ref_range_lower,ref_range_upper,flag,icd_code,admittime,days_since_admission,delay,...,itemid_52281,itemid_52285,itemid_52286,itemid_52312,itemid_52369,itemid_52391,itemid_52425,itemid_52427,itemid_52955,itemid_53153
0,26486158.0,1.10,1.10,0.90,1.10,0,3E04305,2148-08-22 15:18:00,1 days 08:42:00,0 days 01:29:00,...,0,0,0,0,0,0,0,0,0,0
1,26486158.0,45.00,45.00,25.00,36.50,1,3E04305,2148-08-22 15:18:00,1 days 08:42:00,0 days 01:29:00,...,0,0,0,0,0,0,0,0,0,0
2,26486158.0,12.10,12.10,9.40,12.50,0,3E04305,2148-08-22 15:18:00,1 days 08:42:00,0 days 01:29:00,...,0,0,0,0,0,0,0,0,0,0
3,26486158.0,468.00,468.00,180.00,400.00,1,3E04305,2148-08-22 15:18:00,1 days 08:42:00,0 days 01:29:00,...,0,0,0,0,0,0,0,0,0,0
4,26486158.0,1.00,1.00,0.00,0.00,1,3E04305,2148-08-22 15:18:00,1 days 08:42:00,0 days 01:32:00,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
69962,28998349.0,4.70,4.70,3.30,5.10,0,3324,2116-12-03 00:23:00,4 days 18:36:00,0 days 00:01:00,...,0,0,0,0,0,0,0,0,0,0
69963,28998349.0,7.39,7.39,7.35,7.45,0,3324,2116-12-03 00:23:00,4 days 18:36:00,0 days 00:01:00,...,0,0,0,0,0,0,0,0,0,0
69964,28998349.0,145.00,145.00,85.00,105.00,1,3324,2116-12-03 00:23:00,4 days 18:36:00,0 days 00:01:00,...,0,0,0,0,0,0,0,0,0,0
69966,28998349.0,35.00,35.00,21.00,30.00,1,3324,2116-12-03 00:23:00,4 days 18:36:00,0 days 00:01:00,...,0,0,0,0,0,0,0,0,0,0


#### Split into train and test

In [43]:
# data = df_labevents.drop(columns=['icd_code'])

# target = df_labevents['icd_code']

# # Split the dataset into training and testing sets
# labevents_data_train, labevents_data_test, labevents_label_train, labevents_label_test = train_test_split(data, target, test_size=0.2, random_state=42)

# # Print the shapes of the resulting training and testing sets
# print("Training set shape:", labevents_data_train.shape, labevents_label_train.shape)
# print("Testing set shape:", labevents_data_test.shape, labevents_label_test.shape)

Training set shape: (24548, 7869) (24548,)
Testing set shape: (6137, 7869) (6137,)


In [44]:
# # uncomment and run if changes are made

# labevents_data_train.to_csv('labevents_data_train.csv', index=False)
# labevents_data_test.to_csv('labevents_data_test.csv', index=False)

# labevents_label_train.to_csv('labevents_label_train.csv', index=False)
# labevents_label_test.to_csv('labevents_label_test.csv', index=False)

In [45]:
# labevents_data_train

Unnamed: 0,hadm_id,value,valuenum,ref_range_lower,ref_range_upper,flag,admittime,days_since_admission,delay,priority_N/A,...,itemid_52281,itemid_52285,itemid_52286,itemid_52312,itemid_52369,itemid_52391,itemid_52425,itemid_52427,itemid_52955,itemid_53153
67540,22413744.0,15.7,15.7,9.4,12.5,1,2191-01-15 01:55:00,0 days 03:01:00,0 days 01:53:00,0,...,0,0,0,0,0,0,0,0,0,0
68912,29276678.0,2.0,2.0,1.6,2.6,0,2116-02-27 20:55:00,1 days 16:03:00,0 days 01:11:00,0,...,0,0,0,0,0,0,0,0,0,0
20520,26486158.0,0.0,18.0,10.0,18.0,0,2148-08-22 15:18:00,5 days 14:54:00,0 days 01:25:00,0,...,0,0,0,0,0,0,0,0,0,0
22855,21540783.0,0.0,45.0,29.0,201.0,0,2118-10-08 14:00:00,0 days 05:15:00,0 days 00:59:00,0,...,0,0,0,0,0,0,0,0,0,0
21636,26486158.0,18.0,18.0,0.0,40.0,0,2148-08-22 15:18:00,0 days 17:55:00,0 days 03:09:00,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
67986,28157142.0,3.5,3.5,3.3,5.1,0,2176-11-14 18:02:00,4 days 20:55:00,0 days 00:05:00,1,...,0,0,0,0,0,0,0,0,0,0
13630,29483621.0,106.0,106.0,150.0,440.0,1,2136-11-04 20:43:00,1 days 09:22:00,0 days 02:42:00,0,...,0,0,0,0,0,0,0,0,0,0
2496,23052851.0,1.3,1.3,0.4,1.1,1,2135-01-15 20:55:00,10 days 16:05:00,0 days 00:44:00,0,...,0,0,0,0,0,0,0,0,0,0
37377,22987108.0,95.0,95.0,96.0,108.0,1,2146-06-10 16:37:00,16 days 11:51:00,0 days 01:41:00,0,...,0,0,0,0,0,0,0,0,0,0


#### Dimensionality reduction

In [46]:
# Need to reduce to 4909

### microbiologyevents

In [None]:
file = "hosp/microbiologyevents.csv"
full_path = path + file

df_microbio = pd.read_csv(full_path)

In [None]:
# df_microbio['micro_specimen_id'].value_counts()

In [None]:
df_microbio.head(5)

In [None]:
df_microbio = df_microbio[df_microbio['hadm_id'].isin(df_procedures['hadm_id'])]

# For each sample, get the rows from df_procedures that have the same hadm_id
# Go through this list and if the event time is between the in and out transfer time, assign it the same icd_code 

In [None]:
# convert time to datetime
df_microbio['charttime'] = pd.to_datetime(df_microbio['charttime'])

In [None]:
df_microbio['icd_code'] = float('nan')
for index, row in df_microbio.iterrows():
    # Filter target df based on 'hadm_id'
    df_procedures_subset = df_procedures[df_procedures['hadm_id'] == row['hadm_id']]

    datetime_value = row['charttime']
    
    # Filter out datetime values in the second dataframe that are later than the datetime value in the first dataframe
    filtered_procedures = df_procedures_subset[df_procedures_subset['chartdate'] > datetime_value]
    
    if not filtered_procedures.empty:

        # Find the closest datetime value in the filtered second dataframe
        closest_datetime = filtered_procedures['chartdate'].min()

        # Get the id of the sample with the closest datetime value
        closest_id = filtered_procedures[filtered_procedures['chartdate'] == closest_datetime]['icd_code']
        code = pd.array(closest_id)[0]

        # Assign the id to the current row in the first dataframe
        df_microbio.at[index, 'icd_code'] = str(code)



In [None]:
df_microbio.dropna(subset=['icd_code'], inplace=True)
df_microbio 

In [None]:
# make days_since_admission using charttime 

# Convert to datetime
df_microbio['charttime'] = pd.to_datetime(df_microbio['charttime'], format='%Y/%m/%d %H:%M')

# Add admittime column from other dataframe
df_microbio = df_microbio.merge(df_admittime, on='hadm_id', how='left')

# # Discard the time part and keep only the date
# df_hcpcsevents['admittime'] = df_hcpcsevents['admittime'].dt.date
# df_hcpcsevents['chartdate'] = df_hcpcsevents['chartdate'].dt.date

df_microbio['days_since_admission'] = df_microbio['charttime'] - df_microbio['admittime']

# Fill any non time values
df_microbio['days_since_admission'] = df_microbio['days_since_admission'].fillna(pd.Timedelta(0))

# Drop the admission time column
df_microbio = df_microbio.drop(columns=['admittime'])

In [None]:
# Add storetime - charttime feature (call it delay)

# Convert to datetime
df_microbio['storetime'] = pd.to_datetime(df_microbio['storetime'], format='%Y/%m/%d %H:%M')

df_microbio['delay'] = df_microbio['storetime'] - df_microbio['charttime']

# Fill any non time values
df_microbio['delay'] = df_microbio['delay'].fillna(pd.Timedelta(0))

Drop: microevent_id, subject_id, chartdate, charttime, test_seq, storedate, storetime, test_name and org_itemid (since info in name), quantity, ab_name, comments, micro_specimen_id (unique identifier for sample as some measurements are made on the same sample)
Keep but categorical: order_provider_id, spec_type_desc, dilution_text, dilution_comparison
Impute null with 0: order_provider_id, org_itemid, isolate_num, ab_itemid, dilution_value
Impute with N/A and then one hot encode: interpretation

In [None]:
# Drop
# spec_itemid , test_itemid
df_microbio = df_microbio.drop(columns=['microevent_id','subject_id','chartdate','charttime','test_seq','storedate',
                                       'storetime','quantity','comments','ab_itemid',
                                       'spec_itemid','test_itemid','org_itemid','micro_specimen_id'])

In [None]:
# Impute null with 0: order_provider_id, org_itemid, isolate_num, ab_itemid, dilution_value
df_microbio['order_provider_id'] = df_microbio['order_provider_id'].fillna(0)
df_microbio['isolate_num'] = df_microbio['isolate_num'].fillna(0)
df_microbio['dilution_value'] = df_microbio['dilution_value'].fillna(0)

In [None]:
# Impute with N/A and then one hot encode: interpretation
# encode test_name, ab_name

df_microbio['interpretation'] = df_microbio['interpretation'].fillna('N/A')
df_microbio['test_name'] = df_microbio['test_name'].fillna('N/A')
df_microbio['ab_name'] = df_microbio['ab_name'].fillna('N/A')
df_microbio['org_name'] = df_microbio['org_name'].fillna('None')
df_microbio = pd.get_dummies(df_microbio, columns=['org_name','interpretation','ab_name','test_name'])

In [None]:
# Keep but categorical: order_provider_id, spec_type_desc, dilution_text, dilution_comparison
df_microbio = pd.get_dummies(df_microbio, columns=['order_provider_id','spec_type_desc','dilution_text',
                                                  'dilution_comparison'])

In [None]:
df_microbio = df_microbio.dropna()

In [None]:
df_microbio

#### Split into train and test

In [None]:
data = df_microbio.drop(columns=['icd_code'])

target = df_microbio['icd_code']

# Split the dataset into training and testing sets
microbio_data_train, microbio_data_test, microbio_label_train, microbio_label_test = train_test_split(data, target, test_size=0.2, random_state=42)

# Print the shapes of the resulting training and testing sets
print("Training set shape:", microbio_data_train.shape, microbio_label_train.shape)
print("Testing set shape:", microbio_data_test.shape, microbio_label_test.shape)

In [None]:
# uncomment and run if changes are made

microbio_data_train.to_csv('microbio_data_train.csv', index=False)
microbio_data_test.to_csv('microbio_data_test.csv', index=False)

microbio_label_train.to_csv('microbio_label_train.csv', index=False)
microbio_label_test.to_csv('microbio_label_test.csv', index=False)

In [None]:
microbio_data_train

#### Dimensionality reduction

In [65]:
# Need to reduce to 181

### pharmacy

In [None]:
file = "hosp/pharmacy.csv"
full_path = path + file

df_pharmacy = pd.read_csv(full_path)

In [None]:
df_pharmacy.head(2)

In [None]:
# stoptime-starttime for a duration feature

# Convert to datetime
df_pharmacy['stoptime'] = pd.to_datetime(df_pharmacy['stoptime'], format='%Y/%m/%d %H:%M')
df_pharmacy['starttime'] = pd.to_datetime(df_pharmacy['starttime'], format='%Y/%m/%d %H:%M')


df_pharmacy['medication_duration'] = df_pharmacy['stoptime'] - df_pharmacy['starttime']

# Fill any non time values
df_pharmacy['medication_duration'] = df_pharmacy['medication_duration'].fillna(pd.Timedelta(0))

In [None]:
# verifiedtime - entertime for verification_delay feature 

# Convert to datetime
df_pharmacy['verifiedtime'] = pd.to_datetime(df_pharmacy['verifiedtime'], format='%Y/%m/%d %H:%M')
df_pharmacy['entertime'] = pd.to_datetime(df_pharmacy['entertime'], format='%Y/%m/%d %H:%M')

df_pharmacy['verification_delay'] = df_pharmacy['verifiedtime'] - df_pharmacy['entertime']

# Fill any non time values
df_pharmacy['verification_delay'] = df_pharmacy['verification_delay'].fillna(pd.Timedelta(0))

In [None]:
fill_value = [0] 

# Fill null values with the list
df_pharmacy['disp_sched'] = df_pharmacy['disp_sched'].fillna(pd.Series([fill_value]*len(df_pharmacy)))

In [None]:
# Convert all categories to strings
df_pharmacy['disp_sched'] = df_pharmacy['disp_sched'].apply(lambda x: [str(item) for item in x])

mlb = MultiLabelBinarizer()

encoded_feature = pd.DataFrame(mlb.fit_transform(df_pharmacy['disp_sched']),
                               columns=mlb.classes_,
                               index=df_pharmacy.index)

df_pharmacy = pd.concat([df_pharmacy, encoded_feature], axis=1)

In [None]:
df_pharmacy = df_pharmacy[df_pharmacy['hadm_id'].isin(df_procedures['hadm_id'])]

# For each sample, get the rows from df_procedures that have the same hadm_id
# Go through this list and if the event time is between the in and out transfer time, assign it the same icd_code 

In [None]:
# convert time to datetime
df_pharmacy['entertime'] = pd.to_datetime(df_pharmacy['entertime'])

In [None]:
df_pharmacy['icd_code'] = float('nan')
for index, row in df_pharmacy.iterrows():
    # Filter target df based on 'hadm_id'
    df_procedures_subset = df_procedures[df_procedures['hadm_id'] == row['hadm_id']]

    datetime_value = row['entertime']
    
    # Filter out datetime values in the second dataframe that are later than the datetime value in the first dataframe
    filtered_procedures = df_procedures_subset[df_procedures_subset['chartdate'] > datetime_value]
    
    if not filtered_procedures.empty:

        # Find the closest datetime value in the filtered second dataframe
        closest_datetime = filtered_procedures['chartdate'].min()

        # Get the id of the sample with the closest datetime value
        closest_id = filtered_procedures[filtered_procedures['chartdate'] == closest_datetime]['icd_code']
        code = pd.array(closest_id)[0]

        # Assign the id to the current row in the first dataframe
        df_pharmacy.at[index, 'icd_code'] = str(code)

df_pharmacy.dropna(subset=['icd_code'], inplace=True)

In [None]:
df_pharmacy

drop: subject_id, pharmacy_id, poe_id, starttime, stoptime, entertime, verifiedtime, disp_sched, basal_rate, one_hr_max,
expirationdate, fill_quantity
Encode: proc_type, status
Impute with N/A and encode: infusion_type, sliding_scale, duration_interval, expiration_unit, dispensation, medication, route, frequency
Impute with 0: lockout_interval, doses_per_24_hrs, duration, expiration_value

In [None]:
# Drop 
df_pharmacy = df_pharmacy.drop(columns=['subject_id','pharmacy_id','poe_id','starttime','stoptime','entertime',
                                       'verifiedtime','expirationdate', 'fill_quantity','disp_sched'])
# expiration date and fill quantity are all empty

In [None]:
# Encode: proc_type, status
df_pharmacy = pd.get_dummies(df_pharmacy, columns=['proc_type','status'])

In [None]:
# Impute with N/A and encode
df_pharmacy['infusion_type'] = df_pharmacy['infusion_type'].fillna('N/A')
df_pharmacy['sliding_scale'] = df_pharmacy['sliding_scale'].fillna('N/A')
df_pharmacy['duration_interval'] = df_pharmacy['duration_interval'].fillna('N/A')
df_pharmacy['expiration_unit'] = df_pharmacy['expiration_unit'].fillna('N/A')
df_pharmacy['dispensation'] = df_pharmacy['dispensation'].fillna('N/A')
df_pharmacy['medication'] = df_pharmacy['medication'].fillna('N/A')
df_pharmacy['route'] = df_pharmacy['route'].fillna('N/A')
df_pharmacy['frequency'] = df_pharmacy['frequency'].fillna('N/A')
df_pharmacy = pd.get_dummies(df_pharmacy, columns=['infusion_type','sliding_scale','duration_interval','expiration_unit',
                                                  'dispensation','medication','route','frequency'])

In [None]:
# Impute with 0: lockout_interval, doses_per_24_hrs, duration, expiration_value
df_pharmacy['lockout_interval'] = df_pharmacy['lockout_interval'].fillna(0)
df_pharmacy['doses_per_24_hrs'] = df_pharmacy['doses_per_24_hrs'].fillna(0)
df_pharmacy['expiration_value'] = df_pharmacy['expiration_value'].fillna(0)
df_pharmacy['basal_rate'] = df_pharmacy['basal_rate'].fillna(0)
df_pharmacy['one_hr_max'] = df_pharmacy['one_hr_max'].fillna(0)

In [None]:
df_pharmacy

#### Split into train and test

In [None]:
data = df_pharmacy.drop(columns=['icd_code'])

target = df_pharmacy['icd_code']


# Split the dataset into training and testing sets
pharmacy_data_train, pharmacy_data_test, pharmacy_label_train, pharmacy_label_test = train_test_split(data, target, test_size=0.2, random_state=42)

# Print the shapes of the resulting training and testing sets
print("Training set shape:", pharmacy_data_train.shape, pharmacy_label_train.shape)
print("Testing set shape:", pharmacy_data_test.shape, pharmacy_label_test.shape)

In [None]:
# uncomment and run if changes are made

pharmacy_data_train.to_csv('pharmacy_data_train.csv', index=False)
pharmacy_data_test.to_csv('pharmacy_data_test.csv', index=False)

pharmacy_label_train.to_csv('pharmacy_label_train.csv', index=False)
pharmacy_label_test.to_csv('pharmacy_label_test.csv', index=False)

In [None]:
pharmacy_data_train

#### Dimensionality reduction

In [84]:
# Fine

### prescriptions

In [85]:
file = "hosp/prescriptions.csv"
full_path = path + file

df_prescriptions = pd.read_csv(full_path)

In [86]:
df_prescriptions

Unnamed: 0,subject_id,hadm_id,pharmacy_id,poe_id,poe_seq,order_provider_id,starttime,stoptime,drug_type,drug,...,gsn,ndc,prod_strength,form_rx,dose_val_rx,dose_unit_rx,form_val_disp,form_unit_disp,doses_per_24_hrs,route
0,10027602,28166872,27168639,,,,2201-10-30 12:00:00,,MAIN,Fentanyl Citrate,...,,,,,,,,,,
1,10027602,28166872,40720238,,,,2201-10-30 12:00:00,,MAIN,Fentanyl Citrate,...,,,,,,,,,,
2,10027602,28166872,62845687,,,,2201-10-31 12:00:00,,MAIN,Lorazepam,...,,,,,,,,,,
3,10027602,28166872,24340150,,,,2201-10-30 12:00:00,,MAIN,Midazolam,...,,,,,,,,,,
4,10027602,28166872,14435820,,,,2201-10-30 12:00:00,,MAIN,Midazolam,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
18082,10038081,20755971,33730068,10038081-378,378.0,P92KOD,2115-10-11 14:00:00,2115-10-13 07:00:00,MAIN,Artificial Tears,...,030016,2.305060e+07,0.4 mL DROPPERETTE,,1-2,DROP,0.1667-0.3333,DRP,,BOTH EYES
18083,10002428,23473524,87358294,10002428-780,780.0,P71IN4,2156-05-12 13:00:00,2156-05-22 18:00:00,MAIN,Artificial Tears,...,030016,2.305060e+07,0.3mL UD,,1-2,DROP,0.1667-0.3333,DRP,,BOTH EYES
18084,10040025,27996267,81941017,10040025-1640,1640.0,P52ORO,2148-01-26 19:00:00,2148-01-26 18:00:00,MAIN,OxyCODONE (Immediate Release),...,046474,9.046446e+08,15mg Tablet,,5-10,mg,0.3333-0.6667,TAB,,PO/NG
18085,10014354,26228185,46019806,10014354-3105,3105.0,P748G6,2150-05-01 01:00:00,2150-05-01 09:00:00,MAIN,Carbamide Peroxide 6.5%,...,008120,7.811207e+10,15mL Bottle,,5-10,DROP,0.3333-0.6667,BTL,1.0,BOTH EARS


In [87]:
df_prescriptions = df_prescriptions[df_prescriptions['hadm_id'].isin(df_procedures['hadm_id'])]

# For each sample, get the rows from df_procedures that have the same hadm_id
# Go through this list and if the event time is between the in and out transfer time, assign it the same icd_code 

In [88]:
df_prescriptions = df_prescriptions.copy()

In [89]:
# convert time to datetime
df_prescriptions['starttime'] = pd.to_datetime(df_prescriptions['starttime'])

In [90]:
df_prescriptions['icd_code'] = float('nan')
for index, row in df_prescriptions.iterrows():
    # Filter target df based on 'hadm_id'
    df_procedures_subset = df_procedures[df_procedures['hadm_id'] == row['hadm_id']]

    datetime_value = row['starttime']
    
    # Filter out datetime values in the second dataframe that are later than the datetime value in the first dataframe
    filtered_procedures = df_procedures_subset[df_procedures_subset['chartdate'] > datetime_value]
    
    if not filtered_procedures.empty:

        # Find the closest datetime value in the filtered second dataframe
        closest_datetime = filtered_procedures['chartdate'].min()

        # Get the id of the sample with the closest datetime value
        closest_id = filtered_procedures[filtered_procedures['chartdate'] == closest_datetime]['icd_code']
        code = pd.array(closest_id)[0]

        # Assign the id to the current row in the first dataframe
        df_prescriptions.at[index, 'icd_code'] = str(code)

df_prescriptions.dropna(subset=['icd_code'], inplace=True)

In [91]:
df_prescriptions.dropna(subset=['icd_code'], inplace=True)
df_prescriptions

Unnamed: 0,subject_id,hadm_id,pharmacy_id,poe_id,poe_seq,order_provider_id,starttime,stoptime,drug_type,drug,...,ndc,prod_strength,form_rx,dose_val_rx,dose_unit_rx,form_val_disp,form_unit_disp,doses_per_24_hrs,route,icd_code
0,10027602,28166872,27168639,,,,2201-10-30 12:00:00,,MAIN,Fentanyl Citrate,...,,,,,,,,,,966
1,10027602,28166872,40720238,,,,2201-10-30 12:00:00,,MAIN,Fentanyl Citrate,...,,,,,,,,,,966
2,10027602,28166872,62845687,,,,2201-10-31 12:00:00,,MAIN,Lorazepam,...,,,,,,,,,,966
3,10027602,28166872,24340150,,,,2201-10-30 12:00:00,,MAIN,Midazolam,...,,,,,,,,,,966
4,10027602,28166872,14435820,,,,2201-10-30 12:00:00,,MAIN,Midazolam,...,,,,,,,,,,966
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
18075,10002428,28662225,26836589,10002428-421,421.0,P37E1E,2156-04-22 11:00:00,2156-04-22 19:00:00,MAIN,Artificial Tears,...,23050601.0,0.3mL UD,,1-2,DROP,0.1667-0.3333,DRP,,BOTH EYES,966
18077,10018081,28861356,92992187,10018081-1061,1061.0,P17N0B,2134-08-08 15:00:00,2134-08-13 21:00:00,MAIN,Artificial Tears Preserv. Free,...,23050601.0,0.4 mL DROPPERETTE,,1-2,DROP,0.1667-0.3333,DRP,0.0,BOTH EYES,8628
18078,10020944,29974575,12050915,10020944-285,285.0,P96HOQ,2131-02-27 21:00:00,2131-03-13 21:00:00,MAIN,Artificial Tears,...,23050601.0,0.4 mL DROPPERETTE,,1-2,DROP,0.1667-0.3333,DRP,,BOTH EYES,3891
18081,10003400,20214994,86647694,10003400-1372,1372.0,P61VWF,2137-02-27 05:00:00,2137-03-19 20:00:00,MAIN,Artificial Tears,...,23050601.0,0.4 mL DROPPERETTE,,1-2,DROP,0.1667-0.3333,DRP,,BOTH EYES,966


Drop na and encode: dose_val_rx, form_val_disp, order_provider_id
Drop: subject_id, pharmacy_id, starttime, stoptime, form_rx (mostly null), poe_id
Impute with N/A and encode: formulary_drug_cd, gsn, prod_strength, route
Encode: drug_type, drug, dose_unit_rx, form_unit_disp
Impute with 0: doses_per_24_hrs

Drop rows with na

order_provider_id
Was going to impute with N/A and encode but going to drop as too many features 

In [92]:
# Make a feature of stoptime-starttime called duration 

# Convert to datetime
df_prescriptions['stoptime'] = pd.to_datetime(df_prescriptions['stoptime'], format='%Y/%m/%d %H:%M')
df_prescriptions['starttime'] = pd.to_datetime(df_prescriptions['starttime'], format='%Y/%m/%d %H:%M')

df_prescriptions['duration'] = df_prescriptions['stoptime'] - df_prescriptions['starttime']

# Fill any non time values
df_prescriptions['duration'] = df_prescriptions['duration'].fillna(pd.Timedelta(0))

In [93]:
# Drop na
df_prescriptions.dropna(subset=['dose_val_rx', 'form_val_disp'], inplace=True)

In [94]:
# Drop 
df_prescriptions = df_prescriptions.drop(columns=['subject_id','pharmacy_id','starttime','stoptime','form_rx','poe_id',
                                                 'order_provider_id'])

In [95]:
# Impute with N/A and encode
df_prescriptions['formulary_drug_cd'] = df_prescriptions['formulary_drug_cd'].fillna('N/A')
df_prescriptions['gsn'] = df_prescriptions['gsn'].fillna('N/A')
df_prescriptions['prod_strength'] = df_prescriptions['prod_strength'].fillna('N/A')
df_prescriptions['route'] = df_prescriptions['route'].fillna('N/A')

# Impute with 0
df_prescriptions['ndc'] = df_prescriptions['ndc'].fillna(0)

df_prescriptions = pd.get_dummies(df_prescriptions, columns=['formulary_drug_cd','gsn','prod_strength',
                                                            'route','drug_type','drug','dose_unit_rx','form_unit_disp',
                                                            'dose_val_rx','form_val_disp','ndc'])

In [96]:
df_prescriptions['doses_per_24_hrs'] = df_prescriptions['doses_per_24_hrs'].fillna(0)

In [97]:
# Drop any rows with null values 
df_prescriptions = df_prescriptions.dropna()

In [98]:
df_prescriptions

Unnamed: 0,hadm_id,poe_seq,doses_per_24_hrs,icd_code,duration,formulary_drug_cd_5000MLBAG,formulary_drug_cd_AA5D151000I,formulary_drug_cd_ACD3/1000I,formulary_drug_cd_ACE250,formulary_drug_cd_ACE500I,...,ndc_69315090401.0,ndc_69543037910.0,ndc_70092903505.0,ndc_70860045110.0,ndc_70860077602.0,ndc_71019028507.0,ndc_74300008186.0,ndc_76329301205.0,ndc_76329330101.0,ndc_76439034310.0
9,23831430,830.0,0.0,9427,1 days 11:00:00,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
10,20626031,17.0,0.0,3961,1 days 10:00:00,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
13,21027282,786.0,0.0,4311,10 days 22:00:00,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
17,23831430,895.0,0.0,9427,3 days 19:00:00,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
18,23831430,463.0,0.0,9427,1 days 22:00:00,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
18075,28662225,421.0,0.0,966,0 days 08:00:00,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
18077,28861356,1061.0,0.0,8628,5 days 06:00:00,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
18078,29974575,285.0,0.0,3891,14 days 00:00:00,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
18081,20214994,1372.0,0.0,966,20 days 15:00:00,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


#### Split into train and test

In [99]:
data = df_prescriptions.drop(columns=['icd_code'])

target = df_prescriptions['icd_code']

# Split the dataset into training and testing sets
prescriptions_data_train, prescriptions_data_test, prescriptions_label_train, prescriptions_label_test = train_test_split(data, target, test_size=0.2, random_state=42)

# Print the shapes of the resulting training and testing sets
print("Training set shape:", prescriptions_data_train.shape, prescriptions_label_train.shape)
print("Testing set shape:", prescriptions_data_test.shape, prescriptions_label_test.shape)

Training set shape: (6543, 3728) (6543,)
Testing set shape: (1636, 3728) (1636,)


In [100]:
# uncomment and run if changes are made

prescriptions_data_train.to_csv('prescriptions_data_train.csv', index=False)
prescriptions_data_test.to_csv('prescriptions_data_test.csv', index=False)

prescriptions_label_train.to_csv('prescriptions_label_train.csv', index=False)
prescriptions_label_test.to_csv('prescriptions_label_test.csv', index=False)

#### Dimensionality reduction

In [101]:
# Need to reduce to 1308

### icustays

In [None]:
file = "icu/icustays.csv"
full_path = path + file

df_icustays = pd.read_csv(full_path)

In [None]:
df_procedures

In [None]:
df_icustays = df_icustays[df_icustays['hadm_id'].isin(df_procedures['hadm_id'])]

# For each sample, get the rows from df_procedures that have the same hadm_id
# Go through this list and if the event time is between the in and out transfer time, assign it the same icd_code 

In [None]:
# convert time to datetime
df_icustays['outtime'] = pd.to_datetime(df_icustays['outtime'])

In [None]:
codes = pd.DataFrame()
data_new = pd.DataFrame()

for index, row in df_icustays.iterrows():
#     print(row['outtime'])
    # Filter procedures to have the same 'hadm_id' as that row in emar
    df_procedures_subset = df_procedures[df_procedures['hadm_id'] == row['hadm_id']]
#     print(df_procedures_subset['chartdate'])
    # Get the datetime value of that emar sample
    datetime_value = row['outtime']
    
    # Filter out procedures that are later than the datetime value in emar
    filtered_procedures = df_procedures_subset[df_procedures_subset['chartdate'] > datetime_value]
    
    # if it is empty it means there are no procedures for that admission after this sample was taken 
    if not filtered_procedures.empty:
#         print(filtered_procedures)
#         print('next')
        data_new = pd.concat([data_new, pd.DataFrame([row])], ignore_index=True)
        # Find the closest datetime value in the filtered second dataframe
        # Closest to datetime_value
        closest_datetime = filtered_procedures['chartdate'].min()

        # Get the the sample with the closest datetime value
        closest_id = filtered_procedures[filtered_procedures['chartdate'] == closest_datetime]
#         codes = codes + pd.DataFrame(closest_id)
        codes = pd.concat([codes, closest_id], ignore_index=True)

In [None]:
codes

In [None]:
data_new

Drop: subject_id, stay_id, intime, outtime
Encode: first_careunit, last_careunit

In [None]:
df_icustays = data_new

In [None]:
# make a feature called days_since_admission using intime-admittime

# Convert to datetime
df_icustays['intime'] = pd.to_datetime(df_icustays['intime'], format='%Y-%m-%d %H:%M:%S')

# Add admittime column from other dataframe
df_icustays = df_icustays.merge(df_admittime, on='hadm_id', how='left')

# # Discard the time part and keep only the date
# df_hcpcsevents['admittime'] = df_hcpcsevents['admittime'].dt.date
# df_hcpcsevents['chartdate'] = df_hcpcsevents['chartdate'].dt.date

df_icustays['days_since_admission'] = df_icustays['intime'] - df_icustays['admittime']

# Fill any non time values
df_icustays['days_since_admission'] = df_icustays['days_since_admission'].fillna(pd.Timedelta(0))

# Drop the admission time column
df_icustays = df_icustays.drop(columns=['admittime'])

In [None]:
# Drop 
df_icustays = df_icustays.drop(columns=['subject_id','stay_id','intime','outtime'])

# Rename los to icu_los
df_icustays = df_icustays.rename(columns={'los': 'icu_los'})

In [None]:
# Encode
df_icustays = pd.get_dummies(df_icustays, columns=['first_careunit','last_careunit'])

In [None]:
df_icustays

#### Split into train and test

In [None]:
data = df_icustays

# target = df_icustays['icd_code']
target = codes.drop(columns=['hadm_id', 'chartdate'])

# Split the dataset into training and testing sets
icustays_data_train, icustays_data_test, icustays_label_train, icustays_label_test = train_test_split(data, target, test_size=0.2, random_state=42)

# Print the shapes of the resulting training and testing sets
print("Training set shape:", icustays_data_train.shape, icustays_label_train.shape)
print("Testing set shape:", icustays_data_test.shape, icustays_label_test.shape)

In [None]:
data

In [None]:
# uncomment and run if changes are made

icustays_data_train.to_csv('icustays_data_train.csv', index=False)
icustays_data_test.to_csv('icustays_data_test.csv', index=False)

icustays_label_train.to_csv('icustays_label_train.csv', index=False)
icustays_label_test.to_csv('icustays_label_test.csv', index=False)

#### Dimensionality reduction

In [None]:
# Fine

### ingredientevents

In [122]:
file = "icu/ingredientevents.csv"
full_path = path + file

df_ingredient = pd.read_csv(full_path)

In [123]:
df_ingredient

Unnamed: 0,subject_id,hadm_id,stay_id,caregiver_id,starttime,endtime,storetime,itemid,amount,amountuom,rate,rateuom,orderid,linkorderid,statusdescription,originalamount,originalrate
0,10005817,20626031,32604416,4793,2132-12-17 05:00:00,2132-12-17 06:00:00,2132-12-17 06:01:00,227074,49.999999,ml,50.000000,mL/hour,7330951,7330951,FinishedRunning,0,50.000000
1,10005817,20626031,32604416,4793,2132-12-17 05:00:00,2132-12-17 06:00:00,2132-12-17 06:01:00,220490,49.999999,ml,50.000000,mL/hour,7330951,7330951,FinishedRunning,0,50.000000
2,10005817,20626031,32604416,20310,2132-12-17 12:00:00,2132-12-17 13:00:00,2132-12-17 12:48:00,220490,249.999990,ml,249.999985,mL/hour,5334154,5334154,FinishedRunning,0,250.000000
3,10005817,20626031,32604416,20310,2132-12-17 12:00:00,2132-12-17 13:00:00,2132-12-17 12:48:00,226509,249.999990,ml,249.999985,mL/hour,5334154,5334154,FinishedRunning,0,250.000000
4,10005817,20626031,32604416,92805,2132-12-15 16:35:00,2132-12-15 18:00:00,2132-12-15 16:42:00,220490,38.852669,ml,27.425413,mL/hour,1386365,3042892,ChangeDose/Rate,0,47.080292
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
25723,10019003,29279905,34107647,68979,2153-03-28 21:58:00,2153-03-28 22:58:00,2153-03-28 23:22:00,227074,49.999999,ml,50.000000,mL/hour,6547485,6547485,FinishedRunning,0,50.000000
25724,10019003,29279905,34107647,68979,2153-03-28 02:58:00,2153-03-28 02:59:00,2153-03-28 02:58:00,227075,120.000000,ml,,,103707,103707,FinishedRunning,0,120.000000
25725,10019003,29279905,34107647,68979,2153-03-28 02:58:00,2153-03-28 02:59:00,2153-03-28 02:58:00,220490,120.000000,ml,,,103707,103707,FinishedRunning,0,120.000000
25726,10019003,29279905,34107647,88156,2153-03-29 20:58:00,2153-03-29 20:59:00,2153-03-29 20:58:00,220490,500.000000,ml,,,9142525,9142525,FinishedRunning,0,500.000000


In [124]:
df_ingredient = df_ingredient[df_ingredient['hadm_id'].isin(df_procedures['hadm_id'])]

# For each sample, get the rows from df_procedures that have the same hadm_id
# Go through this list and if the event time is between the in and out transfer time, assign it the same icd_code 

In [125]:
# convert time to datetime
df_ingredient['endtime'] = pd.to_datetime(df_ingredient['endtime'])

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_ingredient['endtime'] = pd.to_datetime(df_ingredient['endtime'])


In [126]:
df_ingredient['icd_code'] = float('nan')
for index, row in df_ingredient.iterrows():
    # Filter target df based on 'hadm_id'
    df_procedures_subset = df_procedures[df_procedures['hadm_id'] == row['hadm_id']]

    datetime_value = row['endtime']
    
    # Filter out datetime values in the second dataframe that are later than the datetime value in the first dataframe
    filtered_procedures = df_procedures_subset[df_procedures_subset['chartdate'] > datetime_value]
    
    if not filtered_procedures.empty:

        # Find the closest datetime value in the filtered second dataframe
        closest_datetime = filtered_procedures['chartdate'].min()

        # Get the id of the sample with the closest datetime value
        closest_id = filtered_procedures[filtered_procedures['chartdate'] == closest_datetime]['icd_code']
        code = pd.array(closest_id)[0]

        # Assign the id to the current row in the first dataframe
        df_ingredient.at[index, 'icd_code'] = str(code)

df_ingredient.dropna(subset=['icd_code'], inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_ingredient['icd_code'] = float('nan')
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_ingredient.dropna(subset=['icd_code'], inplace=True)


In [127]:
# df_ingredient['icd_code'] = float('nan')

# for index, row in df_ingredient.iterrows():
#     # Filter target df based on 'hadm_id'
#     df_procedures_subset = df_procedures[df_procedures['hadm_id'] == row['hadm_id']]

#     datetime_value = row['endtime']
    
#     # Filter out datetime values in the second dataframe that are later than the datetime value in the first dataframe
#     filtered_transfers = df_procedures_subset[df_procedures_subset['intime'] > datetime_value]
    
#     if not filtered_transfers.empty:

#         # Find the closest datetime value in the filtered second dataframe
#         closest_datetime = filtered_transfers['intime'].min()

#         # Get the id of the sample with the closest datetime value
#         closest_id = filtered_transfers[filtered_transfers['intime'] == closest_datetime]['icd_code']

#         # Assign the id to the current row in the first dataframe
#         df_ingredient.at[index, 'icd_code'] = closest_id

# df_ingredient.dropna(subset=['icd_code'], inplace=True)

In [128]:
df_ingredient

Unnamed: 0,subject_id,hadm_id,stay_id,caregiver_id,starttime,endtime,storetime,itemid,amount,amountuom,rate,rateuom,orderid,linkorderid,statusdescription,originalamount,originalrate,icd_code
159,10027445,29163082,36084484,32481,2142-08-30 15:00:00,2142-08-30 19:00:00,2142-08-30 15:50:00,227074,40.000001,ml,10.0,mL/hour,2625155,2625155,Stopped,0,1000.0,3596
160,10027445,29163082,36084484,32481,2142-08-30 15:00:00,2142-08-30 19:00:00,2142-08-30 15:50:00,220490,40.000001,ml,10.0,mL/hour,2625155,2625155,Stopped,0,1000.0,3596
161,10027445,29163082,36084484,32481,2142-08-30 15:00:00,2142-08-30 15:01:00,2142-08-30 18:43:00,220490,105.000000,ml,,,2975887,2975887,FinishedRunning,0,105.0,3596
162,10027445,29163082,36084484,32481,2142-08-30 15:00:00,2142-08-30 15:01:00,2142-08-30 18:43:00,227074,105.000000,ml,,,2975887,2975887,FinishedRunning,0,105.0,3596
163,10027445,29163082,36084484,32481,2142-08-30 16:00:00,2142-08-30 16:01:00,2142-08-30 18:44:00,227074,150.000000,ml,,,1148588,1148588,FinishedRunning,0,150.0,3596
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
25723,10019003,29279905,34107647,68979,2153-03-28 21:58:00,2153-03-28 22:58:00,2153-03-28 23:22:00,227074,49.999999,ml,50.0,mL/hour,6547485,6547485,FinishedRunning,0,50.0,0D598ZZ
25724,10019003,29279905,34107647,68979,2153-03-28 02:58:00,2153-03-28 02:59:00,2153-03-28 02:58:00,227075,120.000000,ml,,,103707,103707,FinishedRunning,0,120.0,0D598ZZ
25725,10019003,29279905,34107647,68979,2153-03-28 02:58:00,2153-03-28 02:59:00,2153-03-28 02:58:00,220490,120.000000,ml,,,103707,103707,FinishedRunning,0,120.0,0D598ZZ
25726,10019003,29279905,34107647,88156,2153-03-29 20:58:00,2153-03-29 20:59:00,2153-03-29 20:58:00,220490,500.000000,ml,,,9142525,9142525,FinishedRunning,0,500.0,0D598ZZ


Drop: subject_id, starttime, endtime, storetime, orderid, originalamount, stay_id, caregiver_id
Encode: amountuom, statusdescription, itemid
Impute with 0: rate
Impute with N/A and encode: rateuom, linkorderid

In [129]:
# Make a duration feature of endtime-starttime 

# Convert to datetime
df_ingredient['endtime'] = pd.to_datetime(df_ingredient['endtime'], format='%Y-%m-%d %H:%M:%S')
df_ingredient['starttime'] = pd.to_datetime(df_ingredient['starttime'], format='%Y-%m-%d %H:%M:%S')


df_ingredient['duration'] = df_ingredient['endtime'] - df_ingredient['starttime']

# Fill any non time values
df_ingredient['duration'] = df_ingredient['duration'].fillna(pd.Timedelta(0))

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_ingredient['endtime'] = pd.to_datetime(df_ingredient['endtime'], format='%Y-%m-%d %H:%M:%S')
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_ingredient['starttime'] = pd.to_datetime(df_ingredient['starttime'], format='%Y-%m-%d %H:%M:%S')
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_ingred

In [130]:
# make a recording_delay feature of storetime-endtime

# Convert to datetime
df_ingredient['storetime'] = pd.to_datetime(df_ingredient['storetime'], format='%Y-%m-%d %H:%M:%S')

df_ingredient['recording_delay'] = df_ingredient['storetime'] - df_ingredient['endtime']

# Fill any non time values
df_ingredient['recording_delay'] = df_ingredient['recording_delay'].fillna(pd.Timedelta(0))

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_ingredient['storetime'] = pd.to_datetime(df_ingredient['storetime'], format='%Y-%m-%d %H:%M:%S')
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_ingredient['recording_delay'] = df_ingredient['storetime'] - df_ingredient['endtime']
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_ingredient['r

In [131]:
# Drop 
df_ingredient = df_ingredient.drop(columns=['subject_id','starttime','endtime','storetime','orderid','originalamount',
                                           'stay_id','caregiver_id'])

In [132]:
# Impute with N/A and encode
df_ingredient['rateuom'] = df_ingredient['rateuom'].fillna('N/A')
df_ingredient['linkorderid'] = df_ingredient['linkorderid'].fillna('N/A')
df_ingredient = pd.get_dummies(df_ingredient, columns=['rateuom','amountuom','statusdescription','itemid','linkorderid'])

In [133]:
# Impute with 0
df_ingredient['rate'] = df_ingredient['rate'].fillna(0)

In [134]:
df_ingredient

Unnamed: 0,hadm_id,amount,rate,originalrate,icd_code,duration,recording_delay,rateuom_N/A,rateuom_grams/hour,rateuom_mL/hour,...,linkorderid_9985393,linkorderid_9986202,linkorderid_9986595,linkorderid_9988568,linkorderid_9989506,linkorderid_9990254,linkorderid_9990509,linkorderid_9993006,linkorderid_9993329,linkorderid_9996112
159,29163082,40.000001,10.0,1000.0,3596,0 days 04:00:00,-1 days +20:50:00,0,0,1,...,0,0,0,0,0,0,0,0,0,0
160,29163082,40.000001,10.0,1000.0,3596,0 days 04:00:00,-1 days +20:50:00,0,0,1,...,0,0,0,0,0,0,0,0,0,0
161,29163082,105.000000,0.0,105.0,3596,0 days 00:01:00,0 days 03:42:00,1,0,0,...,0,0,0,0,0,0,0,0,0,0
162,29163082,105.000000,0.0,105.0,3596,0 days 00:01:00,0 days 03:42:00,1,0,0,...,0,0,0,0,0,0,0,0,0,0
163,29163082,150.000000,0.0,150.0,3596,0 days 00:01:00,0 days 02:43:00,1,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
25723,29279905,49.999999,50.0,50.0,0D598ZZ,0 days 01:00:00,0 days 00:24:00,0,0,1,...,0,0,0,0,0,0,0,0,0,0
25724,29279905,120.000000,0.0,120.0,0D598ZZ,0 days 00:01:00,-1 days +23:59:00,1,0,0,...,0,0,0,0,0,0,0,0,0,0
25725,29279905,120.000000,0.0,120.0,0D598ZZ,0 days 00:01:00,-1 days +23:59:00,1,0,0,...,0,0,0,0,0,0,0,0,0,0
25726,29279905,500.000000,0.0,500.0,0D598ZZ,0 days 00:01:00,-1 days +23:59:00,1,0,0,...,0,0,0,0,0,0,0,0,0,0


#### Split into train and test

In [135]:
data = df_ingredient.drop(columns=['icd_code'])

target = df_ingredient['icd_code']
# Split the dataset into training and testing sets
ingredient_data_train, ingredient_data_test, ingredient_label_train, ingredient_label_test = train_test_split(data, target, test_size=0.2, random_state=42)

# Print the shapes of the resulting training and testing sets
print("Training set shape:", ingredient_data_train.shape, ingredient_label_train.shape)
print("Testing set shape:", ingredient_data_test.shape, ingredient_label_test.shape)

Training set shape: (11208, 4147) (11208,)
Testing set shape: (2802, 4147) (2802,)


In [136]:
# uncomment and run if changes are made

ingredient_data_train.to_csv('ingredient_data_train.csv', index=False)
ingredient_data_test.to_csv('ingredient_data_test.csv', index=False)

ingredient_label_train.to_csv('ingredient_label_train.csv', index=False)
ingredient_label_test.to_csv('ingredient_label_test.csv', index=False)

#### Dimensionality reduction

In [137]:
# Need to reduce from 7727 to 4116

In [138]:
# path = "C:/Users/jenni/OneDrive/Desktop/IP/"
# file = "ingredient_data_train.csv"
# full_path = path + file

# ingredient_data_train = pd.read_csv(full_path)

# file = "ingredient_data_test.csv"
# full_path = path + file

# ingredient_data_test = pd.read_csv(full_path)

# file = "ingredient_label_train.csv"
# full_path = path + file

# ingredient_label_train = pd.read_csv(full_path)

# file = "ingredient_label_test.csv"
# full_path = path + file

# ingredient_label_test = pd.read_csv(full_path)

### inputevents

In [None]:
file = "icu/inputevents.csv"
full_path = path + file

df_input = pd.read_csv(full_path)

In [None]:
df_input

In [None]:
df_input = df_input[df_input['hadm_id'].isin(df_procedures['hadm_id'])]

# For each sample, get the rows from df_procedures that have the same hadm_id
# Go through this list and if the event time is between the in and out transfer time, assign it the same icd_code 

In [None]:
# convert time to datetime
df_input['endtime'] = pd.to_datetime(df_input['endtime'])

In [None]:
df_input['icd_code'] = float('nan')
for index, row in df_input.iterrows():
    # Filter target df based on 'hadm_id'
    df_procedures_subset = df_procedures[df_procedures['hadm_id'] == row['hadm_id']]

    datetime_value = row['endtime']
    
    # Filter out datetime values in the second dataframe that are later than the datetime value in the first dataframe
    filtered_procedures = df_procedures_subset[df_procedures_subset['chartdate'] > datetime_value]
    
    if not filtered_procedures.empty:

        # Find the closest datetime value in the filtered second dataframe
        closest_datetime = filtered_procedures['chartdate'].min()

        # Get the id of the sample with the closest datetime value
        closest_id = filtered_procedures[filtered_procedures['chartdate'] == closest_datetime]['icd_code']
        code = pd.array(closest_id)[0]

        # Assign the id to the current row in the first dataframe
        df_input.at[index, 'icd_code'] = str(code)

df_input.dropna(subset=['icd_code'], inplace=True)

In [None]:
# df_input['icd_code'] = float('nan')

# for index, row in df_input.iterrows():
#     # Filter target df based on 'hadm_id'
#     df_procedures_subset = df_procedures[df_procedures['hadm_id'] == row['hadm_id']]

#     datetime_value = row['endtime']
    
#     # Filter out datetime values in the second dataframe that are later than the datetime value in the first dataframe
#     filtered_transfers = df_procedures_subset[df_procedures_subset['intime'] > datetime_value]
    
#     if not filtered_transfers.empty:

#         # Find the closest datetime value in the filtered second dataframe
#         closest_datetime = filtered_transfers['intime'].min()

#         # Get the id of the sample with the closest datetime value
#         closest_id = filtered_transfers[filtered_transfers['intime'] == closest_datetime]['icd_code']

#         # Assign the id to the current row in the first dataframe
#         df_input.at[index, 'icd_code'] = closest_id

# df_input.dropna(subset=['icd_code'], inplace=True)

In [None]:
df_input

Drop: subject_id, starttime, endtime, storetime, orderid, linkorderid, continueinnextdept, stay_id, caregiver_id,
totalamountuom
Encode: amountuom, ordercategoryname, ordercomponenttypedescription, ordercategorydescription, statusdescription, itemid
Impute with 0: rate, totalamount
Impute with N/A and encode: rateuom, secondaryordercategoryname

In [None]:
# Make a duration feature using endtime-starttime

# Convert to datetime
df_input['endtime'] = pd.to_datetime(df_input['endtime'], format='%Y-%m-%d %H:%M:%S')
df_input['starttime'] = pd.to_datetime(df_input['starttime'], format='%Y-%m-%d %H:%M:%S')


df_input['duration'] = df_input['endtime'] - df_input['starttime']

# Fill any non time values
df_input['duration'] = df_input['duration'].fillna(pd.Timedelta(0))

In [None]:
# Make a recording_delay feature using storetime-endtime

# Convert to datetime
df_input['storetime'] = pd.to_datetime(df_input['storetime'], format='%Y-%m-%d %H:%M:%S')

df_input['recording_delay'] = df_input['storetime'] - df_input['endtime']

# Fill any non time values
df_input['recording_delay'] = df_input['recording_delay'].fillna(pd.Timedelta(0))

In [None]:
# Drop 
df_input = df_input.drop(columns=['subject_id','stay_id','starttime','endtime','storetime','orderid','linkorderid',
                                  'continueinnextdept','totalamountuom', 'stay_id','caregiver_id'])

In [None]:
# Impute with N/A and encode
df_input['rateuom'] = df_input['rateuom'].fillna('N/A')
df_input['secondaryordercategoryname'] = df_input['secondaryordercategoryname'].fillna('N/A')
df_input = pd.get_dummies(df_input, columns=['rateuom','secondaryordercategoryname','amountuom','ordercategoryname',
                                            'ordercomponenttypedescription','ordercategorydescription','statusdescription',
                                            'itemid'])

In [None]:
# Impute with 0
df_input['rate'] = df_input['rate'].fillna(0)
df_input['totalamount'] = df_input['totalamount'].fillna(0)

In [None]:
df_input = df_input.dropna()

In [None]:
df_input

#### Split into train and test

In [None]:
data = df_input.drop(columns=['icd_code'])

target = df_input['icd_code']

# Split the dataset into training and testing sets
input_data_train, input_data_test, input_label_train, input_label_test = train_test_split(data, target, test_size=0.2, random_state=42)

# Print the shapes of the resulting training and testing sets
print("Training set shape:", input_data_train.shape, input_label_train.shape)
print("Testing set shape:", input_data_test.shape, input_label_test.shape)

In [None]:
# uncomment and run if changes are made

input_data_train.to_csv('input_data_train.csv', index=False)
input_data_test.to_csv('input_data_test.csv', index=False)

input_label_train.to_csv('input_label_train.csv', index=False)
input_label_test.to_csv('input_label_test.csv', index=False)

In [None]:
input_data_train

#### Dimensionality reduction

In [None]:
# Fine

### outputevents

In [None]:
file = "icu/outputevents.csv"
full_path = path + file

df_output = pd.read_csv(full_path)

In [None]:
df_output

In [None]:
df_output = df_output[df_output['hadm_id'].isin(df_procedures['hadm_id'])]

# For each sample, get the rows from df_procedures that have the same hadm_id
# Go through this list and if the event time is between the in and out transfer time, assign it the same icd_code 

In [None]:
# convert time to datetime
df_output['charttime'] = pd.to_datetime(df_output['charttime'])

In [None]:
df_output['icd_code'] = float('nan')
for index, row in df_output.iterrows():
    # Filter target df based on 'hadm_id'
    df_procedures_subset = df_procedures[df_procedures['hadm_id'] == row['hadm_id']]

    datetime_value = row['charttime']
    
    # Filter out datetime values in the second dataframe that are later than the datetime value in the first dataframe
    filtered_procedures = df_procedures_subset[df_procedures_subset['chartdate'] > datetime_value]
    
    if not filtered_procedures.empty:

        # Find the closest datetime value in the filtered second dataframe
        closest_datetime = filtered_procedures['chartdate'].min()

        # Get the id of the sample with the closest datetime value
        closest_id = filtered_procedures[filtered_procedures['chartdate'] == closest_datetime]['icd_code']
        code = pd.array(closest_id)[0]

        # Assign the id to the current row in the first dataframe
        df_output.at[index, 'icd_code'] = str(code)

df_output.dropna(subset=['icd_code'], inplace=True)

In [None]:
# df_output['icd_code'] = float('nan')

# for index, row in df_output.iterrows():
#     # Filter target df based on 'hadm_id'
#     df_procedures_subset = df_procedures[df_procedures['hadm_id'] == row['hadm_id']]

#     datetime_value = row['charttime']
    
#     # Filter out datetime values in the second dataframe that are later than the datetime value in the first dataframe
#     filtered_transfers = df_transfers_subset[df_transfers_subset['intime'] > datetime_value]
    
#     if not filtered_transfers.empty:

#         # Find the closest datetime value in the filtered second dataframe
#         closest_datetime = filtered_transfers['intime'].min()

#         # Get the id of the sample with the closest datetime value
#         closest_id = filtered_transfers[filtered_transfers['intime'] == closest_datetime]['icd_code']

#         # Assign the id to the current row in the first dataframe
#         df_output.at[index, 'icd_code'] = closest_id

# df_output.dropna(subset=['icd_code'], inplace=True)

In [None]:
df_output

Drop: subject_id, charttime, storetime, valueuom, stay_id, caregiver_id'
Encode: itemid

In [None]:
# Make a days_since_admission feature using charttime-admittime 

# Convert to datetime
df_output['charttime'] = pd.to_datetime(df_output['charttime'], format='%Y-%m-%d %H:%M:%S')

# Add admittime column from other dataframe
df_output = df_output.merge(df_admittime, on='hadm_id', how='left')

df_output['days_since_admission'] = df_output['charttime'] - df_output['admittime']

# Fill any non time values
df_output['days_since_admission'] = df_output['days_since_admission'].fillna(pd.Timedelta(0))

# Drop the admission time column
df_output = df_output.drop(columns=['admittime'])

In [None]:
# Make a recording_delay feature using storetime-charttime

# Convert to datetime
df_output['storetime'] = pd.to_datetime(df_output['storetime'], format='%Y-%m-%d %H:%M:%S')

df_output['recording_delay'] = df_output['storetime'] - df_output['charttime']

# Fill any non time values
df_output['recording_delay'] = df_output['recording_delay'].fillna(pd.Timedelta(0))

In [None]:
# Drop 
df_output = df_output.drop(columns=['subject_id','stay_id','charttime','storetime','storetime','valueuom','caregiver_id'])

In [None]:
#Encode
df_output = pd.get_dummies(df_output, columns=['itemid'])

In [None]:
df_output

#### Split into train and test

In [None]:
data = df_output.drop(columns=['icd_code'])

target = df_output['icd_code']

# Split the dataset into training and testing sets
output_data_train, output_data_test, output_label_train, output_label_test = train_test_split(data, target, test_size=0.2, random_state=42)

# Print the shapes of the resulting training and testing sets
print("Training set shape:", output_data_train.shape, output_label_train.shape)
print("Testing set shape:", output_data_test.shape, output_label_test.shape)

In [None]:
# uncomment and run if changes are made

output_data_train.to_csv('output_data_train.csv', index=False)
output_data_test.to_csv('output_data_test.csv', index=False)

output_label_train.to_csv('output_label_train.csv', index=False)
output_label_test.to_csv('output_label_test.csv', index=False)

In [None]:
output_data_train

#### Dimensionality reduction

In [None]:
# Fine

### procedureevents

In [None]:
file = "icu/procedureevents.csv"
full_path = path + file

df_procedure_events = pd.read_csv(full_path)

In [None]:
df_procedure_events

In [None]:
df_procedure_events = df_procedure_events[df_procedure_events['hadm_id'].isin(df_procedures['hadm_id'])]

# For each sample, get the rows from df_procedures that have the same hadm_id
# Go through this list and if the event time is between the in and out transfer time, assign it the same icd_code 

In [None]:
# convert time to datetime
df_procedure_events['endtime'] = pd.to_datetime(df_procedure_events['endtime'])

In [None]:
df_procedure_events['icd_code'] = float('nan')
for index, row in df_procedure_events.iterrows():
    # Filter target df based on 'hadm_id'
    df_procedures_subset = df_procedures[df_procedures['hadm_id'] == row['hadm_id']]

    datetime_value = row['endtime']
    
    # Filter out datetime values in the second dataframe that are later than the datetime value in the first dataframe
    filtered_procedures = df_procedures_subset[df_procedures_subset['chartdate'] > datetime_value]
    
    if not filtered_procedures.empty:

        # Find the closest datetime value in the filtered second dataframe
        closest_datetime = filtered_procedures['chartdate'].min()

        # Get the id of the sample with the closest datetime value
        closest_id = filtered_procedures[filtered_procedures['chartdate'] == closest_datetime]['icd_code']
        code = pd.array(closest_id)[0]

        # Assign the id to the current row in the first dataframe
        df_procedure_events.at[index, 'icd_code'] = str(code)

df_procedure_events.dropna(subset=['icd_code'], inplace=True)

In [None]:
# df_procedure_events['icd_code'] = float('nan')

# for index, row in df_procedure_events.iterrows():
#     # Filter target df based on 'hadm_id'
#     df_procedures_subset = df_procedures[df_procedures['hadm_id'] == row['hadm_id']]

#     datetime_value = row['endtime']
    
#     # Filter out datetime values in the second dataframe that are later than the datetime value in the first dataframe
#     filtered_transfers = df_transfers_subset[df_transfers_subset['intime'] > datetime_value]
    
#     if not filtered_transfers.empty:

#         # Find the closest datetime value in the filtered second dataframe
#         closest_datetime = filtered_transfers['intime'].min()

#         # Get the id of the sample with the closest datetime value
#         closest_id = filtered_transfers[filtered_transfers['intime'] == closest_datetime]['icd_code']

#         # Assign the id to the current row in the first dataframe
#         df_procedure_events.at[index, 'icd_code'] = closest_id

# df_procedure_events.dropna(subset=['icd_code'], inplace=True)

In [None]:
df_procedure_events

Drop: subject_id, starttime, endtime, storetime, orderid, linkorderid, continueinnextdept, stay_id, caregiver_id
Encode: valueuom, ordercategoryname, ordercategorydescription, statusdescription, itemid
Impute with N/A and encode: location, locationcategory
MAKE DURATION FEATURE 

In [None]:
# Make a duration feature using endtime-starttime

# Convert to datetime
df_procedure_events['endtime'] = pd.to_datetime(df_procedure_events['endtime'], format='%Y-%m-%d %H:%M:%S')
df_procedure_events['starttime'] = pd.to_datetime(df_procedure_events['starttime'], format='%Y-%m-%d %H:%M:%S')


df_procedure_events['duration'] = df_procedure_events['endtime'] - df_procedure_events['starttime']

# Fill any non time values
df_procedure_events['duration'] = df_procedure_events['duration'].fillna(pd.Timedelta(0))

In [None]:
# Make a recording_delay feature using storetime-endtime

# Convert to datetime
df_procedure_events['storetime'] = pd.to_datetime(df_procedure_events['storetime'], format='%Y-%m-%d %H:%M:%S')

df_procedure_events['recording_delay'] = df_procedure_events['storetime'] - df_procedure_events['endtime']

# Fill any non time values
df_procedure_events['recording_delay'] = df_procedure_events['recording_delay'].fillna(pd.Timedelta(0))

In [None]:
# Drop 
df_procedure_events = df_procedure_events.drop(columns=['subject_id','stay_id','starttime','endtime','storetime','orderid',
                                                        'linkorderid','continueinnextdept','caregiver_id'])

In [None]:
# Impute with N/A and encode
df_procedure_events['location'] = df_procedure_events['location'].fillna('N/A')
df_procedure_events['locationcategory'] = df_procedure_events['locationcategory'].fillna('N/A')
df_procedure_events = pd.get_dummies(df_procedure_events, columns=['location','locationcategory','valueuom',
                                                                   'ordercategoryname','ordercategorydescription',
                                                                   'statusdescription','itemid'])

In [None]:
df_procedure_events

#### Split into train and test

In [None]:
data = df_procedure_events.drop(columns=['icd_code'])

target = df_procedure_events['icd_code']


# Split the dataset into training and testing sets
procedure_events_data_train, procedure_events_data_test, procedure_events_label_train, procedure_events_label_test = train_test_split(data, target, test_size=0.2, random_state=42)

# Print the shapes of the resulting training and testing sets
print("Training set shape:", procedure_events_data_train.shape, procedure_events_label_train.shape)
print("Testing set shape:", procedure_events_data_test.shape, procedure_events_label_test.shape)

In [None]:
# uncomment and run if changes are made

procedure_events_data_train.to_csv('procedure_events_data_train.csv', index=False)
procedure_events_data_test.to_csv('procedure_events_data_test.csv', index=False)

procedure_events_label_train.to_csv('procedure_events_label_train.csv', index=False)
procedure_events_label_test.to_csv('procedure_events_label_test.csv', index=False)

In [None]:
procedure_events_data_train

#### Dimensionality reduction

In [None]:
# Fine

### hcpcsevents

In [189]:
# d_hcpcs has longer descriptions (connected by code) but no other useful info 

In [190]:
file = "hosp/hcpcsevents.csv"
full_path = path + file

df_hcpcsevents = pd.read_csv(full_path)

In [191]:
df_hcpcsevents.head()

Unnamed: 0,subject_id,hadm_id,chartdate,hcpcs_cd,seq_num,short_description
0,10005348,29176490,2129-05-22,93454,1,Cardiovascular
1,10005348,29176490,2129-05-22,92921,2,Cardiovascular
2,10004457,21039249,2140-09-17,92980,1,Cardiovascular
3,10004457,25559382,2148-09-14,93455,1,Cardiovascular
4,10039708,27504040,2142-07-06,64415,2,Nervous system


To drop: subject_id, chartdate, hcpcs_cd (code that links to longer description in d_hcpcs)

In [192]:
# Make a feature for days_since_admission using chartdate - admittime

# Convert to datetime
df_hcpcsevents['chartdate'] = pd.to_datetime(df_hcpcsevents['chartdate'], format='%Y/%m/%d %H:%M')

# Add admittime column from other dataframe
df_hcpcsevents = df_hcpcsevents.merge(df_admittime, on='hadm_id', how='left')

# Discard the time part and keep only the date
df_hcpcsevents['admittime'] = df_hcpcsevents['admittime'].dt.date
df_hcpcsevents['chartdate'] = df_hcpcsevents['chartdate'].dt.date

df_hcpcsevents['days_since_admission'] = df_hcpcsevents['chartdate'] - df_hcpcsevents['admittime']

# Fill any non time values
df_hcpcsevents['days_since_admission'] = df_hcpcsevents['days_since_admission'].fillna(pd.Timedelta(0))

In [193]:
df_hcpcsevents = df_hcpcsevents.drop(columns=['subject_id','hcpcs_cd'])
# Not enough samples to include code as after encoding there would be a lot more features

In [194]:
df_hcpcsevents = pd.get_dummies(df_hcpcsevents, columns=['short_description'])

In [195]:
# convert time to datetime
df_hcpcsevents['chartdate'] = pd.to_datetime(df_hcpcsevents['chartdate'])

In [196]:
df_hcpcsevents['icd_code'] = float('nan')
for index, row in df_hcpcsevents.iterrows():
    # Filter target df based on 'hadm_id'
    df_procedures_subset = df_procedures[df_procedures['hadm_id'] == row['hadm_id']]

    datetime_value = row['chartdate']
    
    # Filter out datetime values in the second dataframe that are later than the datetime value in the first dataframe
    filtered_procedures = df_procedures_subset[df_procedures_subset['chartdate'] > datetime_value]
    
    if not filtered_procedures.empty:

        # Find the closest datetime value in the filtered second dataframe
        closest_datetime = filtered_procedures['chartdate'].min()

        # Get the id of the sample with the closest datetime value
        closest_id = filtered_procedures[filtered_procedures['chartdate'] == closest_datetime]['icd_code']
        code = pd.array(closest_id)[0]

        # Assign the id to the current row in the first dataframe
        df_hcpcsevents.at[index, 'icd_code'] = str(code)

df_hcpcsevents.dropna(subset=['icd_code'], inplace=True)

In [197]:
# df_hcpcsevents['icd_code'] = float('nan')

# for index, row in df_hcpcsevents.iterrows():
#     # Filter target df based on 'hadm_id'
#     df_procedures_subset = df_procedures[df_procedures['hadm_id'] == row['hadm_id']]

#     datetime_value = row['chartdate']
    
#     # Filter out datetime values in the second dataframe that are later than the datetime value in the first dataframe
#     filtered_transfers = df_procedures_subset[df_procedures_subset['intime'] > datetime_value]
    
#     if not filtered_transfers.empty:

#         # Find the closest datetime value in the filtered second dataframe
#         closest_datetime = filtered_transfers['intime'].min()

#         # Get the id of the sample with the closest datetime value
#         closest_id = filtered_transfers[filtered_transfers['intime'] == closest_datetime]['icd_code']

#         # Assign the id to the current row in the first dataframe
#         df_hcpcsevents.at[index, 'icd_code'] = closest_id

# df_hcpcsevents.dropna(subset=['icd_code'], inplace=True)

In [198]:
df_hcpcsevents

Unnamed: 0,hadm_id,chartdate,seq_num,admittime,days_since_admission,short_description_Cardiovascular,short_description_Cardiovascular system,short_description_Digestive system,short_description_Endocrine system,short_description_Hemic and lymphatic systems,short_description_Hospital observation per hr,short_description_Hospital observation services,short_description_Integumentary system,short_description_Musculoskeletal system,short_description_Nervous system,short_description_Perc drug-el cor stent sing,icd_code
7,29820177,2150-07-09,1,2150-07-09,0 days,0,0,1,0,0,0,0,0,0,0,0,9705
55,29820177,2150-07-09,2,2150-07-09,0 days,0,0,0,0,0,0,1,0,0,0,0,9705


#### Split into train and test

In [199]:
data = df_hcpcsevents.drop(columns=['icd_code'])

target = df_hcpcsevents['icd_code']


# Split the dataset into training and testing sets
hcpcsevents_data_train, hcpcsevents_data_test, hcpcsevents_label_train, hcpcsevents_label_test = train_test_split(data, target, test_size=0.2, random_state=42)

# Print the shapes of the resulting training and testing sets
print("Training set shape:", hcpcsevents_data_train.shape, hcpcsevents_label_train.shape)
print("Testing set shape:", hcpcsevents_data_test.shape, hcpcsevents_label_test.shape)

Training set shape: (1, 16) (1,)
Testing set shape: (1, 16) (1,)


In [200]:
# uncomment and run if changes are made

hcpcsevents_data_train.to_csv('hcpcsevents_data_train.csv', index=False)
hcpcsevents_data_test.to_csv('hcpcsevents_data_test.csv', index=False)

hcpcsevents_label_train.to_csv('hcpcsevents_label_train.csv', index=False)
hcpcsevents_label_test.to_csv('hcpcsevents_label_test.csv', index=False)

In [201]:
hcpcsevents_data_train

Unnamed: 0,hadm_id,chartdate,seq_num,admittime,days_since_admission,short_description_Cardiovascular,short_description_Cardiovascular system,short_description_Digestive system,short_description_Endocrine system,short_description_Hemic and lymphatic systems,short_description_Hospital observation per hr,short_description_Hospital observation services,short_description_Integumentary system,short_description_Musculoskeletal system,short_description_Nervous system,short_description_Perc drug-el cor stent sing
7,29820177,2150-07-09,1,2150-07-09,0 days,0,0,1,0,0,0,0,0,0,0,0


#### Dimensionality reduction

In [202]:
# ???

### poe

In [203]:
file = "hosp/poe.csv"
full_path = path + file

df_poe = pd.read_csv(full_path)

In [204]:
df_poe.head()

Unnamed: 0,poe_id,poe_seq,subject_id,hadm_id,ordertime,order_type,order_subtype,transaction_type,discontinue_of_poe_id,discontinued_by_poe_id,order_provider_id,order_status
0,10002930-456,456,10002930,20282368,2201-03-23 19:14:33,General Care,Other,New,,,P04TDP,Inactive
1,10002930-454,454,10002930,20282368,2201-03-23 19:14:33,General Care,Vitals/Monitoring,New,,,P04TDP,Inactive
2,10002930-455,455,10002930,20282368,2201-03-23 19:14:33,General Care,Activity,New,,,P04TDP,Inactive
3,10002930-453,453,10002930,20282368,2201-03-23 19:14:33,IV therapy,IV access,New,,,P04TDP,Inactive
4,10002930-452,452,10002930,20282368,2201-03-23 19:14:33,ADT orders,Admit,New,,,P04TDP,Inactive


To drop: poe_id, subject_id, ordertime, discontinue_of_poe_id, discontinued_by_poe_id (all unique), order_status (all inactive)
Encode: order_type, transaction_type
Impute with N/A and then encode: order_subtype, order_provider_id

In [205]:
# make a feature of ordertime - admittime for days_since_admission

# Convert to datetime
df_poe['ordertime'] = pd.to_datetime(df_poe['ordertime'], format='%Y/%m/%d %H:%M:%S')

# Add admittime column from other dataframe
df_poe = df_poe.merge(df_admittime, on='hadm_id', how='left')

# # Discard the time part and keep only the date
# df_hcpcsevents['admittime'] = df_hcpcsevents['admittime'].dt.date
# df_hcpcsevents['chartdate'] = df_hcpcsevents['chartdate'].dt.date

df_poe['days_since_admission'] = df_poe['ordertime'] - df_poe['admittime']

# Fill any non time values
df_poe['days_since_admission'] = df_poe['days_since_admission'].fillna(pd.Timedelta(0))

# Drop the admission time column
df_poe = df_poe.drop(columns=['admittime'])

In [206]:
# Drop 
df_poe = df_poe.drop(columns=['poe_id','subject_id','discontinue_of_poe_id','discontinued_by_poe_id',
                                       'order_status'])

In [207]:
# Encode
df_poe = pd.get_dummies(df_poe, columns=['order_type','transaction_type'])

In [208]:
# Impute with N/A and encode
df_poe['order_subtype'] = df_poe['order_subtype'].fillna('N/A')
df_poe['order_provider_id'] = df_poe['order_provider_id'].fillna('N/A')
df_poe = pd.get_dummies(df_poe, columns=['order_subtype','order_provider_id'])

In [209]:
df_poe = df_poe[df_poe['hadm_id'].isin(df_procedures['hadm_id'])]

# For each sample, get the rows from df_procedures that have the same hadm_id
# Go through this list and if the event time is between the in and out transfer time, assign it the same icd_code 

In [210]:
# convert time to datetime
df_poe['ordertime'] = pd.to_datetime(df_poe['ordertime'])

In [211]:
df_poe['icd_code'] = float('nan')
for index, row in df_poe.iterrows():
    # Filter target df based on 'hadm_id'
    df_procedures_subset = df_procedures[df_procedures['hadm_id'] == row['hadm_id']]

    datetime_value = row['ordertime']
    
    # Filter out datetime values in the second dataframe that are later than the datetime value in the first dataframe
    filtered_procedures = df_procedures_subset[df_procedures_subset['chartdate'] > datetime_value]
    
    if not filtered_procedures.empty:

        # Find the closest datetime value in the filtered second dataframe
        closest_datetime = filtered_procedures['chartdate'].min()

        # Get the id of the sample with the closest datetime value
        closest_id = filtered_procedures[filtered_procedures['chartdate'] == closest_datetime]['icd_code']
        code = pd.array(closest_id)[0]

        # Assign the id to the current row in the first dataframe
        df_poe.at[index, 'icd_code'] = str(code)

df_poe.dropna(subset=['icd_code'], inplace=True)

In [212]:
# df_poe['icd_code'] = float('nan')

# for index, row in df_poe.iterrows():
#     # Filter target df based on 'hadm_id'
#     df_procedures_subset = df_procedures[df_procedures['hadm_id'] == row['hadm_id']]

#     datetime_value = row['ordertime']
    
#     # Filter out datetime values in the second dataframe that are later than the datetime value in the first dataframe
#     filtered_transfers = df_procedures_subset[df_procedures_subset['intime'] > datetime_value]
    
#     if not filtered_transfers.empty:

#         # Find the closest datetime value in the filtered second dataframe
#         closest_datetime = filtered_transfers['intime'].min()

#         # Get the id of the sample with the closest datetime value
#         closest_id = filtered_transfers[filtered_transfers['intime'] == closest_datetime]['icd_code']

#         # Assign the id to the current row in the first dataframe
#         df_poe.at[index, 'icd_code'] = closest_id

# df_poe.dropna(subset=['icd_code'], inplace=True)

In [213]:
df_poe

Unnamed: 0,poe_seq,hadm_id,ordertime,days_since_admission,order_type_ADT orders,order_type_Blood Bank,order_type_Cardiology,order_type_Consults,order_type_Critical Care,order_type_General Care,...,order_provider_id_P99MMB,order_provider_id_P99N27,order_provider_id_P99ND7,order_provider_id_P99Q9I,order_provider_id_P99QP7,order_provider_id_P99U21,order_provider_id_P99WOA,order_provider_id_P99YFT,order_provider_id_P99Z7J,icd_code
89,153,26275841,2142-07-31 10:16:32,0 days 09:44:32,0,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,966
90,154,26275841,2142-07-31 10:16:32,0 days 09:44:32,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,966
91,151,26275841,2142-07-31 10:16:32,0 days 09:44:32,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,966
92,150,26275841,2142-07-31 10:16:32,0 days 09:44:32,0,0,0,0,0,1,...,0,0,0,0,0,0,0,0,0,966
93,152,26275841,2142-07-31 10:16:32,0 days 09:44:32,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,966
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
45075,573,26134779,2149-09-14 11:53:30,1 days 04:17:30,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0W9G3ZZ
45076,542,26134779,2149-09-13 05:57:14,-1 days +22:21:14,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0W9G3ZZ
45077,596,26134779,2149-09-16 18:46:15,3 days 11:10:15,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0W9G3ZZ
45078,571,26134779,2149-09-14 11:52:47,1 days 04:16:47,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0W9G3ZZ


#### Split into train and test

In [214]:
data = df_poe.drop(columns=['icd_code'])

target = df_poe['icd_code']


# Split the dataset into training and testing sets
poe_data_train, poe_data_test, poe_label_train, poe_label_test = train_test_split(data, target, test_size=0.2, random_state=42)

# Print the shapes of the resulting training and testing sets
print("Training set shape:", poe_data_train.shape, poe_label_train.shape)
print("Testing set shape:", poe_data_test.shape, poe_label_test.shape)

Training set shape: (16739, 1460) (16739,)
Testing set shape: (4185, 1460) (4185,)


In [215]:
# uncomment and run if changes are made

poe_data_train.to_csv('poe_data_train.csv', index=False)
poe_data_test.to_csv('poe_data_test.csv', index=False)

poe_label_train.to_csv('poe_label_train.csv', index=False)
poe_label_test.to_csv('poe_label_test.csv', index=False)

In [216]:
poe_data_train

Unnamed: 0,poe_seq,hadm_id,ordertime,days_since_admission,order_type_ADT orders,order_type_Blood Bank,order_type_Cardiology,order_type_Consults,order_type_Critical Care,order_type_General Care,...,order_provider_id_P99MD0,order_provider_id_P99MMB,order_provider_id_P99N27,order_provider_id_P99ND7,order_provider_id_P99Q9I,order_provider_id_P99QP7,order_provider_id_P99U21,order_provider_id_P99WOA,order_provider_id_P99YFT,order_provider_id_P99Z7J
38195,1131,20214994,2137-03-03 12:20:05,7 days 02:20:05,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
827,1757,29276678,2116-03-06 22:39:33,8 days 01:44:33,0,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
24858,911,23831430,2150-04-14 07:09:18,33 days 15:35:18,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
37620,234,28662225,2156-04-19 12:24:42,6 days 22:08:42,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
26873,2192,26486158,2148-08-22 15:40:54,0 days 00:22:54,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
24945,791,23831430,2150-04-05 08:08:36,24 days 16:34:36,0,0,0,0,0,1,...,0,0,0,0,0,0,0,0,0,0
25630,225,23831430,2150-03-19 02:21:31,7 days 10:47:31,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
13100,786,20364112,2149-10-01 18:25:31,-1 days +23:29:31,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2325,223,28829452,2113-09-15 02:55:16,2 days 12:13:16,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


#### Dimensionality reduction

In [217]:
# ???

### services

In [218]:
file = "hosp/services.csv"
full_path = path + file

df_services = pd.read_csv(full_path)

In [219]:
df_services.head()

Unnamed: 0,subject_id,hadm_id,transfertime,prev_service,curr_service
0,10001725,25563031,2110-04-11 15:09:36,,GYN
1,10019003,28003918,2148-12-21 03:32:53,,GYN
2,10007818,22987108,2146-06-10 16:38:18,,MED
3,10004235,24181354,2196-02-24 14:39:31,,MED
4,10026255,22059910,2201-07-07 18:16:14,,MED


Drop: subject_id, transfertime
Impute with N/A and encode: prev_service
Encode: curr_service

In [220]:
# Make a feature called days_since_admission using transfertime-admittime 

# Convert to datetime
df_services['transfertime'] = pd.to_datetime(df_services['transfertime'], format='%Y-%m-%d %H:%M:%S')

# Add admittime column from other dataframe
df_services = df_services.merge(df_admittime, on='hadm_id', how='left')

# # Discard the time part and keep only the date
# df_hcpcsevents['admittime'] = df_hcpcsevents['admittime'].dt.date
# df_hcpcsevents['chartdate'] = df_hcpcsevents['chartdate'].dt.date

df_services['days_since_admission'] = df_services['transfertime'] - df_services['admittime']

# Fill any non time values
df_services['days_since_admission'] = df_services['days_since_admission'].fillna(pd.Timedelta(0))

# Drop the admission time column
df_services = df_services.drop(columns=['admittime'])

In [221]:
# Drop 
df_services = df_services.drop(columns=['subject_id'])

In [222]:
# Impute with N/A and encode
df_services['prev_service'] = df_services['prev_service'].fillna('N/A')
df_services = pd.get_dummies(df_services, columns=['prev_service','curr_service'])

In [223]:
df_services = df_services[df_services['hadm_id'].isin(df_procedures['hadm_id'])]

# For each sample, get the rows from df_procedures that have the same hadm_id
# Go through this list and if the event time is between the in and out transfer time, assign it the same icd_code 

In [224]:
# convert time to datetime
df_services['transfertime'] = pd.to_datetime(df_services['transfertime'])

In [225]:
df_services['icd_code'] = float('nan')
for index, row in df_services.iterrows():
    # Filter target df based on 'hadm_id'
    df_procedures_subset = df_procedures[df_procedures['hadm_id'] == row['hadm_id']]

    datetime_value = row['transfertime']
    
    # Filter out datetime values in the second dataframe that are later than the datetime value in the first dataframe
    filtered_procedures = df_procedures_subset[df_procedures_subset['chartdate'] > datetime_value]
    
    if not filtered_procedures.empty:

        # Find the closest datetime value in the filtered second dataframe
        closest_datetime = filtered_procedures['chartdate'].min()

        # Get the id of the sample with the closest datetime value
        closest_id = filtered_procedures[filtered_procedures['chartdate'] == closest_datetime]['icd_code']
        code = pd.array(closest_id)[0]

        # Assign the id to the current row in the first dataframe
        df_services.at[index, 'icd_code'] = str(code)

df_services.dropna(subset=['icd_code'], inplace=True)

In [226]:
# df_services['icd_code'] = float('nan')

# for index, row in df_services.iterrows():
#     # Filter target df based on 'hadm_id'
#     df_procedures_subset = df_procedures[df_procedures['hadm_id'] == row['hadm_id']]

#     datetime_value = row['transfertime']
    
#     # Filter out datetime values in the second dataframe that are later than the datetime value in the first dataframe
#     filtered_transfers = df_procedures_subset[df_procedures_subset['intime'] > datetime_value]
    
#     if not filtered_transfers.empty:

#         # Find the closest datetime value in the filtered second dataframe
#         closest_datetime = filtered_transfers['intime'].min()

#         # Get the id of the sample with the closest datetime value
#         closest_id = filtered_transfers[filtered_transfers['intime'] == closest_datetime]['icd_code']

#         # Assign the id to the current row in the first dataframe
#         df_services.at[index, 'icd_code'] = closest_id

# df_services.dropna(subset=['icd_code'], inplace=True)

In [227]:
df_services

Unnamed: 0,hadm_id,transfertime,days_since_admission,prev_service_CMED,prev_service_CSURG,prev_service_MED,prev_service_N/A,prev_service_NMED,prev_service_NSURG,prev_service_OMED,...,curr_service_NMED,curr_service_NSURG,curr_service_OMED,curr_service_ORTHO,curr_service_PSYCH,curr_service_SURG,curr_service_TRAUM,curr_service_TSURG,curr_service_VSURG,icd_code
2,22987108,2146-06-10 16:38:18,0 days 00:01:18,0,0,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,3E0G76Z
3,24181354,2196-02-24 14:39:31,0 days 00:01:31,0,0,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,3897
8,22756440,2172-03-17 16:36:05,4 days 16:49:05,0,0,0,0,0,1,0,...,0,0,0,0,0,0,0,0,0,9608
9,22942076,2111-11-13 23:40:00,0 days 00:01:00,0,0,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,3893
13,28661809,2135-01-04 19:27:00,0 days 21:33:00,0,0,0,0,0,0,1,...,0,0,0,0,0,0,0,0,0,0BH17EZ
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
312,24159665,2137-08-10 05:32:24,-1 days +18:32:24,0,0,0,1,0,0,0,...,0,0,0,0,0,0,0,0,1,3989
314,20199380,2144-10-28 23:20:44,0 days 00:00:44,0,0,0,1,0,0,0,...,0,0,0,0,0,0,0,0,1,3E03317
315,27996267,2148-01-23 12:18:49,0 days 00:00:49,0,0,0,1,0,0,0,...,0,0,0,0,0,0,0,0,1,02HV33Z
316,25933959,2147-12-29 19:37:01,0 days 00:01:01,0,0,0,1,0,0,0,...,0,0,0,0,0,0,0,0,1,0LBW0ZZ


#### Split into train and test

In [228]:
data = df_services.drop(columns=['icd_code'])
target = df_services['icd_code']


# Split the dataset into training and testing sets
services_data_train, services_data_test, services_label_train, services_label_test = train_test_split(data, target, test_size=0.2, random_state=42)

# Print the shapes of the resulting training and testing sets
print("Training set shape:", services_data_train.shape, services_label_train.shape)
print("Testing set shape:", services_data_test.shape, services_label_test.shape)

Training set shape: (122, 27) (122,)
Testing set shape: (31, 27) (31,)


In [229]:
# uncomment and run if changes are made

services_data_train.to_csv('services_data_train.csv', index=False)
services_data_test.to_csv('services_data_test.csv', index=False)

services_label_train.to_csv('services_label_train.csv', index=False)
services_label_test.to_csv('services_label_test.csv', index=False)

In [230]:
services_data_train

Unnamed: 0,hadm_id,transfertime,days_since_admission,prev_service_CMED,prev_service_CSURG,prev_service_MED,prev_service_N/A,prev_service_NMED,prev_service_NSURG,prev_service_OMED,...,curr_service_MED,curr_service_NMED,curr_service_NSURG,curr_service_OMED,curr_service_ORTHO,curr_service_PSYCH,curr_service_SURG,curr_service_TRAUM,curr_service_TSURG,curr_service_VSURG
208,25508812,2155-05-22 21:47:18,0 days 00:01:18,0,0,0,1,0,0,0,...,0,0,0,1,0,0,0,0,0,0
155,23559586,2137-08-17 17:36:52,13 days 17:29:52,0,0,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
42,28477280,2137-10-14 17:36:40,1 days 18:53:40,0,0,0,0,0,0,0,...,1,0,0,0,0,0,0,0,0,0
101,23052851,2135-01-15 20:56:36,0 days 00:01:36,0,0,0,1,0,0,0,...,1,0,0,0,0,0,0,0,0,0
176,24597018,2157-11-18 22:57:43,0 days 00:01:43,0,0,0,1,0,0,0,...,0,1,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
163,21133938,2175-03-20 23:29:26,0 days 00:00:26,0,0,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
230,27660781,2117-03-03 15:59:43,0 days 00:00:43,0,0,0,1,0,0,0,...,0,0,0,0,0,0,1,0,0,0
28,28196804,2193-11-23 19:16:36,0 days 00:01:36,0,0,0,1,0,0,0,...,1,0,0,0,0,0,0,0,0,0
206,27525946,2153-04-12 19:07:59,0 days 00:00:59,0,0,0,1,0,0,0,...,0,0,0,1,0,0,0,0,0,0


#### Dimensionality reduction

In [231]:
# ???

### transfers

In [278]:
file = "hosp/transfers.csv"
full_path = path + file

df_transfers = pd.read_csv(full_path)

In [279]:
df_transfers.head()

Unnamed: 0,subject_id,hadm_id,transfer_id,eventtype,careunit,intime,outtime
0,10009049,22995465.0,30030230,discharge,,2174-05-31 14:21:47,
1,10025612,23403708.0,32533329,discharge,,2125-10-03 12:25:27,
2,10020786,23488445.0,37922399,discharge,,2189-06-13 17:25:44,
3,10014078,25809882.0,34694622,discharge,,2166-08-26 14:49:42,
4,10039831,26924951.0,37155928,discharge,,2116-01-02 14:35:02,


Drop: subject_id, transfer_id, intime
Encode: eventtype
Impute with N/A and encode: careunit

In [280]:
df_transfers = df_transfers[df_transfers['hadm_id'].isin(df_procedures['hadm_id'])]

# For each sample, get the rows from df_transfers that have the same hadm_id
# Go through this list and if the event time is between the in and out transfer time, assign it the same icd_code 

In [281]:
# convert time to datetime
df_transfers['outtime'] = pd.to_datetime(df_transfers['outtime'])

In [282]:
df_transfers['icd_code'] = float('nan')
for index, row in df_transfers.iterrows():
    # Filter target df based on 'hadm_id'
    df_procedures_subset = df_procedures[df_procedures['hadm_id'] == row['hadm_id']]

    datetime_value = row['outtime']
    
    # Filter out datetime values in the second dataframe that are later than the datetime value in the first dataframe
    filtered_procedures = df_procedures_subset[df_procedures_subset['chartdate'] > datetime_value]
    
    if not filtered_procedures.empty:

        # Find the closest datetime value in the filtered second dataframe
        closest_datetime = filtered_procedures['chartdate'].min()

        # Get the id of the sample with the closest datetime value
        closest_id = filtered_procedures[filtered_procedures['chartdate'] == closest_datetime]['icd_code']
        code = pd.array(closest_id)[0]

        # Assign the id to the current row in the first dataframe
        df_transfers.at[index, 'icd_code'] = str(code)

df_transfers.dropna(subset=['icd_code'], inplace=True)

In [283]:
df_transfers

Unnamed: 0,subject_id,hadm_id,transfer_id,eventtype,careunit,intime,outtime,icd_code
37,10007818,22987108.0,35618083,admit,Transplant,2146-06-10 16:38:18,2146-06-22 11:46:29,03LP3DZ
53,10025612,23403708.0,31224664,admit,Discharge Lounge,2125-09-25 00:56:40,2125-09-25 13:23:24,3348
54,10003046,26048429.0,33774064,admit,Discharge Lounge,2154-01-02 00:15:52,2154-01-02 15:57:15,4639
56,10006053,22942076.0,30853116,transfer,Discharge Lounge,2111-11-14 00:14:10,2111-11-14 00:19:12,3893
66,10007058,22954658.0,38967799,ED,Emergency Department,2167-11-07 17:57:00,2167-11-07 20:22:00,02C03ZZ
...,...,...,...,...,...,...,...,...
1175,10002428,28662225.0,33987268,transfer,Medical Intensive Care Unit (MICU),2156-04-12 16:24:18,2156-04-17 15:57:08,3893
1179,10004733,27411876.0,34978658,transfer,Medicine,2174-12-12 20:03:01,2174-12-13 17:47:18,4523
1181,10004733,27411876.0,39508129,transfer,Neurology,2174-12-13 17:47:18,2174-12-13 22:18:45,4523
1183,10004733,27411876.0,39635619,admit,Medical/Surgical Intensive Care Unit (MICU/SICU),2174-12-04 11:28:24,2174-12-12 20:03:01,4523


In [284]:
# Make a days_since_admission feature of intime-admittime

# Convert to datetime
df_transfers['intime'] = pd.to_datetime(df_transfers['intime'], format='%Y-%m-%d %H:%M:%S')

# Add admittime column from other dataframe
df_transfers = df_transfers.merge(df_admittime, on='hadm_id', how='left')

df_transfers['days_since_admission'] = df_transfers['intime'] - df_transfers['admittime']

# Fill any non time values
df_transfers['days_since_admission'] = df_transfers['days_since_admission'].fillna(pd.Timedelta(0))

# Drop the admission time column
df_transfers = df_transfers.drop(columns=['admittime'])

In [285]:
# Make a duration feature of outtime-intime 

# Convert to datetime
df_transfers['outtime'] = pd.to_datetime(df_transfers['outtime'], format='%Y-%m-%d %H:%M:%S')

df_transfers['duration'] = df_transfers['outtime'] - df_transfers['intime']

# Fill any non time values
df_transfers['duration'] = df_transfers['duration'].fillna(pd.Timedelta(0))

In [286]:
# Drop 
df_transfers = df_transfers.drop(columns=['subject_id','intime'])

In [287]:
# Impute with N/A and encode
df_transfers['careunit'] = df_transfers['careunit'].fillna('N/A')
df_transfers = pd.get_dummies(df_transfers, columns=['eventtype','careunit'])

In [288]:
df_transfers

Unnamed: 0,hadm_id,transfer_id,outtime,icd_code,days_since_admission,duration,eventtype_ED,eventtype_admit,eventtype_transfer,careunit_Cardiac Surgery,...,careunit_Medical/Surgical Intensive Care Unit (MICU/SICU),careunit_Medicine,careunit_Medicine/Cardiology,careunit_Neuro Surgical Intensive Care Unit (Neuro SICU),careunit_Neurology,careunit_PACU,careunit_Surgical Intensive Care Unit (SICU),careunit_Transplant,careunit_Trauma SICU (TSICU),careunit_Vascular
0,22987108.0,35618083,2146-06-22 11:46:29,03LP3DZ,0 days 00:01:18,11 days 19:08:11,0,1,0,0,...,0,0,0,0,0,0,0,1,0,0
1,23403708.0,31224664,2125-09-25 13:23:24,3348,-1 days +17:41:40,0 days 12:26:44,0,1,0,0,...,0,0,0,0,0,0,0,0,0,0
2,26048429.0,33774064,2154-01-02 15:57:15,4639,-1 days +17:00:52,0 days 15:41:23,0,1,0,0,...,0,0,0,0,0,0,0,0,0,0
3,22942076.0,30853116,2111-11-14 00:19:12,3893,0 days 00:35:10,0 days 00:05:02,0,0,1,0,...,0,0,0,0,0,0,0,0,0,0
4,22954658.0,38967799,2167-11-07 20:22:00,02C03ZZ,-1 days +22:52:00,0 days 02:25:00,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
235,28662225.0,33987268,2156-04-17 15:57:08,3893,0 days 02:08:18,4 days 23:32:50,0,0,1,0,...,0,0,0,0,0,0,0,0,0,0
236,27411876.0,34978658,2174-12-13 17:47:18,4523,8 days 08:35:01,0 days 21:44:17,0,0,1,0,...,0,1,0,0,0,0,0,0,0,0
237,27411876.0,39508129,2174-12-13 22:18:45,4523,9 days 06:19:18,0 days 04:31:27,0,0,1,0,...,0,0,0,0,1,0,0,0,0,0
238,27411876.0,39635619,2174-12-12 20:03:01,4523,0 days 00:00:24,8 days 08:34:37,0,1,0,0,...,1,0,0,0,0,0,0,0,0,0


#### Split into train and test

In [289]:
data = df_transfers.drop(columns=['icd_code'])

target = df_transfers['icd_code']


# Split the dataset into training and testing sets
transfers_data_train, transfers_data_test, transfers_label_train, transfers_label_test = train_test_split(data, target, test_size=0.2, random_state=42)

# Print the shapes of the resulting training and testing sets
print("Training set shape:", transfers_data_train.shape, transfers_label_train.shape)
print("Testing set shape:", transfers_data_test.shape, transfers_label_test.shape)

Training set shape: (192, 29) (192,)
Testing set shape: (48, 29) (48,)


In [290]:
# uncomment and run if changes are made


transfers_data_train.to_csv('transfers_data_train.csv', index=False)
transfers_data_test.to_csv('transfers_data_test.csv', index=False)

transfers_label_train.to_csv('transfers_label_train.csv', index=False)
transfers_label_test.to_csv('transfers_label_test.csv', index=False)

In [293]:
transfers_data_train

Unnamed: 0,hadm_id,transfer_id,outtime,days_since_admission,duration,eventtype_ED,eventtype_admit,eventtype_transfer,careunit_Cardiac Surgery,careunit_Cardiac Vascular Intensive Care Unit (CVICU),...,careunit_Medical/Surgical Intensive Care Unit (MICU/SICU),careunit_Medicine,careunit_Medicine/Cardiology,careunit_Neuro Surgical Intensive Care Unit (Neuro SICU),careunit_Neurology,careunit_PACU,careunit_Surgical Intensive Care Unit (SICU),careunit_Transplant,careunit_Trauma SICU (TSICU),careunit_Vascular
139,23721604.0,32259566,2179-03-27 19:48:00,-1 days +19:48:00,0 days 05:33:00,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
60,24540843.0,34531557,2117-03-14 17:35:24,0 days 00:00:58,0 days 01:00:26,0,1,0,0,0,...,0,0,0,1,0,0,0,0,0,0
204,27189241.0,30272878,2131-05-22 21:50:33,-1 days +22:44:00,0 days 01:17:33,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
186,29483621.0,34948767,2136-11-04 22:12:00,-1 days +19:25:00,0 days 06:04:00,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
207,27996267.0,36822441,2148-01-24 04:50:17,0 days 02:05:57,0 days 14:26:20,0,0,1,0,0,...,0,0,0,0,0,0,0,0,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
106,28477280.0,30458995,2137-10-14 17:08:34,0 days 00:01:57,1 days 18:23:37,0,1,0,0,0,...,0,0,0,0,0,0,1,0,0,0
14,28889419.0,32206630,2125-02-27 10:03:08,-1 days +20:59:41,0 days 05:48:27,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
92,24698912.0,31626291,2192-05-12 09:31:00,-1 days +08:44:00,0 days 17:05:00,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
179,29483621.0,37667168,2136-11-10 21:15:15,0 days 01:29:00,5 days 23:03:15,0,1,0,0,0,...,0,1,0,0,0,0,0,0,0,0


#### Dimensionality reduction

In [294]:
# fine

### Datetimeevents

In [None]:
file = "icu/datetimeevents.csv"
full_path = path + file

df_datetime_events = pd.read_csv(full_path)

In [None]:
df_datetime_events

In [None]:
df_datetime_events = df_datetime_events[df_datetime_events['hadm_id'].isin(df_procedures['hadm_id'])]

# For each sample, get the rows from df_procedures that have the same hadm_id
# Go through this list and if the event time is between the in and out transfer time, assign it the same icd_code

In [None]:
# convert time to datetime
df_datetime_events['value'] = pd.to_datetime(df_datetime_events['value'])

In [None]:
df_datetime_events['icd_code'] = float('nan')
for index, row in df_datetime_events.iterrows():
    # Filter target df based on 'hadm_id'
    df_procedures_subset = df_procedures[df_procedures['hadm_id'] == row['hadm_id']]

    datetime_value = row['value']
    
    # Filter out datetime values in the second dataframe that are later than the datetime value in the first dataframe
    filtered_procedures = df_procedures_subset[df_procedures_subset['chartdate'] > datetime_value]
    
    if not filtered_procedures.empty:

        # Find the closest datetime value in the filtered second dataframe
        closest_datetime = filtered_procedures['chartdate'].min()

        # Get the id of the sample with the closest datetime value
        closest_id = filtered_procedures[filtered_procedures['chartdate'] == closest_datetime]['icd_code']
        code = pd.array(closest_id)[0]

        # Assign the id to the current row in the first dataframe
        df_datetime_events.at[index, 'icd_code'] = str(code)

df_datetime_events.dropna(subset=['icd_code'], inplace=True)

In [None]:
df_datetime_events

In [None]:
# Drop 
df_datetime_events = df_datetime_events.drop(columns=['warning','value','subject_id','stay_id','hadm_id','caregiver_id',
                                                     'charttime','storetime','valueuom'])

In [None]:
# Encode
df_datetime_events = pd.get_dummies(df_datetime_events, columns=['itemid'])

In [None]:
df_datetime_events 

#### Split into train and test

In [None]:
data = df_datetime_events.drop(columns=['icd_code'])

target = df_datetime_events['icd_code']


# Split the dataset into training and testing sets
datetime_events_data_train, datetime_events_data_test, datetime_events_label_train, datetime_events_label_test = train_test_split(data, target, test_size=0.2, random_state=42)

# Print the shapes of the resulting training and testing sets
print("Training set shape:", datetime_events_data_train.shape, datetime_events_label_train.shape)
print("Testing set shape:", datetime_events_data_test.shape, datetime_events_label_test.shape)

In [None]:
# uncomment and run if changes are made

datetime_events_data_train.to_csv('datetime_events_data_train.csv', index=False)
datetime_events_data_test.to_csv('datetime_events_data_test.csv', index=False)

datetime_events_label_train.to_csv('datetime_events_label_train.csv', index=False)
datetime_events_label_test.to_csv('datetime_events_label_test.csv', index=False)

In [None]:
datetime_events_data_train

#### Dimensionality reduction

In [None]:
# Fine