In [1]:
# External libraries for data processing
import numpy as np
import pandas as pd
import sklearn as sk
#To render graphs within notebook
%matplotlib inline
import matplotlib.pyplot as plt
import joblib 
import os

# Versions of libraries
print("Numpy version: {}".format(np.__version__))
print("Pandas version: {}".format(pd.__version__))
print("Scikit version: {}".format(sk.__version__))

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MultiLabelBinarizer

Numpy version: 1.24.3
Pandas version: 1.5.3
Scikit version: 1.3.0


In [2]:
path = "C:/Project/Data/"

In [3]:
file = "hosp/admissions.csv"
full_path = path + file

df_admissions = pd.read_csv(full_path)

df_admissions['dischtime'] = pd.to_datetime(df_admissions['dischtime'], format='%d/%m/%Y %H:%M')
df_admissions['admittime'] = pd.to_datetime(df_admissions['admittime'], format='%d/%m/%Y %H:%M')

df_admittime= pd.DataFrame()
df_admittime['hadm_id'] = df_admissions['hadm_id']
df_admittime['admittime'] = df_admissions['admittime']

### Target variable calculation

In [4]:
# admission id and one hot encoding of the icd_codes (or drgcodes) related to it
# first one-hot encode the code column and then aggregate by id

In [5]:
file = "hosp/diagnoses_icd.csv"
full_path = path + file

df_diagnoses = pd.read_csv(full_path)

In [6]:
df_diagnoses.head()

Unnamed: 0,subject_id,hadm_id,seq_num,icd_code,icd_version
0,10035185,22580999,3,4139,9
1,10035185,22580999,10,V707,9
2,10035185,22580999,1,41401,9
3,10035185,22580999,9,3899,9
4,10035185,22580999,11,V8532,9


In [7]:
df_diagnoses = df_diagnoses.drop(columns=['subject_id','seq_num','icd_version'])

In [8]:
df_diagnoses.head()

Unnamed: 0,hadm_id,icd_code
0,22580999,4139
1,22580999,V707
2,22580999,41401
3,22580999,3899
4,22580999,V8532


In [9]:
one_hot_encoded = pd.get_dummies(df_diagnoses['icd_code'])

df_encoded = pd.concat([df_diagnoses[['hadm_id']], one_hot_encoded], axis=1)

df_aggregated = df_encoded.groupby('hadm_id').sum().reset_index()

# If you want to replace NaN values (where the category did not appear for a particular ID) with 0
df_aggregated = df_aggregated.fillna(0)

In [10]:
df_diagnoses = df_aggregated
df_diagnoses

Unnamed: 0,hadm_id,00845,0088,0380,0383,03842,03843,03849,0388,0389,...,Z95810,Z95820,Z961,Z96651,Z980,Z981,Z9884,Z9911,Z992,Z9981
0,20044587,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,20093566,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,1,0,1,0
2,20192635,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,20199380,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,20214994,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
270,29820177,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
271,29839885,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
272,29842315,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,1,1
273,29858644,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [11]:
file = "hosp/drgcodes.csv"
full_path = path + file

df_drgcodes = pd.read_csv(full_path)

In [12]:
df_drgcodes.head()

Unnamed: 0,subject_id,hadm_id,drg_type,drg_code,description,drg_severity,drg_mortality
0,10004235,22187210,HCFA,864,FEVER,,
1,10026255,22059910,HCFA,180,RESPIRATORY NEOPLASMS W MCC,,
2,10032725,20611640,HCFA,54,NERVOUS SYSTEM NEOPLASMS W MCC,,
3,10005866,21636229,HCFA,393,OTHER DIGESTIVE SYSTEM DIAGNOSES W MCC,,
4,10008454,20291550,HCFA,956,"LIMB REATTACHMENT, HIP & FEMUR PROC FOR MULTIP...",,


In [13]:
df_drgcodes = df_drgcodes.drop(columns=['subject_id','drg_type','description','drg_severity','drg_mortality'])

In [14]:
one_hot_encoded = pd.get_dummies(df_drgcodes['drg_code'])

df_encoded = pd.concat([df_drgcodes[['hadm_id']], one_hot_encoded], axis=1)

df_aggregated = df_encoded.groupby('hadm_id').sum().reset_index()

# If you want to replace NaN values (where the category did not appear for a particular ID) with 0
df_aggregated = df_aggregated.fillna(0)

In [15]:
df_drgcodes = df_aggregated
df_drgcodes

Unnamed: 0,hadm_id,14,20,21,22,23,24,25,26,27,...,950,951,952,956,957,981,982,983,987,988
0,20044587,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,20093566,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,20192635,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,20199380,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,20214994,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
228,29802992,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
229,29839885,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
230,29842315,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
231,29858644,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


### admissions

In [None]:
file = "hosp/admissions.csv"
full_path = path + file

df_admissions = pd.read_csv(full_path)

In [None]:
# df_admissions['subject_id'].value_counts().head(50)

In [None]:
df_admissions.head()

To drop: subject_id, admittime, dischtime, deathtime, hospital_expire_flag

In [None]:
# Make an ed_duration feature for edouttime - edregtime (how long the patient stayed in the emergency department)

# Convert to datetime
df_admissions['edouttime'] = pd.to_datetime(df_admissions['edouttime'], format='%d/%m/%Y %H:%M')
df_admissions['edregtime'] = pd.to_datetime(df_admissions['edregtime'], format='%d/%m/%Y %H:%M')

df_admissions['ed_duration'] = df_admissions['edouttime'] - df_admissions['edregtime']

# Fill any non time values
df_admissions['ed_duration'] = df_admissions['ed_duration'].fillna(pd.Timedelta(0))

In [None]:
df_admissions = df_admissions.drop(columns=['subject_id', 'admittime', 'dischtime', 'deathtime', 'hospital_expire_flag'
                            , 'edregtime', 'edouttime', 'admit_provider_id','discharge_location'])

In [None]:
# Fill Null with N/A and then one hot encode
df_admissions['marital_status'] = df_admissions['marital_status'].fillna('N/A')
df_admissions = pd.get_dummies(df_admissions, columns=['admission_type', 'admission_location', 
                                                      'insurance','language', 'marital_status','race'])

In [None]:
df_admissions

#### Split into train and test

In [None]:
data = df_admissions

# Split the dataset into training and testing sets
admissions_data_train, admissions_data_test = train_test_split(data, test_size=0.2, random_state=42)

# Print the shapes of the resulting training and testing sets
print("Training set shape:", admissions_data_train.shape)
print("Testing set shape:", admissions_data_test.shape)

In [None]:
# uncomment and run if changes are made

admissions_data_train.to_csv('admissions_data_train.csv', index=False)
admissions_data_test.to_csv('admissions_data_test.csv', index=False)

#### Dimensionality reduction

In [None]:
# Fine

### emar

In [None]:
# records for 65 different patients 
# 181 unique admissions

In [None]:
file = "hosp/emar.csv"
full_path = path + file

df_emar = pd.read_csv(full_path)

In [None]:
df_emar.info()
# print(df_emar.columns.tolist())

In [None]:
df_emar.head()

Impute with N/A and encode: enter_provider_id, medication

Drop: subject_id, emar_id, poe_id, pharmacy_id, event_txt, storetime

poe_id is an identifier which links administrations in emar to orders in poe and prescriptions
storetime is when it was recorded in the table

In [None]:
# Make a feature called delay using scheduletime - charttime

# Convert to datetime
df_emar['scheduletime'] = pd.to_datetime(df_emar['scheduletime'], format='%Y/%m/%d %H:%M')
df_emar['charttime'] = pd.to_datetime(df_emar['charttime'], format='%Y/%m/%d %H:%M')

df_emar['delay'] = df_emar['charttime'] - df_emar['scheduletime']

# Fill any non time values
df_emar['delay'] = df_emar['delay'].fillna(pd.Timedelta(0))

In [None]:
df_emar = df_emar.drop(columns=['subject_id','emar_id','poe_id','pharmacy_id',
                               'event_txt','charttime','scheduletime','storetime'])

In [None]:
# Fill Null with N/A and then one hot encode
df_emar['enter_provider_id'] = df_emar['enter_provider_id'].fillna('N/A')
df_emar['medication'] = df_emar['medication'].fillna('N/A')
df_emar = pd.get_dummies(df_emar, columns=['enter_provider_id', 'medication'])

In [None]:
df_emar.info()

In [None]:
df_emar['delay'].value_counts()

#### Split into train and test

In [None]:
data = df_emar

# Split the dataset into training and testing sets
emar_data_train, emar_data_test = train_test_split(data, test_size=0.2, random_state=42)

# Print the shapes of the resulting training and testing sets
print("Training set shape:", emar_data_train.shape)
print("Testing set shape:", emar_data_test.shape)

In [None]:
# uncomment and run if changes are made

emar_data_train.to_csv('emar_data_train.csv', index=False)
emar_data_test.to_csv('emar_data_test.csv', index=False)

#### Dimensionality reduction

In [None]:
# Fine

### emar_detail

In [None]:
file = "hosp/emar_detail.csv"
full_path = path + file

df_emar_detail = pd.read_csv(full_path,low_memory=False)

Fields that have lots of null values:
reason_for_no_barcode: drop
prior_infusion_rate: impute with zeroes
infusion_rate: impute with zeroes
infusion_rate_adjustment: impute with 'N/A', then one hot encoding
infusion_rate_adjustment_amount: impute with zeroes
infusion_rate_unit: impute with 'N/A', then one hot encoding
infusion_complete: impute with 'N/A', then one hot encoding
completion_interval: impute with 0, then ordinal encoding 
new_iv_bag_hung: impute with N, then binary encoding 

Text data to remove but maybe consider later:
product_description, product_description_other

In [None]:
df_emar_detail = df_emar_detail.drop(columns=['reason_for_no_barcode']) # Too hard to encode, adds not much value

In [None]:
# Impute with 0s
df_emar_detail['prior_infusion_rate'] = df_emar_detail['prior_infusion_rate'].fillna(0)
df_emar_detail['infusion_rate'] = df_emar_detail['infusion_rate'].fillna(0)
df_emar_detail['infusion_rate_adjustment_amount'] = df_emar_detail['infusion_rate_adjustment_amount'].fillna(0)

In [None]:
# Impute with N/A and encode
df_emar_detail['infusion_rate_adjustment'] = df_emar_detail['infusion_rate_adjustment'].fillna('N/A')
df_emar_detail['infusion_rate_unit'] = df_emar_detail['infusion_rate_unit'].fillna('N/A')
df_emar_detail['infusion_complete'] = df_emar_detail['infusion_complete'].fillna('N/A')
df_emar_detail = pd.get_dummies(df_emar_detail, columns=['infusion_rate_adjustment','infusion_complete',
                                                         'infusion_rate_unit'])

In [None]:
df_emar_detail['completion_interval'] = df_emar_detail['completion_interval'].fillna(0)
df_emar_detail['completion_interval'] = df_emar_detail['completion_interval'].replace('PRN', 0)
#Converting all the intervals to minutes
df_emar_detail['completion_interval'] = df_emar_detail['completion_interval'].replace('within 2 hours', 120)
df_emar_detail['completion_interval'] = df_emar_detail['completion_interval'].replace('within 4 hours', 240)
df_emar_detail['completion_interval'] = df_emar_detail['completion_interval'].replace('within 1 hour', 60)
df_emar_detail['completion_interval'] = df_emar_detail['completion_interval'].replace('within 1.5 hours', 90)
df_emar_detail['completion_interval'] = df_emar_detail['completion_interval'].replace('within 8 hours', 480)
df_emar_detail['completion_interval'] = df_emar_detail['completion_interval'].replace('within 15 minutes', 15)
df_emar_detail['completion_interval'] = df_emar_detail['completion_interval'].replace('within 12 hours', 720)
df_emar_detail['completion_interval'] = df_emar_detail['completion_interval'].replace('within 30 minutes', 30)
df_emar_detail['completion_interval'] = df_emar_detail['completion_interval'].replace('within 24 hours', 1140)
df_emar_detail['completion_interval'] = df_emar_detail['completion_interval'].replace('within 1 minutes', 1)
df_emar_detail['completion_interval'] = df_emar_detail['completion_interval'].replace('within 14 hours', 840)
df_emar_detail['completion_interval'] = df_emar_detail['completion_interval'].replace('within 7 hours', 420)
df_emar_detail['completion_interval'] = df_emar_detail['completion_interval'].replace('within 5 hours', 300)
df_emar_detail['completion_interval'] = df_emar_detail['completion_interval'].replace('within 3 hours', 180)

In [None]:
df_emar_detail['new_iv_bag_hung'] = df_emar_detail['new_iv_bag_hung'].fillna('N')

In [None]:
# Binary encoding
df_emar_detail['new_iv_bag_hung'] = df_emar_detail['new_iv_bag_hung'].map({'Y': 1, 'N': 0})

In [None]:
# Impute with N/A and one hot encode:
# administration_type
# barcode_type
# complete_dose_not_given
# dose_due_unit
# dose_given_unit
# will_remainder_of_dose_be_given
# product_unit
# product_code
# route
# side
# site

In [None]:
df_emar_detail['administration_type'] = df_emar_detail['administration_type'].fillna('N/A')
df_emar_detail['barcode_type'] = df_emar_detail['barcode_type'].fillna('N/A')
df_emar_detail['complete_dose_not_given'] = df_emar_detail['complete_dose_not_given'].fillna('N/A')
df_emar_detail['dose_due_unit'] = df_emar_detail['dose_due_unit'].fillna('N/A')
df_emar_detail['dose_given_unit'] = df_emar_detail['dose_given_unit'].fillna('N/A')
df_emar_detail['will_remainder_of_dose_be_given'] = df_emar_detail['will_remainder_of_dose_be_given'].fillna('N/A')
df_emar_detail['product_unit'] = df_emar_detail['product_unit'].fillna('N/A')
df_emar_detail['product_code'] = df_emar_detail['product_code'].fillna('N/A')
df_emar_detail['route'] = df_emar_detail['route'].fillna('N/A')
df_emar_detail['side'] = df_emar_detail['side'].fillna('N/A')
df_emar_detail['site'] = df_emar_detail['site'].fillna('N/A')
df_emar_detail = pd.get_dummies(df_emar_detail, columns=['administration_type','barcode_type','complete_dose_not_given',
                                                        'dose_due_unit','dose_given_unit',
                                                        'will_remainder_of_dose_be_given','product_unit','product_code',
                                                        'route','side','site'])

In [None]:
# Impute with zeroes:
# dose_due and dose_given, but also need to deal with some of them being ranges
# product_amount_given
# restart_interval, then ordinal encoding

In [None]:
df_emar_detail['product_amount_given'] = df_emar_detail['product_amount_given'].fillna(0)
df_emar_detail['dose_due'] = df_emar_detail['dose_due'].fillna(0)
df_emar_detail['dose_given'] = df_emar_detail['dose_given'].fillna(0)
df_emar_detail['restart_interval'] = df_emar_detail['restart_interval'].fillna(0)

In [None]:
df_emar_detail['dose_due'] = df_emar_detail['dose_due'].astype(str)
df_emar_detail['dose_given'] = df_emar_detail['dose_given'].astype(str)

In [None]:
def find_middle_value(range_string):
    if '-' in range_string:
        start, end = map(float, range_string.split('-'))
        return (start + end) / 2
    else:
        return range_string

df_emar_detail['dose_due'] = df_emar_detail['dose_due'].apply(find_middle_value)
df_emar_detail['dose_given'] = df_emar_detail['dose_given'].apply(find_middle_value)

In [None]:
df_emar_detail['restart_interval'] = df_emar_detail['restart_interval'].replace('PRN', 0)
#Converting all the intervals to minutes
df_emar_detail['restart_interval'] = df_emar_detail['restart_interval'].replace('within 2 hours', 120)
df_emar_detail['restart_interval'] = df_emar_detail['restart_interval'].replace('within 4 hours', 240)
df_emar_detail['restart_interval'] = df_emar_detail['restart_interval'].replace('within 1 hour', 60)
df_emar_detail['restart_interval'] = df_emar_detail['restart_interval'].replace('within 30 minutes', 30)
df_emar_detail['restart_interval'] = df_emar_detail['restart_interval'].replace('within 24 hours', 1140)

In [None]:
# Impute with N and map to binary encoding:
# continued_infusion_in_other_location
# non_formulary_visual_verification

In [None]:
df_emar_detail['continued_infusion_in_other_location'] = df_emar_detail['continued_infusion_in_other_location'].fillna('N')
df_emar_detail['non_formulary_visual_verification'] = df_emar_detail['non_formulary_visual_verification'].fillna('N')
# Binary encoding
df_emar_detail['continued_infusion_in_other_location'] = df_emar_detail['continued_infusion_in_other_location'].map({'Y': 1, 'N': 0})
df_emar_detail['non_formulary_visual_verification'] = df_emar_detail['non_formulary_visual_verification'].map({'Y': 1, 'N': 0})

In [None]:
df_emar_detail = df_emar_detail.drop(columns=['pharmacy_id']) # Contains NaN values 

In [None]:
df_emar_detail = df_emar_detail.drop(columns=['emar_id']) # Practically unique

In [None]:
# Replace blanks with zero
df_emar_detail['dose_due'] = df_emar_detail['dose_due'].replace('___', 0)
df_emar_detail['dose_given'] = df_emar_detail['dose_given'].replace('___', 0)

In [None]:
df_emar_detail['dose_due'] = df_emar_detail['dose_due'].astype(float)
df_emar_detail['dose_given'] = df_emar_detail['dose_given'].astype(float)

In [None]:
# Impute with N/A or 0
# One hot encode the categorical features 

df_emar_detail['product_description'] = df_emar_detail['product_description'].fillna('N/A')
df_emar_detail['product_description_other'] = df_emar_detail['product_description_other'].fillna('N/A')
df_emar_detail['parent_field_ordinal'] = df_emar_detail['parent_field_ordinal'].fillna(0)
df_emar_detail = pd.get_dummies(df_emar_detail, columns=['product_description_other','product_description'])

In [None]:
df_emar_detail

#### Split into train and test

In [None]:
data = df_emar_detail

# Split the dataset into training and testing sets
emar_detail_data_train, emar_detail_data_test = train_test_split(data, test_size=0.2, random_state=42)

# Print the shapes of the resulting training and testing sets
print("Training set shape:", emar_detail_data_train.shape)
print("Testing set shape:", emar_detail_data_test.shape)

In [None]:
# uncomment and run if changes are made

emar_detail_data_train.to_csv('emar_detail_data_train.csv', index=False)
emar_detail_data_test.to_csv('emar_detail_data_test.csv', index=False)

#### Dimensionality reduction

In [None]:
# Fine

### hcpcsevents

Contains info for 18 different patients 

In [None]:
# d_hcpcs has longer descriptions (connected by code) but no other useful info 

In [None]:
file = "hosp/hcpcsevents.csv"
full_path = path + file

df_hcpcsevents = pd.read_csv(full_path)

In [None]:
df_hcpcsevents.head()

In [None]:
# patient, admission, date, uniquely identifying billed code, sequence number, description

To drop: subject_id, chartdate, hcpcs_cd (code that links to longer description in d_hcpcs)

In [None]:
df_hcpcsevents

In [None]:
# Make a feature for days_since_admission using chartdate - admittime

# Convert to datetime
df_hcpcsevents['chartdate'] = pd.to_datetime(df_hcpcsevents['chartdate'], format='%Y/%m/%d %H:%M')

# Add admittime column from other dataframe
df_hcpcsevents = df_hcpcsevents.merge(df_admittime, on='hadm_id', how='left')

# Discard the time part and keep only the date
df_hcpcsevents['admittime'] = df_hcpcsevents['admittime'].dt.date
df_hcpcsevents['chartdate'] = df_hcpcsevents['chartdate'].dt.date

df_hcpcsevents['days_since_admission'] = df_hcpcsevents['chartdate'] - df_hcpcsevents['admittime']

# Fill any non time values
df_hcpcsevents['days_since_admission'] = df_hcpcsevents['days_since_admission'].fillna(pd.Timedelta(0))

In [None]:
df_hcpcsevents['days_since_admission'].value_counts()

In [None]:
df_hcpcsevents = df_hcpcsevents.drop(columns=['subject_id','chartdate','hcpcs_cd'])
# Not enough samples to include code as after encoding there would be a lot more features 

In [None]:
df_hcpcsevents = pd.get_dummies(df_hcpcsevents, columns=['short_description'])

In [None]:
df_hcpcsevents

#### Split into train and test

In [None]:
data = df_hcpcsevents

# Split the dataset into training and testing sets
hcpcsevents_data_train, hcpcsevents_data_test = train_test_split(data, test_size=0.2, random_state=42)

# Print the shapes of the resulting training and testing sets
print("Training set shape:", hcpcsevents_data_train.shape)
print("Testing set shape:", hcpcsevents_data_test.shape)

In [None]:
# uncomment and run if changes are made

hcpcsevents_data_train.to_csv('hcpcsevents_data_train.csv', index=False)
hcpcsevents_data_test.to_csv('hcpcsevents_data_test.csv', index=False)

#### Dimensionality reduction

In [None]:
# Need to reduce from 15 to 9

### labevents

In [None]:
# Information regarding 252 different admissions

In [None]:
file = "hosp/labevents.csv"
full_path = path + file

df_labevents = pd.read_csv(full_path)

In [None]:
df_labevents['value'] = pd.to_numeric(df_labevents['value'], errors='coerce').fillna(0)

In [None]:
df_labevents.info()

In [None]:
df_labevents['storetime']

In [None]:
# Make a feature for days_since_admission using charttime - admittime

# Convert to datetime
df_labevents['charttime'] = pd.to_datetime(df_labevents['charttime'], format='%Y/%m/%d %H:%M')

# Add admittime column from other dataframe
df_labevents = df_labevents.merge(df_admittime, on='hadm_id', how='left')

# # Discard the time part and keep only the date
# df_hcpcsevents['admittime'] = df_hcpcsevents['admittime'].dt.date
# df_hcpcsevents['chartdate'] = df_hcpcsevents['chartdate'].dt.date

df_labevents['days_since_admission'] = df_labevents['charttime'] - df_labevents['admittime']

# Fill any non time values
df_labevents['days_since_admission'] = df_labevents['days_since_admission'].fillna(pd.Timedelta(0))

In [None]:
# Add storetime - charttime feature called delay

# Convert to datetime
df_labevents['storetime'] = pd.to_datetime(df_labevents['storetime'], format='%Y/%m/%d %H:%M')

df_labevents['delay'] = df_labevents['storetime'] - df_labevents['charttime']

# Fill any non time values
df_labevents['delay'] = df_labevents['delay'].fillna(pd.Timedelta(0))

Drop: labevent_id, subject_id, order_provider_id (too many Null), charttime, storetime, comments

In [None]:
df_labevents = df_labevents.drop(columns=['labevent_id','subject_id','order_provider_id','charttime','storetime','comments'])

In [None]:
# For flag make abnormal = 1 and fill Null with 0
df_labevents['flag'] = df_labevents['flag'].fillna(0)
df_labevents['flag'] = df_labevents['flag'].replace('abnormal', 1)

In [None]:
# For priority fill Null with N/A and then one hot encode
df_labevents['priority'] = df_labevents['priority'].fillna('N/A')
df_labevents = pd.get_dummies(df_labevents, columns=['priority'])

In [None]:
df_labevents = pd.get_dummies(df_labevents, columns=['valueuom','specimen_id','itemid'])

In [None]:
# Drop any rows with null values 
df_labevents = df_labevents.dropna()
# Reduced from 107727 rows to 66660

#### Split into train and test

In [None]:
data = df_labevents

# Split the dataset into training and testing sets
labevents_data_train, labevents_data_test = train_test_split(data, test_size=0.2, random_state=42)

# Print the shapes of the resulting training and testing sets
print("Training set shape:", labevents_data_train.shape)
print("Testing set shape:", labevents_data_test.shape)

In [None]:
# uncomment and run if changes are made

labevents_data_train.to_csv('labevents_data_train.csv', index=False)
labevents_data_test.to_csv('labevents_data_test.csv', index=False)

#### Dimensionality reduction

In [None]:
# Need to reduce from 11681 to 10665

### microbiologyevents

In [None]:
file = "hosp/microbiologyevents.csv"
full_path = path + file

df_microbio = pd.read_csv(full_path)

In [None]:
df_microbio.head(5)

In [None]:
# make days_since_admission using charttime 

# Convert to datetime
df_microbio['charttime'] = pd.to_datetime(df_microbio['charttime'], format='%Y/%m/%d %H:%M')

# Add admittime column from other dataframe
df_microbio = df_microbio.merge(df_admittime, on='hadm_id', how='left')

# # Discard the time part and keep only the date
# df_hcpcsevents['admittime'] = df_hcpcsevents['admittime'].dt.date
# df_hcpcsevents['chartdate'] = df_hcpcsevents['chartdate'].dt.date

df_microbio['days_since_admission'] = df_microbio['charttime'] - df_microbio['admittime']

# Fill any non time values
df_microbio['days_since_admission'] = df_microbio['days_since_admission'].fillna(pd.Timedelta(0))

# Drop the admission time column
df_microbio = df_microbio.drop(columns=['admittime'])

In [None]:
# Add storetime - charttime feature (call it delay)

# Convert to datetime
df_microbio['storetime'] = pd.to_datetime(df_microbio['storetime'], format='%Y/%m/%d %H:%M')

df_microbio['delay'] = df_microbio['storetime'] - df_microbio['charttime']

# Fill any non time values
df_microbio['delay'] = df_microbio['delay'].fillna(pd.Timedelta(0))

Drop: microevent_id, subject_id, chartdate, charttime, test_seq, storedate, storetime, test_name and org_itemid (since info in name), quantity, ab_name, comments, micro_specimen_id (unique identifier for sample as some measurements are made on the same sample)
Keep but categorical: order_provider_id, spec_type_desc, dilution_text, dilution_comparison
Impute null with 0: order_provider_id, org_itemid, isolate_num, ab_itemid, dilution_value
Impute with N/A and then one hot encode: interpretation

In [None]:
# Drop
# spec_itemid , test_itemid
df_microbio = df_microbio.drop(columns=['microevent_id','subject_id','chartdate','charttime','test_seq','storedate',
                                       'storetime','quantity','comments','ab_itemid',
                                       'spec_itemid','test_itemid','org_itemid','micro_specimen_id'])

In [None]:
# Impute null with 0: order_provider_id, org_itemid, isolate_num, ab_itemid, dilution_value
df_microbio['order_provider_id'] = df_microbio['order_provider_id'].fillna(0)
df_microbio['isolate_num'] = df_microbio['isolate_num'].fillna(0)
df_microbio['dilution_value'] = df_microbio['dilution_value'].fillna(0)

In [None]:
# Impute with N/A and then one hot encode: interpretation
# encode test_name, ab_name

df_microbio['interpretation'] = df_microbio['interpretation'].fillna('N/A')
df_microbio['test_name'] = df_microbio['test_name'].fillna('N/A')
df_microbio['ab_name'] = df_microbio['ab_name'].fillna('N/A')
df_microbio['org_name'] = df_microbio['org_name'].fillna('None')
df_microbio = pd.get_dummies(df_microbio, columns=['org_name','interpretation','ab_name','test_name'])

In [None]:
# Keep but categorical: order_provider_id, spec_type_desc, dilution_text, dilution_comparison
df_microbio = pd.get_dummies(df_microbio, columns=['order_provider_id','spec_type_desc','dilution_text',
                                                  'dilution_comparison'])

In [None]:
df_microbio = df_microbio.dropna()

In [None]:
df_microbio

#### Split into train and test

In [None]:
data = df_microbio

# Split the dataset into training and testing sets
microbio_data_train, microbio_data_test = train_test_split(data, test_size=0.2, random_state=42)

# Print the shapes of the resulting training and testing sets
print("Training set shape:", microbio_data_train.shape)
print("Testing set shape:", microbio_data_test.shape)

In [None]:
# uncomment and run if changes are made

microbio_data_train.to_csv('microbio_data_train.csv', index=False)
microbio_data_test.to_csv('microbio_data_test.csv', index=False)

In [None]:
microbio_data_train

#### Dimensionality reduction

In [None]:
# Fine

### patients

In [None]:
file = "hosp/patients.csv"
full_path = path + file

df_patients = pd.read_csv(full_path)

In [None]:
df_patients

In [None]:
df_patients['anchor_age'].value_counts

Drop: anchor_year
Encode: gender (M to 0 and F to 1), dod (change all to 1 and nulls to 0)
Dummies: anchor_year_group  

In [None]:
# Drop
df_patients = df_patients.drop(columns=['anchor_year','dod']) 
# Since this is the shifted year and dod is an outcome value

In [None]:
# Encode: gender (M to 0 and F to 1), dod (change all to 1 and nulls to 0)
df_patients['gender'] = df_patients['gender'].replace('M', 0)
df_patients['gender'] = df_patients['gender'].replace('F', 1)

In [None]:
# Dummies: anchor_year_group  
df_patients = pd.get_dummies(df_patients, columns=['anchor_year_group'])

In [None]:
df_patients

#### Split into train and test

In [None]:
data = df_patients

# Split the dataset into training and testing sets
patients_data_train, patients_data_test = train_test_split(data, test_size=0.2, random_state=42)

# Print the shapes of the resulting training and testing sets
print("Training set shape:", patients_data_train.shape)
print("Testing set shape:", patients_data_test.shape)

In [None]:
# uncomment and run if changes are made

patients_data_train.to_csv('patients_data_train.csv', index=False)
patients_data_test.to_csv('patients_data_test.csv', index=False)

#### Dimensionality reduction

In [None]:
# Fine

### pharmacy

In [None]:
file = "hosp/pharmacy.csv"
full_path = path + file

df_pharmacy = pd.read_csv(full_path)

In [None]:
df_pharmacy.head(2)

In [None]:
df_pharmacy.columns

drop: subject_id, pharmacy_id, poe_id, starttime, stoptime, entertime, verifiedtime, disp_sched, basal_rate, one_hr_max,
expirationdate, fill_quantity
Encode: proc_type, status
Impute with N/A and encode: infusion_type, sliding_scale, duration_interval, expiration_unit, dispensation, medication, route, frequency
Impute with 0: lockout_interval, doses_per_24_hrs, duration, expiration_value

In [None]:
# stoptime-starttime for a duration feature

# Convert to datetime
df_pharmacy['stoptime'] = pd.to_datetime(df_pharmacy['stoptime'], format='%Y/%m/%d %H:%M')
df_pharmacy['starttime'] = pd.to_datetime(df_pharmacy['starttime'], format='%Y/%m/%d %H:%M')


df_pharmacy['medication_duration'] = df_pharmacy['stoptime'] - df_pharmacy['starttime']

# Fill any non time values
df_pharmacy['medication_duration'] = df_pharmacy['medication_duration'].fillna(pd.Timedelta(0))

In [None]:
# verifiedtime - entertime for verification_delay feature 

# Convert to datetime
df_pharmacy['verifiedtime'] = pd.to_datetime(df_pharmacy['verifiedtime'], format='%Y/%m/%d %H:%M')
df_pharmacy['entertime'] = pd.to_datetime(df_pharmacy['entertime'], format='%Y/%m/%d %H:%M')

df_pharmacy['verification_delay'] = df_pharmacy['verifiedtime'] - df_pharmacy['entertime']

# Fill any non time values
df_pharmacy['verification_delay'] = df_pharmacy['verification_delay'].fillna(pd.Timedelta(0))

In [None]:
fill_value = [0] 

# Fill null values with the list
df_pharmacy['disp_sched'] = df_pharmacy['disp_sched'].fillna(pd.Series([fill_value]*len(df_pharmacy)))

In [None]:
# Convert all categories to strings
df_pharmacy['disp_sched'] = df_pharmacy['disp_sched'].apply(lambda x: [str(item) for item in x])

mlb = MultiLabelBinarizer()

encoded_feature = pd.DataFrame(mlb.fit_transform(df_pharmacy['disp_sched']),
                               columns=mlb.classes_,
                               index=df_pharmacy.index)

df_pharmacy = pd.concat([df_pharmacy, encoded_feature], axis=1)

In [None]:
# Drop 
df_pharmacy = df_pharmacy.drop(columns=['subject_id','pharmacy_id','poe_id','starttime','stoptime','entertime',
                                       'verifiedtime','expirationdate', 'fill_quantity','disp_sched'])
# expiration date and fill quantity are all empty

In [None]:
# Encode: proc_type, status
df_pharmacy = pd.get_dummies(df_pharmacy, columns=['proc_type','status'])

In [None]:
# Impute with N/A and encode
df_pharmacy['infusion_type'] = df_pharmacy['infusion_type'].fillna('N/A')
df_pharmacy['sliding_scale'] = df_pharmacy['sliding_scale'].fillna('N/A')
df_pharmacy['duration_interval'] = df_pharmacy['duration_interval'].fillna('N/A')
df_pharmacy['expiration_unit'] = df_pharmacy['expiration_unit'].fillna('N/A')
df_pharmacy['dispensation'] = df_pharmacy['dispensation'].fillna('N/A')
df_pharmacy['medication'] = df_pharmacy['medication'].fillna('N/A')
df_pharmacy['route'] = df_pharmacy['route'].fillna('N/A')
df_pharmacy['frequency'] = df_pharmacy['frequency'].fillna('N/A')
df_pharmacy = pd.get_dummies(df_pharmacy, columns=['infusion_type','sliding_scale','duration_interval','expiration_unit',
                                                  'dispensation','medication','route','frequency'])

In [None]:
# Impute with 0: lockout_interval, doses_per_24_hrs, duration, expiration_value
df_pharmacy['lockout_interval'] = df_pharmacy['lockout_interval'].fillna(0)
df_pharmacy['doses_per_24_hrs'] = df_pharmacy['doses_per_24_hrs'].fillna(0)
df_pharmacy['expiration_value'] = df_pharmacy['expiration_value'].fillna(0)
df_pharmacy['basal_rate'] = df_pharmacy['basal_rate'].fillna(0)
df_pharmacy['one_hr_max'] = df_pharmacy['one_hr_max'].fillna(0)

#### Split into train and test

In [None]:
data = df_pharmacy

# Split the dataset into training and testing sets
pharmacy_data_train, pharmacy_data_test = train_test_split(data, test_size=0.2, random_state=42)

# Print the shapes of the resulting training and testing sets
print("Training set shape:", pharmacy_data_train.shape)
print("Testing set shape:", pharmacy_data_test.shape)

In [None]:
# uncomment and run if changes are made

pharmacy_data_train.to_csv('pharmacy_data_train.csv', index=False)
pharmacy_data_test.to_csv('pharmacy_data_test.csv', index=False)

#### Dimensionality reduction

In [None]:
# Fine

### poe

In [None]:
file = "hosp/poe.csv"
full_path = path + file

df_poe = pd.read_csv(full_path)

To drop: poe_id, subject_id, ordertime, discontinue_of_poe_id, discontinued_by_poe_id (all unique), order_status (all inactive)
Encode: order_type, transaction_type
Impute with N/A and then encode: order_subtype, order_provider_id

In [None]:
# make a feature of ordertime - admittime for days_since_admission

# Convert to datetime
df_poe['ordertime'] = pd.to_datetime(df_poe['ordertime'], format='%Y/%m/%d %H:%M:%S')

# Add admittime column from other dataframe
df_poe = df_poe.merge(df_admittime, on='hadm_id', how='left')

# # Discard the time part and keep only the date
# df_hcpcsevents['admittime'] = df_hcpcsevents['admittime'].dt.date
# df_hcpcsevents['chartdate'] = df_hcpcsevents['chartdate'].dt.date

df_poe['days_since_admission'] = df_poe['ordertime'] - df_poe['admittime']

# Fill any non time values
df_poe['days_since_admission'] = df_poe['days_since_admission'].fillna(pd.Timedelta(0))

# Drop the admission time column
df_poe = df_poe.drop(columns=['admittime'])

In [None]:
# Drop 
df_poe = df_poe.drop(columns=['poe_id','subject_id','ordertime','discontinue_of_poe_id','discontinued_by_poe_id',
                                       'order_status'])

In [None]:
# Encode
df_poe = pd.get_dummies(df_poe, columns=['order_type','transaction_type'])

In [None]:
# Impute with N/A and encode
df_poe['order_subtype'] = df_poe['order_subtype'].fillna('N/A')
df_poe['order_provider_id'] = df_poe['order_provider_id'].fillna('N/A')
df_poe = pd.get_dummies(df_poe, columns=['order_subtype','order_provider_id'])

In [None]:
df_poe

#### Split into train and test

In [None]:
data = df_poe

# Split the dataset into training and testing sets
poe_data_train, poe_data_test = train_test_split(data, test_size=0.2, random_state=42)

# Print the shapes of the resulting training and testing sets
print("Training set shape:", poe_data_train.shape)
print("Testing set shape:", poe_data_test.shape)

In [None]:
# uncomment and run if changes are made

poe_data_train.to_csv('poe_data_train.csv', index=False)
poe_data_test.to_csv('poe_data_test.csv', index=False)

#### Dimensionality reduction

In [None]:
# Fine

### prescriptions

In [None]:
file = "hosp/prescriptions.csv"
full_path = path + file

df_prescriptions = pd.read_csv(full_path)

In [None]:
df_prescriptions.head()

Drop na and encode: dose_val_rx, form_val_disp, order_provider_id
Drop: subject_id, pharmacy_id, starttime, stoptime, form_rx (mostly null), poe_id
Impute with N/A and encode: formulary_drug_cd, gsn, prod_strength, route
Encode: drug_type, drug, dose_unit_rx, form_unit_disp
Impute with 0: doses_per_24_hrs

Drop rows with na

order_provider_id
Was going to impute with N/A and encode but going to drop as too many features 

In [None]:
# Make a feature of stoptime-starttime called duration 

# Convert to datetime
df_prescriptions['stoptime'] = pd.to_datetime(df_prescriptions['stoptime'], format='%Y/%m/%d %H:%M')
df_prescriptions['starttime'] = pd.to_datetime(df_prescriptions['starttime'], format='%Y/%m/%d %H:%M')

df_prescriptions['duration'] = df_prescriptions['stoptime'] - df_prescriptions['starttime']

# Fill any non time values
df_prescriptions['duration'] = df_prescriptions['duration'].fillna(pd.Timedelta(0))

In [None]:
# Drop na
df_prescriptions.dropna(subset=['dose_val_rx', 'form_val_disp'], inplace=True)

In [None]:
# Drop 
df_prescriptions = df_prescriptions.drop(columns=['subject_id','pharmacy_id','starttime','stoptime','form_rx','poe_id',
                                                 'order_provider_id'])

In [None]:
# Impute with N/A and encode
df_prescriptions['formulary_drug_cd'] = df_prescriptions['formulary_drug_cd'].fillna('N/A')
df_prescriptions['gsn'] = df_prescriptions['gsn'].fillna('N/A')
df_prescriptions['prod_strength'] = df_prescriptions['prod_strength'].fillna('N/A')
df_prescriptions['route'] = df_prescriptions['route'].fillna('N/A')

# Impute with 0
df_prescriptions['ndc'] = df_prescriptions['ndc'].fillna(0)

df_prescriptions = pd.get_dummies(df_prescriptions, columns=['formulary_drug_cd','gsn','prod_strength',
                                                            'route','drug_type','drug','dose_unit_rx','form_unit_disp',
                                                            'dose_val_rx','form_val_disp','ndc'])

In [None]:
df_prescriptions['doses_per_24_hrs'] = df_prescriptions['doses_per_24_hrs'].fillna(0)

In [None]:
# Drop any rows with null values 
df_prescriptions = df_prescriptions.dropna()

#### Split into train and test

In [None]:
data = df_prescriptions

# Split the dataset into training and testing sets
prescriptions_data_train, prescriptions_data_test= train_test_split(data, test_size=0.2, random_state=42)

# Print the shapes of the resulting training and testing sets
print("Training set shape:", prescriptions_data_train.shape)
print("Testing set shape:", prescriptions_data_test.shape)

In [None]:
# uncomment and run if changes are made

prescriptions_data_train.to_csv('prescriptions_data_train.csv', index=False)
prescriptions_data_test.to_csv('prescriptions_data_test.csv', index=False)

#### Dimensionality reduction

In [None]:
# Need to reduce from 4890 to 2874 or less

### procedures_icd

In [None]:
file = "hosp/procedures_icd.csv"
full_path = path + file

df_procedures = pd.read_csv(full_path)

In [None]:
df_procedures['icd_code'].value_counts()

Drop: subject_id, chartdate
Encode: icd_code

In [None]:
# make a feature called days_since_admission of chartdate - admitdate

# Convert to datetime
df_procedures['chartdate'] = pd.to_datetime(df_procedures['chartdate'], format='%Y-%m-%d')

# Add admittime column from other dataframe
df_procedures = df_procedures.merge(df_admittime, on='hadm_id', how='left')

# # Discard the time part and keep only the date
df_procedures['admittime'] = df_procedures['admittime'].dt.date
df_procedures['chartdate'] = df_procedures['chartdate'].dt.date

df_procedures['days_since_admission'] = df_procedures['chartdate'] - df_procedures['admittime']

# Fill any non time values
df_procedures['days_since_admission'] = df_procedures['days_since_admission'].fillna(pd.Timedelta(0))

# Drop the admission time column
df_procedures = df_procedures.drop(columns=['admittime'])

In [None]:
# Drop 
df_procedures = df_procedures.drop(columns=['subject_id','chartdate'])

In [None]:
# Encode
df_procedures = pd.get_dummies(df_procedures, columns=['icd_code'])

#### Split into train and test

In [None]:
data = df_procedures

# Split the dataset into training and testing sets
procedures_data_train, procedures_data_test = train_test_split(data, test_size=0.2, random_state=42)

# Print the shapes of the resulting training and testing sets
print("Training set shape:", procedures_data_train.shape)
print("Testing set shape:", procedures_data_test.shape)

In [None]:
# uncomment and run if changes are made

procedures_data_train.to_csv('procedures_data_train.csv', index=False)
procedures_data_test.to_csv('procedures_data_test.csv', index=False)

#### Dimensionality reduction

In [None]:
# Need to reduce from 355 to 115

### services

In [None]:
file = "hosp/services.csv"
full_path = path + file

df_services = pd.read_csv(full_path)

In [None]:
df_services.head()

Drop: subject_id, transfertime
Impute with N/A and encode: prev_service
Encode: curr_service

In [None]:
# Make a feature called days_since_admission using transfertime-admittime 

# Convert to datetime
df_services['transfertime'] = pd.to_datetime(df_services['transfertime'], format='%Y-%m-%d %H:%M:%S')

# Add admittime column from other dataframe
df_services = df_services.merge(df_admittime, on='hadm_id', how='left')

# # Discard the time part and keep only the date
# df_hcpcsevents['admittime'] = df_hcpcsevents['admittime'].dt.date
# df_hcpcsevents['chartdate'] = df_hcpcsevents['chartdate'].dt.date

df_services['days_since_admission'] = df_services['transfertime'] - df_services['admittime']

# Fill any non time values
df_services['days_since_admission'] = df_services['days_since_admission'].fillna(pd.Timedelta(0))

# Drop the admission time column
df_services = df_services.drop(columns=['admittime'])

In [None]:
# Drop 
df_services = df_services.drop(columns=['subject_id','transfertime'])

In [None]:
# Impute with N/A and encode
df_services['prev_service'] = df_services['prev_service'].fillna('N/A')
df_services = pd.get_dummies(df_services, columns=['prev_service','curr_service'])

#### Split into train and test

In [None]:
data = df_services

# Split the dataset into training and testing sets
services_data_train, services_data_test = train_test_split(data, test_size=0.2, random_state=42)

# Print the shapes of the resulting training and testing sets
print("Training set shape:", services_data_train.shape)
print("Testing set shape:", services_data_test.shape)

In [None]:
# uncomment and run if changes are made

services_data_train.to_csv('services_data_train.csv', index=False)
services_data_test.to_csv('services_data_test.csv', index=False)

#### Dimensionality reduction

In [None]:
# Fine

### transfers

In [None]:
file = "hosp/transfers.csv"
full_path = path + file

df_transfers = pd.read_csv(full_path)

In [None]:
df_transfers.head()

Drop: subject_id, transfer_id, intime, outtime
Encode: eventtype
Impute with N/A and encode: careunit

In [None]:
# Make a days_since_admission feature of intime-admittime

# Convert to datetime
df_transfers['intime'] = pd.to_datetime(df_transfers['intime'], format='%Y-%m-%d %H:%M:%S')

# Add admittime column from other dataframe
df_transfers = df_transfers.merge(df_admittime, on='hadm_id', how='left')

df_transfers['days_since_admission'] = df_transfers['intime'] - df_transfers['admittime']

# Fill any non time values
df_transfers['days_since_admission'] = df_transfers['days_since_admission'].fillna(pd.Timedelta(0))

# Drop the admission time column
df_transfers = df_transfers.drop(columns=['admittime'])

In [None]:
# Make a duration feature of outtime-intime 

# Convert to datetime
df_transfers['outtime'] = pd.to_datetime(df_transfers['outtime'], format='%Y-%m-%d %H:%M:%S')

df_transfers['duration'] = df_transfers['outtime'] - df_transfers['intime']

# Fill any non time values
df_transfers['duration'] = df_transfers['duration'].fillna(pd.Timedelta(0))

In [None]:
# Drop 
df_transfers = df_transfers.drop(columns=['subject_id','transfer_id','intime','outtime'])

In [None]:
# Impute with N/A and encode
df_transfers['careunit'] = df_transfers['careunit'].fillna('N/A')
df_transfers = pd.get_dummies(df_transfers, columns=['eventtype','careunit'])

#### Split into train and test

In [None]:
data = df_transfers

# Split the dataset into training and testing sets
transfers_data_train, transfers_data_test = train_test_split(data, test_size=0.2, random_state=42)

# Print the shapes of the resulting training and testing sets
print("Training set shape:", transfers_data_train.shape)
print("Testing set shape:", transfers_data_test.shape)

In [None]:
# uncomment and run if changes are made

transfers_data_train.to_csv('transfers_data_train.csv', index=False)
transfers_data_test.to_csv('transfers_data_test.csv', index=False)

#### Dimensionality reduction

In [None]:
# Fine

### chartevents

In [16]:
file = "icu/chartevents.csv"
full_path = path + file

df_chart = pd.read_csv(full_path)

In [17]:
df_chart['itemid'].value_counts()

227969    19330
220045    13913
220210    13913
220277    13540
220048    12460
          ...  
229448        1
227847        1
229592        1
225743        1
229160        1
Name: itemid, Length: 1318, dtype: int64

In [18]:
df_chart.head()

Unnamed: 0,subject_id,hadm_id,stay_id,caregiver_id,charttime,storetime,itemid,value,valuenum,valueuom,warning
0,10005817,20626031,32604416,6770.0,2132-12-16 00:00:00,2132-12-15 23:45:00,225054,On,,,0.0
1,10005817,20626031,32604416,6770.0,2132-12-16 00:00:00,2132-12-15 23:43:00,223769,100,100.0,%,0.0
2,10005817,20626031,32604416,6770.0,2132-12-16 00:00:00,2132-12-15 23:47:00,223956,Atrial demand,,,0.0
3,10005817,20626031,32604416,6770.0,2132-12-16 00:00:00,2132-12-15 23:47:00,224866,Yes,,,0.0
4,10005817,20626031,32604416,6770.0,2132-12-16 00:00:00,2132-12-15 23:45:00,227341,No,0.0,,0.0


Drop: subject_id, charttime, storetime, stay_id, caregiver_id (the person who documented the data)
Encode: value,itemid
Impute with 0: valuenum, warning
Impute with N/A and encode: valueuom

In [19]:
# Make a days_since_admission feature of charttime-admittime 

# Convert to datetime
df_chart['charttime'] = pd.to_datetime(df_chart['charttime'], format='%Y-%m-%d %H:%M:%S')

# Add admittime column from other dataframe
df_chart = df_chart.merge(df_admittime, on='hadm_id', how='left')

# # Discard the time part and keep only the date
# df_hcpcsevents['admittime'] = df_hcpcsevents['admittime'].dt.date
# df_hcpcsevents['chartdate'] = df_hcpcsevents['chartdate'].dt.date

df_chart['days_since_admission'] = df_chart['charttime'] - df_chart['admittime']

# Fill any non time values
df_chart['days_since_admission'] = df_chart['days_since_admission'].fillna(pd.Timedelta(0))

# Drop the admission time column
df_chart = df_chart.drop(columns=['admittime'])

In [20]:
# Make a delay feature of storetime-charttime

# Convert to datetime
df_chart['storetime'] = pd.to_datetime(df_chart['storetime'], format='%Y-%m-%d %H:%M:%S')

df_chart['delay'] = df_chart['storetime'] - df_chart['charttime']

# Fill any non time values
df_chart['delay'] = df_chart['delay'].fillna(pd.Timedelta(0))

In [21]:
# Drop 
df_chart = df_chart.drop(columns=['subject_id','charttime','storetime', 'stay_id','caregiver_id'])

In [22]:
# Impute with N/A and encode
df_chart['valueuom'] = df_chart['valueuom'].fillna('N/A')
df_chart = pd.get_dummies(df_chart, columns=['valueuom','value','itemid'])

MemoryError: Unable to allocate 2.64 GiB for an array with shape (4242, 668862) and data type uint8

In [None]:
# Impute with 0
df_chart['valuenum'] = df_chart['valuenum'].fillna(0)
df_chart['warning'] = df_chart['warning'].fillna(0)

In [None]:
# df_chart.describe()

#### Split into train and test

In [None]:
data = df_chart

# Split the dataset into training and testing sets
chart_data_train, chart_data_test = train_test_split(data, test_size=0.2, random_state=42)

# Print the shapes of the resulting training and testing sets
print("Training set shape:", chart_data_train.shape)
print("Testing set shape:", chart_data_test.shape)

In [23]:
# uncomment and run if changes are made

chart_data_train.to_csv('chart_data_train.csv', index=False)
chart_data_test.to_csv('chart_data_test.csv', index=False)

NameError: name 'chart_data_train' is not defined

#### Dimensionality reduction

In [None]:
# Fine

### icustays

In [24]:
file = "icu/icustays.csv"
full_path = path + file

df_icustays = pd.read_csv(full_path)

In [25]:
df_icustays.head()

Unnamed: 0,subject_id,hadm_id,stay_id,first_careunit,last_careunit,intime,outtime,los
0,10018328,23786647,31269608,Neuro Stepdown,Neuro Stepdown,2154-04-24 23:03:44,2154-05-02 15:55:21,7.702512
1,10020187,24104168,37509585,Neuro Surgical Intensive Care Unit (Neuro SICU),Neuro Stepdown,2169-01-15 04:56:00,2169-01-20 15:47:50,5.452662
2,10020187,26842957,32554129,Neuro Intermediate,Neuro Intermediate,2170-02-24 18:18:46,2170-02-25 15:15:26,0.872685
3,10012853,27882036,31338022,Trauma SICU (TSICU),Trauma SICU (TSICU),2176-11-26 02:34:49,2176-11-29 20:58:54,3.766725
4,10020740,25826145,32145159,Trauma SICU (TSICU),Trauma SICU (TSICU),2150-06-03 20:12:32,2150-06-04 21:05:58,1.037106


In [26]:
df_icustays['outtime'].value_counts()

2154-05-02 15:55:21    1
2141-12-18 14:16:17    1
2137-10-14 17:08:34    1
2118-11-19 20:34:51    1
2120-05-14 16:28:21    1
                      ..
2150-03-28 22:20:47    1
2129-01-05 14:11:03    1
2156-04-26 18:58:41    1
2131-03-08 18:30:38    1
2177-03-29 18:03:36    1
Name: outtime, Length: 140, dtype: int64

Drop: subject_id, stay_id, intime, outtime
Encode: first_careunit, last_careunit

In [27]:
# make a feature called days_since_admission using intime-admittime

# Convert to datetime
df_icustays['intime'] = pd.to_datetime(df_icustays['intime'], format='%Y-%m-%d %H:%M:%S')

# Add admittime column from other dataframe
df_icustays = df_icustays.merge(df_admittime, on='hadm_id', how='left')

# # Discard the time part and keep only the date
# df_hcpcsevents['admittime'] = df_hcpcsevents['admittime'].dt.date
# df_hcpcsevents['chartdate'] = df_hcpcsevents['chartdate'].dt.date

df_icustays['days_since_admission'] = df_icustays['intime'] - df_icustays['admittime']

# Fill any non time values
df_icustays['days_since_admission'] = df_icustays['days_since_admission'].fillna(pd.Timedelta(0))

# Drop the admission time column
df_icustays = df_icustays.drop(columns=['admittime'])

In [28]:
# Drop 
df_icustays = df_icustays.drop(columns=['subject_id','stay_id','intime','outtime'])

# Rename los to icu_los
df_icustays = df_icustays.rename(columns={'los': 'icu_los'})

In [29]:
# Encode
df_icustays = pd.get_dummies(df_icustays, columns=['first_careunit','last_careunit'])

#### Split into train and test

In [30]:
data = df_icustays

# Split the dataset into training and testing sets
icustays_data_train, icustays_data_test = train_test_split(data, test_size=0.2, random_state=42)

# Print the shapes of the resulting training and testing sets
print("Training set shape:", icustays_data_train.shape)
print("Testing set shape:", icustays_data_test.shape)

Training set shape: (112, 21)
Testing set shape: (28, 21)


In [31]:
# uncomment and run if changes are made

icustays_data_train.to_csv('icustays_data_train.csv', index=False)
icustays_data_test.to_csv('icustays_data_test.csv', index=False)

#### Dimensionality reduction

In [None]:
# Fine

### ingredientevents

In [32]:
file = "icu/ingredientevents.csv"
full_path = path + file

df_ingredient = pd.read_csv(full_path)

In [33]:
df_ingredient.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 25728 entries, 0 to 25727
Data columns (total 17 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   subject_id         25728 non-null  int64  
 1   hadm_id            25728 non-null  int64  
 2   stay_id            25728 non-null  int64  
 3   caregiver_id       25728 non-null  int64  
 4   starttime          25728 non-null  object 
 5   endtime            25728 non-null  object 
 6   storetime          25728 non-null  object 
 7   itemid             25728 non-null  int64  
 8   amount             25728 non-null  float64
 9   amountuom          25728 non-null  object 
 10  rate               16643 non-null  float64
 11  rateuom            16643 non-null  object 
 12  orderid            25728 non-null  int64  
 13  linkorderid        25728 non-null  int64  
 14  statusdescription  25728 non-null  object 
 15  originalamount     25728 non-null  int64  
 16  originalrate       257

In [34]:
df_ingredient['storetime']

0        2132-12-17 06:01:00
1        2132-12-17 06:01:00
2        2132-12-17 12:48:00
3        2132-12-17 12:48:00
4        2132-12-15 16:42:00
                ...         
25723    2153-03-28 23:22:00
25724    2153-03-28 02:58:00
25725    2153-03-28 02:58:00
25726    2153-03-29 20:58:00
25727    2153-03-29 20:58:00
Name: storetime, Length: 25728, dtype: object

Drop: subject_id, starttime, endtime, storetime, orderid, originalamount, stay_id, caregiver_id
Encode: amountuom, statusdescription, itemid
Impute with 0: rate
Impute with N/A and encode: rateuom, linkorderid

In [36]:
# Make a duration feature of endtime-starttime 

# Convert to datetime
df_ingredient['endtime'] = pd.to_datetime(df_ingredient['endtime'], format='%Y-%m-%d %H:%M:%S')
df_ingredient['starttime'] = pd.to_datetime(df_ingredient['starttime'], format='%Y-%m-%d %H:%M:%S')


df_ingredient['duration'] = df_ingredient['endtime'] - df_ingredient['starttime']

# Fill any non time values
df_ingredient['duration'] = df_ingredient['duration'].fillna(pd.Timedelta(0))

In [37]:
# make a recording_delay feature of storetime-endtime

# Convert to datetime
df_ingredient['storetime'] = pd.to_datetime(df_ingredient['storetime'], format='%Y-%m-%d %H:%M:%S')

df_ingredient['recording_delay'] = df_ingredient['storetime'] - df_ingredient['endtime']

# Fill any non time values
df_ingredient['recording_delay'] = df_ingredient['recording_delay'].fillna(pd.Timedelta(0))

In [38]:
# Drop 
df_ingredient = df_ingredient.drop(columns=['subject_id','starttime','endtime','storetime','orderid','originalamount',
                                           'stay_id','caregiver_id'])

In [39]:
# Impute with N/A and encode
df_ingredient['rateuom'] = df_ingredient['rateuom'].fillna('N/A')
df_ingredient['linkorderid'] = df_ingredient['linkorderid'].fillna('N/A')
df_ingredient = pd.get_dummies(df_ingredient, columns=['rateuom','amountuom','statusdescription','itemid','linkorderid'])

In [40]:
# Impute with 0
df_ingredient['rate'] = df_ingredient['rate'].fillna(0)

#### Split into train and test

In [41]:
data = df_ingredient
# Split the dataset into training and testing sets
ingredient_data_train, ingredient_data_test = train_test_split(data, test_size=0.2, random_state=42)

# Print the shapes of the resulting training and testing sets
print("Training set shape:", ingredient_data_train.shape)
print("Testing set shape:", ingredient_data_test.shape)

Training set shape: (20582, 7729)
Testing set shape: (5146, 7729)


In [42]:
# uncomment and run if changes are made

ingredient_data_train.to_csv('ingredient_data_train.csv', index=False)
ingredient_data_test.to_csv('ingredient_data_test.csv', index=False)


#### Dimensionality reduction

In [None]:
# Need to reduce from 7727 to 4116

### inputevents

In [43]:
file = "icu/inputevents.csv"
full_path = path + file

df_input = pd.read_csv(full_path)

In [44]:
df_input.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 20404 entries, 0 to 20403
Data columns (total 26 columns):
 #   Column                         Non-Null Count  Dtype  
---  ------                         --------------  -----  
 0   subject_id                     20404 non-null  int64  
 1   hadm_id                        20404 non-null  int64  
 2   stay_id                        20404 non-null  int64  
 3   caregiver_id                   20404 non-null  int64  
 4   starttime                      20404 non-null  object 
 5   endtime                        20404 non-null  object 
 6   storetime                      20404 non-null  object 
 7   itemid                         20404 non-null  int64  
 8   amount                         20404 non-null  float64
 9   amountuom                      20404 non-null  object 
 10  rate                           11038 non-null  float64
 11  rateuom                        11038 non-null  object 
 12  orderid                        20404 non-null 

Drop: subject_id, starttime, endtime, storetime, orderid, linkorderid, continueinnextdept, stay_id, caregiver_id,
totalamountuom
Encode: amountuom, ordercategoryname, ordercomponenttypedescription, ordercategorydescription, statusdescription, itemid
Impute with 0: rate, totalamount
Impute with N/A and encode: rateuom, secondaryordercategoryname

In [45]:
# Make a duration feature using endtime-starttime

# Convert to datetime
df_input['endtime'] = pd.to_datetime(df_input['endtime'], format='%Y-%m-%d %H:%M:%S')
df_input['starttime'] = pd.to_datetime(df_input['starttime'], format='%Y-%m-%d %H:%M:%S')


df_input['duration'] = df_input['endtime'] - df_input['starttime']

# Fill any non time values
df_input['duration'] = df_input['duration'].fillna(pd.Timedelta(0))

In [46]:
# Make a recording_delay feature using storetime-endtime

# Convert to datetime
df_input['storetime'] = pd.to_datetime(df_input['storetime'], format='%Y-%m-%d %H:%M:%S')

df_input['recording_delay'] = df_input['storetime'] - df_input['endtime']

# Fill any non time values
df_input['recording_delay'] = df_input['recording_delay'].fillna(pd.Timedelta(0))

In [47]:
# Drop 
df_input = df_input.drop(columns=['subject_id','stay_id','starttime','endtime','storetime','orderid','linkorderid',
                                  'continueinnextdept','totalamountuom', 'stay_id','caregiver_id'])

In [48]:
# Impute with N/A and encode
df_input['rateuom'] = df_input['rateuom'].fillna('N/A')
df_input['secondaryordercategoryname'] = df_input['secondaryordercategoryname'].fillna('N/A')
df_input = pd.get_dummies(df_input, columns=['rateuom','secondaryordercategoryname','amountuom','ordercategoryname',
                                            'ordercomponenttypedescription','ordercategorydescription','statusdescription',
                                            'itemid'])

In [49]:
# Impute with 0
df_input['rate'] = df_input['rate'].fillna(0)
df_input['totalamount'] = df_input['totalamount'].fillna(0)

In [50]:
df_input = df_input.dropna()

#### Split into train and test

In [51]:
data = df_input

# Split the dataset into training and testing sets
input_data_train, input_data_test = train_test_split(data, test_size=0.2, random_state=42)

# Print the shapes of the resulting training and testing sets
print("Training set shape:", input_data_train.shape)
print("Testing set shape:", input_data_test.shape)

Training set shape: (16323, 222)
Testing set shape: (4081, 222)


In [52]:
# uncomment and run if changes are made

input_data_train.to_csv('input_data_train.csv', index=False)
input_data_test.to_csv('input_data_test.csv', index=False)

#### Dimensionality reduction

In [None]:
# Fine

### outputevents

In [53]:
file = "icu/outputevents.csv"
full_path = path + file

df_output = pd.read_csv(full_path)

Drop: subject_id, charttime, storetime, valueuom, stay_id, caregiver_id'
Encode: itemid

In [55]:
# Make a days_since_admission feature using charttime-admittime 

# Convert to datetime
df_output['charttime'] = pd.to_datetime(df_output['charttime'], format='%Y-%m-%d %H:%M:%S')

# Add admittime column from other dataframe
df_output = df_output.merge(df_admittime, on='hadm_id', how='left')

df_output['days_since_admission'] = df_output['charttime'] - df_output['admittime']

# Fill any non time values
df_output['days_since_admission'] = df_output['days_since_admission'].fillna(pd.Timedelta(0))

# Drop the admission time column
df_output = df_output.drop(columns=['admittime'])

In [56]:
# Make a recording_delay feature using storetime-charttime

# Convert to datetime
df_output['storetime'] = pd.to_datetime(df_output['storetime'], format='%Y-%m-%d %H:%M:%S')

df_output['recording_delay'] = df_output['storetime'] - df_output['charttime']

# Fill any non time values
df_output['recording_delay'] = df_output['recording_delay'].fillna(pd.Timedelta(0))

In [57]:
# Drop 
df_output = df_output.drop(columns=['subject_id','stay_id','charttime','storetime','storetime','valueuom','caregiver_id'])

In [58]:
#Encode
df_output = pd.get_dummies(df_output, columns=['itemid'])

#### Split into train and test

In [59]:
data = df_output

# Split the dataset into training and testing sets
output_data_train, output_data_test = train_test_split(data, test_size=0.2, random_state=42)

# Print the shapes of the resulting training and testing sets
print("Training set shape:", output_data_train.shape)
print("Testing set shape:", output_data_test.shape)

Training set shape: (7489, 43)
Testing set shape: (1873, 43)


In [60]:
# uncomment and run if changes are made

output_data_train.to_csv('output_data_train.csv', index=False)
output_data_test.to_csv('output_data_test.csv', index=False)

#### Dimensionality reduction

In [None]:
# Fine

### procedureevents

In [61]:
file = "icu/procedureevents.csv"
full_path = path + file

df_procedure_events = pd.read_csv(full_path)

In [62]:
df_procedure_events.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1468 entries, 0 to 1467
Data columns (total 22 columns):
 #   Column                    Non-Null Count  Dtype  
---  ------                    --------------  -----  
 0   subject_id                1468 non-null   int64  
 1   hadm_id                   1468 non-null   int64  
 2   stay_id                   1468 non-null   int64  
 3   caregiver_id              1226 non-null   float64
 4   starttime                 1468 non-null   object 
 5   endtime                   1468 non-null   object 
 6   storetime                 1468 non-null   object 
 7   itemid                    1468 non-null   int64  
 8   value                     1468 non-null   float64
 9   valueuom                  1468 non-null   object 
 10  location                  353 non-null    object 
 11  locationcategory          353 non-null    object 
 12  orderid                   1468 non-null   int64  
 13  linkorderid               1468 non-null   int64  
 14  ordercat

Drop: subject_id, starttime, endtime, storetime, orderid, linkorderid, continueinnextdept, stay_id, caregiver_id
Encode: valueuom, ordercategoryname, ordercategorydescription, statusdescription, itemid
Impute with N/A and encode: location, locationcategory
MAKE DURATION FEATURE 

In [63]:
# Make a duration feature using endtime-starttime

# Convert to datetime
df_procedure_events['endtime'] = pd.to_datetime(df_procedure_events['endtime'], format='%Y-%m-%d %H:%M:%S')
df_procedure_events['starttime'] = pd.to_datetime(df_procedure_events['starttime'], format='%Y-%m-%d %H:%M:%S')


df_procedure_events['duration'] = df_procedure_events['endtime'] - df_procedure_events['starttime']

# Fill any non time values
df_procedure_events['duration'] = df_procedure_events['duration'].fillna(pd.Timedelta(0))

In [64]:
# Make a recording_delay feature using storetime-endtime

# Convert to datetime
df_procedure_events['storetime'] = pd.to_datetime(df_procedure_events['storetime'], format='%Y-%m-%d %H:%M:%S')

df_procedure_events['recording_delay'] = df_procedure_events['storetime'] - df_procedure_events['endtime']

# Fill any non time values
df_procedure_events['recording_delay'] = df_procedure_events['recording_delay'].fillna(pd.Timedelta(0))

In [65]:
# Drop 
df_procedure_events = df_procedure_events.drop(columns=['subject_id','stay_id','starttime','endtime','storetime','orderid',
                                                        'linkorderid','continueinnextdept','caregiver_id'])

In [66]:
# Impute with N/A and encode
df_procedure_events['location'] = df_procedure_events['location'].fillna('N/A')
df_procedure_events['locationcategory'] = df_procedure_events['locationcategory'].fillna('N/A')
df_procedure_events = pd.get_dummies(df_procedure_events, columns=['location','locationcategory','valueuom',
                                                                   'ordercategoryname','ordercategorydescription',
                                                                   'statusdescription','itemid'])

#### Split into train and test

In [67]:
data = df_procedure_events

# Split the dataset into training and testing sets
procedure_events_data_train, procedure_events_data_test = train_test_split(data, test_size=0.2, random_state=42)

# Print the shapes of the resulting training and testing sets
print("Training set shape:", procedure_events_data_train.shape)
print("Testing set shape:", procedure_events_data_test.shape)

Training set shape: (1174, 163)
Testing set shape: (294, 163)


In [68]:
# uncomment and run if changes are made

procedure_events_data_train.to_csv('procedure_events_data_train.csv', index=False)
procedure_events_data_test.to_csv('procedure_events_data_test.csv', index=False)

#### Dimensionality reduction

In [None]:
# Fine