In [1]:
# External libraries for data processing
import numpy as np
import pandas as pd
import sklearn as sk
#To render graphs within notebook
%matplotlib inline
import matplotlib.pyplot as plt
import joblib 
import os

# Versions of libraries
print("Numpy version: {}".format(np.__version__))
print("Pandas version: {}".format(pd.__version__))
print("Scikit version: {}".format(sk.__version__))

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MultiLabelBinarizer

Numpy version: 1.24.3
Pandas version: 1.5.3
Scikit version: 1.3.0


In [2]:
path = "C:/Project/Data/"

### Target variable calculation

In [3]:
# LOS based on admissions table (target dataframe)

file = "hosp/admissions.csv"
full_path = path + file

df_admissions = pd.read_csv(full_path)

In [4]:
df_admissions['dischtime'] = pd.to_datetime(df_admissions['dischtime'], format='%d/%m/%Y %H:%M')
df_admissions['admittime'] = pd.to_datetime(df_admissions['admittime'], format='%d/%m/%Y %H:%M')

In [5]:
df_los_hadm = pd.DataFrame()
df_los_subject = pd.DataFrame()

df_los_subject['subject_id'] = df_admissions['subject_id']
df_los_hadm['hadm_id'] = df_admissions['hadm_id']
df_los_hadm['los'] = df_admissions['dischtime']-df_admissions['admittime']
df_los_subject['los'] = df_admissions['dischtime']-df_admissions['admittime']

In [6]:
df_los_hadm

Unnamed: 0,hadm_id,los
0,24181354,8 days 23:24:00
1,25926192,7 days 20:12:00
2,23983182,5 days 17:33:00
3,22942076,1 days 17:41:00
4,21606243,2 days 02:11:00
...,...,...
270,24745425,5 days 15:57:00
271,22168393,4 days 12:18:00
272,27708593,7 days 07:10:00
273,23251352,4 days 04:56:00


In [7]:
# Average LOS for each subject_id
df_los_subject = df_los_subject.groupby('subject_id').mean().reset_index()

In [8]:
df_los_subject

Unnamed: 0,subject_id,los
0,10000032,1 days 10:40:00
1,10001217,6 days 08:30:30
2,10001725,2 days 23:52:00
3,10002428,5 days 14:54:34.285714285
4,10002495,6 days 21:24:00
...,...,...
95,10038999,9 days 02:41:30
96,10039708,7 days 11:56:06
97,10039831,5 days 07:19:00
98,10039997,2 days 07:47:00


In [9]:
df_admittime= pd.DataFrame()
df_admittime['hadm_id'] = df_admissions['hadm_id']
df_admittime['admittime'] = df_admissions['admittime']

In [10]:
# df_admittime

### omr

In [None]:
# optional to do
# Make a chartdate - admittime feature called days_since_admission 
# - Can't because hadm not provided here 

In [None]:
file = "hosp/omr.csv"
full_path = path + file

df_omr = pd.read_csv(full_path)

In [None]:
df_omr.info()

In [None]:
# Combine result_name and seq_num into the column name with result_value from the same row as its value 

# Function to combine values from columns into a new column 
def new_columns(row):
    return row['result_name'] + '_' + str(row['seq_num'])

new_names = df_omr.apply(new_columns, axis=1) # series of names of combinations 


def add_values(row, colName):
    name = row['result_name'] + '_' + str(row['seq_num'])
    if str(name) == colName:
        return row['result_value']
    else:
        return 0


for i in range(len(new_names)):
    df_omr[new_names[i]] = df_omr.apply(add_values, args=(new_names[i],), axis=1)

In [None]:
# Drop seq_num, result_name, result_value
df_omr = df_omr.drop(columns=['seq_num', 'result_name', 'result_value'])
# sequence number doesn't add any useful info

In [None]:
df_omr['subject_id'].value_counts()

# The patient with the most measurements has 391 so could make it 391 features for everyone but most will have lots of 
# zeroes
# Fine as sparcity represents not taking many measurements which could also be a factor?
# Could have number of measurements as an additional feature too

In [None]:
filtered_df = df_omr[df_omr['subject_id'] == 10019003]
filtered_df

# Preserves every measurement made for each subject across all of their stays 
# Only one entry per row 

In [None]:
backup = df_omr.copy()

In [None]:
# ordering by date (so each patients measurements are chronological from top to bottom)

df_omr = df_omr.sort_values(by=['subject_id', 'chartdate'])

df_omr

# This preserves for example, increase in weight over time 

In [None]:
# drop chartdate since the time shift is not consistent for each subject 
df_omr = df_omr.drop(columns=['chartdate'])

In [None]:
# reset index
df_omr = df_omr.reset_index(drop=True)

In [None]:
# df_omr.head(20)

In [None]:
df_omr_final = pd.DataFrame()

In [None]:
# Row for each subject, features for every measurement made on them 

colNames = df_omr.columns.tolist()
colNames.remove('subject_id')

x = 0
prev_subject = 0


for row in range(len(df_omr)):
    current_subject = df_omr['subject_id'][row] 
    if current_subject != prev_subject:
        x = 0 # reset x
    for i in range(len(colNames)): # for each column
        if df_omr.loc[row, colNames[i]] != 0:
            if colNames[i] + '_0' not in df_omr_final.columns: # New column name added
                x = 0 # reset x
            new_name = colNames[i] + '_' + str(x)
            if new_name in df_omr_final.columns and (current_subject == prev_subject): # Trying to add another of the same 
                # measurement for the same patient 
                x += 1
                new_name = colNames[i] + '_' + str(x)
            df_omr_final.loc[current_subject, new_name] = df_omr.loc[row, colNames[i]]
            df_omr_final = df_omr_final.copy()
            break # leave for loop as the rest of the columns will be 0 for this row
    prev_subject = current_subject
    

In [None]:
df_omr_final.fillna(0, inplace=True)
df_omr_final

In [None]:
#  df_omr_final.columns.tolist()

In [None]:
# Convert all values to numbers 

df_omr_final = df_omr_final.astype(str)

# Function to convert fraction string to decimal
def fraction_to_decimal(fraction_str):
    try:
        numerator, denominator = map(int, fraction_str.split('/'))
        return numerator / denominator
    except ValueError:
        return fraction_str  # Return unchanged if not a fraction

# Apply the function to the entire DataFrame
df_omr_final = df_omr_final.applymap(fraction_to_decimal)

In [None]:
df_omr_final = df_omr_final.astype(float)
# df_omr_final.info()

In [None]:
# Reset the index and convert it to a column
df_omr_final.reset_index(inplace=True)
df_omr_final.rename(columns={'index': 'subject_id'}, inplace=True)

In [None]:
# Merge the DataFrames based on the ID column
df_omr_final = df_omr_final.merge(df_los_subject, on='subject_id', how='left')

In [None]:
# What does this show?
# Each patient (subject_id is the index of the df) has measurements showing type_sequence_date
# sequence starts from 1 and it is used when the same measurement was taken more than once in a single day
# date starts from 0 and is used when the same measurement for the same patient was taken on a different day
# Note that they were NOT taken on the same date for each patient but the bigger the date integer, the later the measurement
# was taken, relative to that patient's admission  

# Weight (Lbs)_1_0 is the first time the patient was weighed, Weight (Lbs)_3_0 is the third time they were weighed on that 
# same day as they were first weighed
# Weight (Lbs)_1_1 is from a separate (later) date where the patient was weighed again, this is the first measurement 
# from this day 
# Any non applicable measurements are imputed with 0 

Decide which ones to keep all measurements of per patient and which to just take the average and keep as one record for patient (that aren’t likely to change):

Remove blood pressure sitting, lying and standing as too few samples 
Take average for height 

Could probably drop a few of the features that are really empty ?

In [None]:
# drop subject_id
df_omr_final = df_omr_final.drop(columns=['subject_id'])

In [None]:
df_omr_final # target variable los

In [None]:
# Save DataFrame to CSV file

# uncomment and run if changes are made
# df_omr_final.to_csv('df_omr.csv', index=False)

#### Learner target feature

Given a patients omr data (grouped by subject_id), predict their length of stay (take an average for multiple admissions)

#### Split into train and test

In [None]:
data = df_omr_final.drop(columns=['los'])
target = df_omr_final['los']

# Split the dataset into training and testing sets
omr_data_train, omr_data_test, omr_label_train, omr_label_test = train_test_split(data, target, test_size=0.2, random_state=42)

# Print the shapes of the resulting training and testing sets
print("Training set shape:", omr_data_train.shape, omr_label_train.shape)
print("Testing set shape:", omr_data_test.shape, omr_label_test.shape)

In [None]:
# uncomment and run if changes are made

# omr_data_train.to_csv('omr_data_train.csv', index=False)
# omr_data_test.to_csv('omr_data_test.csv', index=False)

# omr_label_train.to_csv('omr_label_train.csv', index=False)
# omr_label_test.to_csv('omr_label_test.csv', index=False)

#### Dimensionality reduction 

In [11]:
path = "C:/Users/jenni/OneDrive/Desktop/IP/"
file = "omr_data_train.csv"
full_path = path + file

omr_data_train = pd.read_csv(full_path)

file = "omr_data_test.csv"
full_path = path + file

omr_data_test = pd.read_csv(full_path)

file = "omr_label_train.csv"
full_path = path + file

omr_label_train = pd.read_csv(full_path)

file = "omr_label_test.csv"
full_path = path + file

omr_label_test = pd.read_csv(full_path)

In [12]:
omr_data_train

Unnamed: 0,BMI (kg/m2)_1_0,Weight (Lbs)_1_0,Blood Pressure_1_0,BMI (kg/m2)_1_1,Weight (Lbs)_1_1,Height (Inches)_1_0,Weight (Lbs)_17_0,Weight (Lbs)_5_0,Weight (Lbs)_10_0,Weight (Lbs)_12_0,...,Weight (Lbs)_14_1,Weight (Lbs)_20_0,Weight (Lbs)_27_0,Weight (Lbs)_23_0,Weight (Lbs)_19_0,Weight (Lbs)_24_0,Weight (Lbs)_16_1,Weight (Lbs)_11_1,Weight (Lbs)_25_0,Weight (Lbs)_22_0
0,0.0,0.0,0.000000,25.6,0.00,62.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,168.0,0.000000,25.5,0.00,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,167.0,0.000000,25.8,0.00,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,2.038462,0.0,0.00,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,138.0,0.000000,0.0,0.00,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
58,0.0,0.0,0.000000,25.9,0.00,73.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
59,0.0,0.0,0.000000,0.0,165.00,64.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
60,0.0,0.0,0.000000,33.3,0.00,62.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
61,0.0,0.0,0.000000,30.5,0.00,68.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [None]:
# Need to reduce to 12 features, this is an especially sparse matrix

#### Truncated Singular Value Decomposition (SVD)

In [18]:
from sklearn.decomposition import TruncatedSVD

# Number of desired features (components)
n_components = 12

# Initialize Truncated SVD with the desired number of components
svd = TruncatedSVD(n_components=n_components)

# Fit the Truncated SVD model to the sparse matrix and transform the data
svd.fit(omr_data_train)
transformed_matrix = svd.transform(omr_data_train)

# Get the explained variance ratio (how much variance is explained by each component)
explained_variance_ratio = svd.explained_variance_ratio_

# Print the transformed matrix and explained variance ratio
# print("Transformed Matrix:")
# print(transformed_matrix)
print("\nExplained Variance Ratio:")
print(explained_variance_ratio)

print("\n Amount of original variance conserved:", np.sum(svd.explained_variance_ratio_))


Explained Variance Ratio:
[0.33342246 0.10604019 0.08112751 0.04986414 0.0442775  0.03792258
 0.03462321 0.03055395 0.02590118 0.02647111 0.02490924 0.02347857]

 Amount of original variance conserved: 0.8185916541599938


#### Non-negative Matrix Factorization (NMF)

In [26]:
from sklearn.decomposition import NMF


# Number of desired features (components)
n_components = 12

# Initialize NMF with the desired number of components
nmf = NMF(n_components=n_components)

# Fit the NMF model to the sparse matrix and transform the data
W = nmf.fit_transform(omr_data_train)
H = nmf.components_

# Reconstruct the original matrix
reconstructed_matrix = np.dot(W, H)

# Print the factorized matrices (W and H) and the reconstructed matrix
# print("Factorized Matrix W:")
# print(W)
# print("\nFactorized Matrix H:")
# print(H)
# print("\nReconstructed Matrix:")
# print(reconstructed_matrix)

In [23]:
# Calculate the Frobenius norm of the reconstruction error
# which measures the element-wise difference between the original and reconstructed matrices
reconstruction_error = np.linalg.norm(omr_data_train - reconstructed_matrix, 'fro')

# Calculate the Frobenius norm of the original matrix
original_norm = np.linalg.norm(omr_data_train, 'fro')

# Calculate the percentage of variance conserved
variance_conserved = (1 - reconstruction_error / original_norm) * 100

print("Reconstruction Error (Frobenius norm):", reconstruction_error)
print("Original Matrix Norm (Frobenius norm):", original_norm)
print("Percentage of Variance Conserved:", variance_conserved)

# By evaluating the reconstruction error and the percentage of variance conserved, you can assess how well the NMF 
# factorization captures the structure of the original data. 
# Lower reconstruction error and higher percentage of variance conserved indicate a better quality of factorization

Reconstruction Error (Frobenius norm): 2111.202976270996
Original Matrix Norm (Frobenius norm): 4965.416241595012
Percentage of Variance Conserved: 57.48185300991351


#### Sparse PCA - takes too long

In [39]:
from sklearn.decomposition import SparsePCA

# Number of desired components
n_components = 12

# Initialize Sparse PCA with the desired number of components and sparsity level
sparse_pca = SparsePCA(n_components=n_components, alpha=0.1)  # Adjust alpha as needed

# Fit Sparse PCA to the sparse matrix and transform the data
sparse_pca.fit(omr_data_train)
transformed_matrix = sparse_pca.transform(omr_data_train)

# Get the loading matrix (sparse components)
loading_matrix = sparse_pca.components_

# Print the transformed matrix and loading matrix
print("Transformed Matrix:")
print(transformed_matrix)
print("\nLoading Matrix (Sparse Components):")
print(loading_matrix)

In [30]:
# Compute the squared norms of the components
squared_norms = np.linalg.norm(loading_matrix, axis=1)**2

# Calculate the proportion of explained variance for each component
total_squared_norm = np.linalg.norm(omr_data_train)**2
explained_variance_ratio = squared_norms / total_squared_norm

# Print the explained variance ratio of each component
for i, evr in enumerate(explained_variance_ratio):
    print(f"Explained Variance Ratio of Component {i + 1}: {evr:.4f}")

# Calculate the total explained variance ratio
total_explained_variance_ratio = np.sum(explained_variance_ratio)
print("\nTotal Explained Variance Ratio:", total_explained_variance_ratio)

Explained Variance Ratio of Component 1: 0.0000
Explained Variance Ratio of Component 2: 0.0000
Explained Variance Ratio of Component 3: 0.0000
Explained Variance Ratio of Component 4: 0.0000
Explained Variance Ratio of Component 5: 0.0000
Explained Variance Ratio of Component 6: 0.0000
Explained Variance Ratio of Component 7: 0.0000
Explained Variance Ratio of Component 8: 0.0000
Explained Variance Ratio of Component 9: 0.0000
Explained Variance Ratio of Component 10: 0.0000
Explained Variance Ratio of Component 11: 0.0000
Explained Variance Ratio of Component 12: 0.0000

Total Explained Variance Ratio: 4.867096141886648e-07


#### Random Projection

In [38]:
from sklearn.random_projection import SparseRandomProjection

# Number of desired features (components)
n_components = 12

# Initialize Random Projection with the desired number of components
random_projection = SparseRandomProjection(n_components=n_components)

# Fit Random Projection to the sparse matrix and transform the data
transformed_matrix = random_projection.fit_transform(omr_data_train)

# Print the transformed matrix
print("Transformed Matrix:")
print(transformed_matrix)

Assess the quality of the dimensionality reduction performed by Random Projection by examining the pairwise distances between points in the original and projected spaces. A well-preserved variance will result in similar pairwise distances between points in both spaces.

In [33]:
# Calculate the pairwise distances in the original space
original_distances = np.linalg.norm(omr_data_train - omr_data_train.mean(axis=0), axis=1)

# Calculate the pairwise distances in the projected space
projected_distances = np.linalg.norm(transformed_matrix - transformed_matrix.mean(axis=0), axis=1)

# Compute the distortion in pairwise distances
distortion = np.abs(original_distances - projected_distances)

# Calculate the maximum and average distortion
max_distortion = np.max(distortion)
average_distortion = np.mean(distortion)

print("Maximum Distortion:", max_distortion)
print("Average Distortion:", average_distortion)

Maximum Distortion: 287.1814575402386
Average Distortion: 107.38740883522641


In [None]:
# High distortion values suggest low preservation of variance 

In [48]:
!pip install umap-learn

Collecting umap-learn
  Downloading umap-learn-0.5.5.tar.gz (90 kB)
     ---------------------------------------- 0.0/90.9 kB ? eta -:--:--
     ---------------------------------------- 90.9/90.9 kB 5.0 MB/s eta 0:00:00
  Preparing metadata (setup.py): started
  Preparing metadata (setup.py): finished with status 'done'
Collecting pynndescent>=0.5 (from umap-learn)
  Obtaining dependency information for pynndescent>=0.5 from https://files.pythonhosted.org/packages/4e/82/0b9851a2fd4da9b57d7931446f5ebab92a98f1f35d3dc0dae5f9ed50a462/pynndescent-0.5.11-py3-none-any.whl.metadata
  Downloading pynndescent-0.5.11-py3-none-any.whl.metadata (6.8 kB)
Downloading pynndescent-0.5.11-py3-none-any.whl (55 kB)
   ---------------------------------------- 0.0/55.8 kB ? eta -:--:--
   ---------------------------------------- 55.8/55.8 kB ? eta 0:00:00
Building wheels for collected packages: umap-learn
  Building wheel for umap-learn (setup.py): started
  Building wheel for umap-learn (setup.py): finishe

#### Uniform Manifold Approximation and Projection (UMAP)

In [52]:
import umap
import matplotlib.pyplot as plt

# Initialize UMAP with desired parameters
umap_model = umap.UMAP(n_neighbors=5, min_dist=0.3, n_components=2)

# Fit UMAP to the data and transform it to the lower-dimensional space
umap_result = umap_model.fit_transform(omr_data_train)

# Plot the results
plt.scatter(umap_result[:, 0], umap_result[:, 1], c='b', marker='o')
plt.title('UMAP Projection of Data')
plt.xlabel('UMAP 1')
plt.ylabel('UMAP 2')
plt.show()

AttributeError: module 'umap' has no attribute 'UMAP'

In [41]:
!pip install umap

Collecting umap
  Downloading umap-0.1.1.tar.gz (3.2 kB)
  Preparing metadata (setup.py): started
  Preparing metadata (setup.py): finished with status 'done'
Building wheels for collected packages: umap
  Building wheel for umap (setup.py): started
  Building wheel for umap (setup.py): finished with status 'done'
  Created wheel for umap: filename=umap-0.1.1-py3-none-any.whl size=3550 sha256=d03b2e81ba4d6136e2a1a8575fb42f4e267be78ad66cb2494e6328c50f7c6bff
  Stored in directory: c:\users\jenni\appdata\local\pip\cache\wheels\82\d8\73\e9eb3334baaad795ff0278363ff1aca7568bdf2793e452a527
Successfully built umap
Installing collected packages: umap
Successfully installed umap-0.1.1


### admissions

In [None]:
file = "hosp/admissions.csv"
full_path = path + file

df_admissions = pd.read_csv(full_path)

In [None]:
# df_admissions['subject_id'].value_counts().head(50)

In [None]:
df_admissions.head() # admittime and dischtime for los

To drop: subject_id, admittime, dischtime, deathtime, hospital_expire_flag

In [None]:
# Make an ed_duration feature for edouttime - edregtime (how long the patient stayed in the emergency department)

# Convert to datetime
df_admissions['edouttime'] = pd.to_datetime(df_admissions['edouttime'], format='%d/%m/%Y %H:%M')
df_admissions['edregtime'] = pd.to_datetime(df_admissions['edregtime'], format='%d/%m/%Y %H:%M')

df_admissions['ed_duration'] = df_admissions['edouttime'] - df_admissions['edregtime']

# Fill any non time values
df_admissions['ed_duration'] = df_admissions['ed_duration'].fillna(pd.Timedelta(0))

In [None]:
df_admissions = df_admissions.drop(columns=['subject_id', 'admittime', 'dischtime', 'deathtime', 'hospital_expire_flag'
                            , 'edregtime', 'edouttime', 'admit_provider_id','discharge_location'])

# discharge_location is an outcome feature, should not be used to predict LOS as not known beforehand

In [None]:
# Fill Null with N/A and then one hot encode
df_admissions['marital_status'] = df_admissions['marital_status'].fillna('N/A')
df_admissions = pd.get_dummies(df_admissions, columns=['admission_type', 'admission_location', 
                                                      'insurance','language', 'marital_status','race'])

In [None]:
# df_admissions

#### Learner target feature

Given patient data given at time of admission, predict their length of stay (dischtime - admittime)

In [None]:
df_admissions = df_admissions.merge(df_los_hadm, on='hadm_id', how='left')
df_admissions = df_admissions.drop(columns=['hadm_id'])

In [None]:
# Save DataFrame to CSV file

# uncomment and run if changes are made
# df_admissions.to_csv('df_admissions.csv', index=False)

#### Split into train and test

In [None]:
data = df_admissions.drop(columns=['los'])
target = df_admissions['los']

# Split the dataset into training and testing sets
admissions_data_train, admissions_data_test, admissions_label_train, admissions_label_test = train_test_split(data, target, test_size=0.2, random_state=42)

# Print the shapes of the resulting training and testing sets
print("Training set shape:", admissions_data_train.shape, admissions_label_train.shape)
print("Testing set shape:", admissions_data_test.shape, admissions_label_test.shape)

In [None]:
# uncomment and run if changes are made

admissions_data_train.to_csv('admissions_data_train.csv', index=False)
admissions_data_test.to_csv('admissions_data_test.csv', index=False)

admissions_label_train.to_csv('admissions_label_train.csv', index=False)
admissions_label_test.to_csv('admissions_label_test.csv', index=False)

#### Dimensionality reduction

In [None]:
# Fine

### diagnoses - Do not use

In [None]:
# Diagnoses are recorded upon discharge so don't use to predice LOS

In [None]:
file = "hosp/diagnoses_icd.csv"
full_path = path + file

df_diagnoses = pd.read_csv(full_path)

In [None]:
df_diagnoses.info()
# Note that Icd_code and icd_version together relate to a particular diagnosis 

In [None]:
df_diagnoses = df_diagnoses.drop(columns=['subject_id'])

In [None]:
df_diagnoses = pd.get_dummies(df_diagnoses, columns=['icd_code'])

In [None]:
df_diagnoses #Currently has too many features 

#### Learner target feature

Given information about a billed diagnosis for a patient, predict their length of stay (based on hadm_id)

In [None]:
df_diagnoses = df_diagnoses.merge(df_los_hadm, on='hadm_id', how='left')
df_diagnoses = df_diagnoses.drop(columns=['hadm_id'])

In [None]:
# uncomment and run if changes are made

# Save DataFrame to CSV file
# df_diagnoses.to_csv('df_diagnoses.csv', index=False)

#### Split into train and test

In [None]:
# data = df_diagnoses.drop(columns=['los'])
# target = df_diagnoses['los']

# # Split the dataset into training and testing sets
# diagnoses_data_train, diagnoses_data_test, diagnoses_label_train, diagnoses_label_test = train_test_split(data, target, test_size=0.2, random_state=42)

# # Print the shapes of the resulting training and testing sets
# print("Training set shape:", diagnoses_data_train.shape, diagnoses_label_train.shape)
# print("Testing set shape:", diagnoses_data_test.shape, diagnoses_label_test.shape)

In [None]:
# uncomment and run if changes are made

# diagnoses_data_train.to_csv('diagnoses_data_train.csv', index=False)
# diagnoses_data_test.to_csv('diagnoses_data_test.csv', index=False)

# diagnoses_label_train.to_csv('diagnoses_label_train.csv', index=False)
# diagnoses_label_test.to_csv('diagnoses_label_test.csv', index=False)

### emar

In [None]:
# records for 65 different patients 
# 181 unique admissions

In [None]:
file = "hosp/emar.csv"
full_path = path + file

df_emar = pd.read_csv(full_path)

In [None]:
df_emar.info()
# print(df_emar.columns.tolist())

In [None]:
df_emar.head()

In [None]:
# df_emar['emar_seq'].value_counts()

Impute with N/A and encode: enter_provider_id, medication

Drop: subject_id, emar_id, poe_id, pharmacy_id, event_txt, storetime

poe_id is an identifier which links administrations in emar to orders in poe and prescriptions
storetime is when it was recorded in the table

In [None]:
# Make a feature called delay using scheduletime - charttime

# Convert to datetime
df_emar['scheduletime'] = pd.to_datetime(df_emar['scheduletime'], format='%Y/%m/%d %H:%M')
df_emar['charttime'] = pd.to_datetime(df_emar['charttime'], format='%Y/%m/%d %H:%M')

df_emar['delay'] = df_emar['charttime'] - df_emar['scheduletime']

# Fill any non time values
df_emar['delay'] = df_emar['delay'].fillna(pd.Timedelta(0))

In [None]:
df_emar = df_emar.drop(columns=['subject_id','emar_id','poe_id','pharmacy_id',
                               'event_txt','charttime','scheduletime','storetime'])

In [None]:
# Fill Null with N/A and then one hot encode
df_emar['enter_provider_id'] = df_emar['enter_provider_id'].fillna('N/A')
df_emar['medication'] = df_emar['medication'].fillna('N/A')
df_emar = pd.get_dummies(df_emar, columns=['enter_provider_id', 'medication'])

In [None]:
df_emar.info()

In [None]:
df_emar['delay'].value_counts()

#### Learner target feature

Relationship between particular medications, uniquely given by emar_id (and some other info regarding it) and length of stay (change hadm_id to LOS)

In [None]:
df_emar = df_emar.merge(df_los_hadm, on='hadm_id', how='left')
df_emar = df_emar.drop(columns=['hadm_id'])

In [None]:
# Save DataFrame to CSV file

# uncomment and run if changes are made
df_emar.to_csv('df_emar.csv', index=False)

#### Split into train and test

In [None]:
data = df_emar.drop(columns=['los'])
target = df_emar['los']

# Split the dataset into training and testing sets
emar_data_train, emar_data_test, emar_label_train, emar_label_test = train_test_split(data, target, test_size=0.2, random_state=42)

# Print the shapes of the resulting training and testing sets
print("Training set shape:", emar_data_train.shape, emar_label_train.shape)
print("Testing set shape:", emar_data_test.shape, emar_label_test.shape)

In [None]:
# uncomment and run if changes are made

emar_data_train.to_csv('emar_data_train.csv', index=False)
emar_data_test.to_csv('emar_data_test.csv', index=False)

emar_label_train.to_csv('emar_label_train.csv', index=False)
emar_label_test.to_csv('emar_label_test.csv', index=False)

#### Dimensionality reduction

In [None]:
# Fine

### emar_detail

In [None]:
file = "hosp/emar_detail.csv"
full_path = path + file

df_emar_detail = pd.read_csv(full_path,low_memory=False)

Fields that have lots of null values:
reason_for_no_barcode: drop
prior_infusion_rate: impute with zeroes
infusion_rate: impute with zeroes
infusion_rate_adjustment: impute with 'N/A', then one hot encoding
infusion_rate_adjustment_amount: impute with zeroes
infusion_rate_unit: impute with 'N/A', then one hot encoding
infusion_complete: impute with 'N/A', then one hot encoding
completion_interval: impute with 0, then ordinal encoding 
new_iv_bag_hung: impute with N, then binary encoding 

Text data to remove but maybe consider later:
product_description, product_description_other

In [None]:
df_emar_detail.columns

In [None]:
df_emar_detail = df_emar_detail.drop(columns=['reason_for_no_barcode']) # Too hard to encode, adds not much value

In [None]:
# Impute with 0s
df_emar_detail['prior_infusion_rate'] = df_emar_detail['prior_infusion_rate'].fillna(0)
df_emar_detail['infusion_rate'] = df_emar_detail['infusion_rate'].fillna(0)
df_emar_detail['infusion_rate_adjustment_amount'] = df_emar_detail['infusion_rate_adjustment_amount'].fillna(0)

In [None]:
# Impute with N/A and encode
df_emar_detail['infusion_rate_adjustment'] = df_emar_detail['infusion_rate_adjustment'].fillna('N/A')
df_emar_detail['infusion_rate_unit'] = df_emar_detail['infusion_rate_unit'].fillna('N/A')
df_emar_detail['infusion_complete'] = df_emar_detail['infusion_complete'].fillna('N/A')
df_emar_detail = pd.get_dummies(df_emar_detail, columns=['infusion_rate_adjustment','infusion_complete',
                                                         'infusion_rate_unit'])

In [None]:
df_emar_detail['completion_interval'] = df_emar_detail['completion_interval'].fillna(0)
df_emar_detail['completion_interval'] = df_emar_detail['completion_interval'].replace('PRN', 0)
#Converting all the intervals to minutes
df_emar_detail['completion_interval'] = df_emar_detail['completion_interval'].replace('within 2 hours', 120)
df_emar_detail['completion_interval'] = df_emar_detail['completion_interval'].replace('within 4 hours', 240)
df_emar_detail['completion_interval'] = df_emar_detail['completion_interval'].replace('within 1 hour', 60)
df_emar_detail['completion_interval'] = df_emar_detail['completion_interval'].replace('within 1.5 hours', 90)
df_emar_detail['completion_interval'] = df_emar_detail['completion_interval'].replace('within 8 hours', 480)
df_emar_detail['completion_interval'] = df_emar_detail['completion_interval'].replace('within 15 minutes', 15)
df_emar_detail['completion_interval'] = df_emar_detail['completion_interval'].replace('within 12 hours', 720)
df_emar_detail['completion_interval'] = df_emar_detail['completion_interval'].replace('within 30 minutes', 30)
df_emar_detail['completion_interval'] = df_emar_detail['completion_interval'].replace('within 24 hours', 1140)
df_emar_detail['completion_interval'] = df_emar_detail['completion_interval'].replace('within 1 minutes', 1)
df_emar_detail['completion_interval'] = df_emar_detail['completion_interval'].replace('within 14 hours', 840)
df_emar_detail['completion_interval'] = df_emar_detail['completion_interval'].replace('within 7 hours', 420)
df_emar_detail['completion_interval'] = df_emar_detail['completion_interval'].replace('within 5 hours', 300)
df_emar_detail['completion_interval'] = df_emar_detail['completion_interval'].replace('within 3 hours', 180)

In [None]:
df_emar_detail['new_iv_bag_hung'] = df_emar_detail['new_iv_bag_hung'].fillna('N')

In [None]:
# Binary encoding
df_emar_detail['new_iv_bag_hung'] = df_emar_detail['new_iv_bag_hung'].map({'Y': 1, 'N': 0})

In [None]:
# Impute with N/A and one hot encode:
# administration_type
# barcode_type
# complete_dose_not_given
# dose_due_unit
# dose_given_unit
# will_remainder_of_dose_be_given
# product_unit
# product_code
# route
# side
# site

In [None]:
df_emar_detail['administration_type'] = df_emar_detail['administration_type'].fillna('N/A')
df_emar_detail['barcode_type'] = df_emar_detail['barcode_type'].fillna('N/A')
df_emar_detail['complete_dose_not_given'] = df_emar_detail['complete_dose_not_given'].fillna('N/A')
df_emar_detail['dose_due_unit'] = df_emar_detail['dose_due_unit'].fillna('N/A')
df_emar_detail['dose_given_unit'] = df_emar_detail['dose_given_unit'].fillna('N/A')
df_emar_detail['will_remainder_of_dose_be_given'] = df_emar_detail['will_remainder_of_dose_be_given'].fillna('N/A')
df_emar_detail['product_unit'] = df_emar_detail['product_unit'].fillna('N/A')
df_emar_detail['product_code'] = df_emar_detail['product_code'].fillna('N/A')
df_emar_detail['route'] = df_emar_detail['route'].fillna('N/A')
df_emar_detail['side'] = df_emar_detail['side'].fillna('N/A')
df_emar_detail['site'] = df_emar_detail['site'].fillna('N/A')
df_emar_detail = pd.get_dummies(df_emar_detail, columns=['administration_type','barcode_type','complete_dose_not_given',
                                                        'dose_due_unit','dose_given_unit',
                                                        'will_remainder_of_dose_be_given','product_unit','product_code',
                                                        'route','side','site'])

In [None]:
# Impute with zeroes:
# dose_due and dose_given, but also need to deal with some of them being ranges
# product_amount_given
# restart_interval, then ordinal encoding

In [None]:
df_emar_detail['product_amount_given'] = df_emar_detail['product_amount_given'].fillna(0)
df_emar_detail['dose_due'] = df_emar_detail['dose_due'].fillna(0)
df_emar_detail['dose_given'] = df_emar_detail['dose_given'].fillna(0)
df_emar_detail['restart_interval'] = df_emar_detail['restart_interval'].fillna(0)

In [None]:
df_emar_detail['dose_due'] = df_emar_detail['dose_due'].astype(str)
df_emar_detail['dose_given'] = df_emar_detail['dose_given'].astype(str)

In [None]:
def find_middle_value(range_string):
    if '-' in range_string:
        start, end = map(float, range_string.split('-'))
        return (start + end) / 2
    else:
        return range_string

df_emar_detail['dose_due'] = df_emar_detail['dose_due'].apply(find_middle_value)
df_emar_detail['dose_given'] = df_emar_detail['dose_given'].apply(find_middle_value)

In [None]:
df_emar_detail['restart_interval'] = df_emar_detail['restart_interval'].replace('PRN', 0)
#Converting all the intervals to minutes
df_emar_detail['restart_interval'] = df_emar_detail['restart_interval'].replace('within 2 hours', 120)
df_emar_detail['restart_interval'] = df_emar_detail['restart_interval'].replace('within 4 hours', 240)
df_emar_detail['restart_interval'] = df_emar_detail['restart_interval'].replace('within 1 hour', 60)
df_emar_detail['restart_interval'] = df_emar_detail['restart_interval'].replace('within 30 minutes', 30)
df_emar_detail['restart_interval'] = df_emar_detail['restart_interval'].replace('within 24 hours', 1140)

In [None]:
# Impute with N and map to binary encoding:
# continued_infusion_in_other_location
# non_formulary_visual_verification

In [None]:
df_emar_detail['continued_infusion_in_other_location'] = df_emar_detail['continued_infusion_in_other_location'].fillna('N')
df_emar_detail['non_formulary_visual_verification'] = df_emar_detail['non_formulary_visual_verification'].fillna('N')
# Binary encoding
df_emar_detail['continued_infusion_in_other_location'] = df_emar_detail['continued_infusion_in_other_location'].map({'Y': 1, 'N': 0})
df_emar_detail['non_formulary_visual_verification'] = df_emar_detail['non_formulary_visual_verification'].map({'Y': 1, 'N': 0})

In [None]:
df_emar_detail = df_emar_detail.drop(columns=['pharmacy_id']) # Contains NaN values 

In [None]:
df_emar_detail = df_emar_detail.drop(columns=['emar_id']) # Practically unique

In [None]:
# Replace blanks with zero
df_emar_detail['dose_due'] = df_emar_detail['dose_due'].replace('___', 0)
df_emar_detail['dose_given'] = df_emar_detail['dose_given'].replace('___', 0)

In [None]:
df_emar_detail['dose_due'] = df_emar_detail['dose_due'].astype(float)
df_emar_detail['dose_given'] = df_emar_detail['dose_given'].astype(float)

In [None]:
# Impute with N/A or 0
# One hot encode the categorical features 

df_emar_detail['product_description'] = df_emar_detail['product_description'].fillna('N/A')
df_emar_detail['product_description_other'] = df_emar_detail['product_description_other'].fillna('N/A')
df_emar_detail['parent_field_ordinal'] = df_emar_detail['parent_field_ordinal'].fillna(0)
df_emar_detail = pd.get_dummies(df_emar_detail, columns=['product_description_other','product_description'])

In [None]:
df_emar_detail

#### Learner target feature

Given information about an administered medication, predict LOS (use subject_id mapped to average LOS frame)

In [None]:
df_emar_detail = df_emar_detail.merge(df_los_subject, on='subject_id', how='left')
df_emar_detail = df_emar_detail.drop(columns=['subject_id'])

In [None]:
# Save DataFrame to CSV file

# uncomment and run if changes are made
# df_emar_detail.to_csv('df_emar_detail.csv', index=False)

#### Split into train and test

In [None]:
data = df_emar_detail.drop(columns=['los'])
target = df_emar_detail['los']

# Split the dataset into training and testing sets
emar_detail_data_train, emar_detail_data_test, emar_detail_label_train, emar_detail_label_test = train_test_split(data, target, test_size=0.2, random_state=42)

# Print the shapes of the resulting training and testing sets
print("Training set shape:", emar_detail_data_train.shape, emar_detail_label_train.shape)
print("Testing set shape:", emar_detail_data_test.shape, emar_detail_label_test.shape)

In [None]:
# uncomment and run if changes are made

# emar_detail_data_train.to_csv('emar_detail_data_train.csv', index=False)
# emar_detail_data_test.to_csv('emar_detail_data_test.csv', index=False)

# emar_detail_label_train.to_csv('emar_detail_label_train.csv', index=False)
# emar_detail_label_test.to_csv('emar_detail_label_test.csv', index=False)

#### Dimensionality reduction

In [None]:
# Fine

### hcpcsevents

Contains info for 18 different patients 

In [None]:
# d_hcpcs has longer descriptions (connected by code) but no other useful info 

In [None]:
file = "hosp/hcpcsevents.csv"
full_path = path + file

df_hcpcsevents = pd.read_csv(full_path)

In [None]:
df_hcpcsevents.head()

In [None]:
# patient, admission, date, uniquely identifying billed code, sequence number, description

In [None]:
df_hcpcsevents['seq_num'].value_counts()

To drop: subject_id, chartdate, hcpcs_cd (code that links to longer description in d_hcpcs)

In [None]:
# Make a feature for days_since_admission using chartdate - admittime

# Convert to datetime
df_hcpcsevents['chartdate'] = pd.to_datetime(df_hcpcsevents['chartdate'], format='%Y/%m/%d %H:%M')

# Add admittime column from other dataframe
df_hcpcsevents = df_hcpcsevents.merge(df_admittime, on='hadm_id', how='left')

# Discard the time part and keep only the date
df_hcpcsevents['admittime'] = df_hcpcsevents['admittime'].dt.date
df_hcpcsevents['chartdate'] = df_hcpcsevents['chartdate'].dt.date

df_hcpcsevents['days_since_admission'] = df_hcpcsevents['chartdate'] - df_hcpcsevents['admittime']

# Fill any non time values
df_hcpcsevents['days_since_admission'] = df_hcpcsevents['days_since_admission'].fillna(pd.Timedelta(0))

In [None]:
df_hcpcsevents['days_since_admission'].value_counts()

In [None]:
df_hcpcsevents = df_hcpcsevents.drop(columns=['subject_id','chartdate','hcpcs_cd'])
# Not enough samples to include code as after encoding there would be a lot more features 

In [None]:
df_hcpcsevents = pd.get_dummies(df_hcpcsevents, columns=['short_description'])

In [None]:
df_hcpcsevents

#### Learner target feature

Given (brief) information about billed events, predict length of stay (based on hadm_id)

In [None]:
df_hcpcsevents = df_hcpcsevents.merge(df_los_hadm, on='hadm_id', how='left')
df_hcpcsevents = df_hcpcsevents.drop(columns=['hadm_id', 'admittime'])

In [None]:
# df_hcpcsevents.describe()

In [None]:
# Save DataFrame to CSV file

# uncomment and run if changes are made
# df_hcpcsevents.to_csv('df_hcpcsevents.csv', index=False)

#### Split into train and test

In [None]:
data = df_hcpcsevents.drop(columns=['los'])
target = df_hcpcsevents['los']

# Split the dataset into training and testing sets
hcpcsevents_data_train, hcpcsevents_data_test, hcpcsevents_label_train, hcpcsevents_label_test = train_test_split(data, target, test_size=0.2, random_state=42)

# Print the shapes of the resulting training and testing sets
print("Training set shape:", hcpcsevents_data_train.shape, hcpcsevents_label_train.shape)
print("Testing set shape:", hcpcsevents_data_test.shape, hcpcsevents_label_test.shape)

In [None]:
# uncomment and run if changes are made

hcpcsevents_data_train.to_csv('hcpcsevents_data_train.csv', index=False)
hcpcsevents_data_test.to_csv('hcpcsevents_data_test.csv', index=False)

hcpcsevents_label_train.to_csv('hcpcsevents_label_train.csv', index=False)
hcpcsevents_label_test.to_csv('hcpcsevents_label_test.csv', index=False)

#### Dimensionality reduction

In [None]:
# Need to reduce from 13 to 9

In [None]:
path = "C:/Users/jenni/OneDrive/Desktop/IP/"
file = "hcpcsevents_data_train.csv"
full_path = path + file

hcpcsevents_data_train = pd.read_csv(full_path)

file = "hcpcsevents_data_test.csv"
full_path = path + file

hcpcsevents_data_test = pd.read_csv(full_path)

file = "hcpcsevents_label_train.csv"
full_path = path + file

hcpcsevents_label_train = pd.read_csv(full_path)

file = "hcpcsevents_label_test.csv"
full_path = path + file

hcpcsevents_label_test = pd.read_csv(full_path)

### labevents

In [None]:
# Information regarding 252 different admissions

In [None]:
file = "hosp/labevents.csv"
full_path = path + file

df_labevents = pd.read_csv(full_path)

In [None]:
# df_labevents['hadm_id'].value_counts()

In [None]:
df_labevents['value'] = pd.to_numeric(df_labevents['value'], errors='coerce').fillna(0)

In [None]:
df_labevents.info()

In [None]:
df_labevents['storetime']

In [None]:
# Make a feature for days_since_admission using charttime - admittime

# Convert to datetime
df_labevents['charttime'] = pd.to_datetime(df_labevents['charttime'], format='%Y/%m/%d %H:%M')

# Add admittime column from other dataframe
df_labevents = df_labevents.merge(df_admittime, on='hadm_id', how='left')

# # Discard the time part and keep only the date
# df_hcpcsevents['admittime'] = df_hcpcsevents['admittime'].dt.date
# df_hcpcsevents['chartdate'] = df_hcpcsevents['chartdate'].dt.date

df_labevents['days_since_admission'] = df_labevents['charttime'] - df_labevents['admittime']

# Fill any non time values
df_labevents['days_since_admission'] = df_labevents['days_since_admission'].fillna(pd.Timedelta(0))

In [None]:
# Add storetime - charttime feature called delay

# Convert to datetime
df_labevents['storetime'] = pd.to_datetime(df_labevents['storetime'], format='%Y/%m/%d %H:%M')

df_labevents['delay'] = df_labevents['storetime'] - df_labevents['charttime']

# Fill any non time values
df_labevents['delay'] = df_labevents['delay'].fillna(pd.Timedelta(0))

Drop: labevent_id, subject_id, order_provider_id (too many Null), charttime, storetime, comments

In [None]:
df_labevents = df_labevents.drop(columns=['labevent_id','subject_id','order_provider_id','charttime','storetime','comments'])

In [None]:
# For flag make abnormal = 1 and fill Null with 0
df_labevents['flag'] = df_labevents['flag'].fillna(0)
df_labevents['flag'] = df_labevents['flag'].replace('abnormal', 1)

In [None]:
# For priority fill Null with N/A and then one hot encode
df_labevents['priority'] = df_labevents['priority'].fillna('N/A')
df_labevents = pd.get_dummies(df_labevents, columns=['priority'])

In [None]:
df_labevents = pd.get_dummies(df_labevents, columns=['valueuom','specimen_id','itemid'])

In [None]:
# Drop any rows with null values 
df_labevents = df_labevents.dropna()
# Reduced from 107727 rows to 66660

#### Learner target feature

Given information about a laboratory event (from a patient specimen) - These include haematology measurements, blood gases, chemistry panels, and less common tests such as genetic assays.
Predict that patient's eventual LOS (based on hadm_id)

In [None]:
df_labevents = df_labevents.merge(df_los_hadm, on='hadm_id', how='left')
df_labevents = df_labevents.drop(columns=['hadm_id', 'admittime'])

In [None]:
# df_labevents.columns.tolist()

In [None]:
# Save DataFrame to CSV file

# uncomment and run if changes are made
df_labevents.to_csv('df_labevents.csv', index=False)

#### Split into train and test

In [None]:
data = df_labevents.drop(columns=['los'])
target = df_labevents['los']

# Split the dataset into training and testing sets
labevents_data_train, labevents_data_test, labevents_label_train, labevents_label_test = train_test_split(data, target, test_size=0.2, random_state=42)

# Print the shapes of the resulting training and testing sets
print("Training set shape:", labevents_data_train.shape, labevents_label_train.shape)
print("Testing set shape:", labevents_data_test.shape, labevents_label_test.shape)

In [None]:
# uncomment and run if changes are made

labevents_data_train.to_csv('labevents_data_train.csv', index=False)
labevents_data_test.to_csv('labevents_data_test.csv', index=False)

labevents_label_train.to_csv('labevents_label_train.csv', index=False)
labevents_label_test.to_csv('labevents_label_test.csv', index=False)

In [None]:
labevents_data_train

#### Dimensionality reduction

In [None]:
# Fine

### microbiologyevents

In [None]:
file = "hosp/microbiologyevents.csv"
full_path = path + file

df_microbio = pd.read_csv(full_path)

In [None]:
# df_microbio['micro_specimen_id'].value_counts()

In [None]:
df_microbio.head(5)

In [None]:
# df_microbio['comments'].value_counts()

In [None]:
# make days_since_admission using charttime 

# Convert to datetime
df_microbio['charttime'] = pd.to_datetime(df_microbio['charttime'], format='%Y/%m/%d %H:%M')

# Add admittime column from other dataframe
df_microbio = df_microbio.merge(df_admittime, on='hadm_id', how='left')

# # Discard the time part and keep only the date
# df_hcpcsevents['admittime'] = df_hcpcsevents['admittime'].dt.date
# df_hcpcsevents['chartdate'] = df_hcpcsevents['chartdate'].dt.date

df_microbio['days_since_admission'] = df_microbio['charttime'] - df_microbio['admittime']

# Fill any non time values
df_microbio['days_since_admission'] = df_microbio['days_since_admission'].fillna(pd.Timedelta(0))

# Drop the admission time column
df_microbio = df_microbio.drop(columns=['admittime'])

In [None]:
# Add storetime - charttime feature (call it delay)

# Convert to datetime
df_microbio['storetime'] = pd.to_datetime(df_microbio['storetime'], format='%Y/%m/%d %H:%M')

df_microbio['delay'] = df_microbio['storetime'] - df_microbio['charttime']

# Fill any non time values
df_microbio['delay'] = df_microbio['delay'].fillna(pd.Timedelta(0))

Drop: microevent_id, subject_id, chartdate, charttime, test_seq, storedate, storetime, test_name and org_itemid (since info in name), quantity, ab_name, comments, micro_specimen_id (unique identifier for sample as some measurements are made on the same sample)
Keep but categorical: order_provider_id, spec_type_desc, dilution_text, dilution_comparison
Impute null with 0: order_provider_id, org_itemid, isolate_num, ab_itemid, dilution_value
Impute with N/A and then one hot encode: interpretation

In [None]:
# Drop
# spec_itemid , test_itemid
df_microbio = df_microbio.drop(columns=['microevent_id','subject_id','chartdate','charttime','test_seq','storedate',
                                       'storetime','quantity','comments','ab_itemid',
                                       'spec_itemid','test_itemid','org_itemid','micro_specimen_id'])

In [None]:
# Impute null with 0: order_provider_id, org_itemid, isolate_num, ab_itemid, dilution_value
df_microbio['order_provider_id'] = df_microbio['order_provider_id'].fillna(0)
df_microbio['isolate_num'] = df_microbio['isolate_num'].fillna(0)
df_microbio['dilution_value'] = df_microbio['dilution_value'].fillna(0)

In [None]:
# Impute with N/A and then one hot encode: interpretation
# encode test_name, ab_name

df_microbio['interpretation'] = df_microbio['interpretation'].fillna('N/A')
df_microbio['test_name'] = df_microbio['test_name'].fillna('N/A')
df_microbio['ab_name'] = df_microbio['ab_name'].fillna('N/A')
df_microbio['org_name'] = df_microbio['org_name'].fillna('None')
df_microbio = pd.get_dummies(df_microbio, columns=['org_name','interpretation','ab_name','test_name'])

In [None]:
# Keep but categorical: order_provider_id, spec_type_desc, dilution_text, dilution_comparison
df_microbio = pd.get_dummies(df_microbio, columns=['order_provider_id','spec_type_desc','dilution_text',
                                                  'dilution_comparison'])

In [None]:
df_microbio = df_microbio.dropna()

In [None]:
df_microbio

#### Learner target feature

Given information about a microbiology measurement, predict that patient's eventual LOS (based on hadm_id)

In [None]:
df_microbio = df_microbio.merge(df_los_hadm, on='hadm_id', how='left')
df_microbio = df_microbio.drop(columns=['hadm_id'])

In [None]:
# Save DataFrame to CSV file

# uncomment and run if changes are made
# df_microbio.to_csv('df_microbio.csv', index=False)

#### Split into train and test

In [None]:
data = df_microbio.drop(columns=['los'])
target = df_microbio['los']

# Split the dataset into training and testing sets
microbio_data_train, microbio_data_test, microbio_label_train, microbio_label_test = train_test_split(data, target, test_size=0.2, random_state=42)

# Print the shapes of the resulting training and testing sets
print("Training set shape:", microbio_data_train.shape, microbio_label_train.shape)
print("Testing set shape:", microbio_data_test.shape, microbio_label_test.shape)

In [None]:
# uncomment and run if changes are made

# microbio_data_train.to_csv('microbio_data_train.csv', index=False)
# microbio_data_test.to_csv('microbio_data_test.csv', index=False)

# microbio_label_train.to_csv('microbio_label_train.csv', index=False)
# microbio_label_test.to_csv('microbio_label_test.csv', index=False)

In [None]:
microbio_data_train

#### Dimensionality reduction

In [None]:
# Fine

### patients

In [None]:
file = "hosp/patients.csv"
full_path = path + file

df_patients = pd.read_csv(full_path)

In [None]:
df_patients

In [None]:
df_patients['anchor_age'].value_counts

Drop: anchor_year
Encode: gender (M to 0 and F to 1), dod (change all to 1 and nulls to 0)
Dummies: anchor_year_group  

In [None]:
# Drop
df_patients = df_patients.drop(columns=['anchor_year','dod']) 
# Since this is the shifted year and dod is an outcome value

In [None]:
# Encode: gender (M to 0 and F to 1), dod (change all to 1 and nulls to 0)
df_patients['gender'] = df_patients['gender'].replace('M', 0)
df_patients['gender'] = df_patients['gender'].replace('F', 1)

In [None]:
# Dummies: anchor_year_group  
df_patients = pd.get_dummies(df_patients, columns=['anchor_year_group'])

In [None]:
df_patients

#### Learner target feature

Based on the patient's gender, age and whether their year group was either 2011-2013 or 2014-2016, predict LOS (take an average based on subject_id)

In [None]:
df_patients = df_patients.merge(df_los_subject, on='subject_id', how='left')
df_patients = df_patients.drop(columns=['subject_id'])

In [None]:
# Save DataFrame to CSV file

# uncomment and run if changes are made
df_patients.to_csv('df_patients.csv', index=False)

#### Split into train and test

In [None]:
data = df_patients.drop(columns=['los'])
target = df_patients['los']

# Split the dataset into training and testing sets
patients_data_train, patients_data_test, patients_label_train, patients_label_test = train_test_split(data, target, test_size=0.2, random_state=42)

# Print the shapes of the resulting training and testing sets
print("Training set shape:", patients_data_train.shape, patients_label_train.shape)
print("Testing set shape:", patients_data_test.shape, patients_label_test.shape)

In [None]:
# uncomment and run if changes are made

patients_data_train.to_csv('patients_data_train.csv', index=False)
patients_data_test.to_csv('patients_data_test.csv', index=False)

patients_label_train.to_csv('patients_label_train.csv', index=False)
patients_label_test.to_csv('patients_label_test.csv', index=False)

In [None]:
# patients_data_train

#### Dimensionality reduction

In [None]:
# Fine

### pharmacy

In [None]:
file = "hosp/pharmacy.csv"
full_path = path + file

df_pharmacy = pd.read_csv(full_path)

In [None]:
df_pharmacy.head(2)

In [None]:
# object_columns = df_pharmacy.select_dtypes(include=['object'])
# print(object_columns.columns)

In [None]:
df_pharmacy.columns

drop: subject_id, pharmacy_id, poe_id, starttime, stoptime, entertime, verifiedtime, disp_sched, basal_rate, one_hr_max,
expirationdate, fill_quantity
Encode: proc_type, status
Impute with N/A and encode: infusion_type, sliding_scale, duration_interval, expiration_unit, dispensation, medication, route, frequency
Impute with 0: lockout_interval, doses_per_24_hrs, duration, expiration_value

In [None]:
# stoptime-starttime for a duration feature

# Convert to datetime
df_pharmacy['stoptime'] = pd.to_datetime(df_pharmacy['stoptime'], format='%Y/%m/%d %H:%M')
df_pharmacy['starttime'] = pd.to_datetime(df_pharmacy['starttime'], format='%Y/%m/%d %H:%M')


df_pharmacy['medication_duration'] = df_pharmacy['stoptime'] - df_pharmacy['starttime']

# Fill any non time values
df_pharmacy['medication_duration'] = df_pharmacy['medication_duration'].fillna(pd.Timedelta(0))

In [None]:
# verifiedtime - entertime for verification_delay feature 

# Convert to datetime
df_pharmacy['verifiedtime'] = pd.to_datetime(df_pharmacy['verifiedtime'], format='%Y/%m/%d %H:%M')
df_pharmacy['entertime'] = pd.to_datetime(df_pharmacy['entertime'], format='%Y/%m/%d %H:%M')

df_pharmacy['verification_delay'] = df_pharmacy['verifiedtime'] - df_pharmacy['entertime']

# Fill any non time values
df_pharmacy['verification_delay'] = df_pharmacy['verification_delay'].fillna(pd.Timedelta(0))

In [None]:
fill_value = [0] 

# Fill null values with the list
df_pharmacy['disp_sched'] = df_pharmacy['disp_sched'].fillna(pd.Series([fill_value]*len(df_pharmacy)))

In [None]:
# Convert all categories to strings
df_pharmacy['disp_sched'] = df_pharmacy['disp_sched'].apply(lambda x: [str(item) for item in x])

mlb = MultiLabelBinarizer()

encoded_feature = pd.DataFrame(mlb.fit_transform(df_pharmacy['disp_sched']),
                               columns=mlb.classes_,
                               index=df_pharmacy.index)

df_pharmacy = pd.concat([df_pharmacy, encoded_feature], axis=1)

In [None]:
# df_pharmacy

In [None]:
# Drop 
df_pharmacy = df_pharmacy.drop(columns=['subject_id','pharmacy_id','poe_id','starttime','stoptime','entertime',
                                       'verifiedtime','expirationdate', 'fill_quantity','disp_sched'])
# expiration date and fill quantity are all empty

In [None]:
# Encode: proc_type, status
df_pharmacy = pd.get_dummies(df_pharmacy, columns=['proc_type','status'])

In [None]:
# Impute with N/A and encode
df_pharmacy['infusion_type'] = df_pharmacy['infusion_type'].fillna('N/A')
df_pharmacy['sliding_scale'] = df_pharmacy['sliding_scale'].fillna('N/A')
df_pharmacy['duration_interval'] = df_pharmacy['duration_interval'].fillna('N/A')
df_pharmacy['expiration_unit'] = df_pharmacy['expiration_unit'].fillna('N/A')
df_pharmacy['dispensation'] = df_pharmacy['dispensation'].fillna('N/A')
df_pharmacy['medication'] = df_pharmacy['medication'].fillna('N/A')
df_pharmacy['route'] = df_pharmacy['route'].fillna('N/A')
df_pharmacy['frequency'] = df_pharmacy['frequency'].fillna('N/A')
df_pharmacy = pd.get_dummies(df_pharmacy, columns=['infusion_type','sliding_scale','duration_interval','expiration_unit',
                                                  'dispensation','medication','route','frequency'])

In [None]:
# Impute with 0: lockout_interval, doses_per_24_hrs, duration, expiration_value
df_pharmacy['lockout_interval'] = df_pharmacy['lockout_interval'].fillna(0)
df_pharmacy['doses_per_24_hrs'] = df_pharmacy['doses_per_24_hrs'].fillna(0)
df_pharmacy['expiration_value'] = df_pharmacy['expiration_value'].fillna(0)
df_pharmacy['basal_rate'] = df_pharmacy['basal_rate'].fillna(0)
df_pharmacy['one_hr_max'] = df_pharmacy['one_hr_max'].fillna(0)

In [None]:
# df_pharmacy

#### Learner target feature

Given information about a particular prescribed medication, predict the LOS for that patient (based on hadm_id)

In [None]:
df_pharmacy = df_pharmacy.merge(df_los_hadm, on='hadm_id', how='left')
df_pharmacy = df_pharmacy.drop(columns=['hadm_id'])

In [None]:
# Save DataFrame to CSV file

# uncomment and run if changes are made
df_pharmacy.to_csv('df_pharmacy.csv', index=False)

#### Split into train and test

In [None]:
data = df_pharmacy.drop(columns=['los'])
target = df_pharmacy['los']

# Split the dataset into training and testing sets
pharmacy_data_train, pharmacy_data_test, pharmacy_label_train, pharmacy_label_test = train_test_split(data, target, test_size=0.2, random_state=42)

# Print the shapes of the resulting training and testing sets
print("Training set shape:", pharmacy_data_train.shape, pharmacy_label_train.shape)
print("Testing set shape:", pharmacy_data_test.shape, pharmacy_label_test.shape)

In [None]:
# uncomment and run if changes are made

pharmacy_data_train.to_csv('pharmacy_data_train.csv', index=False)
pharmacy_data_test.to_csv('pharmacy_data_test.csv', index=False)

pharmacy_label_train.to_csv('pharmacy_label_train.csv', index=False)
pharmacy_label_test.to_csv('pharmacy_label_test.csv', index=False)

#### Dimensionality reduction

In [None]:
# Fine

### poe

In [None]:
file = "hosp/poe.csv"
full_path = path + file

df_poe = pd.read_csv(full_path)

In [None]:
# object_columns = df_poe.select_dtypes(include=['object'])
# print(object_columns.columns)

In [None]:
df_poe.head()

In [None]:
# df_poe['order_status'].value_counts()

To drop: poe_id, subject_id, ordertime, discontinue_of_poe_id, discontinued_by_poe_id (all unique), order_status (all inactive)
Encode: order_type, transaction_type
Impute with N/A and then encode: order_subtype, order_provider_id

In [None]:
# make a feature of ordertime - admittime for days_since_admission

# Convert to datetime
df_poe['ordertime'] = pd.to_datetime(df_poe['ordertime'], format='%Y/%m/%d %H:%M:%S')

# Add admittime column from other dataframe
df_poe = df_poe.merge(df_admittime, on='hadm_id', how='left')

# # Discard the time part and keep only the date
# df_hcpcsevents['admittime'] = df_hcpcsevents['admittime'].dt.date
# df_hcpcsevents['chartdate'] = df_hcpcsevents['chartdate'].dt.date

df_poe['days_since_admission'] = df_poe['ordertime'] - df_poe['admittime']

# Fill any non time values
df_poe['days_since_admission'] = df_poe['days_since_admission'].fillna(pd.Timedelta(0))

# Drop the admission time column
df_poe = df_poe.drop(columns=['admittime'])

In [None]:
# Drop 
df_poe = df_poe.drop(columns=['poe_id','subject_id','ordertime','discontinue_of_poe_id','discontinued_by_poe_id',
                                       'order_status'])

In [None]:
# Encode
df_poe = pd.get_dummies(df_poe, columns=['order_type','transaction_type'])

In [None]:
# Impute with N/A and encode
df_poe['order_subtype'] = df_poe['order_subtype'].fillna('N/A')
df_poe['order_provider_id'] = df_poe['order_provider_id'].fillna('N/A')
df_poe = pd.get_dummies(df_poe, columns=['order_subtype','order_provider_id'])

In [None]:
df_poe

#### Learner target feature

Given information about a particular order made by a provider, predict the LOS for that patient (based on hadm_id)

In [None]:
df_poe = df_poe.merge(df_los_hadm, on='hadm_id', how='left')
df_poe = df_poe.drop(columns=['hadm_id'])

In [None]:
# Save DataFrame to CSV file

# uncomment and run if changes are made
df_poe.to_csv('df_poe.csv', index=False)

#### Split into train and test

In [None]:
data = df_poe.drop(columns=['los'])
target = df_poe['los']

# Split the dataset into training and testing sets
poe_data_train, poe_data_test, poe_label_train, poe_label_test = train_test_split(data, target, test_size=0.2, random_state=42)

# Print the shapes of the resulting training and testing sets
print("Training set shape:", poe_data_train.shape, poe_label_train.shape)
print("Testing set shape:", poe_data_test.shape, poe_label_test.shape)

In [None]:
# uncomment and run if changes are made

poe_data_train.to_csv('poe_data_train.csv', index=False)
poe_data_test.to_csv('poe_data_test.csv', index=False)

poe_label_train.to_csv('poe_label_train.csv', index=False)
poe_label_test.to_csv('poe_label_test.csv', index=False)

#### Dimensionality reduction

In [None]:
# Fine

### prescriptions

In [None]:
file = "hosp/prescriptions.csv"
full_path = path + file

df_prescriptions = pd.read_csv(full_path)

In [None]:
df_prescriptions.head()

In [None]:
# df_prescriptions['ndc'].value_counts()

Drop na and encode: dose_val_rx, form_val_disp, order_provider_id
Drop: subject_id, pharmacy_id, starttime, stoptime, form_rx (mostly null), poe_id
Impute with N/A and encode: formulary_drug_cd, gsn, prod_strength, route
Encode: drug_type, drug, dose_unit_rx, form_unit_disp
Impute with 0: doses_per_24_hrs

Drop rows with na

order_provider_id
Was going to impute with N/A and encode but going to drop as too many features 

In [None]:
# Make a feature of stoptime-starttime called duration 

# Convert to datetime
df_prescriptions['stoptime'] = pd.to_datetime(df_prescriptions['stoptime'], format='%Y/%m/%d %H:%M')
df_prescriptions['starttime'] = pd.to_datetime(df_prescriptions['starttime'], format='%Y/%m/%d %H:%M')

df_prescriptions['duration'] = df_prescriptions['stoptime'] - df_prescriptions['starttime']

# Fill any non time values
df_prescriptions['duration'] = df_prescriptions['duration'].fillna(pd.Timedelta(0))

In [None]:
# Drop na
df_prescriptions.dropna(subset=['dose_val_rx', 'form_val_disp'], inplace=True)

In [None]:
# Drop 
df_prescriptions = df_prescriptions.drop(columns=['subject_id','pharmacy_id','starttime','stoptime','form_rx','poe_id',
                                                 'order_provider_id'])

In [None]:
# Impute with N/A and encode
df_prescriptions['formulary_drug_cd'] = df_prescriptions['formulary_drug_cd'].fillna('N/A')
df_prescriptions['gsn'] = df_prescriptions['gsn'].fillna('N/A')
df_prescriptions['prod_strength'] = df_prescriptions['prod_strength'].fillna('N/A')
df_prescriptions['route'] = df_prescriptions['route'].fillna('N/A')

# Impute with 0
df_prescriptions['ndc'] = df_prescriptions['ndc'].fillna(0)

df_prescriptions = pd.get_dummies(df_prescriptions, columns=['formulary_drug_cd','gsn','prod_strength',
                                                            'route','drug_type','drug','dose_unit_rx','form_unit_disp',
                                                            'dose_val_rx','form_val_disp','ndc'])

In [None]:
df_prescriptions['doses_per_24_hrs'] = df_prescriptions['doses_per_24_hrs'].fillna(0)

In [None]:
# df_prescriptions

In [None]:
# Drop any rows with null values 
df_prescriptions = df_prescriptions.dropna()

In [None]:
# object_columns = df_prescriptions.select_dtypes(include=['object'])
# print(object_columns.columns)

In [None]:
df_prescriptions

#### Learner target feature

Given information about a particular prescribed medication, predict the LOS for that patient (based on hadm_id)

In [None]:
df_prescriptions = df_prescriptions.merge(df_los_hadm, on='hadm_id', how='left')
df_prescriptions = df_prescriptions.drop(columns=['hadm_id'])

In [None]:
# Save DataFrame to CSV file

# uncomment and run if changes are made
# df_prescriptions.to_csv('df_prescriptions.csv', index=False)

#### Split into train and test

In [None]:
data = df_prescriptions.drop(columns=['los'])
target = df_prescriptions['los']

# Split the dataset into training and testing sets
prescriptions_data_train, prescriptions_data_test, prescriptions_label_train, prescriptions_label_test = train_test_split(data, target, test_size=0.2, random_state=42)

# Print the shapes of the resulting training and testing sets
print("Training set shape:", prescriptions_data_train.shape, prescriptions_label_train.shape)
print("Testing set shape:", prescriptions_data_test.shape, prescriptions_label_test.shape)

In [None]:
# uncomment and run if changes are made

prescriptions_data_train.to_csv('prescriptions_data_train.csv', index=False)
prescriptions_data_test.to_csv('prescriptions_data_test.csv', index=False)

prescriptions_label_train.to_csv('prescriptions_label_train.csv', index=False)
prescriptions_label_test.to_csv('prescriptions_label_test.csv', index=False)

#### Dimensionality reduction

In [None]:
# Need to reduce from 4890 to 2874 or less

In [None]:
path = "C:/Users/jenni/OneDrive/Desktop/IP/"
file = "prescriptions_data_train.csv"
full_path = path + file

prescriptions_data_train = pd.read_csv(full_path)

file = "prescriptions_data_test.csv"
full_path = path + file

prescriptions_data_test = pd.read_csv(full_path)

file = "prescriptions_label_train.csv"
full_path = path + file

prescriptions_label_train = pd.read_csv(full_path)

file = "prescriptions_label_test.csv"
full_path = path + file

prescriptions_label_test = pd.read_csv(full_path)

### procedures_icd

In [None]:
file = "hosp/procedures_icd.csv"
full_path = path + file

df_procedures = pd.read_csv(full_path)

In [None]:
df_procedures['icd_code'].value_counts()

In [None]:
# df_procedures['icd_version'].value_counts()

Drop: subject_id, chartdate
Encode: icd_code

In [None]:
# make a feature called days_since_admission of chartdate - admitdate

# Convert to datetime
df_procedures['chartdate'] = pd.to_datetime(df_procedures['chartdate'], format='%Y-%m-%d')

# Add admittime column from other dataframe
df_procedures = df_procedures.merge(df_admittime, on='hadm_id', how='left')

# # Discard the time part and keep only the date
df_procedures['admittime'] = df_procedures['admittime'].dt.date
df_procedures['chartdate'] = df_procedures['chartdate'].dt.date

df_procedures['days_since_admission'] = df_procedures['chartdate'] - df_procedures['admittime']

# Fill any non time values
df_procedures['days_since_admission'] = df_procedures['days_since_admission'].fillna(pd.Timedelta(0))

# Drop the admission time column
df_procedures = df_procedures.drop(columns=['admittime'])

In [None]:
# Drop 
df_procedures = df_procedures.drop(columns=['subject_id','chartdate'])

In [None]:
# Encode
df_procedures = pd.get_dummies(df_procedures, columns=['icd_code'])

#### Learner target feature

Given information about billed procedures for patients during their hospital stay, predict the LOS for that patient (based on hadm_id)

In [None]:
df_procedures = df_procedures.merge(df_los_hadm, on='hadm_id', how='left')
df_procedures = df_procedures.drop(columns=['hadm_id'])

In [None]:
# Save DataFrame to CSV file

# uncomment and run if changes are made
# df_procedures.to_csv('df_procedures.csv', index=False)

#### Split into train and test

In [None]:
data = df_procedures.drop(columns=['los'])
target = df_procedures['los']

# Split the dataset into training and testing sets
procedures_data_train, procedures_data_test, procedures_label_train, procedures_label_test = train_test_split(data, target, test_size=0.2, random_state=42)

# Print the shapes of the resulting training and testing sets
print("Training set shape:", procedures_data_train.shape, procedures_label_train.shape)
print("Testing set shape:", procedures_data_test.shape, procedures_label_test.shape)

In [None]:
# uncomment and run if changes are made

# procedures_data_train.to_csv('procedures_data_train.csv', index=False)
# procedures_data_test.to_csv('procedures_data_test.csv', index=False)

# procedures_label_train.to_csv('procedures_label_train.csv', index=False)
# procedures_label_test.to_csv('procedures_label_test.csv', index=False)

#### Dimensionality reduction

In [None]:
# Need to reduce from 355 to 115

In [None]:
path = "C:/Users/jenni/OneDrive/Desktop/IP/"
file = "procedures_data_train.csv"
full_path = path + file

procedures_data_train = pd.read_csv(full_path)

file = "procedures_data_test.csv"
full_path = path + file

procedures_data_test = pd.read_csv(full_path)

file = "procedures_label_train.csv"
full_path = path + file

procedures_label_train = pd.read_csv(full_path)

file = "procedures_label_test.csv"
full_path = path + file

procedures_label_test = pd.read_csv(full_path)

### services

In [None]:
file = "hosp/services.csv"
full_path = path + file

df_services = pd.read_csv(full_path)

In [None]:
df_services.head()

In [None]:
# df_services['curr_service'].value_counts()

Drop: subject_id, transfertime
Impute with N/A and encode: prev_service
Encode: curr_service

In [None]:
# Make a feature called days_since_admission using transfertime-admittime 

# Convert to datetime
df_services['transfertime'] = pd.to_datetime(df_services['transfertime'], format='%Y-%m-%d %H:%M:%S')

# Add admittime column from other dataframe
df_services = df_services.merge(df_admittime, on='hadm_id', how='left')

# # Discard the time part and keep only the date
# df_hcpcsevents['admittime'] = df_hcpcsevents['admittime'].dt.date
# df_hcpcsevents['chartdate'] = df_hcpcsevents['chartdate'].dt.date

df_services['days_since_admission'] = df_services['transfertime'] - df_services['admittime']

# Fill any non time values
df_services['days_since_admission'] = df_services['days_since_admission'].fillna(pd.Timedelta(0))

# Drop the admission time column
df_services = df_services.drop(columns=['admittime'])

In [None]:
# Drop 
df_services = df_services.drop(columns=['subject_id','transfertime'])

In [None]:
# Impute with N/A and encode
df_services['prev_service'] = df_services['prev_service'].fillna('N/A')
df_services = pd.get_dummies(df_services, columns=['prev_service','curr_service'])

#### Learner target feature

Given information about the hospital service(s) which cared for the patient during their hospitalization, predict their eventual LOS (based on hadm_id)

In [None]:
df_services = df_services.merge(df_los_hadm, on='hadm_id', how='left')
df_services = df_services.drop(columns=['hadm_id'])

In [None]:
# Save DataFrame to CSV file

# uncomment and run if changes are made
# df_services.to_csv('df_services.csv', index=False)

#### Split into train and test

In [None]:
data = df_services.drop(columns=['los'])
target = df_services['los']

# Split the dataset into training and testing sets
services_data_train, services_data_test, services_label_train, services_label_test = train_test_split(data, target, test_size=0.2, random_state=42)

# Print the shapes of the resulting training and testing sets
print("Training set shape:", services_data_train.shape, services_label_train.shape)
print("Testing set shape:", services_data_test.shape, services_label_test.shape)

In [None]:
# uncomment and run if changes are made

# services_data_train.to_csv('services_data_train.csv', index=False)
# services_data_test.to_csv('services_data_test.csv', index=False)

# services_label_train.to_csv('services_label_train.csv', index=False)
# services_label_test.to_csv('services_label_test.csv', index=False)

In [None]:
services_data_train

#### Dimensionality reduction

In [None]:
# Fine

### transfers

In [None]:
file = "hosp/transfers.csv"
full_path = path + file

df_transfers = pd.read_csv(full_path)

In [None]:
df_transfers.head()

In [None]:
# df_transfers['intime'].value_counts()

Drop: subject_id, transfer_id, intime, outtime
Encode: eventtype
Impute with N/A and encode: careunit

In [None]:
# Make a days_since_admission feature of intime-admittime

# Convert to datetime
df_transfers['intime'] = pd.to_datetime(df_transfers['intime'], format='%Y-%m-%d %H:%M:%S')

# Add admittime column from other dataframe
df_transfers = df_transfers.merge(df_admittime, on='hadm_id', how='left')

df_transfers['days_since_admission'] = df_transfers['intime'] - df_transfers['admittime']

# Fill any non time values
df_transfers['days_since_admission'] = df_transfers['days_since_admission'].fillna(pd.Timedelta(0))

# Drop the admission time column
df_transfers = df_transfers.drop(columns=['admittime'])

In [None]:
# Make a duration feature of outtime-intime 

# Convert to datetime
df_transfers['outtime'] = pd.to_datetime(df_transfers['outtime'], format='%Y-%m-%d %H:%M:%S')

df_transfers['duration'] = df_transfers['outtime'] - df_transfers['intime']

# Fill any non time values
df_transfers['duration'] = df_transfers['duration'].fillna(pd.Timedelta(0))

In [None]:
# Drop 
df_transfers = df_transfers.drop(columns=['subject_id','transfer_id','intime','outtime'])

In [None]:
# Impute with N/A and encode
df_transfers['careunit'] = df_transfers['careunit'].fillna('N/A')
df_transfers = pd.get_dummies(df_transfers, columns=['eventtype','careunit'])

In [None]:
# df_transfers.info()

#### Learner target feature

Given information about patients' unit transfers, predict their eventual LOS (based on hadm_id)

In [None]:
df_transfers = df_transfers.merge(df_los_hadm, on='hadm_id', how='left')
df_transfers = df_transfers.drop(columns=['hadm_id'])

In [None]:
# Save DataFrame to CSV file

# uncomment and run if changes are made
# df_transfers.to_csv('df_transfers.csv', index=False)

#### Split into train and test

In [None]:
data = df_transfers.drop(columns=['los'])
target = df_transfers['los']

# Split the dataset into training and testing sets
transfers_data_train, transfers_data_test, transfers_label_train, transfers_label_test = train_test_split(data, target, test_size=0.2, random_state=42)

# Print the shapes of the resulting training and testing sets
print("Training set shape:", transfers_data_train.shape, transfers_label_train.shape)
print("Testing set shape:", transfers_data_test.shape, transfers_label_test.shape)

In [None]:
# uncomment and run if changes are made

# transfers_data_train.to_csv('transfers_data_train.csv', index=False)
# transfers_data_test.to_csv('transfers_data_test.csv', index=False)

# transfers_label_train.to_csv('transfers_label_train.csv', index=False)
# transfers_label_test.to_csv('transfers_label_test.csv', index=False)

#### Dimensionality reduction

In [None]:
# Fine

### chartevents

In [None]:
file = "icu/chartevents.csv"
full_path = path + file

df_chart = pd.read_csv(full_path)

In [None]:
df_chart['itemid'].value_counts()

In [None]:
df_chart.head()

In [None]:
# object_columns = df_chart.select_dtypes(include=['object'])
# print(object_columns.columns)

In [None]:
# df_chart['warning'].value_counts()

Drop: subject_id, charttime, storetime, stay_id, caregiver_id (the person who documented the data)
Encode: value,itemid
Impute with 0: valuenum, warning
Impute with N/A and encode: valueuom

In [None]:
# Make a days_since_admission feature of charttime-admittime 

# Convert to datetime
df_chart['charttime'] = pd.to_datetime(df_chart['charttime'], format='%Y-%m-%d %H:%M:%S')

# Add admittime column from other dataframe
df_chart = df_chart.merge(df_admittime, on='hadm_id', how='left')

# # Discard the time part and keep only the date
# df_hcpcsevents['admittime'] = df_hcpcsevents['admittime'].dt.date
# df_hcpcsevents['chartdate'] = df_hcpcsevents['chartdate'].dt.date

df_chart['days_since_admission'] = df_chart['charttime'] - df_chart['admittime']

# Fill any non time values
df_chart['days_since_admission'] = df_chart['days_since_admission'].fillna(pd.Timedelta(0))

# Drop the admission time column
df_chart = df_chart.drop(columns=['admittime'])

In [None]:
# Make a delay feature of storetime-charttime

# Convert to datetime
df_chart['storetime'] = pd.to_datetime(df_chart['storetime'], format='%Y-%m-%d %H:%M:%S')

df_chart['delay'] = df_chart['storetime'] - df_chart['charttime']

# Fill any non time values
df_chart['delay'] = df_chart['delay'].fillna(pd.Timedelta(0))

In [None]:
# Drop 
df_chart = df_chart.drop(columns=['subject_id','charttime','storetime', 'stay_id','caregiver_id'])

In [None]:
# Impute with N/A and encode
df_chart['valueuom'] = df_chart['valueuom'].fillna('N/A')
df_chart = pd.get_dummies(df_chart, columns=['valueuom','value','itemid'])

In [None]:
# Impute with 0
df_chart['valuenum'] = df_chart['valuenum'].fillna(0)
df_chart['warning'] = df_chart['warning'].fillna(0)

In [None]:
# df_chart.describe()

#### Learner target feature

Given a piece of charted data during their ICU stay, predict their eventual LOS (based on hadm_id)

In [None]:
df_chart = df_chart.merge(df_los_hadm, on='hadm_id', how='left')
df_chart = df_chart.drop(columns=['hadm_id'])

In [None]:
# Save DataFrame to CSV file
# uncomment and run if changes are made (it takes ages btw)
# df_chart.to_csv('df_chart.csv', index=False)

#### Split into train and test

In [None]:
data = df_chart.drop(columns=['los'])
target = df_chart['los']

# Split the dataset into training and testing sets
chart_data_train, chart_data_test, chart_label_train, chart_label_test = train_test_split(data, target, test_size=0.2, random_state=42)

# Print the shapes of the resulting training and testing sets
print("Training set shape:", chart_data_train.shape, chart_label_train.shape)
print("Testing set shape:", chart_data_test.shape, chart_label_test.shape)

In [None]:
# uncomment and run if changes are made

# chart_data_train.to_csv('chart_data_train.csv', index=False)
# chart_data_test.to_csv('chart_data_test.csv', index=False)

# chart_label_train.to_csv('chart_label_train.csv', index=False)
# chart_label_test.to_csv('chart_label_test.csv', index=False)

#### Dimensionality reduction

In [None]:
# Fine

### icustays

In [None]:
file = "icu/icustays.csv"
full_path = path + file

df_icustays = pd.read_csv(full_path)

In [None]:
df_icustays.head()

In [None]:
df_icustays['outtime'].value_counts()

Drop: subject_id, stay_id, intime, outtime
Encode: first_careunit, last_careunit

In [None]:
# make a feature called days_since_admission using intime-admittime

# Convert to datetime
df_icustays['intime'] = pd.to_datetime(df_icustays['intime'], format='%Y-%m-%d %H:%M:%S')

# Add admittime column from other dataframe
df_icustays = df_icustays.merge(df_admittime, on='hadm_id', how='left')

# # Discard the time part and keep only the date
# df_hcpcsevents['admittime'] = df_hcpcsevents['admittime'].dt.date
# df_hcpcsevents['chartdate'] = df_hcpcsevents['chartdate'].dt.date

df_icustays['days_since_admission'] = df_icustays['intime'] - df_icustays['admittime']

# Fill any non time values
df_icustays['days_since_admission'] = df_icustays['days_since_admission'].fillna(pd.Timedelta(0))

# Drop the admission time column
df_icustays = df_icustays.drop(columns=['admittime'])

In [None]:
# Drop 
df_icustays = df_icustays.drop(columns=['subject_id','stay_id','intime','outtime'])

# Rename los to icu_los
df_icustays = df_icustays.rename(columns={'los': 'icu_los'})

In [None]:
# Encode
df_icustays = pd.get_dummies(df_icustays, columns=['first_careunit','last_careunit'])

#### Learner target feature

Given tracking information for ICU stays including admission and discharge times, predict the patient's eventual LOS (based on hadm_id)

In [None]:
df_icustays = df_icustays.merge(df_los_hadm, on='hadm_id', how='left')
df_icustays = df_icustays.drop(columns=['hadm_id'])

In [None]:
# Save DataFrame to CSV file

# uncomment and run if changes are made
# df_icustays.to_csv('df_icustays.csv', index=False)

#### Split into train and test

In [None]:
data = df_icustays.drop(columns=['los'])
target = df_icustays['los']

# Split the dataset into training and testing sets
icustays_data_train, icustays_data_test, icustays_label_train, icustays_label_test = train_test_split(data, target, test_size=0.2, random_state=42)

# Print the shapes of the resulting training and testing sets
print("Training set shape:", icustays_data_train.shape, icustays_label_train.shape)
print("Testing set shape:", icustays_data_test.shape, icustays_label_test.shape)

In [None]:
# uncomment and run if changes are made

# icustays_data_train.to_csv('icustays_data_train.csv', index=False)
# icustays_data_test.to_csv('icustays_data_test.csv', index=False)

# icustays_label_train.to_csv('icustays_label_train.csv', index=False)
# icustays_label_test.to_csv('icustays_label_test.csv', index=False)

#### Dimensionality reduction

In [None]:
# Fine

### ingredientevents

In [None]:
file = "icu/ingredientevents.csv"
full_path = path + file

df_ingredient = pd.read_csv(full_path)

In [None]:
df_ingredient.info()

In [None]:
df_ingredient['storetime']

Drop: subject_id, starttime, endtime, storetime, orderid, originalamount, stay_id, caregiver_id
Encode: amountuom, statusdescription, itemid
Impute with 0: rate
Impute with N/A and encode: rateuom, linkorderid

In [None]:
# Make a duration feature of endtime-starttime 

# Convert to datetime
df_ingredient['endtime'] = pd.to_datetime(df_ingredient['endtime'], format='%Y-%m-%d %H:%M:%S')
df_ingredient['starttime'] = pd.to_datetime(df_ingredient['starttime'], format='%Y-%m-%d %H:%M:%S')


df_ingredient['duration'] = df_ingredient['endtime'] - df_ingredient['starttime']

# Fill any non time values
df_ingredient['duration'] = df_ingredient['duration'].fillna(pd.Timedelta(0))

In [None]:
# make a recording_delay feature of storetime-endtime

# Convert to datetime
df_ingredient['storetime'] = pd.to_datetime(df_ingredient['storetime'], format='%Y-%m-%d %H:%M:%S')

df_ingredient['recording_delay'] = df_ingredient['storetime'] - df_ingredient['endtime']

# Fill any non time values
df_ingredient['recording_delay'] = df_ingredient['recording_delay'].fillna(pd.Timedelta(0))

In [None]:
# Drop 
df_ingredient = df_ingredient.drop(columns=['subject_id','starttime','endtime','storetime','orderid','originalamount',
                                           'stay_id','caregiver_id'])

In [None]:
# Impute with N/A and encode
df_ingredient['rateuom'] = df_ingredient['rateuom'].fillna('N/A')
df_ingredient['linkorderid'] = df_ingredient['linkorderid'].fillna('N/A')
df_ingredient = pd.get_dummies(df_ingredient, columns=['rateuom','amountuom','statusdescription','itemid','linkorderid'])

In [None]:
# Impute with 0
df_ingredient['rate'] = df_ingredient['rate'].fillna(0)

#### Learner target feature

Given information on ingredients of continuous or intermittent administrations including nutritional and water content, predict the patient's eventual LOS (based on hadm_id)

In [None]:
df_ingredient = df_ingredient.merge(df_los_hadm, on='hadm_id', how='left')
df_ingredient = df_ingredient.drop(columns=['hadm_id'])

In [None]:
# Save DataFrame to CSV file

# uncomment and run if changes are made
df_ingredient.to_csv('df_ingredient.csv', index=False)

#### Split into train and test

In [None]:
data = df_ingredient.drop(columns=['los'])
target = df_ingredient['los']

# Split the dataset into training and testing sets
ingredient_data_train, ingredient_data_test, ingredient_label_train, ingredient_label_test = train_test_split(data, target, test_size=0.2, random_state=42)

# Print the shapes of the resulting training and testing sets
print("Training set shape:", ingredient_data_train.shape, ingredient_label_train.shape)
print("Testing set shape:", ingredient_data_test.shape, ingredient_label_test.shape)

In [None]:
# uncomment and run if changes are made

ingredient_data_train.to_csv('ingredient_data_train.csv', index=False)
ingredient_data_test.to_csv('ingredient_data_test.csv', index=False)

ingredient_label_train.to_csv('ingredient_label_train.csv', index=False)
ingredient_label_test.to_csv('ingredient_label_test.csv', index=False)

#### Dimensionality reduction

In [None]:
# Need to reduce from 7727 to 4116

In [None]:
path = "C:/Users/jenni/OneDrive/Desktop/IP/"
file = "ingredient_data_train.csv"
full_path = path + file

ingredient_data_train = pd.read_csv(full_path)

file = "ingredient_data_test.csv"
full_path = path + file

ingredient_data_test = pd.read_csv(full_path)

file = "ingredient_label_train.csv"
full_path = path + file

ingredient_label_train = pd.read_csv(full_path)

file = "ingredient_label_test.csv"
full_path = path + file

ingredient_label_test = pd.read_csv(full_path)

### inputevents

In [None]:
file = "icu/inputevents.csv"
full_path = path + file

df_input = pd.read_csv(full_path)

In [None]:
# object_columns = df_input.select_dtypes(include=['object'])
# print(object_columns.columns)

In [None]:
df_input.info()

Drop: subject_id, starttime, endtime, storetime, orderid, linkorderid, continueinnextdept, stay_id, caregiver_id,
totalamountuom
Encode: amountuom, ordercategoryname, ordercomponenttypedescription, ordercategorydescription, statusdescription, itemid
Impute with 0: rate, totalamount
Impute with N/A and encode: rateuom, secondaryordercategoryname

In [None]:
# Make a duration feature using endtime-starttime

# Convert to datetime
df_input['endtime'] = pd.to_datetime(df_input['endtime'], format='%Y-%m-%d %H:%M:%S')
df_input['starttime'] = pd.to_datetime(df_input['starttime'], format='%Y-%m-%d %H:%M:%S')


df_input['duration'] = df_input['endtime'] - df_input['starttime']

# Fill any non time values
df_input['duration'] = df_input['duration'].fillna(pd.Timedelta(0))

In [None]:
# Make a recording_delay feature using storetime-endtime

# Convert to datetime
df_input['storetime'] = pd.to_datetime(df_input['storetime'], format='%Y-%m-%d %H:%M:%S')

df_input['recording_delay'] = df_input['storetime'] - df_input['endtime']

# Fill any non time values
df_input['recording_delay'] = df_input['recording_delay'].fillna(pd.Timedelta(0))

In [None]:
# Drop 
df_input = df_input.drop(columns=['subject_id','stay_id','starttime','endtime','storetime','orderid','linkorderid',
                                  'continueinnextdept','totalamountuom', 'stay_id','caregiver_id'])

In [None]:
# Impute with N/A and encode
df_input['rateuom'] = df_input['rateuom'].fillna('N/A')
df_input['secondaryordercategoryname'] = df_input['secondaryordercategoryname'].fillna('N/A')
df_input = pd.get_dummies(df_input, columns=['rateuom','secondaryordercategoryname','amountuom','ordercategoryname',
                                            'ordercomponenttypedescription','ordercategorydescription','statusdescription',
                                            'itemid'])

In [None]:
# Impute with 0
df_input['rate'] = df_input['rate'].fillna(0)
df_input['totalamount'] = df_input['totalamount'].fillna(0)

In [None]:
df_input = df_input.dropna()

In [None]:
df_input

#### Learner target feature

Given information documented regarding continuous infusions or intermittent administrations, predict the patient's eventual LOS (based on hadm_id)

In [None]:
df_input = df_input.merge(df_los_hadm, on='hadm_id', how='left')
df_input = df_input.drop(columns=['hadm_id'])

In [None]:
# Save DataFrame to CSV file

# uncomment and run if changes are made
df_input.to_csv('df_input.csv', index=False)

#### Split into train and test

In [None]:
data = df_input.drop(columns=['los'])
target = df_input['los']

# Split the dataset into training and testing sets
input_data_train, input_data_test, input_label_train, input_label_test = train_test_split(data, target, test_size=0.2, random_state=42)

# Print the shapes of the resulting training and testing sets
print("Training set shape:", input_data_train.shape, input_label_train.shape)
print("Testing set shape:", input_data_test.shape, input_label_test.shape)

In [None]:
# uncomment and run if changes are made

input_data_train.to_csv('input_data_train.csv', index=False)
input_data_test.to_csv('input_data_test.csv', index=False)

input_label_train.to_csv('input_label_train.csv', index=False)
input_label_test.to_csv('input_label_test.csv', index=False)

In [None]:
input_data_train

#### Dimensionality reduction

In [None]:
# Fine

### outputevents

In [None]:
file = "icu/outputevents.csv"
full_path = path + file

df_output = pd.read_csv(full_path)

In [None]:
df_output.columns

In [None]:
# df_output['value'].value_counts()

Drop: subject_id, charttime, storetime, valueuom, stay_id, caregiver_id'
Encode: itemid

In [None]:
# Make a days_since_admission feature using charttime-admittime 

# Convert to datetime
df_output['charttime'] = pd.to_datetime(df_output['charttime'], format='%Y-%m-%d %H:%M:%S')

# Add admittime column from other dataframe
df_output = df_output.merge(df_admittime, on='hadm_id', how='left')

df_output['days_since_admission'] = df_output['charttime'] - df_output['admittime']

# Fill any non time values
df_output['days_since_admission'] = df_output['days_since_admission'].fillna(pd.Timedelta(0))

# Drop the admission time column
df_output = df_output.drop(columns=['admittime'])

In [None]:
# Make a recording_delay feature using storetime-charttime

# Convert to datetime
df_output['storetime'] = pd.to_datetime(df_output['storetime'], format='%Y-%m-%d %H:%M:%S')

df_output['recording_delay'] = df_output['storetime'] - df_output['charttime']

# Fill any non time values
df_output['recording_delay'] = df_output['recording_delay'].fillna(pd.Timedelta(0))

In [None]:
# Drop 
df_output = df_output.drop(columns=['subject_id','stay_id','charttime','storetime','storetime','valueuom','caregiver_id'])

In [None]:
#Encode
df_output = pd.get_dummies(df_output, columns=['itemid'])

#### Learner target feature

Given information regarding patient outputs including urine, drainage, and so on, predict the patient's eventual LOS (based on hadm_id)

In [None]:
df_output = df_output.merge(df_los_hadm, on='hadm_id', how='left')
df_output = df_output.drop(columns=['hadm_id'])

In [None]:
# Save DataFrame to CSV file

# uncomment and run if changes are made
df_output.to_csv('df_output.csv', index=False)

#### Split into train and test

In [None]:
data = df_output.drop(columns=['los'])
target = df_output['los']

# Split the dataset into training and testing sets
output_data_train, output_data_test, output_label_train, output_label_test = train_test_split(data, target, test_size=0.2, random_state=42)

# Print the shapes of the resulting training and testing sets
print("Training set shape:", output_data_train.shape, output_label_train.shape)
print("Testing set shape:", output_data_test.shape, output_label_test.shape)

In [None]:
# uncomment and run if changes are made

output_data_train.to_csv('output_data_train.csv', index=False)
output_data_test.to_csv('output_data_test.csv', index=False)

output_label_train.to_csv('output_label_train.csv', index=False)
output_label_test.to_csv('output_label_test.csv', index=False)

In [None]:
output_data_train

#### Dimensionality reduction

In [None]:
# Fine

### procedureevents

In [None]:
file = "icu/procedureevents.csv"
full_path = path + file

df_procedure_events = pd.read_csv(full_path)

In [None]:
df_procedure_events.info()

In [None]:
# df_procedure_events['value'].value_counts()

Drop: subject_id, starttime, endtime, storetime, orderid, linkorderid, continueinnextdept, stay_id, caregiver_id
Encode: valueuom, ordercategoryname, ordercategorydescription, statusdescription, itemid
Impute with N/A and encode: location, locationcategory
MAKE DURATION FEATURE 

In [None]:
# Make a duration feature using endtime-starttime

# Convert to datetime
df_procedure_events['endtime'] = pd.to_datetime(df_procedure_events['endtime'], format='%Y-%m-%d %H:%M:%S')
df_procedure_events['starttime'] = pd.to_datetime(df_procedure_events['starttime'], format='%Y-%m-%d %H:%M:%S')


df_procedure_events['duration'] = df_procedure_events['endtime'] - df_procedure_events['starttime']

# Fill any non time values
df_procedure_events['duration'] = df_procedure_events['duration'].fillna(pd.Timedelta(0))

In [None]:
# Make a recording_delay feature using storetime-endtime

# Convert to datetime
df_procedure_events['storetime'] = pd.to_datetime(df_procedure_events['storetime'], format='%Y-%m-%d %H:%M:%S')

df_procedure_events['recording_delay'] = df_procedure_events['storetime'] - df_procedure_events['endtime']

# Fill any non time values
df_procedure_events['recording_delay'] = df_procedure_events['recording_delay'].fillna(pd.Timedelta(0))

In [None]:
# Drop 
df_procedure_events = df_procedure_events.drop(columns=['subject_id','stay_id','starttime','endtime','storetime','orderid',
                                                        'linkorderid','continueinnextdept','caregiver_id'])

In [None]:
# Impute with N/A and encode
df_procedure_events['location'] = df_procedure_events['location'].fillna('N/A')
df_procedure_events['locationcategory'] = df_procedure_events['locationcategory'].fillna('N/A')
df_procedure_events = pd.get_dummies(df_procedure_events, columns=['location','locationcategory','valueuom',
                                                                   'ordercategoryname','ordercategorydescription',
                                                                   'statusdescription','itemid'])

In [None]:
df_procedure_events

#### Learner target feature

Given information regarding patient outputs including urine, drainage, and so on, predict the patient's eventual LOS (based on hadm_id)

In [None]:
df_procedure_events = df_procedure_events.merge(df_los_hadm, on='hadm_id', how='left')
df_procedure_events = df_procedure_events.drop(columns=['hadm_id'])

In [None]:
# Save DataFrame to CSV file

# uncomment and run if changes are made
df_procedure_events.to_csv('df_procedure_events.csv', index=False)

#### Split into train and test

In [None]:
data = df_procedure_events.drop(columns=['los'])
target = df_procedure_events['los']

# Split the dataset into training and testing sets
procedure_events_data_train, procedure_events_data_test, procedure_events_label_train, procedure_events_label_test = train_test_split(data, target, test_size=0.2, random_state=42)

# Print the shapes of the resulting training and testing sets
print("Training set shape:", procedure_events_data_train.shape, procedure_events_label_train.shape)
print("Testing set shape:", procedure_events_data_test.shape, procedure_events_label_test.shape)

In [None]:
# uncomment and run if changes are made

procedure_events_data_train.to_csv('procedure_events_data_train.csv', index=False)
procedure_events_data_test.to_csv('procedure_events_data_test.csv', index=False)

procedure_events_label_train.to_csv('procedure_events_label_train.csv', index=False)
procedure_events_label_test.to_csv('procedure_events_label_test.csv', index=False)

In [None]:
procedure_events_data_train

#### Dimensionality reduction

In [None]:
# Fine