#### Notes from phase 1

In [None]:
# Data for each table was split into train and test
# To get the target labels, search and filter the diagnoses dataframes (saved) for the hadm_ids present in the train / test
# set

In [106]:
# External libraries for data processing
import numpy as np
import pandas as pd
import sklearn as sk
#To render graphs within notebook
%matplotlib inline
import matplotlib.pyplot as plt
import joblib 
import os
from tqdm import tqdm

# Versions of libraries
print("Numpy version: {}".format(np.__version__))
print("Pandas version: {}".format(pd.__version__))
print("Scikit version: {}".format(sk.__version__))

Numpy version: 1.24.3
Pandas version: 1.5.3
Scikit version: 1.3.0


In [107]:
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import mean_squared_error
import matplotlib.pyplot as plt

In [108]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MultiLabelBinarizer

In [109]:
def convert_to_days(duration_str):
    parts = duration_str.split(' days ')  # Split string into form ['22', '20:55:00']
    days = float(parts[0])  # Extract number of days and convert to float
    time_parts = parts[1].split(':')  # Split time part (hh:mm:ss) ['20', '55', '00']
    hours = float(time_parts[0])  # Extract hours and convert to float
    minutes = float(time_parts[1])  # Extract minutes and convert to float
    seconds = float(time_parts[2])  # Extract seconds and convert to float
    total_days = days + (hours / 24) + (minutes / (24 * 60)) + (seconds / (24 * 3600))  # Calculate total days
    return total_days

In [110]:
path = "C:/Project/Data/"

In [111]:
file = "hosp/admissions.csv"
full_path = path + file

df_admissions = pd.read_csv(full_path)

df_admissions['dischtime'] = pd.to_datetime(df_admissions['dischtime'], format='%d/%m/%Y %H:%M')
df_admissions['admittime'] = pd.to_datetime(df_admissions['admittime'], format='%d/%m/%Y %H:%M')

df_admittime= pd.DataFrame()
df_admittime['hadm_id'] = df_admissions['hadm_id']
df_admittime['admittime'] = df_admissions['admittime']

In [112]:
file = "hosp/diagnoses_icd.csv"
full_path = path + file

df_diagnoses = pd.read_csv(full_path)

In [113]:
df_diagnoses = df_diagnoses.drop(columns=['subject_id','seq_num','icd_version'])

In [114]:
one_hot_encoded = pd.get_dummies(df_diagnoses['icd_code'])

df_encoded = pd.concat([df_diagnoses[['hadm_id']], one_hot_encoded], axis=1)

df_aggregated = df_encoded.groupby('hadm_id').sum().reset_index()

# If you want to replace NaN values (where the category did not appear for a particular ID) with 0
df_aggregated = df_aggregated.fillna(0)

In [115]:
df_diagnoses = df_aggregated
df_diagnoses

Unnamed: 0,hadm_id,00845,0088,0380,0383,03842,03843,03849,0388,0389,...,Z95810,Z95820,Z961,Z96651,Z980,Z981,Z9884,Z9911,Z992,Z9981
0,20044587,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,20093566,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,1,0,1,0
2,20192635,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,20199380,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,20214994,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
270,29820177,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
271,29839885,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
272,29842315,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,1,1
273,29858644,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [116]:
file = "hosp/drgcodes.csv"
full_path = path + file

df_drgcodes = pd.read_csv(full_path)

In [117]:
df_drgcodes['hadm_id'].value_counts()

22187210    2
27505812    2
25926192    2
27089790    2
24490144    2
           ..
22539296    1
20385771    1
20199380    1
20973395    1
23559586    1
Name: hadm_id, Length: 233, dtype: int64

In [118]:
df_drgcodes = df_drgcodes.drop(columns=['subject_id','drg_type','description','drg_severity','drg_mortality'])

In [119]:
one_hot_encoded = pd.get_dummies(df_drgcodes['drg_code'])

df_encoded = pd.concat([df_drgcodes[['hadm_id']], one_hot_encoded], axis=1)

df_aggregated = df_encoded.groupby('hadm_id').sum().reset_index()

# If you want to replace NaN values (where the category did not appear for a particular ID) with 0
df_aggregated = df_aggregated.fillna(0)

In [120]:
df_drgcodes = df_aggregated
df_drgcodes

Unnamed: 0,hadm_id,14,20,21,22,23,24,25,26,27,...,950,951,952,956,957,981,982,983,987,988
0,20044587,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,20093566,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,20192635,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,20199380,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,20214994,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
228,29802992,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
229,29839885,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
230,29842315,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
231,29858644,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


### labevents 

In [121]:
# Can't allocate memory this way 

# path = "C:/Users/jenni/OneDrive/Desktop/IP/"

# file = "labevents_data_train.csv"
# full_path = path + file
# X_train = pd.read_csv(full_path)

# file = "labevents_data_test.csv"
# full_path = path + file
# X_test = pd.read_csv(full_path)

In [122]:
path = "C:/Project/Data/"

In [123]:
file = "hosp/labevents.csv"
full_path = path + file

df_labevents = pd.read_csv(full_path)

In [124]:
df_labevents['value'] = pd.to_numeric(df_labevents['value'], errors='coerce').fillna(0)

In [125]:
# Make a feature for days_since_admission using charttime - admittime

# Convert to datetime
df_labevents['charttime'] = pd.to_datetime(df_labevents['charttime'], format='%Y/%m/%d %H:%M')

# Add admittime column from other dataframe
df_labevents = df_labevents.merge(df_admittime, on='hadm_id', how='left')

# # Discard the time part and keep only the date
# df_hcpcsevents['admittime'] = df_hcpcsevents['admittime'].dt.date
# df_hcpcsevents['chartdate'] = df_hcpcsevents['chartdate'].dt.date

df_labevents['days_since_admission'] = df_labevents['charttime'] - df_labevents['admittime']

# Fill any non time values
df_labevents['days_since_admission'] = df_labevents['days_since_admission'].fillna(pd.Timedelta(0))

In [126]:
# Add storetime - charttime feature called delay

# Convert to datetime
df_labevents['storetime'] = pd.to_datetime(df_labevents['storetime'], format='%Y/%m/%d %H:%M')

df_labevents['delay'] = df_labevents['storetime'] - df_labevents['charttime']

# Fill any non time values
df_labevents['delay'] = df_labevents['delay'].fillna(pd.Timedelta(0))

In [127]:
df_labevents = df_labevents.drop(columns=['labevent_id','subject_id','order_provider_id','charttime','storetime','comments'])

In [128]:
# For flag make abnormal = 1 and fill Null with 0
df_labevents['flag'] = df_labevents['flag'].fillna(0)
df_labevents['flag'] = df_labevents['flag'].replace('abnormal', 1)

In [129]:
# For priority fill Null with N/A and then one hot encode
df_labevents['priority'] = df_labevents['priority'].fillna('N/A')
df_labevents = pd.get_dummies(df_labevents, columns=['priority'])

In [130]:
df_labevents = pd.get_dummies(df_labevents, columns=['valueuom','specimen_id','itemid'])

In [131]:
# Drop any rows with null values 
df_labevents = df_labevents.dropna()
# Reduced from 107727 rows to 66660

In [132]:
df_labevents = df_labevents.drop(columns=['admittime'])

In [133]:
df_labevents.head()

Unnamed: 0,hadm_id,value,valuenum,ref_range_lower,ref_range_upper,flag,days_since_admission,delay,priority_N/A,priority_ROUTINE,...,itemid_52286,itemid_52312,itemid_52369,itemid_52391,itemid_52419,itemid_52425,itemid_52427,itemid_52769,itemid_52955,itemid_53153
0,29600294.0,15.4,15.4,10.5,15.5,0,1 days 01:03:00,0 days 01:30:00,0,1,...,0,0,0,0,0,0,0,0,0,0
1,29600294.0,3.35,3.35,4.6,6.1,1,1 days 01:03:00,0 days 01:30:00,0,1,...,0,0,0,0,0,0,0,0,0,0
2,29600294.0,49.7,49.7,35.1,46.3,1,1 days 01:03:00,0 days 01:30:00,0,1,...,0,0,0,0,0,0,0,0,0,0
3,29600294.0,20.3,20.3,4.0,10.0,1,1 days 01:03:00,0 days 01:30:00,0,1,...,0,0,0,0,0,0,0,0,0,0
4,29600294.0,31.1,31.1,32.0,37.0,1,1 days 01:03:00,0 days 01:30:00,0,1,...,0,0,0,0,0,0,0,0,0,0


#### Split into train and test

In [134]:
data = df_labevents

# Split the dataset into training and testing sets
labevents_data_train, labevents_data_test = train_test_split(data, test_size=0.2, random_state=42)

# Print the shapes of the resulting training and testing sets
print("Training set shape:", labevents_data_train.shape)
print("Testing set shape:", labevents_data_test.shape)

Training set shape: (53328, 11680)
Testing set shape: (13332, 11680)


In [135]:
data = labevents_data_train.drop(columns=['hadm_id'])
data = data.reset_index(drop=True)

In [136]:
labevents_data_train.head()

Unnamed: 0,hadm_id,value,valuenum,ref_range_lower,ref_range_upper,flag,days_since_admission,delay,priority_N/A,priority_ROUTINE,...,itemid_52286,itemid_52312,itemid_52369,itemid_52391,itemid_52419,itemid_52425,itemid_52427,itemid_52769,itemid_52955,itemid_53153
99267,26793610.0,19.0,19.0,22.0,32.0,1,0 days 04:36:00,0 days 02:53:00,0,1,...,0,0,0,0,0,0,0,0,0,0
106026,28998349.0,17.7,17.7,10.5,15.5,1,7 days 03:35:00,0 days 00:10:00,0,1,...,0,0,0,0,0,0,0,0,0,0
44791,22490490.0,0.0,134.0,70.0,100.0,1,5 days 15:25:00,0 days 02:06:00,0,1,...,0,0,0,0,0,0,0,0,0,0
83275,28258130.0,34.0,34.0,35.0,45.0,1,10 days 23:24:00,0 days 00:10:00,1,0,...,0,0,0,0,0,0,0,0,0,0
100330,22205327.0,2.4,2.4,1.6,2.6,0,9 days 09:55:00,0 days 01:58:00,0,0,...,0,0,0,0,0,0,0,0,0,0


In [138]:
# # target = df_diagnoses (rows where hadm_id is in train data)
# # Make sure the order is the same

# # Extract the unique IDs from the column 
# train_ids = labevents_data_train['hadm_id'].unique()

# # Filter to keep only rows where the id column is in train_ids
# y_train = df_diagnoses[df_diagnoses['hadm_id'].isin(train_ids)]

# test_ids = labevents_data_test['hadm_id'].unique()

# # Filter to keep only rows where the id column is in train_ids
# y_test = df_diagnoses[df_diagnoses['hadm_id'].isin(test_ids)]

In [139]:
# target = df_diagnoses (rows where hadm_id is in train data)
# Make sure the order is the same

# Extract the unique IDs from the column 
train_ids = labevents_data_train['hadm_id'].unique()

# Filter to keep only rows where the id column is in train_ids
y_train = df_drgcodes[df_drgcodes['hadm_id'].isin(train_ids)]

test_ids = labevents_data_test['hadm_id'].unique()

# Filter to keep only rows where the id column is in train_ids
y_test = df_drgcodes[df_drgcodes['hadm_id'].isin(test_ids)]

In [140]:
merged_df = pd.merge(y_train, labevents_data_train['hadm_id'], on='hadm_id')

In [141]:
y_train = merged_df
y_train

Unnamed: 0,hadm_id,14,20,21,22,23,24,25,26,27,...,950,951,952,956,957,981,982,983,987,988
0,20044587,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,20044587,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,20044587,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,20044587,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,20044587,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
52756,29974575,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
52757,29974575,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
52758,29974575,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
52759,29974575,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [142]:
merged_df = pd.merge(y_test, labevents_data_test['hadm_id'], on='hadm_id')

y_test = merged_df
y_test

Unnamed: 0,hadm_id,14,20,21,22,23,24,25,26,27,...,950,951,952,956,957,981,982,983,987,988
0,20044587,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,20044587,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,20044587,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,20044587,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,20044587,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
13194,29974575,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
13195,29974575,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
13196,29974575,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
13197,29974575,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [143]:
X_train = labevents_data_train
X_test = labevents_data_test

In [144]:
# X_train['delay']= X_train['delay'].astype(str)
# X_train['delay']= X_train['delay'].apply(convert_to_days)
# X_train['days_since_admission'] = X_train['days_since_admission'].astype(str)
# X_train['days_since_admission'] = X_train['days_since_admission'].str.split().str[0].astype(int)

# X_train = X_train.drop(columns=['hadm_id'])
# X_train = X_train.reset_index(drop=True)

# y_train = y_train.drop(columns=['hadm_id'])
# y_train = y_train.reset_index(drop=True)

In [145]:
X_train

Unnamed: 0,hadm_id,value,valuenum,ref_range_lower,ref_range_upper,flag,days_since_admission,delay,priority_N/A,priority_ROUTINE,...,itemid_52286,itemid_52312,itemid_52369,itemid_52391,itemid_52419,itemid_52425,itemid_52427,itemid_52769,itemid_52955,itemid_53153
99267,26793610.0,19.0,19.0,22.0,32.0,1,0 days 04:36:00,0 days 02:53:00,0,1,...,0,0,0,0,0,0,0,0,0,0
106026,28998349.0,17.7,17.7,10.5,15.5,1,7 days 03:35:00,0 days 00:10:00,0,1,...,0,0,0,0,0,0,0,0,0,0
44791,22490490.0,0.0,134.0,70.0,100.0,1,5 days 15:25:00,0 days 02:06:00,0,1,...,0,0,0,0,0,0,0,0,0,0
83275,28258130.0,34.0,34.0,35.0,45.0,1,10 days 23:24:00,0 days 00:10:00,1,0,...,0,0,0,0,0,0,0,0,0,0
100330,22205327.0,2.4,2.4,1.6,2.6,0,9 days 09:55:00,0 days 01:58:00,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
58478,28872262.0,3.9,3.9,3.3,5.1,0,8 days 18:53:00,0 days 01:00:00,0,1,...,0,0,0,0,0,0,0,0,0,0
9921,21476294.0,100.0,100.0,96.0,108.0,0,21 days 10:06:00,0 days 01:08:00,0,0,...,0,0,0,0,0,0,0,0,0,0
89565,26467376.0,68.5,68.5,9.4,12.5,1,0 days 14:02:00,0 days 01:28:00,0,1,...,0,0,0,0,0,0,0,0,0,0
1419,21476294.0,0.0,0.0,0.0,0.0,0,0 days 12:06:00,0 days 02:15:00,0,0,...,0,0,0,0,0,0,0,0,0,0


In [146]:
X_train['delay']= X_train['delay'].astype(str)
X_train['delay']= X_train['delay'].apply(convert_to_days)
X_train['days_since_admission'] = X_train['days_since_admission'].astype(str)
X_train['days_since_admission'] = X_train['days_since_admission'].str.split().str[0].astype(int)

X_train = X_train.drop(columns=['hadm_id'])
X_train = X_train.reset_index(drop=True)

y_train = y_train.drop(columns=['hadm_id'])
y_train = y_train.reset_index(drop=True)

In [147]:
X_test['delay']= X_test['delay'].astype(str)
X_test['delay']= X_test['delay'].apply(convert_to_days)
X_test['days_since_admission'] = X_test['days_since_admission'].astype(str)
X_test['days_since_admission'] = X_test['days_since_admission'].str.split().str[0].astype(int)

X_test = X_test.drop(columns=['hadm_id'])
X_test = X_test.reset_index(drop=True)

y_test = y_test.drop(columns=['hadm_id'])
y_test = y_test.reset_index(drop=True)

In [148]:
# sparse matrices to address memory issue

from scipy.sparse import csr_matrix

X_train_sparse = csr_matrix(X_train)
X_test_sparse = csr_matrix(X_test)

#### Dimensionality reduction - not enough memory

In [39]:
# Need to reduce from 11681 to 10665

In [40]:
# data['delay']= data['delay'].astype(str)
# data['delay']= data['delay'].apply(convert_to_days)
# data['days_since_admission'] = data['days_since_admission'].astype(str)
# data['days_since_admission'] = data['days_since_admission'].str.split().str[0].astype(int)

In [41]:
# from sklearn.decomposition import TruncatedSVD

# # Number of desired features (components)
# n_components = 10665

# # Initialize Truncated SVD with the desired number of components
# svd = TruncatedSVD(n_components=n_components)

# # Fit the Truncated SVD model to the sparse matrix and transform the data
# svd.fit(data)
# data = svd.transform(data)

# # Get the explained variance ratio (how much variance is explained by each component)
# explained_variance_ratio = svd.explained_variance_ratio_

# # Print the transformed matrix and explained variance ratio
# # print("Transformed Matrix:")
# # print(transformed_matrix)
# print("\nExplained Variance Ratio:")
# print(explained_variance_ratio)

# print("\n Amount of original variance conserved:", np.sum(svd.explained_variance_ratio_))

#### Multi-output decision tree

In [48]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.multioutput import MultiOutputClassifier
from sklearn.metrics import accuracy_score

# Initialize the decision tree classifier
decision_tree = DecisionTreeClassifier(random_state=42, max_depth=50)

# Initialize the multi-output classifier with the decision tree as the base estimator
multi_output_tree = MultiOutputClassifier(decision_tree, n_jobs=-1)


In [49]:
# Train the multi-output classifier
multi_output_tree.fit(X_train_sparse, y_train)

# Predict the outputs for the test set
y_pred = multi_output_tree.predict(X_test_sparse)

# # Evaluate the accuracy of the multi-output classifier
# accuracy = accuracy_score(y_test, y_pred)
# print("Accuracy:", accuracy)

In [50]:
from sklearn.metrics import classification_report

# Assuming y_true and y_pred are your true and predicted labels
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.05      0.00      0.01       370
           1       0.00      0.00      0.00       112
           2       0.00      0.00      0.00       101
           3       0.00      0.00      0.00       137
           4       0.08      0.00      0.01       504
           5       0.00      0.00      0.00        85
           6       0.00      0.00      0.00       184
           7       0.00      0.00      0.00        36
           8       0.00      0.00      0.00       322
           9       0.00      0.00      0.00        27
          10       0.00      0.00      0.00       123
          11       0.00      0.00      0.00       352
          12       0.00      0.00      0.00        72
          13       0.00      0.00      0.00        30
          14       0.00      0.00      0.00        46
          15       0.00      0.00      0.00        11
          16       0.00      0.00      0.00        32
          17       0.03    

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


#### 	Multilabel KNN

Training time is fine but testing would take like 16 hours to predict 10 samples

In [73]:
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score, classification_report
from sklearn.multioutput import MultiOutputClassifier

knn_classifier = KNeighborsClassifier(n_neighbors=5)

# Create a MultiOutputClassifier to handle multi-output classification
multi_output_knn = MultiOutputClassifier(knn_classifier)

for i in tqdm(range(10)):
    multi_output_knn.fit(X_train_sparse[:500, :100], y_train.iloc[:500, :])


100%|██████████████████████████████████████████████████████████████████████████████████| 10/10 [00:08<00:00,  1.21it/s]


In [77]:
multi_output_knn

In [79]:
for i in tqdm(range(1000)):
    y_pred = multi_output_knn.predict(X_test_sparse[:10, :100])

In [None]:
# Evaluate the classifier
accuracy = accuracy_score(y_test, y_pred)
print("Accuracy:", accuracy)

# Classification report
print("Classification Report:")
print(classification_report(y_test, y_pred))

In [67]:
X_test.iloc[:100, :100]

Unnamed: 0,value,valuenum,ref_range_lower,ref_range_upper,flag,days_since_admission,delay,priority_N/A,priority_ROUTINE,priority_STAT,...,specimen_id_321525,specimen_id_324268,specimen_id_330995,specimen_id_331179,specimen_id_335177,specimen_id_341449,specimen_id_342098,specimen_id_356984,specimen_id_358710,specimen_id_368713
0,10.00,10.00,6.0,20.0,0,2,0.114583,0,1,0,...,0,0,0,0,0,0,0,0,0,0
1,3.76,3.76,4.2,5.4,1,1,0.018056,0,1,0,...,0,0,0,0,0,0,0,0,0,0
2,4.40,4.40,3.3,5.1,0,4,0.134028,0,1,0,...,0,0,0,0,0,0,0,0,0,0
3,101.00,101.00,96.0,108.0,0,2,0.068056,0,1,0,...,0,0,0,0,0,0,0,0,0,0
4,1.00,1.00,0.5,1.2,0,2,0.053472,0,0,1,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
95,2.30,2.30,2.7,4.5,1,9,0.063194,0,0,1,...,0,0,0,0,0,0,0,0,0,0
96,4.10,4.10,3.5,5.2,0,0,0.084722,0,1,0,...,0,0,0,0,0,0,0,0,0,0
97,27.30,27.30,40.0,52.0,1,3,0.025694,0,0,1,...,0,0,0,0,0,0,0,0,0,0
98,91.00,91.00,40.0,130.0,0,42,0.054167,0,1,0,...,0,0,0,0,0,0,0,0,0,0


#### SVM

In [61]:
X_train.iloc[:200, :100]

Unnamed: 0,value,valuenum,ref_range_lower,ref_range_upper,flag,days_since_admission,delay,priority_N/A,priority_ROUTINE,priority_STAT,...,specimen_id_321525,specimen_id_324268,specimen_id_330995,specimen_id_331179,specimen_id_335177,specimen_id_341449,specimen_id_342098,specimen_id_356984,specimen_id_358710,specimen_id_368713
0,19.000,19.000,22.000,32.000,1,0,0.12013888888888888,0,1,0,...,0,0,0,0,0,0,0,0,0,0
1,17.700,17.700,10.500,15.500,1,7,0.006944444444444444,0,1,0,...,0,0,0,0,0,0,0,0,0,0
2,0.000,134.000,70.000,100.000,1,5,0.0875,0,1,0,...,0,0,0,0,0,0,0,0,0,0
3,34.000,34.000,35.000,45.000,1,10,0.006944444444444444,1,0,0,...,0,0,0,0,0,0,0,0,0,0
4,2.400,2.400,1.600,2.600,0,9,0.08194444444444444,0,0,1,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
195,0.000,134.000,70.000,100.000,1,1,0.07222222222222222,0,1,0,...,0,0,0,0,0,0,0,0,0,0
196,0.000,5.500,4.800,5.900,0,0,0.5895833333333333,0,0,1,...,0,0,0,0,0,0,0,0,0,0
197,138.000,138.000,133.000,145.000,0,2,0.09444444444444444,0,1,0,...,0,0,0,0,0,0,0,0,0,0
198,1.032,1.032,1.001,1.035,0,4,0.03680555555555556,0,0,1,...,0,0,0,0,0,0,0,0,0,0


In [63]:
from sklearn.datasets import make_multilabel_classification
from sklearn.multioutput import MultiOutputClassifier
from sklearn.svm import SVC
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report

# Initialize multi-label SVM classifier
svm_clf = MultiOutputClassifier(SVC(kernel='linear'))

for i in tqdm(range(100)):
    svm_clf.fit(X_train_sparse[:3000, :10], y_train.iloc[:3000, :])



  0%|                                                                                          | 0/100 [13:40<?, ?it/s]


KeyboardInterrupt: 

In [None]:
# Predict the labels for the test set
y_pred = svm_clf.predict(X_test_sparse[:100, :100])

# Evaluate the performance of the classifier
accuracy = accuracy_score(y_test, y_pred)
print("Accuracy:", accuracy)

# Print classification report
print("Classification Report:")
print(classification_report(y_test, y_pred))

#### Classifier Chain

In [150]:
X_train_sparse.shape

(53328, 11679)

In [151]:
y_train.shape

(52761, 240)

In [105]:
from sklearn.multioutput import ClassifierChain
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report

# Create a chain of logistic regression classifiers
classifier_chain = ClassifierChain(LogisticRegression())

# Train the classifier chain
classifier_chain.fit(X_train_sparse, y_train)

# Predict labels for test data
y_pred = classifier_chain.predict(X_test_sparse)

# Evaluate performance
accuracy = accuracy_score(y_test, y_pred)
print("Accuracy:", accuracy)

# Print classification report
print("Classification Report:")
print(classification_report(y_test, y_pred))


ValueError: Found input variables with inconsistent numbers of samples: [53328, 52761]

#### Random forest for multi-output

In [48]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report

# Create a multi-output Random Forest classifier
forest_classifier = RandomForestClassifier(n_estimators=10, random_state=42)

# Train the classifier
forest_classifier.fit(X_train, y_train)

# Predict labels for test data
y_pred = forest_classifier.predict(X_test)

# Evaluate performance
accuracy = accuracy_score(y_test, y_pred)
print("Accuracy:", accuracy)

# Print classification report
print("Classification Report:")
print(classification_report(y_test, y_pred))


#### 	Probabilistic graphical models

In [None]:
# joint inference of multiple diagnoses (Bayesian Networks or Markov Random Fields)

In [None]:
# Converting duration strings to floats

y_train['los'] = y_train['los'].astype(str)
y_train.fillna(0, inplace=True)
y_test['los'] = y_test['los'].astype(str)
y_test.fillna(0, inplace=True)
y_train.loc[~y_train['los'].str.contains('days'), 'los'] = '0 days 00:00:00'
y_test.loc[~y_test['los'].str.contains('days'), 'los'] = '0 days 00:00:00'

y_train['los'] = y_train['los'].apply(convert_to_days)
y_test['los'] = y_test['los'].apply(convert_to_days)

In [None]:
X_train['delay']= X_train['delay'].apply(convert_to_days)
X_test['delay']= X_test['delay'].apply(convert_to_days)

In [None]:
# Also need to deal with days_since_admission

#### Random forest

In [None]:
y_test['los'].describe()

In [None]:
# Convert DataFrame to 1D array using ravel()
y_train = y_train.values.ravel()
y_test = y_test.values.ravel()

# Initialize and fit the Random Forest regression model
random_forest = RandomForestRegressor(n_estimators=100, max_depth=10, random_state=42)
random_forest.fit(X_train, y_train)

# Predict on the test set
y_pred = random_forest.predict(X_test)

# Calculate mean squared error
mse = mean_squared_error(y_test, y_pred)
print("Mean Squared Error:", mse)

# Plot true vs predicted values
plt.scatter(y_test, y_pred)
plt.xlabel("True Values")
plt.ylabel("Predicted Values")
plt.title("True vs Predicted Values (Random Forest Regression)")
plt.show()

### emar

In [None]:
path = "C:/Users/jenni/OneDrive/Desktop/IP/"

file = "emar_data_train.csv"
full_path = path + file
emar_data_train = pd.read_csv(full_path)

file = "emar_data_test.csv"
full_path = path + file
emar_data_test = pd.read_csv(full_path)

file = "emar_label_train.csv"
full_path = path + file
emar_label_train = pd.read_csv(full_path)

file = "emar_label_test.csv"
full_path = path + file
emar_label_test = pd.read_csv(full_path)

In [None]:
# Converting duration strings to floats

emar_label_train['los'] = emar_label_train['los'].astype(str)
emar_label_train.fillna(0, inplace=True)
emar_label_test['los'] = emar_label_test['los'].astype(str)
emar_label_test.fillna(0, inplace=True)
emar_label_train.loc[~emar_label_train['los'].str.contains('days'), 'los'] = '0 days 00:00:00'
emar_label_test.loc[~emar_label_test['los'].str.contains('days'), 'los'] = '0 days 00:00:00'

In [None]:
emar_label_train['los'] = emar_label_train['los'].apply(convert_to_days)
emar_label_test['los'] = emar_label_test['los'].apply(convert_to_days)

In [None]:
emar_data_train['delay']= emar_data_train['delay'].apply(convert_to_days)
emar_data_test['delay']= emar_data_test['delay'].apply(convert_to_days)

In [None]:
emar_label_test['los'].describe()

#### Compare to MSE of always guessing average

In [None]:
# Calculate the mean value of training set target variable
mean_prediction = np.mean(y_train)

# Generate an array of the same length as pred_values with the mean value
mean_guesses = np.full_like(y_test, mean_prediction)

# Calculate the Mean Squared Error (MSE) between the predicted values and the mean guesses
mse = np.mean((y_pred - mean_guesses) ** 2)

print("Mean Squared Error (MSE):", mse)

### emar_detail

In [None]:
path = "C:/Users/jenni/OneDrive/Desktop/IP/"

file = "emar_detail_data_train.csv"
full_path = path + file
X_train = pd.read_csv(full_path)

file = "emar_detail_data_test.csv"
full_path = path + file
X_test = pd.read_csv(full_path)

file = "emar_detail_label_train.csv"
full_path = path + file
y_train = pd.read_csv(full_path)

file = "emar_detail_label_test.csv"
full_path = path + file
y_test = pd.read_csv(full_path)

In [None]:
# Converting duration strings to floats

y_train['los'] = y_train['los'].astype(str)
y_train.fillna(0, inplace=True)
y_test['los'] = y_test['los'].astype(str)
y_test.fillna(0, inplace=True)
y_train.loc[~y_train['los'].str.contains('days'), 'los'] = '0 days 00:00:00'
y_test.loc[~y_test['los'].str.contains('days'), 'los'] = '0 days 00:00:00'

y_train['los'] = y_train['los'].apply(convert_to_days)
y_test['los'] = y_test['los'].apply(convert_to_days)

#### Random forest

In [None]:
# Convert DataFrame to 1D array using ravel()
y_train = y_train.values.ravel()
y_test = y_test.values.ravel()

# Initialize and fit the Random Forest regression model
random_forest = RandomForestRegressor(n_estimators=100, max_depth=10, random_state=42)
random_forest.fit(X_train, y_train)

# Predict on the test set
y_pred = random_forest.predict(X_test)

# Calculate mean squared error
mse = mean_squared_error(y_test, y_pred)
print("Mean Squared Error:", mse)

# Plot true vs predicted values
plt.scatter(y_test, y_pred)
plt.xlabel("True Values")
plt.ylabel("Predicted Values")
plt.title("True vs Predicted Values (Random Forest Regression)")
plt.show()

In [None]:
# y_test['los'].describe()

In [None]:
# Calculate mean squared error
mse = mean_squared_error(y_test, y_pred)
print("Mean Squared Error:", mse)

# Plot true vs predicted values
plt.scatter(y_test, y_pred)
plt.xlabel("True Values")
plt.ylabel("Predicted Values")
plt.title("True vs Predicted Values (Random Forest Regression)")
plt.show()

#### Compare to MSE of always guessing average

In [None]:
# Calculate the mean value of training set target variable
mean_prediction = np.mean(y_train)

# Generate an array of the same length as pred_values with the mean value
mean_guesses = np.full_like(y_test, mean_prediction)

# Calculate the Mean Squared Error (MSE) between the predicted values and the mean guesses
mse = np.mean((y_pred - mean_guesses) ** 2)

print("Mean Squared Error (MSE):", mse)

### omr

In [None]:
path = "C:/Users/jenni/OneDrive/Desktop/IP/"

file = "omr_data_train.csv"
full_path = path + file
X_train = pd.read_csv(full_path)

file = "omr_data_test.csv"
full_path = path + file
X_test = pd.read_csv(full_path)

file = "omr_label_train.csv"
full_path = path + file
y_train = pd.read_csv(full_path)

file = "omr_label_test.csv"
full_path = path + file
y_test = pd.read_csv(full_path)

In [None]:
# Need to reduce to 12 features, this is an especially sparse matrix

In [None]:
from sklearn.decomposition import TruncatedSVD

# Number of desired features (components)
n_components = 12

# Initialize Truncated SVD with the desired number of components
svd = TruncatedSVD(n_components=n_components)

# Fit the Truncated SVD model to the sparse matrix and transform the data
svd.fit(X_train)
transformed_matrix = svd.transform(X_train)

# Get the explained variance ratio (how much variance is explained by each component)
explained_variance_ratio = svd.explained_variance_ratio_

# Print the transformed matrix and explained variance ratio
# print("Transformed Matrix:")
# print(transformed_matrix)
print("\nExplained Variance Ratio:")
print(explained_variance_ratio)

print("\n Amount of original variance conserved:", np.sum(svd.explained_variance_ratio_))

In [None]:
# Converting duration strings to floats

y_train['los'] = y_train['los'].astype(str)
y_train.fillna(0, inplace=True)
y_test['los'] = y_test['los'].astype(str)
y_test.fillna(0, inplace=True)
y_train.loc[~y_train['los'].str.contains('days'), 'los'] = '0 days 00:00:00'
y_test.loc[~y_test['los'].str.contains('days'), 'los'] = '0 days 00:00:00'

y_train['los'] = y_train['los'].apply(convert_to_days)
y_test['los'] = y_test['los'].apply(convert_to_days)

#### Random forest

In [None]:
y_test['los'].describe()

In [None]:
# Convert DataFrame to 1D array using ravel()
y_train = y_train.values.ravel()
y_test = y_test.values.ravel()

# Initialize and fit the Random Forest regression model
random_forest = RandomForestRegressor(n_estimators=100, max_depth=10, random_state=42)
random_forest.fit(X_train, y_train)

# Predict on the test set
y_pred = random_forest.predict(X_test)

# Calculate mean squared error
mse = mean_squared_error(y_test, y_pred)
print("Mean Squared Error:", mse)

# Plot true vs predicted values
plt.scatter(y_test, y_pred)
plt.xlabel("True Values")
plt.ylabel("Predicted Values")
plt.title("True vs Predicted Values (Random Forest Regression)")
plt.show()

#### Compare to MSE of always guessing average

In [None]:
# Calculate the mean value of training set target variable
mean_prediction = np.mean(y_train)

# Generate an array of the same length as pred_values with the mean value
mean_guesses = np.full_like(y_test, mean_prediction)

# Calculate the Mean Squared Error (MSE) between the predicted values and the mean guesses
mse = np.mean((y_pred - mean_guesses) ** 2)

print("Mean Squared Error (MSE):", mse)

### Admissions

In [None]:
path = "C:/Users/jenni/OneDrive/Desktop/IP/"

file = "admissions_data_train.csv"
full_path = path + file
X_train = pd.read_csv(full_path)

file = "admissions_data_test.csv"
full_path = path + file
X_test = pd.read_csv(full_path)

file = "admissions_label_train.csv"
full_path = path + file
y_train = pd.read_csv(full_path)

file = "admissions_label_test.csv"
full_path = path + file
y_test = pd.read_csv(full_path)

In [None]:
# Converting duration strings to floats

y_train['los'] = y_train['los'].astype(str)
y_train.fillna(0, inplace=True)
y_test['los'] = y_test['los'].astype(str)
y_test.fillna(0, inplace=True)
y_train.loc[~y_train['los'].str.contains('days'), 'los'] = '0 days 00:00:00'
y_test.loc[~y_test['los'].str.contains('days'), 'los'] = '0 days 00:00:00'

y_train['los'] = y_train['los'].apply(convert_to_days)
y_test['los'] = y_test['los'].apply(convert_to_days)

In [None]:
X_train['ed_duration']= X_train['ed_duration'].apply(convert_to_days)
X_test['ed_duration']= X_test['ed_duration'].apply(convert_to_days)

#### Random forest

In [None]:
y_test['los'].describe()

In [None]:
# Convert DataFrame to 1D array using ravel()
y_train = y_train.values.ravel()
y_test = y_test.values.ravel()

# Initialize and fit the Random Forest regression model
random_forest = RandomForestRegressor(n_estimators=100, max_depth=10, random_state=42)
random_forest.fit(X_train, y_train)

# Predict on the test set
y_pred = random_forest.predict(X_test)

# Calculate mean squared error
mse = mean_squared_error(y_test, y_pred)
print("Mean Squared Error:", mse)

# Plot true vs predicted values
plt.scatter(y_test, y_pred)
plt.xlabel("True Values")
plt.ylabel("Predicted Values")
plt.title("True vs Predicted Values (Random Forest Regression)")
plt.show()

#### Compare to MSE of always guessing average

In [None]:
# Calculate the mean value of training set target variable
mean_prediction = np.mean(y_train)

# Generate an array of the same length as pred_values with the mean value
mean_guesses = np.full_like(y_test, mean_prediction)

# Calculate the Mean Squared Error (MSE) between the predicted values and the mean guesses
mse = np.mean((y_pred - mean_guesses) ** 2)

print("Mean Squared Error (MSE):", mse)

### hcpcsevents

In [None]:
path = "C:/Users/jenni/OneDrive/Desktop/IP/"

file = "hcpcsevents_data_train.csv"
full_path = path + file
X_train = pd.read_csv(full_path)

file = "hcpcsevents_data_test.csv"
full_path = path + file
X_test = pd.read_csv(full_path)

file = "hcpcsevents_label_train.csv"
full_path = path + file
y_train = pd.read_csv(full_path)

file = "hcpcsevents_label_test.csv"
full_path = path + file
y_test = pd.read_csv(full_path)

In [None]:
# Need to reduce from 13 to 9

In [None]:
# Convert strings to integers
X_train['days_since_admission'] = X_train['days_since_admission'].str.split().str[0].astype(int)
X_test['days_since_admission'] = X_test['days_since_admission'].str.split().str[0].astype(int)

In [None]:
from sklearn.decomposition import TruncatedSVD

# Number of desired features (components)
n_components = 9

# Initialize Truncated SVD with the desired number of components
svd = TruncatedSVD(n_components=n_components)

# Fit the Truncated SVD model to the sparse matrix and transform the data
svd.fit(X_train)
transformed_matrix = svd.transform(X_train)

# Get the explained variance ratio (how much variance is explained by each component)
explained_variance_ratio = svd.explained_variance_ratio_

# Print the transformed matrix and explained variance ratio
# print("Transformed Matrix:")
# print(transformed_matrix)
print("\nExplained Variance Ratio:")
print(explained_variance_ratio)

print("\n Amount of original variance conserved:", np.sum(svd.explained_variance_ratio_))

In [None]:
# Converting duration strings to floats

y_train['los'] = y_train['los'].astype(str)
y_train.fillna(0, inplace=True)
y_test['los'] = y_test['los'].astype(str)
y_test.fillna(0, inplace=True)
y_train.loc[~y_train['los'].str.contains('days'), 'los'] = '0 days 00:00:00'
y_test.loc[~y_test['los'].str.contains('days'), 'los'] = '0 days 00:00:00'

y_train['los'] = y_train['los'].apply(convert_to_days)
y_test['los'] = y_test['los'].apply(convert_to_days)

#### Random forest

In [None]:
# Convert DataFrame to 1D array using ravel()
y_train = y_train.values.ravel()
y_test = y_test.values.ravel()

# Initialize and fit the Random Forest regression model
random_forest = RandomForestRegressor(n_estimators=100, max_depth=10, random_state=42)
random_forest.fit(X_train, y_train)

# Predict on the test set
y_pred = random_forest.predict(X_test)

# Calculate mean squared error
mse = mean_squared_error(y_test, y_pred)
print("Mean Squared Error:", mse)

# Plot true vs predicted values
plt.scatter(y_test, y_pred)
plt.xlabel("True Values")
plt.ylabel("Predicted Values")
plt.title("True vs Predicted Values (Random Forest Regression)")
plt.show()

#### Compare to MSE of always guessing average

In [None]:
# Calculate the mean value of training set target variable
mean_prediction = np.mean(y_train)

# Generate an array of the same length as pred_values with the mean value
mean_guesses = np.full_like(y_test, mean_prediction)

# Calculate the Mean Squared Error (MSE) between the predicted values and the mean guesses
mse = np.mean((y_pred - mean_guesses) ** 2)

print("Mean Squared Error (MSE):", mse)

### microbiologyevents

In [None]:
path = "C:/Users/jenni/OneDrive/Desktop/IP/"

file = "microbio_data_train.csv"
full_path = path + file
X_train = pd.read_csv(full_path)

file = "microbio_data_test.csv"
full_path = path + file
X_test = pd.read_csv(full_path)

file = "microbio_label_train.csv"
full_path = path + file
y_train = pd.read_csv(full_path)

file = "microbio_label_test.csv"
full_path = path + file
y_test = pd.read_csv(full_path)

In [None]:
# Converting duration strings to floats

y_train['los'] = y_train['los'].astype(str)
y_train.fillna(0, inplace=True)
y_test['los'] = y_test['los'].astype(str)
y_test.fillna(0, inplace=True)
y_train.loc[~y_train['los'].str.contains('days'), 'los'] = '0 days 00:00:00'
y_test.loc[~y_test['los'].str.contains('days'), 'los'] = '0 days 00:00:00'

y_train['los'] = y_train['los'].apply(convert_to_days)
y_test['los'] = y_test['los'].apply(convert_to_days)

In [None]:
X_train['delay']= X_train['delay'].apply(convert_to_days)
X_test['delay']= X_test['delay'].apply(convert_to_days)

In [None]:
# Also need to deal with days_since_admission
X_train['days_since_admission'] = X_train['days_since_admission'].apply(convert_to_days)
X_test['days_since_admission'] = X_test['days_since_admission'].apply(convert_to_days)

#### Random forest

In [None]:
y_test['los'].describe()

In [None]:
# Convert DataFrame to 1D array using ravel()
y_train = y_train.values.ravel()
y_test = y_test.values.ravel()

# Initialize and fit the Random Forest regression model
random_forest = RandomForestRegressor(n_estimators=100, max_depth=10, random_state=42)
random_forest.fit(X_train, y_train)

# Predict on the test set
y_pred = random_forest.predict(X_test)

# Calculate mean squared error
mse = mean_squared_error(y_test, y_pred)
print("Mean Squared Error:", mse)

# Plot true vs predicted values
plt.scatter(y_test, y_pred)
plt.xlabel("True Values")
plt.ylabel("Predicted Values")
plt.title("True vs Predicted Values (Random Forest Regression)")
plt.show()

#### Compare to MSE of always guessing average

In [None]:
# Calculate the mean value of training set target variable
mean_prediction = np.mean(y_train)

# Generate an array of the same length as pred_values with the mean value
mean_guesses = np.full_like(y_test, mean_prediction)

# Calculate the Mean Squared Error (MSE) between the predicted values and the mean guesses
mse = np.mean((y_pred - mean_guesses) ** 2)

print("Mean Squared Error (MSE):", mse)

### patients

In [None]:
path = "C:/Users/jenni/OneDrive/Desktop/IP/"

file = "patients_data_train.csv"
full_path = path + file
X_train = pd.read_csv(full_path)

file = "patients_data_test.csv"
full_path = path + file
X_test = pd.read_csv(full_path)

file = "patients_label_train.csv"
full_path = path + file
y_train = pd.read_csv(full_path)

file = "patients_label_test.csv"
full_path = path + file
y_test = pd.read_csv(full_path)

In [None]:
# Converting duration strings to floats

y_train['los'] = y_train['los'].astype(str)
y_train.fillna(0, inplace=True)
y_test['los'] = y_test['los'].astype(str)
y_test.fillna(0, inplace=True)
y_train.loc[~y_train['los'].str.contains('days'), 'los'] = '0 days 00:00:00'
y_test.loc[~y_test['los'].str.contains('days'), 'los'] = '0 days 00:00:00'

y_train['los'] = y_train['los'].apply(convert_to_days)
y_test['los'] = y_test['los'].apply(convert_to_days)

#### Random forest

In [None]:
y_test['los'].describe()

In [None]:
# Convert DataFrame to 1D array using ravel()
y_train = y_train.values.ravel()
y_test = y_test.values.ravel()

# Initialize and fit the Random Forest regression model
random_forest = RandomForestRegressor(n_estimators=100, max_depth=10, random_state=42)
random_forest.fit(X_train, y_train)

# Predict on the test set
y_pred = random_forest.predict(X_test)

# Calculate mean squared error
mse = mean_squared_error(y_test, y_pred)
print("Mean Squared Error:", mse)

# Plot true vs predicted values
plt.scatter(y_test, y_pred)
plt.xlabel("True Values")
plt.ylabel("Predicted Values")
plt.title("True vs Predicted Values (Random Forest Regression)")
plt.show()

#### Compare to MSE of always guessing average

In [None]:
# Calculate the mean value of training set target variable
mean_prediction = np.mean(y_train)

# Generate an array of the same length as pred_values with the mean value
mean_guesses = np.full_like(y_test, mean_prediction)

# Calculate the Mean Squared Error (MSE) between the predicted values and the mean guesses
mse = np.mean((y_pred - mean_guesses) ** 2)

print("Mean Squared Error (MSE):", mse)

### pharmacy

In [None]:
path = "C:/Users/jenni/OneDrive/Desktop/IP/"

file = "pharmacy_data_train.csv"
full_path = path + file
X_train = pd.read_csv(full_path)

file = "pharmacy_data_test.csv"
full_path = path + file
X_test = pd.read_csv(full_path)

file = "pharmacy_label_train.csv"
full_path = path + file
y_train = pd.read_csv(full_path)

file = "pharmacy_label_test.csv"
full_path = path + file
y_test = pd.read_csv(full_path)

In [None]:
# Converting duration strings to floats

y_train['los'] = y_train['los'].astype(str)
y_train.fillna(0, inplace=True)
y_test['los'] = y_test['los'].astype(str)
y_test.fillna(0, inplace=True)
y_train.loc[~y_train['los'].str.contains('days'), 'los'] = '0 days 00:00:00'
y_test.loc[~y_test['los'].str.contains('days'), 'los'] = '0 days 00:00:00'

y_train['los'] = y_train['los'].apply(convert_to_days)
y_test['los'] = y_test['los'].apply(convert_to_days)

In [None]:
X_train['medication_duration']= X_train['medication_duration'].apply(convert_to_days)
X_test['medication_duration']= X_test['medication_duration'].apply(convert_to_days)
# Convert strings to integers
X_train['verification_delay'] = X_train['verification_delay'].str.split().str[0].astype(int)
X_test['verification_delay'] = X_test['verification_delay'].str.split().str[0].astype(int)

#### Random forest

In [None]:
y_test['los'].describe()

In [None]:
X_train.fillna(0, inplace=True)

In [None]:
X_test.fillna(0, inplace=True)

In [None]:
# Convert DataFrame to 1D array using ravel()
y_train = y_train.values.ravel()
y_test = y_test.values.ravel()

# Initialize and fit the Random Forest regression model
random_forest = RandomForestRegressor(n_estimators=100, max_depth=10, random_state=42)
random_forest.fit(X_train, y_train)

# Predict on the test set
y_pred = random_forest.predict(X_test)

# Calculate mean squared error
mse = mean_squared_error(y_test, y_pred)
print("Mean Squared Error:", mse)

# Plot true vs predicted values
plt.scatter(y_test, y_pred)
plt.xlabel("True Values")
plt.ylabel("Predicted Values")
plt.title("True vs Predicted Values (Random Forest Regression)")
plt.show()

#### Compare to MSE of always guessing average

In [None]:
# Calculate the mean value of training set target variable
mean_prediction = np.mean(y_train)

# Generate an array of the same length as pred_values with the mean value
mean_guesses = np.full_like(y_test, mean_prediction)

# Calculate the Mean Squared Error (MSE) between the predicted values and the mean guesses
mse = np.mean((y_pred - mean_guesses) ** 2)

print("Mean Squared Error (MSE):", mse)

### poe

In [None]:
path = "C:/Users/jenni/OneDrive/Desktop/IP/"

file = "poe_data_train.csv"
full_path = path + file
X_train = pd.read_csv(full_path)

file = "poe_data_test.csv"
full_path = path + file
X_test = pd.read_csv(full_path)

file = "poe_label_train.csv"
full_path = path + file
y_train = pd.read_csv(full_path)

file = "poe_label_test.csv"
full_path = path + file
y_test = pd.read_csv(full_path)

In [None]:
# Converting duration strings to floats

y_train['los'] = y_train['los'].astype(str)
y_train.fillna(0, inplace=True)
y_test['los'] = y_test['los'].astype(str)
y_test.fillna(0, inplace=True)
y_train.loc[~y_train['los'].str.contains('days'), 'los'] = '0 days 00:00:00'
y_test.loc[~y_test['los'].str.contains('days'), 'los'] = '0 days 00:00:00'

y_train['los'] = y_train['los'].apply(convert_to_days)
y_test['los'] = y_test['los'].apply(convert_to_days)

In [None]:
# Need to deal with days_since_admission
X_train['days_since_admission'] = X_train['days_since_admission'].apply(convert_to_days)
X_test['days_since_admission'] = X_test['days_since_admission'].apply(convert_to_days)

#### Random forest

In [None]:
y_test['los'].describe()

In [None]:
# Convert DataFrame to 1D array using ravel()
y_train = y_train.values.ravel()
y_test = y_test.values.ravel()

# Initialize and fit the Random Forest regression model
random_forest = RandomForestRegressor(n_estimators=100, max_depth=10, random_state=42)
random_forest.fit(X_train, y_train)

# Predict on the test set
y_pred = random_forest.predict(X_test)

# Calculate mean squared error
mse = mean_squared_error(y_test, y_pred)
print("Mean Squared Error:", mse)

# Plot true vs predicted values
plt.scatter(y_test, y_pred)
plt.xlabel("True Values")
plt.ylabel("Predicted Values")
plt.title("True vs Predicted Values (Random Forest Regression)")
plt.show()

#### Compare to MSE of always guessing average

In [None]:
# Calculate the mean value of training set target variable
mean_prediction = np.mean(y_train)

# Generate an array of the same length as pred_values with the mean value
mean_guesses = np.full_like(y_test, mean_prediction)

# Calculate the Mean Squared Error (MSE) between the predicted values and the mean guesses
mse = np.mean((y_pred - mean_guesses) ** 2)

print("Mean Squared Error (MSE):", mse)

###  prescriptions

In [None]:
path = "C:/Users/jenni/OneDrive/Desktop/IP/"

file = "prescriptions_data_train.csv"
full_path = path + file
X_train = pd.read_csv(full_path)

file = "prescriptions_data_test.csv"
full_path = path + file
X_test = pd.read_csv(full_path)

file = "prescriptions_label_train.csv"
full_path = path + file
y_train = pd.read_csv(full_path)

file = "prescriptions_label_test.csv"
full_path = path + file
y_test = pd.read_csv(full_path)

In [None]:
# Converting duration strings to floats

y_train['los'] = y_train['los'].astype(str)
y_train.fillna(0, inplace=True)
y_test['los'] = y_test['los'].astype(str)
y_test.fillna(0, inplace=True)
y_train.loc[~y_train['los'].str.contains('days'), 'los'] = '0 days 00:00:00'
y_test.loc[~y_test['los'].str.contains('days'), 'los'] = '0 days 00:00:00'

y_train['los'] = y_train['los'].apply(convert_to_days)
y_test['los'] = y_test['los'].apply(convert_to_days)

In [None]:
X_train['duration']= X_train['duration'].apply(convert_to_days)
X_test['duration']= X_test['duration'].apply(convert_to_days)

In [None]:
# Need to reduce from 4890 to 2874 or less

In [None]:
from sklearn.decomposition import TruncatedSVD

# Number of desired features (components)
n_components = 2874

# Initialize Truncated SVD with the desired number of components
svd = TruncatedSVD(n_components=n_components)

# Fit the Truncated SVD model to the sparse matrix and transform the data
svd.fit(X_train)
transformed_matrix = svd.transform(X_train)

# Get the explained variance ratio (how much variance is explained by each component)
explained_variance_ratio = svd.explained_variance_ratio_

# Print the transformed matrix and explained variance ratio
# print("Transformed Matrix:")
# print(transformed_matrix)
print("\nExplained Variance Ratio:")
print(explained_variance_ratio)

print("\n Amount of original variance conserved:", np.sum(svd.explained_variance_ratio_))

#### Random forest

In [None]:
y_test['los'].describe()

In [None]:
# Convert DataFrame to 1D array using ravel()
y_train = y_train.values.ravel()
y_test = y_test.values.ravel()

# Initialize and fit the Random Forest regression model
random_forest = RandomForestRegressor(n_estimators=100, max_depth=10, random_state=42)
random_forest.fit(X_train, y_train)

# Predict on the test set
y_pred = random_forest.predict(X_test)

# Calculate mean squared error
mse = mean_squared_error(y_test, y_pred)
print("Mean Squared Error:", mse)

# Plot true vs predicted values
plt.scatter(y_test, y_pred)
plt.xlabel("True Values")
plt.ylabel("Predicted Values")
plt.title("True vs Predicted Values (Random Forest Regression)")
plt.show()

#### Compare to MSE of always guessing average

In [None]:
# Calculate the mean value of training set target variable
mean_prediction = np.mean(y_train)

# Generate an array of the same length as pred_values with the mean value
mean_guesses = np.full_like(y_test, mean_prediction)

# Calculate the Mean Squared Error (MSE) between the predicted values and the mean guesses
mse = np.mean((y_pred - mean_guesses) ** 2)

print("Mean Squared Error (MSE):", mse)

### procedures_icd

In [None]:
path = "C:/Users/jenni/OneDrive/Desktop/IP/"

file = "procedures_data_train.csv"
full_path = path + file
X_train = pd.read_csv(full_path)

file = "procedures_data_test.csv"
full_path = path + file
X_test = pd.read_csv(full_path)

file = "procedures_label_train.csv"
full_path = path + file
y_train = pd.read_csv(full_path)

file = "procedures_label_test.csv"
full_path = path + file
y_test = pd.read_csv(full_path)

In [None]:
# Converting duration strings to floats

y_train['los'] = y_train['los'].astype(str)
y_train.fillna(0, inplace=True)
y_test['los'] = y_test['los'].astype(str)
y_test.fillna(0, inplace=True)
y_train.loc[~y_train['los'].str.contains('days'), 'los'] = '0 days 00:00:00'
y_test.loc[~y_test['los'].str.contains('days'), 'los'] = '0 days 00:00:00'

y_train['los'] = y_train['los'].apply(convert_to_days)
y_test['los'] = y_test['los'].apply(convert_to_days)

In [None]:
# Convert strings to integers
X_train['days_since_admission'] = X_train['days_since_admission'].str.split().str[0].astype(int)
X_test['days_since_admission'] = X_test['days_since_admission'].str.split().str[0].astype(int)

In [None]:
# Need to reduce from 355 to 115

In [None]:
from sklearn.decomposition import TruncatedSVD

# Number of desired features (components)
n_components = 115

# Initialize Truncated SVD with the desired number of components
svd = TruncatedSVD(n_components=n_components)

# Fit the Truncated SVD model to the sparse matrix and transform the data
svd.fit(X_train)
transformed_matrix = svd.transform(X_train)

# Get the explained variance ratio (how much variance is explained by each component)
explained_variance_ratio = svd.explained_variance_ratio_

# Print the transformed matrix and explained variance ratio
# print("Transformed Matrix:")
# print(transformed_matrix)
print("\nExplained Variance Ratio:")
print(explained_variance_ratio)

print("\n Amount of original variance conserved:", np.sum(svd.explained_variance_ratio_))

#### Random forest

In [None]:
y_test['los'].describe()

In [None]:
# Convert DataFrame to 1D array using ravel()
y_train = y_train.values.ravel()
y_test = y_test.values.ravel()

# Initialize and fit the Random Forest regression model
random_forest = RandomForestRegressor(n_estimators=100, max_depth=10, random_state=42)
random_forest.fit(X_train, y_train)

# Predict on the test set
y_pred = random_forest.predict(X_test)

# Calculate mean squared error
mse = mean_squared_error(y_test, y_pred)
print("Mean Squared Error:", mse)

# Plot true vs predicted values
plt.scatter(y_test, y_pred)
plt.xlabel("True Values")
plt.ylabel("Predicted Values")
plt.title("True vs Predicted Values (Random Forest Regression)")
plt.show()

#### Compare to MSE of always guessing average

In [None]:
# Calculate the mean value of training set target variable
mean_prediction = np.mean(y_train)

# Generate an array of the same length as pred_values with the mean value
mean_guesses = np.full_like(y_test, mean_prediction)

# Calculate the Mean Squared Error (MSE) between the predicted values and the mean guesses
mse = np.mean((y_pred - mean_guesses) ** 2)

print("Mean Squared Error (MSE):", mse)

### services

In [None]:
path = "C:/Users/jenni/OneDrive/Desktop/IP/"

file = "services_data_train.csv"
full_path = path + file
X_train = pd.read_csv(full_path)

file = "services_data_test.csv"
full_path = path + file
X_test = pd.read_csv(full_path)

file = "services_label_train.csv"
full_path = path + file
y_train = pd.read_csv(full_path)

file = "services_label_test.csv"
full_path = path + file
y_test = pd.read_csv(full_path)

In [None]:
# Converting duration strings to floats

y_train['los'] = y_train['los'].astype(str)
y_train.fillna(0, inplace=True)
y_test['los'] = y_test['los'].astype(str)
y_test.fillna(0, inplace=True)
y_train.loc[~y_train['los'].str.contains('days'), 'los'] = '0 days 00:00:00'
y_test.loc[~y_test['los'].str.contains('days'), 'los'] = '0 days 00:00:00'

y_train['los'] = y_train['los'].apply(convert_to_days)
y_test['los'] = y_test['los'].apply(convert_to_days)

In [None]:
# Need to deal with days_since_admission
X_train['days_since_admission'] = X_train['days_since_admission'].apply(convert_to_days)
X_test['days_since_admission'] = X_test['days_since_admission'].apply(convert_to_days)

#### Random forest

In [None]:
y_test['los'].describe()

In [None]:
# Convert DataFrame to 1D array using ravel()
y_train = y_train.values.ravel()
y_test = y_test.values.ravel()

# Initialize and fit the Random Forest regression model
random_forest = RandomForestRegressor(n_estimators=100, max_depth=10, random_state=42)
random_forest.fit(X_train, y_train)

# Predict on the test set
y_pred = random_forest.predict(X_test)

# Calculate mean squared error
mse = mean_squared_error(y_test, y_pred)
print("Mean Squared Error:", mse)

# Plot true vs predicted values
plt.scatter(y_test, y_pred)
plt.xlabel("True Values")
plt.ylabel("Predicted Values")
plt.title("True vs Predicted Values (Random Forest Regression)")
plt.show()

#### Compare to MSE of always guessing average

In [None]:
# Calculate the mean value of training set target variable
mean_prediction = np.mean(y_train)

# Generate an array of the same length as pred_values with the mean value
mean_guesses = np.full_like(y_test, mean_prediction)

# Calculate the Mean Squared Error (MSE) between the predicted values and the mean guesses
mse = np.mean((y_pred - mean_guesses) ** 2)

print("Mean Squared Error (MSE):", mse)

### transfers

In [None]:
path = "C:/Users/jenni/OneDrive/Desktop/IP/"

file = "transfers_data_train.csv"
full_path = path + file
X_train = pd.read_csv(full_path)

file = "transfers_data_test.csv"
full_path = path + file
X_test = pd.read_csv(full_path)

file = "transfers_label_train.csv"
full_path = path + file
y_train = pd.read_csv(full_path)

file = "transfers_label_test.csv"
full_path = path + file
y_test = pd.read_csv(full_path)

In [None]:
# Converting duration strings to floats

y_train['los'] = y_train['los'].astype(str)
y_train.fillna(0, inplace=True)
y_test['los'] = y_test['los'].astype(str)
y_test.fillna(0, inplace=True)
y_train.loc[~y_train['los'].str.contains('days'), 'los'] = '0 days 00:00:00'
y_test.loc[~y_test['los'].str.contains('days'), 'los'] = '0 days 00:00:00'

y_train['los'] = y_train['los'].apply(convert_to_days)
y_test['los'] = y_test['los'].apply(convert_to_days)

In [None]:
X_train['duration']= X_train['duration'].apply(convert_to_days)
X_test['duration']= X_test['duration'].apply(convert_to_days)

In [None]:
# Also need to deal with days_since_admission
X_train['days_since_admission'] = X_train['days_since_admission'].apply(convert_to_days)
X_test['days_since_admission'] = X_test['days_since_admission'].apply(convert_to_days)

#### Random forest

In [None]:
y_test['los'].describe()

In [None]:
# Convert DataFrame to 1D array using ravel()
y_train = y_train.values.ravel()
y_test = y_test.values.ravel()

# Initialize and fit the Random Forest regression model
random_forest = RandomForestRegressor(n_estimators=100, max_depth=10, random_state=42)
random_forest.fit(X_train, y_train)

# Predict on the test set
y_pred = random_forest.predict(X_test)

# Calculate mean squared error
mse = mean_squared_error(y_test, y_pred)
print("Mean Squared Error:", mse)

# Plot true vs predicted values
plt.scatter(y_test, y_pred)
plt.xlabel("True Values")
plt.ylabel("Predicted Values")
plt.title("True vs Predicted Values (Random Forest Regression)")
plt.show()

#### Compare to MSE of always guessing average

In [None]:
# Calculate the mean value of training set target variable
mean_prediction = np.mean(y_train)

# Generate an array of the same length as pred_values with the mean value
mean_guesses = np.full_like(y_test, mean_prediction)

# Calculate the Mean Squared Error (MSE) between the predicted values and the mean guesses
mse = np.mean((y_pred - mean_guesses) ** 2)

print("Mean Squared Error (MSE):", mse)

### chartevents - unable to allocate memory

In [None]:
path = "C:/Users/jenni/OneDrive/Desktop/IP/"

file = "chart_data_train.csv"
full_path = path + file
X_train = pd.read_csv(full_path)

file = "chart_data_test.csv"
full_path = path + file
X_test = pd.read_csv(full_path)

file = "chart_label_train.csv"
full_path = path + file
y_train = pd.read_csv(full_path)

file = "chart_label_test.csv"
full_path = path + file
y_test = pd.read_csv(full_path)

In [None]:
# Converting duration strings to floats

y_train['los'] = y_train['los'].astype(str)
y_train.fillna(0, inplace=True)
y_test['los'] = y_test['los'].astype(str)
y_test.fillna(0, inplace=True)
y_train.loc[~y_train['los'].str.contains('days'), 'los'] = '0 days 00:00:00'
y_test.loc[~y_test['los'].str.contains('days'), 'los'] = '0 days 00:00:00'

y_train['los'] = y_train['los'].apply(convert_to_days)
y_test['los'] = y_test['los'].apply(convert_to_days)

In [None]:
X_train['delay']= X_train['delay'].apply(convert_to_days)
X_test['delay']= X_test['delay'].apply(convert_to_days)

In [None]:
# Also need to deal with days_since_admission
X_train['days_since_admission'] = X_train['days_since_admission'].apply(convert_to_days)
X_test['days_since_admission'] = X_test['days_since_admission'].apply(convert_to_days)

#### Random forest

In [None]:
y_test['los'].describe()

In [None]:
# Convert DataFrame to 1D array using ravel()
y_train = y_train.values.ravel()
y_test = y_test.values.ravel()

# Initialize and fit the Random Forest regression model
random_forest = RandomForestRegressor(n_estimators=100, max_depth=10, random_state=42)
random_forest.fit(X_train, y_train)

# Predict on the test set
y_pred = random_forest.predict(X_test)

# Calculate mean squared error
mse = mean_squared_error(y_test, y_pred)
print("Mean Squared Error:", mse)

# Plot true vs predicted values
plt.scatter(y_test, y_pred)
plt.xlabel("True Values")
plt.ylabel("Predicted Values")
plt.title("True vs Predicted Values (Random Forest Regression)")
plt.show()

### icustays

In [None]:
path = "C:/Users/jenni/OneDrive/Desktop/IP/"

file = "icustays_data_train.csv"
full_path = path + file
X_train = pd.read_csv(full_path)

file = "icustays_data_test.csv"
full_path = path + file
X_test = pd.read_csv(full_path)

file = "icustays_label_train.csv"
full_path = path + file
y_train = pd.read_csv(full_path)

file = "icustays_label_test.csv"
full_path = path + file
y_test = pd.read_csv(full_path)

In [None]:
# Converting duration strings to floats

y_train['los'] = y_train['los'].astype(str)
y_train.fillna(0, inplace=True)
y_test['los'] = y_test['los'].astype(str)
y_test.fillna(0, inplace=True)
y_train.loc[~y_train['los'].str.contains('days'), 'los'] = '0 days 00:00:00'
y_test.loc[~y_test['los'].str.contains('days'), 'los'] = '0 days 00:00:00'

y_train['los'] = y_train['los'].apply(convert_to_days)
y_test['los'] = y_test['los'].apply(convert_to_days)

In [None]:
# Need to deal with days_since_admission
X_train['days_since_admission'] = X_train['days_since_admission'].apply(convert_to_days)
X_test['days_since_admission'] = X_test['days_since_admission'].apply(convert_to_days)

#### Random forest

In [None]:
y_test['los'].describe()

In [None]:
# Convert DataFrame to 1D array using ravel()
y_train = y_train.values.ravel()
y_test = y_test.values.ravel()

# Initialize and fit the Random Forest regression model
random_forest = RandomForestRegressor(n_estimators=100, max_depth=10, random_state=42)
random_forest.fit(X_train, y_train)

# Predict on the test set
y_pred = random_forest.predict(X_test)

# Calculate mean squared error
mse = mean_squared_error(y_test, y_pred)
print("Mean Squared Error:", mse)

# Plot true vs predicted values
plt.scatter(y_test, y_pred)
plt.xlabel("True Values")
plt.ylabel("Predicted Values")
plt.title("True vs Predicted Values (Random Forest Regression)")
plt.show()

#### Compare to MSE of always guessing average

In [None]:
# Calculate the mean value of training set target variable
mean_prediction = np.mean(y_train)

# Generate an array of the same length as pred_values with the mean value
mean_guesses = np.full_like(y_test, mean_prediction)

# Calculate the Mean Squared Error (MSE) between the predicted values and the mean guesses
mse = np.mean((y_pred - mean_guesses) ** 2)

print("Mean Squared Error (MSE):", mse)

### ingredientevents

In [None]:
path = "C:/Users/jenni/OneDrive/Desktop/IP/"

file = "ingredient_data_train.csv"
full_path = path + file
X_train = pd.read_csv(full_path)

file = "ingredient_data_test.csv"
full_path = path + file
X_test = pd.read_csv(full_path)

file = "ingredient_label_train.csv"
full_path = path + file
y_train = pd.read_csv(full_path)

file = "ingredient_label_test.csv"
full_path = path + file
y_test = pd.read_csv(full_path)

In [None]:
# Converting duration strings to floats

y_train['los'] = y_train['los'].astype(str)
y_train.fillna(0, inplace=True)
y_test['los'] = y_test['los'].astype(str)
y_test.fillna(0, inplace=True)
y_train.loc[~y_train['los'].str.contains('days'), 'los'] = '0 days 00:00:00'
y_test.loc[~y_test['los'].str.contains('days'), 'los'] = '0 days 00:00:00'

y_train['los'] = y_train['los'].apply(convert_to_days)
y_test['los'] = y_test['los'].apply(convert_to_days)

In [None]:
X_train['duration']= X_train['duration'].apply(convert_to_days)
X_test['duration']= X_test['duration'].apply(convert_to_days)

In [None]:
X_train['recording_delay']= X_train['recording_delay'].apply(convert_to_days)
X_test['recording_delay']= X_test['recording_delay'].apply(convert_to_days)

In [None]:
# Need to reduce from 7727 to 4116

In [None]:
from sklearn.decomposition import TruncatedSVD

# Number of desired features (components)
n_components = 4116

# Initialize Truncated SVD with the desired number of components
svd = TruncatedSVD(n_components=n_components)

# Fit the Truncated SVD model to the sparse matrix and transform the data
svd.fit(X_train)
transformed_matrix = svd.transform(X_train)

# Get the explained variance ratio (how much variance is explained by each component)
explained_variance_ratio = svd.explained_variance_ratio_

# Print the transformed matrix and explained variance ratio
# print("Transformed Matrix:")
# print(transformed_matrix)
print("\nExplained Variance Ratio:")
print(explained_variance_ratio)

print("\n Amount of original variance conserved:", np.sum(svd.explained_variance_ratio_))

#### Random forest

In [None]:
y_test['los'].describe()

In [None]:
# Convert DataFrame to 1D array using ravel()
y_train = y_train.values.ravel()
y_test = y_test.values.ravel()

# Initialize and fit the Random Forest regression model
random_forest = RandomForestRegressor(n_estimators=100, max_depth=10, random_state=42)
random_forest.fit(X_train, y_train)

# Predict on the test set
y_pred = random_forest.predict(X_test)

# Calculate mean squared error
mse = mean_squared_error(y_test, y_pred)
print("Mean Squared Error:", mse)

# Plot true vs predicted values
plt.scatter(y_test, y_pred)
plt.xlabel("True Values")
plt.ylabel("Predicted Values")
plt.title("True vs Predicted Values (Random Forest Regression)")
plt.show()

#### Compare to MSE of always guessing average

In [None]:
# Calculate the mean value of training set target variable
mean_prediction = np.mean(y_train)

# Generate an array of the same length as pred_values with the mean value
mean_guesses = np.full_like(y_test, mean_prediction)

# Calculate the Mean Squared Error (MSE) between the predicted values and the mean guesses
mse = np.mean((y_pred - mean_guesses) ** 2)

print("Mean Squared Error (MSE):", mse)

### inputevents

In [None]:
path = "C:/Users/jenni/OneDrive/Desktop/IP/"

file = "input_data_train.csv"
full_path = path + file
X_train = pd.read_csv(full_path)

file = "input_data_test.csv"
full_path = path + file
X_test = pd.read_csv(full_path)

file = "input_label_train.csv"
full_path = path + file
y_train = pd.read_csv(full_path)

file = "input_label_test.csv"
full_path = path + file
y_test = pd.read_csv(full_path)

In [None]:
# Converting duration strings to floats

y_train['los'] = y_train['los'].astype(str)
y_train.fillna(0, inplace=True)
y_test['los'] = y_test['los'].astype(str)
y_test.fillna(0, inplace=True)
y_train.loc[~y_train['los'].str.contains('days'), 'los'] = '0 days 00:00:00'
y_test.loc[~y_test['los'].str.contains('days'), 'los'] = '0 days 00:00:00'

y_train['los'] = y_train['los'].apply(convert_to_days)
y_test['los'] = y_test['los'].apply(convert_to_days)

In [None]:
X_train['duration']= X_train['duration'].apply(convert_to_days)
X_test['duration']= X_test['duration'].apply(convert_to_days)

In [None]:
X_train['recording_delay']= X_train['recording_delay'].apply(convert_to_days)
X_test['recording_delay']= X_test['recording_delay'].apply(convert_to_days)

#### Random forest

In [None]:
y_test['los'].describe()

In [None]:
# Convert DataFrame to 1D array using ravel()
y_train = y_train.values.ravel()
y_test = y_test.values.ravel()

# Initialize and fit the Random Forest regression model
random_forest = RandomForestRegressor(n_estimators=100, max_depth=10, random_state=42)
random_forest.fit(X_train, y_train)

# Predict on the test set
y_pred = random_forest.predict(X_test)

# Calculate mean squared error
mse = mean_squared_error(y_test, y_pred)
print("Mean Squared Error:", mse)

# Plot true vs predicted values
plt.scatter(y_test, y_pred)
plt.xlabel("True Values")
plt.ylabel("Predicted Values")
plt.title("True vs Predicted Values (Random Forest Regression)")
plt.show()

#### Compare to MSE of always guessing average

In [None]:
# Calculate the mean value of training set target variable
mean_prediction = np.mean(y_train)

# Generate an array of the same length as pred_values with the mean value
mean_guesses = np.full_like(y_test, mean_prediction)

# Calculate the Mean Squared Error (MSE) between the predicted values and the mean guesses
mse = np.mean((y_pred - mean_guesses) ** 2)

print("Mean Squared Error (MSE):", mse)

### outputevents

In [None]:
path = "C:/Users/jenni/OneDrive/Desktop/IP/"

file = "output_data_train.csv"
full_path = path + file
X_train = pd.read_csv(full_path)

file = "output_data_test.csv"
full_path = path + file
X_test = pd.read_csv(full_path)

file = "output_label_train.csv"
full_path = path + file
y_train = pd.read_csv(full_path)

file = "output_label_test.csv"
full_path = path + file
y_test = pd.read_csv(full_path)

In [None]:
# Converting duration strings to floats

y_train['los'] = y_train['los'].astype(str)
y_train.fillna(0, inplace=True)
y_test['los'] = y_test['los'].astype(str)
y_test.fillna(0, inplace=True)
y_train.loc[~y_train['los'].str.contains('days'), 'los'] = '0 days 00:00:00'
y_test.loc[~y_test['los'].str.contains('days'), 'los'] = '0 days 00:00:00'

y_train['los'] = y_train['los'].apply(convert_to_days)
y_test['los'] = y_test['los'].apply(convert_to_days)

In [None]:
X_train['recording_delay']= X_train['recording_delay'].apply(convert_to_days)
X_test['recording_delay']= X_test['recording_delay'].apply(convert_to_days)

In [None]:
# Also need to deal with days_since_admission
X_train['days_since_admission'] = X_train['days_since_admission'].apply(convert_to_days)
X_test['days_since_admission'] = X_test['days_since_admission'].apply(convert_to_days)

#### Random forest

In [None]:
y_test['los'].describe()

In [None]:
# Convert DataFrame to 1D array using ravel()
y_train = y_train.values.ravel()
y_test = y_test.values.ravel()

# Initialize and fit the Random Forest regression model
random_forest = RandomForestRegressor(n_estimators=100, max_depth=10, random_state=42)
random_forest.fit(X_train, y_train)

# Predict on the test set
y_pred = random_forest.predict(X_test)

# Calculate mean squared error
mse = mean_squared_error(y_test, y_pred)
print("Mean Squared Error:", mse)

# Plot true vs predicted values
plt.scatter(y_test, y_pred)
plt.xlabel("True Values")
plt.ylabel("Predicted Values")
plt.title("True vs Predicted Values (Random Forest Regression)")
plt.show()

#### Compare to MSE of always guessing average

In [None]:
# Calculate the mean value of training set target variable
mean_prediction = np.mean(y_train)

# Generate an array of the same length as pred_values with the mean value
mean_guesses = np.full_like(y_test, mean_prediction)

# Calculate the Mean Squared Error (MSE) between the predicted values and the mean guesses
mse = np.mean((y_pred - mean_guesses) ** 2)

print("Mean Squared Error (MSE):", mse)

### procedureevents

In [None]:
path = "C:/Users/jenni/OneDrive/Desktop/IP/"

file = "procedure_events_data_train.csv"
full_path = path + file
X_train = pd.read_csv(full_path)

file = "procedure_events_data_test.csv"
full_path = path + file
X_test = pd.read_csv(full_path)

file = "procedure_events_label_train.csv"
full_path = path + file
y_train = pd.read_csv(full_path)

file = "procedure_events_label_test.csv"
full_path = path + file
y_test = pd.read_csv(full_path)

In [None]:
# Converting duration strings to floats

y_train['los'] = y_train['los'].astype(str)
y_train.fillna(0, inplace=True)
y_test['los'] = y_test['los'].astype(str)
y_test.fillna(0, inplace=True)
y_train.loc[~y_train['los'].str.contains('days'), 'los'] = '0 days 00:00:00'
y_test.loc[~y_test['los'].str.contains('days'), 'los'] = '0 days 00:00:00'

y_train['los'] = y_train['los'].apply(convert_to_days)
y_test['los'] = y_test['los'].apply(convert_to_days)

In [None]:
X_train['duration']= X_train['duration'].apply(convert_to_days)
X_test['duration']= X_test['duration'].apply(convert_to_days)

In [None]:
X_train['recording_delay']= X_train['recording_delay'].apply(convert_to_days)
X_test['recording_delay']= X_test['recording_delay'].apply(convert_to_days)

#### Random forest

In [None]:
y_test['los'].describe()

In [None]:
# Convert DataFrame to 1D array using ravel()
y_train = y_train.values.ravel()
y_test = y_test.values.ravel()

# Initialize and fit the Random Forest regression model
random_forest = RandomForestRegressor(n_estimators=100, max_depth=10, random_state=42)
random_forest.fit(X_train, y_train)

# Predict on the test set
y_pred = random_forest.predict(X_test)

# Calculate mean squared error
mse = mean_squared_error(y_test, y_pred)
print("Mean Squared Error:", mse)

# Plot true vs predicted values
plt.scatter(y_test, y_pred)
plt.xlabel("True Values")
plt.ylabel("Predicted Values")
plt.title("True vs Predicted Values (Random Forest Regression)")
plt.show()

#### Compare to MSE of always guessing average

In [None]:
# Calculate the mean value of training set target variable
mean_prediction = np.mean(y_train)

# Generate an array of the same length as pred_values with the mean value
mean_guesses = np.full_like(y_test, mean_prediction)

# Calculate the Mean Squared Error (MSE) between the predicted values and the mean guesses
mse = np.mean((y_pred - mean_guesses) ** 2)

print("Mean Squared Error (MSE):", mse)