In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.metrics import mean_squared_error, mean_absolute_error
from sklearn.impute import SimpleImputer

# Load your dataset 
df = pd.read_csv('healthcare_dataset.csv')

# Step 1: Parse dates and compute Length of Stay
df['Date of Admission'] = pd.to_datetime(df['Date of Admission'])
df['Discharge Date'] = pd.to_datetime(df['Discharge Date'])
df['Length_of_Stay'] = (df['Discharge Date'] - df['Date of Admission']).dt.days

# Step 2: Select relevant features
features = [
    'Age', 'Gender', 'Admission Type', 'Medical Condition',
    'Medication', 'Room Number'
]
target = 'Length_of_Stay'

X = df[features]
y = df[target]

# Step 3: Preprocessing pipeline
categorical_cols = ['Gender', 'Admission Type', 'Medical Condition', 'Medication']
numerical_cols = ['Age', 'Room Number']

preprocessor = ColumnTransformer([
    ('cat', OneHotEncoder(handle_unknown='ignore'), categorical_cols),
    ('num', SimpleImputer(strategy='median'), numerical_cols)
])

# Step 4: Create the pipeline with a Random Forest
model = Pipeline(steps=[
    ('preprocessing', preprocessor),
    ('regressor', RandomForestRegressor(n_estimators=100, random_state=42))
])

# Step 5: Train-test split and model fitting
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
model.fit(X_train, y_train)

# Step 6: Evaluate the model
y_pred = model.predict(X_test)

print("Mean Absolute Error (MAE):", mean_absolute_error(y_test, y_pred))
print("Root Mean Squared Error (RMSE):", np.sqrt(mean_squared_error(y_test, y_pred)))


Mean Absolute Error (MAE): 7.36675824002574
Root Mean Squared Error (RMSE): 8.718736966664478


### Model Performance
Mean Absolute Error (MAE): 7.37 days

Root Mean Squared Error (RMSE): 8.72 days

What these numbers mean:
Metric	Value	Interpretation
MAE	7.37 days	On average, the model's predictions are off by about 7.4 days. This is a linear measure of error.
RMSE	8.72 days	The average error, giving more weight to larger errors. Indicates some outliers exist.
