In [3]:


import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.impute import SimpleImputer
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score

# 1. Data Loading
df = pd.read_csv('/content/Dataset.csv')

# 2. Data Cleaning and Preprocessing
# a. Handle Missing Values
df['All Bed Occupancy Rate'] = df['All Bed Occupancy Rate'].fillna(df['All Bed Occupancy Rate'].median())
if 'ICU Bed Source Last Updated' in df.columns:
    df = df.drop('ICU Bed Source Last Updated', axis=1)

# Fill missing numerical values
numerical_cols_with_missing_values = [
    'Staffed All Beds', 'Staffed ICU Beds', 'Licensed All Beds',
    'ICU Bed Occupancy Rate', 'Population', 'Population (20+)', 'Population (65+)'
]
for col in numerical_cols_with_missing_values:
    if col in df.columns and df[col].isnull().any():
        df[col] = df[col].fillna(df[col].median())

# Fill missing categorical values
categorical_cols_with_missing_values = ['ICU Bed Source']
for col in categorical_cols_with_missing_values:
    if col in df.columns and df[col].isnull().any():
        df[col] = df[col].fillna(df[col].mode()[0])

# b. Handle Outliers using IQR method
numerical_cols_for_outlier_handling = [
    'Staffed All Beds', 'Staffed ICU Beds', 'Licensed All Beds',
    'All Bed Occupancy Rate', 'ICU Bed Occupancy Rate',
    'Population', 'Population (20+)', 'Population (65+)'
]
for col in numerical_cols_for_outlier_handling:
    if col in df.columns:
        Q1 = df[col].quantile(0.25)
        Q3 = df[col].quantile(0.75)
        IQR = Q3 - Q1
        lower = Q1 - 1.5 * IQR
        upper = Q3 + 1.5 * IQR
        df = df[(df[col] >= lower) & (df[col] <= upper)]

# 3. Feature Engineering
df['Staffed_Beds_Population'] = df['Staffed All Beds'] * df['Population']
df['Licensed_Beds_Population'] = df['Licensed All Beds'] * df['Population']
df['Staffed_Beds_Squared'] = df['Staffed All Beds']**2
df['Population_Squared'] = df['Population']**2

# Encode Categorical Features
cat_cols = ['State', 'County Name', 'ICU Bed Source']
cat_cols = [col for col in cat_cols if col in df.columns]

encoder = OneHotEncoder(handle_unknown='ignore', sparse_output=False)

encoded_features = encoder.fit_transform(df[cat_cols])
encoded_df = pd.DataFrame(encoded_features, columns=encoder.get_feature_names_out(cat_cols), index=df.index)

df = pd.concat([df.drop(cat_cols, axis=1), encoded_df], axis=1)

# 4. Data Splitting
target_col = 'All Bed Occupancy Rate'
X = df.drop(target_col, axis=1)
y = df[target_col]

X_train, X_temp, y_train, y_temp = train_test_split(X, y, test_size=0.2, random_state=42)
X_val, X_test, y_val, y_test = train_test_split(X_temp, y_temp, test_size=0.5, random_state=42)

# 5. Scale Numerical Features
numerical_cols = [
    'Staffed All Beds', 'Staffed ICU Beds', 'Licensed All Beds',
    'ICU Bed Occupancy Rate', 'Population', 'Population (20+)', 'Population (65+)',
    'Staffed_Beds_Population', 'Licensed_Beds_Population', 'Staffed_Beds_Squared', 'Population_Squared'
]
numerical_cols = [col for col in numerical_cols if col in X_train.columns]

scaler = StandardScaler()
X_train[numerical_cols] = scaler.fit_transform(X_train[numerical_cols])
X_val[numerical_cols] = scaler.transform(X_val[numerical_cols])
X_test[numerical_cols] = scaler.transform(X_test[numerical_cols])

# 6. Imputation (for Linear Regression)
imputer = SimpleImputer(strategy='median')
X_train_imputed = imputer.fit_transform(X_train)
X_val_imputed = imputer.transform(X_val)
X_test_imputed = imputer.transform(X_test)

# 7. Model Training
models = {
    "Linear Regression": LinearRegression(),
    "Random Forest": RandomForestRegressor(random_state=42),
    "Gradient Boosting": GradientBoostingRegressor(random_state=42),
}

for name, model in models.items():
    model.fit(X_train_imputed, y_train)
    print(f"{name} trained successfully.")

# 8. Evaluation
epsilon = 1e-10
for name, model in models.items():
    y_pred = model.predict(X_test_imputed)
    mae = mean_absolute_error(y_test, y_pred)
    rmse = np.sqrt(mean_squared_error(y_test, y_pred))
    r2 = r2_score(y_test, y_pred)
    mape = np.mean(np.abs((y_test - y_pred) / (y_test + epsilon))) * 100

    print(f"\n{name} Evaluation Metrics:")
    print(f"  MAE  : {mae:.3f}")
    print(f"  RMSE : {rmse:.3f}")
    print(f"  R²   : {r2:.3f}")
    print(f"  MAPE : {mape:.3f}%")


Linear Regression trained successfully.
Random Forest trained successfully.
Gradient Boosting trained successfully.

Linear Regression Evaluation Metrics:
  MAE  : 0.158
  RMSE : 0.200
  R²   : -0.531
  MAPE : 73.156%

Random Forest Evaluation Metrics:
  MAE  : 0.119
  RMSE : 0.160
  R²   : 0.023
  MAPE : 53.888%

Gradient Boosting Evaluation Metrics:
  MAE  : 0.118
  RMSE : 0.158
  R²   : 0.039
  MAPE : 55.945%
