In [1]:
import sys
from pathlib import Path
import joblib

# Add project root to the Python path
project_root = Path.cwd().parent.parent
sys.path.append(str(project_root))

import pandas as pd
from sklearn.model_selection import train_test_split
from src.data_management.data_preprocessing_pipeline import DataPreprocessingPipeline
from sklearn.metrics import mean_absolute_error, r2_score
from catboost import CatBoostRegressor
from sklearn.linear_model import LinearRegression, Ridge, Lasso
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from xgboost import XGBRegressor
from lightgbm import LGBMRegressor
from sklearn.model_selection import cross_val_score, cross_validate
from catboost import CatBoostRegressor

# Loading dataset
path = "/Users/Macbook/Desktop/emergency-dept-optimization"
emergency_df = pd.read_sas(f"{path}/nhamcs14.sas7bdat")
target = "LOV"

# Instantiate the data preprocessing pipeline
pipeline = DataPreprocessingPipeline(emergency_df=emergency_df,target=target,percent_train=0.7,percent_val=0.15,percent_test=0.15,path=path,stratify=False)

# Run the pipeline
pipeline.run()

X_train = pipeline.X_train
X_validation = pipeline.X_validation
X_test = pipeline.X_test

y_train = pipeline.y_train
y_validation = pipeline.y_validation
y_test = pipeline.X_test

X_train_preprocessed = pipeline.X_train_preprocessed
X_validation_preprocessed = pipeline.X_validation_preprocessed
X_test_preprocessed = pipeline.X_test_preprocessed

# MODEL TRAINING BELOW#

# Define dictionary of models with their default parameters
models_with_defaults = {
    'LinearRegression': {
        'model': LinearRegression()
    },
    'Ridge': {
        'model': Ridge()
    },
    'Lasso': {
        'model': Lasso()
    },
    'GradientBoostingRegressor': {
        'model': GradientBoostingRegressor(random_state=42)
    },
    'XGBRegressor': {
        'model': XGBRegressor(random_state=42)
    },
    'LGBMRegressor': {
        'model': LGBMRegressor(random_state=42, force_col_wise=True, verbosity=-1)
    },
    'CatBoostRegressor': {
        'model': CatBoostRegressor(verbose=0, random_seed=1)
    } 
}

# Initialize variables to track best model performance
best_model = None
best_mae = float('inf')  # Initialize with infinity
best_r2 = -float('inf')   # Initialize with negative infinity

# Define scoring metrics for cross-validation
scoring = ['neg_mean_absolute_error', 'r2']

# Loop over each model and evaluate their performance using cross-validation
for model_name, model_info in models_with_defaults.items():
    model = model_info['model']
    print(f"Cross-validating model {model_name}...")
    
    # Perform cross-validation
    cv_results = cross_validate(model, X_train_preprocessed, y_train, cv=5, scoring=scoring)
    
    # Calculate the average of the cross-validation scores
    mean_mae_validation = -1 * cv_results['test_neg_mean_absolute_error'].mean()  # Make MAE positive
    mean_r2_validation = cv_results['test_r2'].mean()
    
    # Print cross-validation metrics
    print(f"Model: {model_name}")
    print(f"Cross-Validation MAE: {mean_mae_validation:.2f}")
    print(f"Cross-Validation R-squared: {mean_r2_validation:.2f}")
    print()
    
    # Update best model if current model performs better based on MAE
    if mean_mae_validation < best_mae and mean_r2_validation > best_r2:
        best_model = model
        best_model_name = model_name
        best_mae = mean_mae_validation
        best_r2 = mean_r2_validation

# Print the best model and its cross-validation performance
print("Best Model Based on Cross-Validation:")
print(f"Model Name: {best_model_name}")
print(f"Best Cross-Validation MAE: {best_mae:.2f}")
print(f"Best Cross-Validation R-squared: {best_r2:.2f}")

# Create a directory to save the trained model
model_train_dir = Path("model_train")
model_train_dir.mkdir(exist_ok=True)

# Save the best model using joblib
joblib.dump(best_model, model_train_dir / f"{best_model_name}.pkl")
print(f"Best model saved as {best_model_name}.pkl in the 'model_train' directory.")

 
Cleaning data...
Some columns have missing values:
Initial emergency dataset: (23844, 1012)
Cleaned emergency dataset: (22308, 870)
Splitting data...
Sizes of the split datasets:
train_df: (15615, 870)
test_df: (3346, 870)
validation_df: (3347, 870)
train_df has been saved with success
test_df has been saved with success
validation_df has been saved with success
Loading data...
train_df size: (15615, 870)
X_train size: (15615, 869)
y_train size: (15615,)

validation_df size: (3347, 870)
X_validation size: (3347, 869)
y_validation size: (3347,)

test_df size: (3346, 870)
X_test size: (3346, 869)
y_test size: (3346,)
Transforming data...
Data preprocessing pipeline completed.
 
Cross-validating model LinearRegression...
Model: LinearRegression
Cross-Validation MAE: 96.35
Cross-Validation R-squared: -2.38

Cross-validating model Ridge...
Model: Ridge
Cross-Validation MAE: 66.03
Cross-Validation R-squared: 0.17

Cross-validating model Lasso...
Model: Lasso
Cross-Validation MAE: 57.59
Cro

In [None]:
/Users/Macbook/Desktop/emergency-dept-optimization/src/model_training/model_train/LGBMRegressor.pkl