In [1]:
import joblib
import os
import sys
import matplotlib.pyplot as plt
import seaborn as sns
from pathlib import Path
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV
from sklearn.metrics import mean_absolute_error, r2_score
from catboost import CatBoostRegressor
from sklearn.linear_model import LinearRegression, Ridge, Lasso
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from xgboost import XGBRegressor
from lightgbm import LGBMRegressor
from sklearn.model_selection import cross_val_score, cross_validate, GridSearchCV
from catboost import CatBoostRegressor
from lightgbm import LGBMRegressor
import pandas as pd
from sklearn.metrics import mean_absolute_error, r2_score, make_scorer
project_root = Path.cwd().parents[1]
sys.path.insert(0, str(project_root))

# Import the project configuration settings.
import config
from config import RANDOM_SEED
config.set_seed()
# Import our custom utility functions
from utils import DataPreprocessingPipeline

In [7]:
# Load the raw data from the data/raw directory using the load_data method from the config module located at config.org
emergency_df = config.load_data('nhamcs14.sas7bdat', 'raw')

In [8]:
# Load and preprocess data using a custom pipeline defined in the utils module.
# Define the target variable and columns to drop
target = "LOV"
target_to_drop = ['LOV_BINARY', 'WAITTIME_BINARY']

# Initializing the data preprocessing pipeline
pipeline = DataPreprocessingPipeline(emergency_df=emergency_df, target=target,
                                     target_to_drop=target_to_drop, percent_train=0.70,
                                     percent_val=0.15, percent_test=0.15, stratify=False)

# Running the data preprocessing steps
pipeline.run()

# Extracting the preprocessed training, validation, and test sets
X_train_preprocessed = pipeline.X_train_preprocessed
X_validation_preprocessed = pipeline.X_validation_preprocessed
X_test_preprocessed = pipeline.X_test_preprocessed
y_train = pipeline.y_train
y_validation = pipeline.y_validation
y_test = pipeline.y_test

feature_names = pipeline.feature_names

cleaned_emergency_df = pipeline.cleaned_emergency_df 
transformed_emergency_df = pipeline.transformed_emergency_df

# Save the feature names for future reference
feature_names = [name.replace('num__', '').replace('cat__', '') for name in pipeline.feature_names]
feature_names_list = list(feature_names)
config.save_data(feature_names, f"features_{target}.csv", 'features')

1-Cleaning data...
Data cleaning completed
Size of Initial dataset:(23844, 1012)
Size of cleaned dataset:(17959, 370)

2-Applying feature engineering...
Feature engineering completed
Size of the dataset after feature engineering:(17959, 387)

3-Splitting data...
self.stratify: False
Splitting data completed

4-Loading data...
train_df size: (12571, 387)
X_train size: (12571, 386)
y_train size: (12571,)

validation_df size: (2693, 387)
X_validation size: (2693, 386)
y_validation size: (2693,)

test_df size: (2695, 387)
X_test size: (2695, 386)
y_test size: (2695,)
Loading data completed

5-Preprocessing data...
Preprocessing data completed.
Processor saved successfully


In [9]:
# Loads the tuned model and evaluates its performance on the test set
model_train_dir = "/Users/Macbook/Desktop/EDPredictiveEfficiency/scripts/models_training_and_selection/model_train/"
model_filename = "CatBoostRegressor.joblib"
model_file_path = os.path.join(model_train_dir, model_filename)
best_trained_model = joblib.load(model_file_path) if os.path.exists(model_file_path) else print(f"No such file found: {model_file_path}")

In [10]:
# Implementing a ModelTuner class to tune the hyperparameters of the best model using GridSearchCV
# Defining the hyperparameter grid and scoring metric for tuning
# Tuneing the hyperparameters, evaluates the best model on the validation set, and saves the tuned model
class ModelTuner:
    def __init__(self, model, param_grid, scoring, cv=5, n_jobs=-1, verbose=2):
        self.model = model
        self.param_grid = param_grid
        self.scoring = scoring
        self.cv = cv
        self.n_jobs = n_jobs
        self.verbose = verbose
        self.grid_search = None
        self.best_model = None

    def tune_hyperparameters(self, X_train, y_train):
        """Tune hyperparameters using GridSearchCV."""
        self.grid_search = GridSearchCV(
            estimator=self.model,
            param_grid=self.param_grid,
            cv=self.cv,
            scoring=self.scoring,
            refit=True,
            verbose=self.verbose,
            n_jobs=self.n_jobs
        )
        self.grid_search.fit(X_train, y_train)
        self.best_model = self.grid_search.best_estimator_
        print("Best Hyperparameters:", self.grid_search.best_params_)
        print("Best Cross-Validation MAE:", -self.grid_search.best_score_)

    def evaluate(self, X_val, y_val):
        """Evaluate the best model on the validation set."""
        y_pred_validation = self.best_model.predict(X_val)
        mae_validation, r2_validation = mean_absolute_error(y_val, y_pred_validation), r2_score(y_val, y_pred_validation)
        print(f"Validation Set Performance:\nMAE: {mae_validation}\nR-squared: {r2_validation}")

    def save_model(self, model_directory, model_name):
        """Save the best model to the specified directory."""
        joblib.dump(self.best_model, os.path.join(model_directory, model_name))

# Define hyperparameter grid
param_grid = {
    'depth': [6, 8],
    'learning_rate': [0.01, 0.03, 0.1],
    'iterations': [1000],
    'l2_leaf_reg': [1, 3, 4]
}
mae_scorer = make_scorer(mean_absolute_error, greater_is_better=False)


In [14]:
# Tune hyperparameters, evaluate the best model, and save it
tuner = ModelTuner(best_trained_model, param_grid, scoring=mae_scorer)
tuner.tune_hyperparameters(X_train_preprocessed, y_train)
tuner.evaluate(X_validation_preprocessed, y_validation)
model_directory = "/Users/Macbook/Desktop/EDPredictiveEfficiency/notebooks/model_train"
tuner.save_model(model_directory, "CatBoostRegressor_tuned.joblib")

Fitting 5 folds for each of 18 candidates, totalling 90 fits
[CV] END depth=6, iterations=1000, l2_leaf_reg=1, learning_rate=0.03; total time= 1.1min
[CV] END depth=6, iterations=1000, l2_leaf_reg=1, learning_rate=0.03; total time= 1.1min
[CV] END depth=6, iterations=1000, l2_leaf_reg=1, learning_rate=0.03; total time= 1.1min
[CV] END depth=6, iterations=1000, l2_leaf_reg=1, learning_rate=0.01; total time= 1.1min
[CV] END depth=6, iterations=1000, l2_leaf_reg=1, learning_rate=0.01; total time= 1.1min
[CV] END depth=6, iterations=1000, l2_leaf_reg=1, learning_rate=0.01; total time= 1.1min
[CV] END depth=6, iterations=1000, l2_leaf_reg=1, learning_rate=0.01; total time= 1.2min
[CV] END depth=6, iterations=1000, l2_leaf_reg=1, learning_rate=0.01; total time= 1.2min
[CV] END depth=6, iterations=1000, l2_leaf_reg=1, learning_rate=0.03; total time= 1.1min
[CV] END depth=6, iterations=1000, l2_leaf_reg=1, learning_rate=0.03; total time= 1.1min
[CV] END depth=6, iterations=1000, l2_leaf_reg=1,