In [1]:
import numpy as np
import pandas as pd
import os
import joblib
import shap  # Import SHAP library
import seaborn as sns  # Import Seaborn for visualization
import matplotlib.pyplot as plt  # Import matplotlib for plotting
from catboost import CatBoostRegressor
from sklearn.model_selection import GridSearchCV, train_test_split, cross_validate
from sklearn.metrics import mean_absolute_error, r2_score, mean_squared_error
from scipy import sparse as sp
from lightgbm import LGBMRegressor

import sys
from pathlib import Path

project_root = Path.cwd().parents[1]
sys.path.insert(0, str(project_root))

# Import the project configuration settings.
import config

# Loading training data
X_train = config.load_data('X_train_regression_WAITTIME.csv', 'train')
y_train = config.load_data('y_train_regression_WAITTIME.csv', 'train')

# Loading validation data, if percent_val > 0
X_validation = config.load_data('X_validation_regression_WAITTIME.csv', 'validation')
y_validation = config.load_data('y_validation_regression_WAITTIME.csv', 'validation')

# Loading preprocessed data
X_train_preprocessed = config.load_data('X_train_preprocessed_regression_WAITTIME.csv', 'processed')
X_validation_preprocessed = config.load_data('X_validation_preprocessed_regression_WAITTIME.csv', 'processed')
X_test_preprocessed = config.load_data('X_test_preprocessed_regression_WAITTIME.csv', 'processed')

feature_names = config.load_data('features_WAITTIME.csv', 'features')['0'].tolist()
top_features = config.load_data('top_features_WAITTIME.csv', 'features')['0'].tolist()
top_feature_indices = config.load_data('feature_indices_WAITTIME.csv', 'features')['0'].tolist()

X_train_selected_features = config.load_data('X_train_selected_features_WAITTIME.csv', 'train').sort_index(axis=1)
X_validation_selected_features = config.load_data('X_validation_selected_features_WAITTIME.csv', 'validation').sort_index(axis=1)
X_test_selected_features = config.load_data('X_test_selected_features_WAITTIME.csv', 'test').sort_index(axis=1)


In [2]:

# Define a range of hyperparameters for CatBoostRegressor
param_grid = {
    'depth': [4, 6, 8],
    'learning_rate': [0.01, 0.05, 0.1],
    'iterations': [30, 50, 100],
    'l2_leaf_reg': [1, 3, 5]
}

# Initialize CatBoostRegressor and GridSearchCV
# catboost_model = CatBoostRegressor(random_state=42, verbose=0)
catboost_model= LGBMRegressor(random_state=42, force_col_wise=True, verbosity=-1)
grid_search = GridSearchCV(estimator=catboost_model, param_grid=param_grid, cv=5, n_jobs=-1, scoring='neg_mean_squared_error', verbose=1)

# Perform hyperparameter tuning
print("Starting hyperparameter tuning...")
# grid_search.fit(X_train_selected_features, y_train)
grid_search.fit(X_train_preprocessed, y_train)

# Retrieve and report the best hyperparameters
best_hyperparams = grid_search.best_params_
# print(f"Best hyperparameters: {best_hyperparams}")

# Training the final model with best hyperparameters on the entire training data
best_model = grid_search.best_estimator_

# Optional: Evaluate on validation set
# y_validation_pred = best_model.predict(X_validation_selected_features)
y_validation_pred = best_model.predict(X_validation_preprocessed)
mae_validation = mean_absolute_error(y_validation, y_validation_pred)
r2_validation = r2_score(y_validation, y_validation_pred)
print(f"Validation MAE: {mae_validation:.2f}, R2: {r2_validation:.2f}")

# # Save the best trained model 
model_filename = f"best_waittime_regression_model_with_all_features.joblib"
config.save_model(best_model, model_filename)
# Print confirmation
print(f"{best_model} saved")



Starting hyperparameter tuning...
Fitting 5 folds for each of 81 candidates, totalling 405 fits
Validation MAE: 13.86, R2: 0.37
LGBMRegressor(depth=4, force_col_wise=True, iterations=30, l2_leaf_reg=1,
              random_state=42, verbosity=-1) saved


In [3]:

# Retrain the model using the best hyperparameters and only the top 60 features
best_model.fit(X_train_selected_features, y_train)

# Predict on the validation set using the retrained model
y_validation_pred = best_model.predict(X_validation_selected_features)

# Evaluate the model's performance
mae_validation = mean_absolute_error(y_validation, y_validation_pred)
r2_validation = r2_score(y_validation, y_validation_pred)

# Output the performance metrics
print(f"Validation MAE: {mae_validation:.2f}, R2: {r2_validation:.2f}")

# Save the best trained model
# Define the model name
model_name = 'best_waittime_regression_model_with_top_features.joblib'
# Save the model using the utility function from config.py
config.save_model(best_model, model_name)
# Print confirmation
print(f"{model_name} saved")


Validation MAE: 13.87, R2: 0.36
best_waittime_regression_model_with_top_features.joblib saved
