In [1]:
import joblib
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from pathlib import Path
import sys

# ML Models
from sklearn.model_selection import train_test_split, cross_validate, GridSearchCV
from sklearn.metrics import mean_absolute_error, r2_score
from catboost import CatBoostRegressor
from sklearn.linear_model import LinearRegression, Ridge, Lasso
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from xgboost import XGBRegressor
from lightgbm import LGBMRegressor

# Configuration and Utilities

from utils import DataPreprocessingPipeline

# Set up project root and import configurations
project_root = Path.cwd().parents[1]
sys.path.insert(0, str(project_root))

import config
config.set_seed()

# Load and preprocess data
emergency_df = config.load_data('nhamcs14.sas7bdat', 'raw')
pipeline = DataPreprocessingPipeline(emergency_df=emergency_df, target='WAITTIME',
                                     target_to_drop=['WAITTIME_BINARY','LOV_BINARY'],
                                     percent_train=0.70, percent_val=0.15, percent_test=0.15,
                                     stratify=False)
pipeline.run()

# Feature selection
feature_names = [name.replace('num__', '').replace('cat__', '') for name in pipeline.feature_names]
feature_names = feature_names[:60]  # Select top 60 features based on previous steps not shown here

# Models for selection
models_with_defaults = {
    'CatBoostRegressor': CatBoostRegressor(random_state=config.RANDOM_SEED, verbose=0),
    'LinearRegression': LinearRegression(),
    'Ridge': Ridge(),
    'Lasso': Lasso(),
    'GradientBoostingRegressor': GradientBoostingRegressor(random_state=config.RANDOM_SEED),
    'XGBRegressor': XGBRegressor(random_state=config.RANDOM_SEED),
    'LGBMRegressor': LGBMRegressor(random_state=config.RANDOM_SEED, force_col_wise=True, verbosity=-1),
}

# Model selection and tuning
best_model, best_hyperparams = None, None
best_mae, best_r2 = float('inf'), -float('inf')
scoring = ['neg_mean_absolute_error', 'r2']

for model_name, model in models_with_defaults.items():
    print(f"Cross-validating model: {model_name}...")
    cv_results = cross_validate(model, pipeline.X_train_preprocessed[:, :60], pipeline.y_train, 
                                cv=5, scoring=scoring)
    mean_mae_validation = -1 * np.mean(cv_results['test_neg_mean_absolute_error'])
    mean_r2_validation = np.mean(cv_results['test_r2'])
    
    if mean_mae_validation < best_mae and mean_r2_validation > best_r2:
        best_model = model
        best_mae, best_r2 = mean_mae_validation, mean_r2_validation

# Assuming CatBoostRegressor is the best model based on cross-validation
param_grid = {'depth': [4, 6, 8], 'learning_rate': [0.01, 0.05, 0.1], 'iterations': [30, 50, 100]}
grid_search = GridSearchCV(best_model, param_grid=param_grid, cv=5, scoring='neg_mean_squared_error', verbose=1)
grid_search.fit(pipeline.X_train_preprocessed[:, :60], pipeline.y_train)

# Evaluate on validation data
y_validation_pred = grid_search.best_estimator_.predict(pipeline.X_validation_preprocessed[:, :60])
mae_validation, r2_validation = mean_absolute_error(pipeline.y_validation, y_validation_pred), r2_score(pipeline.y_validation, y_validation_pred)
print(f"Validation MAE: {mae_validation:.2f}, R2: {r2_validation:.2f}")

# Save the tuned model
model_filename = 'best_waittime_regression_model.joblib'
joblib.dump(grid_search.best_estimator_, model_filename)

# Test data evaluation
y_test_pred = grid_search.best_estimator_.predict(pipeline.X_test_preprocessed[:, :60])
mae_test, r2_test = mean_absolute_error(pipeline.y_test, y_test_pred), r2_score(pipeline.y_test, y_test_pred)
print(f"Test MAE: {mae_test:.2f}, R2: {r2_test:.2f}")


ModuleNotFoundError: No module named 'utils'