In [11]:
import os
import matplotlib.pyplot as plt
import sklearn
import seaborn as sns
import shap
import numpy as np
import scipy.stats as stats
# from sklearn.inspection import plot_partial_dependence
import pandas as pd
import numpy as np
import joblib
from sklearn.metrics import mean_absolute_error, r2_score
# Loading dataset

import sys
from pathlib import Path

#  project root directory. .
project_root = Path.cwd().parents[1]
sys.path.insert(0, str(project_root))

# Import the project configuration settings.
import config
from config import RANDOM_SEED
config.set_seed()
# Import our custom utility functions
from utils import DataPreprocessingPipeline

In [2]:
# Load the raw data from the data/raw directory using the load_data method from the config module located at config.org
emergency_df = config.load_data('nhamcs14.sas7bdat', 'raw')

In [3]:
# Load and preprocess data using a custom pipeline defined in the utils module.
# Define the target variable and columns to drop
target = "LOV"
target_to_drop = ['LOV_BINARY', 'WAITTIME_BINARY']

# Initializing the data preprocessing pipeline
pipeline = DataPreprocessingPipeline(emergency_df=emergency_df, target=target,
                                     target_to_drop=target_to_drop, percent_train=0.70,
                                     percent_val=0.15, percent_test=0.15, stratify=False)

# Running the data preprocessing steps
pipeline.run()

# Extracting the preprocessed training, validation, and test sets
X_train_preprocessed = pipeline.X_train_preprocessed
X_validation_preprocessed = pipeline.X_validation_preprocessed
X_test_preprocessed = pipeline.X_test_preprocessed
y_train = pipeline.y_train
y_validation = pipeline.y_validation
y_test = pipeline.y_test

feature_names = pipeline.feature_names

cleaned_emergency_df = pipeline.cleaned_emergency_df 
transformed_emergency_df = pipeline.transformed_emergency_df

# Save the feature names for future reference
feature_names = [name.replace('num__', '').replace('cat__', '') for name in pipeline.feature_names]
feature_names_list = list(feature_names)
config.save_data(feature_names, f"features_{target}.csv", 'features')

1-Cleaning data...
Data cleaning completed
Size of Initial dataset:(23844, 1012)
Size of cleaned dataset:(17959, 370)

2-Applying feature engineering...
Feature engineering completed
Size of the dataset after feature engineering:(17959, 387)

3-Splitting data...
self.stratify: False
Splitting data completed

4-Loading data...
train_df size: (12571, 387)
X_train size: (12571, 386)
y_train size: (12571,)

validation_df size: (2693, 387)
X_validation size: (2693, 386)
y_validation size: (2693,)

test_df size: (2695, 387)
X_test size: (2695, 386)
y_test size: (2695,)
Loading data completed

5-Preprocessing data...
Preprocessing data completed.
Processor saved successfully


In [4]:
# Loads the tuned model and evaluates its performance on the test set
model_train_dir = "/Users/Macbook/Desktop/EDPredictiveEfficiency/notebooks/model_train/"
model_filename = "CatBoostRegressor_tuned.joblib"
model_file_path = os.path.join(model_train_dir, model_filename)
trained_model = joblib.load(model_file_path) if os.path.exists(model_file_path) else print(f"No such file found: {model_file_path}")
print("Model loaded successfully.")

Model loaded successfully.


In [7]:
# Make predictions on the test dataset with_all_features
y_pred_test = trained_model.predict(X_test_preprocessed)
# Evaluate the final model's performance on the test dataset
mae_test = mean_absolute_error(y_test, y_pred_test)
r2_test = r2_score(y_test, y_pred_test)
print("Test Set Performance (All features):")
print(f"Mean Absolute Error (MAE) on Test Data: {round(mae_test, 2)}")
print(f"R-squared on Test Data: {round(r2_test, 2)}")

Test Set Performance (All features):
Mean Absolute Error (MAE) on Test Data: 51.58
R-squared on Test Data: 0.55
