In [1]:
import sys
import numpy as np
import pandas as pd
import os
import joblib
import shap  # Import SHAP library
import seaborn as sns  # Import Seaborn for visualization
import matplotlib.pyplot as plt  # Import matplotlib for plotting
from catboost import CatBoostClassifier
from sklearn.metrics import accuracy_score
from sklearn.model_selection import GridSearchCV, train_test_split, cross_validate
from sklearn.metrics import mean_absolute_error, r2_score, mean_squared_error
from catboost import CatBoostClassifier
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import f1_score, roc_auc_score, precision_score, recall_score,make_scorer, classification_report, precision_recall_fscore_support
from sklearn.metrics import classification_report
from scipy import sparse as sp
from pathlib import Path

import sys
from pathlib import Path

project_root = Path.cwd().parents[1]
sys.path.insert(0, str(project_root))

# Import the project configuration settings.
import config
from config import RANDOM_SEED
config.set_seed()

# Loading training data
X_train = config.load_data('X_train_classification_WAITTIME_BINARY.csv', 'train')
y_train = config.load_data('y_train_classification_WAITTIME_BINARY.csv', 'train')

X_test = config.load_data('X_test_classification_WAITTIME_BINARY.csv', 'test')
y_test = config.load_data('y_test_classification_WAITTIME_BINARY.csv', 'test')
test_df = config.load_data('test_df_classification_WAITTIME_BINARY.csv', 'test')

# Loading validation data, if percent_val > 0
X_validation = config.load_data('X_validation_classification_WAITTIME_BINARY.csv', 'validation')
y_validation = config.load_data('y_validation_classification_WAITTIME_BINARY.csv', 'validation')

# Loading preprocessed data
X_train_preprocessed = config.load_data('X_train_preprocessed_classification_WAITTIME_BINARY.csv', 'processed')
X_validation_preprocessed = config.load_data('X_validation_preprocessed_classification_WAITTIME_BINARY.csv', 'processed')
X_test_preprocessed = config.load_data('X_test_preprocessed_classification_WAITTIME_BINARY.csv', 'processed')

feature_names = config.load_data('features_WAITTIME_BINARY.csv', 'features')['0'].tolist()
top_features = config.load_data('top_features_WAITTIME_BINARY.csv', 'features')['0'].tolist()
top_feature_indices = config.load_data('feature_indices_WAITTIME_BINARY.csv', 'features')['0'].tolist()

X_train_selected_features = config.load_data('X_train_selected_features_WAITTIME_BINARY.csv', 'train').sort_index(axis=1)
X_validation_selected_features = config.load_data('X_validation_selected_features_WAITTIME_BINARY.csv', 'validation').sort_index(axis=1)
X_test_selected_features = config.load_data('X_test_selected_features_WAITTIME_BINARY.csv', 'test').sort_index(axis=1)

In [2]:

# Define a range of hyperparameters for CatBoostClassifier
param_grid = {
    'depth': [4, 6, 8],
    'learning_rate': [0.01, 0.05, 0.1],
    'iterations': [30, 50, 100],
    'l2_leaf_reg': [1, 3, 5]
}

# Initialize CatBoostClassifier with auto_class_weights set to 'Balanced'
catboost_model = CatBoostClassifier(random_state=42, verbose=0, auto_class_weights='Balanced')

# Configure GridSearchCV
grid_search = GridSearchCV(
    estimator=catboost_model,
    param_grid=param_grid,
    cv=5,
    n_jobs=-1,
    scoring=make_scorer(f1_score, average='weighted'),  # Using make_scorer to specify average method
    verbose=1
)

# Perform hyperparameter tuning
print("Starting hyperparameter tuning...")
grid_search.fit(X_train_selected_features, y_train)

# Retrieve and report the best hyperparameters
best_hyperparams = grid_search.best_params_
print(f"Best hyperparameters: {best_hyperparams}")

# Training the final model with best hyperparameters
best_model = grid_search.best_estimator_

# Evaluate on validation set
y_validation_pred = best_model.predict(X_validation_selected_features)
y_validation_proba = best_model.predict_proba(X_validation_selected_features)[:, 1]  # For ROC AUC

# Calculate metrics                                             
f1_validation = f1_score(y_validation, y_validation_pred, average='weighted')
roc_auc_validation = roc_auc_score(y_validation, y_validation_proba)
precision_validation = precision_score(y_validation, y_validation_pred, average='weighted')
recall_validation = recall_score(y_validation, y_validation_pred, average='weighted')

print(f"Validation Metrics:\nF1 (Weighted): {f1_validation:.2f}, ROC AUC: {roc_auc_validation:.2f}, Precision (Weighted): {precision_validation:.2f}, Recall (Weighted): {recall_validation:.2f}")



#Metrics for each class
# Generate a classification report
report = classification_report(y_validation, y_validation_pred, output_dict=True)

# Print out the classification report
print(classification_report(y_validation, y_validation_pred))

# For a more custom output, you can also access specific metrics from the report dictionary:
for label, metrics in report.items():
    if label not in ('accuracy', 'macro avg', 'weighted avg'):
        print(f"Class {label} - Precision: {metrics['precision']:.2f}, Recall: {metrics['recall']:.2f}, F1-score: {metrics['f1-score']:.2f}")

# # Save the best trained model 
model_filename = f"best_waittime_classification_model_tuned.joblib"
config.save_model(best_model, model_filename)
# Print confirmation
print(f"{best_model} saved")



Starting hyperparameter tuning...
Fitting 5 folds for each of 81 candidates, totalling 405 fits
Best hyperparameters: {'depth': 8, 'iterations': 100, 'l2_leaf_reg': 5, 'learning_rate': 0.1}
Validation Metrics:
F1 (Weighted): 0.72, ROC AUC: 0.81, Precision (Weighted): 0.73, Recall (Weighted): 0.72
              precision    recall  f1-score   support

         0.0       0.81      0.72      0.76      1776
         1.0       0.62      0.72      0.66      1103

    accuracy                           0.72      2879
   macro avg       0.71      0.72      0.71      2879
weighted avg       0.73      0.72      0.72      2879

Class 0.0 - Precision: 0.81, Recall: 0.72, F1-score: 0.76
Class 1.0 - Precision: 0.62, Recall: 0.72, F1-score: 0.66
<catboost.core.CatBoostClassifier object at 0x179c5b050> saved


In [3]:

cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=RANDOM_SEED)

# Define and train the best model
best_model = CatBoostClassifier(random_state=RANDOM_SEED, verbose=0, auto_class_weights='Balanced')
best_model.fit(X_train_selected_features, y_train)  # Assume these are defined appropriately

# Prediction - using a custom threshold if necessary
y_test_pred_proba = best_model.predict_proba(X_test_selected_features)[:, 1]  # Assume this is defined
# Determine a threshold that may better balance precision and recall
threshold = 0.5  # This can be adjusted based on the desired precision/recall trade-off
y_test_pred = (y_test_pred_proba >= threshold).astype(int)

# Ensure y_test, y_test_pred, and y_test_proba are 1D numpy arrays
y_test = np.ravel(y_test)

# Generate HOSPCODEs from the test DataFrame
HOSPCODEs_test = np.array(test_df['HOSPCODE'].tolist())
HOSPCODEs_test = HOSPCODEs_test.astype(int).astype(str)

# Create a DataFrame to hold predictions along with their corresponding HOSPCODEs
predictions_df = pd.DataFrame({
    'HOSPCODE': HOSPCODEs_test,
    'y_true': y_test,
    'y_pred': y_test_pred,
    'y_proba': y_test_pred_proba
})

# Analyze results per HOSPCODE
valid_hospcode_list = []

for code, group in predictions_df.groupby('HOSPCODE'):
    metrics = precision_recall_fscore_support(group['y_true'], group['y_pred'], average=None, zero_division=0)
    precision, recall, f1_score, _ = metrics  # Unpack metrics for each class

    # Validate metrics: All metrics for both classes should be >0.70 and <1.00
    if all(m > 0.70 and m < 1.00 for m in precision) and \
       all(m > 0.70 and m < 1.00 for m in recall) and \
       all(m > 0.70 and m < 1.00 for m in f1_score):
        valid_hospcode_list.append(code)
        print(f"HOSPCODE: {code} - All metrics met the criteria (>0.70 and <1.00).")
        print(f"Classification Report for {code}:")
        print(classification_report(group['y_true'], group['y_pred'], zero_division=0))
        print("\n")

# Output valid HOSPCODEs
print("List of HOSPCODEs meeting the criteria:", valid_hospcode_list)


HOSPCODE: 120 - All metrics met the criteria (>0.70 and <1.00).
Classification Report for 120:
              precision    recall  f1-score   support

         0.0       0.75      0.75      0.75         4
         1.0       0.75      0.75      0.75         4

    accuracy                           0.75         8
   macro avg       0.75      0.75      0.75         8
weighted avg       0.75      0.75      0.75         8



HOSPCODE: 139 - All metrics met the criteria (>0.70 and <1.00).
Classification Report for 139:
              precision    recall  f1-score   support

         0.0       0.86      0.86      0.86         7
         1.0       0.80      0.80      0.80         5

    accuracy                           0.83        12
   macro avg       0.83      0.83      0.83        12
weighted avg       0.83      0.83      0.83        12



HOSPCODE: 162 - All metrics met the criteria (>0.70 and <1.00).
Classification Report for 162:
              precision    recall  f1-score   support

  