In [1]:
import sys
from pathlib import Path
import joblib

# Add project root to the Python path
project_root = Path.cwd().parent.parent
sys.path.append(str(project_root))
import shap

import os
import re
import sys

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import shap
from scipy import stats
from scipy.sparse import issparse
from sklearn.base import clone

from imblearn.ensemble import BalancedRandomForestClassifier
from imblearn.over_sampling import ADASYN, SMOTE
from imblearn.pipeline import Pipeline as ImbPipeline
from imblearn.under_sampling import RandomUnderSampler

from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.cluster import KMeans
from sklearn.compose import ColumnTransformer
from sklearn.decomposition import PCA, TruncatedSVD
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.dummy import DummyClassifier
from sklearn.ensemble import (RandomForestClassifier, ExtraTreesClassifier,
                              GradientBoostingClassifier, GradientBoostingRegressor,
                              RandomForestRegressor)
from sklearn.feature_selection import (SelectFromModel, SelectKBest,
                                       RFECV, RFE, mutual_info_classif,
                                       mutual_info_regression)
from sklearn.impute import SimpleImputer
from sklearn.linear_model import LogisticRegression, ElasticNet
from sklearn.metrics import (classification_report, roc_auc_score, precision_recall_curve, auc,
                             accuracy_score, confusion_matrix, explained_variance_score,
                             f1_score, mean_absolute_error, mean_squared_error, precision_recall_fscore_support,
                             precision_score, r2_score, recall_score, silhouette_score)
from sklearn.model_selection import (train_test_split, RandomizedSearchCV, GridSearchCV, StratifiedKFold)
from sklearn.naive_bayes import MultinomialNB
from sklearn.neighbors import KNeighborsClassifier, KNeighborsRegressor
from sklearn.neural_network import MLPClassifier
from sklearn.pipeline import Pipeline, make_pipeline
from sklearn.preprocessing import (StandardScaler, OneHotEncoder, MinMaxScaler, FunctionTransformer, LabelEncoder, OrdinalEncoder)
from sklearn.svm import SVC, SVR
from sklearn.tree import DecisionTreeClassifier

from catboost import CatBoostClassifier
from lightgbm import LGBMClassifier
from umap import UMAP
from xgboost import XGBClassifier
from tempfile import gettempdir
from imblearn.over_sampling import SMOTE

from sklearn.model_selection import cross_val_score, cross_val_predict
from sklearn.metrics import classification_report, roc_auc_score, precision_recall_curve, auc, matthews_corrcoef
from imblearn.pipeline import Pipeline as ImbPipeline
from sklearn.model_selection import StratifiedKFold
import numpy as np

from sklearn.model_selection import train_test_split
from src.data_management.data_preprocessing_pipeline import DataPreprocessingPipeline

import scipy.sparse
import config
from config import RANDOM_SEED
config.set_seed()

In [2]:
# Loading dataset
path = "/Users/jeancherizol/MADS/699/all_data"
path_models = "/Users/jeancherizol/MADS/699/SIADS/emergency-dept-optimization/models"
emergency_df = pd.read_sas(f"{path}/nhamcs14.sas7bdat")

target = 'WAITTIME_BINARY'
target_to_drop = ['WAITTIME','LOV_BINARY']

In [3]:
# Instantiate the data preprocessing pipeline
pipeline = DataPreprocessingPipeline(emergency_df=emergency_df,target=target,target_to_drop=target_to_drop, percent_train=0.80,percent_val=0.10,percent_test=0.10,path=path,stratify=True)

# Run the pipeline
pipeline.run()

X_train = pipeline.X_train
X_validation = pipeline.X_validation
X_test = pipeline.X_test

y_train = pipeline.y_train
y_validation = pipeline.y_validation
y_test = pipeline.y_test
print("Length y_test",len(y_test))

X_train_preprocessed = pipeline.X_train_preprocessed
X_validation_preprocessed = pipeline.X_validation_preprocessed
X_test_preprocessed = pipeline.X_test_preprocessed

feature_names = pipeline.feature_names

emergency_df = pipeline.emergency_df 
cleaned_emergency_df = pipeline.cleaned_emergency_df 
transformed_emergency_df = pipeline.transformed_emergency_df

print("Count number of rows per classes of", cleaned_emergency_df[target].value_counts())

preprocessor = pipeline.processor

# Ensure feature_names is treated as a list
feature_names = [name.replace('num__', '').replace('cat__', '') for name in feature_names]
feature_names_list = list(feature_names)

1-Cleaning data...
Data cleaning completed
Size of Initial dataset:(23844, 1012)
Size of cleaned dataset:(23844, 408)

2-Applying feature engineering...
Feature engineering completed
Size of the dataset after feature engineering:(23844, 445)

3-Splitting data...
self.stratify: True
Splitting data completed

4-Loading data...
train_df size: (19075, 445)
X_train size: (19075, 444)
y_train size: (19075,)

validation_df size: (2384, 445)
X_validation size: (2384, 444)
y_validation size: (2384,)

test_df size: (2385, 445)
X_test size: (2385, 444)
y_test size: (2385,)
Loading data completed

5-Preprocessing data...
Preprocessing data completed.
Processor saved successfully
Length y_test 2385
Count number of rows per classes of WAITTIME_BINARY
0    17079
1     6765
Name: count, dtype: int64


In [4]:
# Define the model and parameters for RandomizedSearchCV
models_params = {
     'LGBMClassifier': {
        'model': LGBMClassifier(is_unbalance=True,verbose=-1),
        'params': {
            'classifier__max_depth': [3, 4, 5], 
            'classifier__learning_rate': [0.01, 0.1], 
            'classifier__n_estimators': [100, 200]
        }
    },
     'CatBoostClassifier': {
        'model': CatBoostClassifier(verbose=0),
        'params': {
            'classifier__depth': [3, 4, 5], 
            'classifier__learning_rate': [0.01, 0.1],
            'classifier__iterations': [100, 200],
            'classifier__train_dir': [gettempdir()] 
        }
    },
    'RandomForestClassifier': {
        'model': RandomForestClassifier(),
        'params': {
            'classifier__n_estimators': [100, 200],
            'classifier__max_depth': [None, 5, 10],
            'classifier__min_samples_leaf': [1, 2, 4]
        }
    },
    'XGBClassifier': {
        'model': XGBClassifier(use_label_encoder=False, eval_metric='logloss'),
        'params': {
            'classifier__max_depth': [3, 5, 7],
            'classifier__learning_rate': [0.01, 0.1, 0.2],
            'classifier__n_estimators': [100, 500],
            'classifier__subsample': [0.6, 0.8],
            'classifier__colsample_bytree': [0.6, 0.8]
        }
    },
    'Random Forest Classifier': {
        'model': RandomForestClassifier(random_state=RANDOM_SEED),
        'params': {
            'classifier__n_estimators': [100, 200], 
            'classifier__max_depth': [10, 20, None]
        }
    },
    'Gradient Boosting Classifier': {
        'model': GradientBoostingClassifier(random_state=RANDOM_SEED),
        'params': {
            'classifier__n_estimators': [100, 200], 
            'classifier__learning_rate': [0.01, 0.1, 0.2], 
            'classifier__max_depth': [3, 5, 7]
        }
    }
    ,
    'XGBClassifier (Updated)': {
        'model': XGBClassifier(use_label_encoder=False, eval_metric='logloss'),
        'params': {
            'classifier__max_depth': [3, 4, 5], 
            'classifier__learning_rate': [0.01, 0.1], 
            'classifier__n_estimators': [100, 200]
        }
    },
    'MLPClassifier': {
        'model': MLPClassifier(max_iter=100),
        'params': {
            'classifier__alpha': [0.0001, 0.001, 0.01], 
            'classifier__hidden_layer_sizes': [(100,), (50, 50), (100, 50)]
        }
    }
 
}


In [5]:
# cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=RANDOM_SEED)

# from sklearn.model_selection import cross_validate

# # Define scoring metrics
# scoring_metrics = {
#     'f1_weighted': 'f1_weighted',
#     'roc_auc': 'roc_auc',
#     'precision': 'precision_weighted',
#     'recall': 'recall_weighted'
# }

# model_scores = {}

# for model_name, mp in models_params.items():
#     print(f"Evaluating model: {model_name}")
#     pipeline = ImbPipeline([
#         ('preprocessor', preprocessor),  # Ensure this is correctly defined
#         ('smote', SMOTE(random_state=RANDOM_SEED)),
#         ('classifier', mp['model'])
#     ])
    
#     # Calculate cross-validated scores for multiple metrics
#     scores = cross_validate(pipeline, X_train, y_train, scoring=scoring_metrics, cv=cv, n_jobs=-1, return_train_score=False)
    
#     # Compute the mean for each metric
#     mean_scores = {metric: np.mean(scores[f'test_{metric}']) for metric in scoring_metrics}
    
#     model_scores[model_name] = mean_scores
    
#     print(f"Model: {model_name} - Scores: {mean_scores}")

# # We prioritize F1, but also consider ROC-AUC, Precision, and Recall as secondary criteria
# best_model_name = max(model_scores, key=lambda k: (
#     model_scores[k]['f1_weighted'],
#     model_scores[k]['roc_auc'],
#     model_scores[k]['precision'],
#     model_scores[k]['recall']
# ))
# print(f"Best model based on comprehensive evaluation: {best_model_name}")


In [6]:
from imblearn.pipeline import Pipeline as ImbPipeline
from imblearn.over_sampling import SMOTE, ADASYN
from imblearn.under_sampling import RandomUnderSampler
from imblearn.combine import SMOTEENN, SMOTETomek
from sklearn.model_selection import StratifiedKFold, cross_validate
from imblearn.ensemble import BalancedRandomForestClassifier, EasyEnsembleClassifier
import numpy as np

cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=RANDOM_SEED)

# Define scoring metrics
scoring_metrics = {
    'f1_weighted': 'f1_weighted',
    'roc_auc': 'roc_auc',
    'precision': 'precision_weighted',
    'recall': 'recall_weighted'
}

model_scores = {}


for model_name, mp in models_params.items():
    print(f"Evaluating model: {model_name}")
    
    # Adjust the resampling technique or class weight here based on the model
    if model_name in ['ModelNameSupportingClassWeight']:  # Placeholder for actual model name
        classifier = mp['model']
        classifier.set_params(**{'class_weight': 'balanced'})
        pipeline_steps = [('preprocessor', preprocessor), ('classifier', classifier)]
    else:
        resampling = SMOTE(random_state=RANDOM_SEED)  # Or any other resampling strategy
        pipeline_steps = [('preprocessor', preprocessor), ('resampling', resampling), ('classifier', mp['model'])]
    
    pipeline = ImbPipeline(steps=pipeline_steps)
    
    # Calculate cross-validated scores for multiple metrics
    scores = cross_validate(pipeline, X_train, y_train, scoring=scoring_metrics, cv=cv, n_jobs=-1, return_train_score=False)
    
    # Compute the mean for each metric
    mean_scores = {metric: np.mean(scores[f'test_{metric}']) for metric in scoring_metrics}
    
    model_scores[model_name] = mean_scores
    
    print(f"Model: {model_name} - Scores: {mean_scores}")

# Determine the best model based on F1 score and secondary criteria
best_model_name = max(model_scores, key=lambda k: (
    model_scores[k]['f1_weighted'],
    model_scores[k]['roc_auc'],
    model_scores[k]['precision'],
    model_scores[k]['recall']
))
print(f"Best model based on comprehensive evaluation: {best_model_name}")


Evaluating model: LGBMClassifier
Model: LGBMClassifier - Scores: {'f1_weighted': 0.6578240638545965, 'roc_auc': 0.694243591649756, 'precision': 0.6948898913385138, 'recall': 0.6422018348623852}
Evaluating model: CatBoostClassifier
Model: CatBoostClassifier - Scores: {'f1_weighted': 0.7663315032717346, 'roc_auc': 0.8294752911313983, 'precision': 0.768435948361739, 'recall': 0.7811795543905637}
Evaluating model: RandomForestClassifier
Model: RandomForestClassifier - Scores: {'f1_weighted': 0.6679801716251152, 'roc_auc': 0.7514411836852045, 'precision': 0.7222507952442855, 'recall': 0.7368807339449541}
Evaluating model: XGBClassifier
Model: XGBClassifier - Scores: {'f1_weighted': 0.7627181703045925, 'roc_auc': 0.8186460364550688, 'precision': 0.7617791121751609, 'recall': 0.7747313237221495}
Evaluating model: Random Forest Classifier
Model: Random Forest Classifier - Scores: {'f1_weighted': 0.6668856235393296, 'roc_auc': 0.7487316839163538, 'precision': 0.7232643621166495, 'recall': 0.736