# Final Model Selection - Naïve Bayes

## Setup

In [1]:
#import libraries
import numpy as np
import pandas as pd
from sklearn.model_selection import StratifiedKFold, RandomizedSearchCV, train_test_split
from sklearn.naive_bayes import GaussianNB
from sklearn.metrics import f1_score, recall_score, roc_auc_score, make_scorer, classification_report
from imblearn.over_sampling import SMOTE
from imblearn.pipeline import Pipeline

In [2]:
df = pd.read_csv("../data/df_final.csv")

## Randomized Search Cross-Validation

I will now use `RandomizedSearchCV` to find the best hyperparameters for my Naïve Bayes model. I will use `StratifiedKFold` for cross-validation and `SMOTE` for oversampling, to account for the imbalance in the dataset.

In [3]:
#get features and target
X = df.drop("is_child", axis=1)
y = df["is_child"]

In [4]:
#make pipeline
pipe = Pipeline([
    ("smote", SMOTE(random_state=42)),
    ("clf", GaussianNB())
])

#set hyperparameters
params = {
        "clf__var_smoothing": np.logspace(-11, -7, 10), 
        "smote__k_neighbors": [3, 5, 7],
        "smote__sampling_strategy": [0.6, 0.7, 0.8, 0.9, "auto"],
         }

#instantiate folds for cross-validation
cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

#set scoring metrics
scoring = {
    "recall": make_scorer(recall_score),
    "f1": make_scorer(f1_score),
    "roc_auc": make_scorer(roc_auc_score)
}

#instantiate cross-validation
random_search = RandomizedSearchCV(
    pipe,
    param_distributions=params,
    n_iter=25,
    cv=cv,
    scoring=scoring,
    n_jobs=-1,
    random_state=42,
    refit="recall",
    verbose=2
)

In [5]:
#split into train and test data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

In [6]:
#fit model
random_search.fit(X_train, y_train)

Fitting 5 folds for each of 25 candidates, totalling 125 fits
[CV] END clf__var_smoothing=2.7825594022071258e-11, smote__k_neighbors=3, smote__sampling_strategy=0.9; total time= 2.0min
[CV] END clf__var_smoothing=2.7825594022071258e-11, smote__k_neighbors=3, smote__sampling_strategy=0.9; total time= 2.0min
[CV] END clf__var_smoothing=5.994842503189421e-10, smote__k_neighbors=7, smote__sampling_strategy=0.9; total time= 2.1min
[CV] END clf__var_smoothing=5.994842503189421e-10, smote__k_neighbors=7, smote__sampling_strategy=0.9; total time= 2.1min
[CV] END clf__var_smoothing=5.994842503189421e-10, smote__k_neighbors=7, smote__sampling_strategy=0.9; total time= 2.2min
[CV] END clf__var_smoothing=2.7825594022071258e-11, smote__k_neighbors=3, smote__sampling_strategy=0.9; total time= 2.2min
[CV] END clf__var_smoothing=5.994842503189421e-10, smote__k_neighbors=7, smote__sampling_strategy=0.9; total time= 2.4min
[CV] END clf__var_smoothing=5.994842503189421e-10, smote__k_neighbors=7, smote__s

In [7]:
#get predictions
y_pred = random_search.predict(X_test)

#find best parameters from cross-validation
print("Best parameters:", random_search.best_params_)
print("Best recall score:", random_search.best_score_)

Best parameters: {'smote__sampling_strategy': 'auto', 'smote__k_neighbors': 3, 'clf__var_smoothing': np.float64(5.994842503189421e-10)}
Best recall score: 0.8163688592433905


In [8]:
#classification report and ROC AUC
print(classification_report(y_test, y_pred))
print(f"ROC AUC: {roc_auc_score(y_test, y_pred):.2f}")

              precision    recall  f1-score   support

           0       0.98      0.63      0.77    463402
           1       0.11      0.82      0.19     25674

    accuracy                           0.64    489076
   macro avg       0.55      0.72      0.48    489076
weighted avg       0.94      0.64      0.74    489076

ROC AUC: 0.72
