# Machine Learning: Naive Bayes

## 0 Imports

In [31]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import time
from sklearn import linear_model
from sklearn.metrics import confusion_matrix, recall_score, matthews_corrcoef
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.naive_bayes import GaussianNB

In [32]:
data_change = pd.read_csv("../dat/dips/DIPS_Data_cleaned_change.csv", sep = ",", low_memory = False)

## 1 Naive Bayes Model

In [37]:
# Predictors
X = data_change
X = X.drop(["hpi"], axis = 1)

# Target
y = data_change["hpi"]

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 42, stratify = y)

print("Naive Bayes \n",
      "X_train shape: ", X_train.shape, round(X_train.shape[0]/len(X), 2), "\n",
      "X_test shape: ", X_test.shape, round(X_test.shape[0]/len(X), 2),  "\n",
      "y_train shape: ", y_train.shape, round(y_train.shape[0]/len(y), 2), "\n",
      "y_test shape: ", y_test.shape, round(y_test.shape[0]/len(y), 2), "\n")

Naive Bayes 
 X_train shape:  (983, 22) 0.8 
 X_test shape:  (246, 22) 0.2 
 y_train shape:  (983,) 0.8 
 y_test shape:  (246,) 0.2 



In [34]:
def sklearn_naive_bayes(x_test_nb, x_train_nb, y_train_nb, y_test_nb):

    # 1: GRID SEARCH
    gnb_model = GaussianNB()
    param_grid = {
        'var_smoothing': np.logspace(0, -10, 50)
    }

    grid_search = GridSearchCV(
        estimator = gnb_model,
        param_grid = param_grid,
        cv = 10
    )
    grid_search.fit(x_train_nb, y_train_nb)
    best_model = grid_search.best_estimator_
    print("Best parameters: ", grid_search.best_params_)
    print("Best score: ", grid_search.best_score_)

    # 2: FITTING THE MODEL
    model = GaussianNB(
        var_smoothing = best_model.var_smoothing
    )
    model.fit(x_train_nb, y_train_nb)

    # 3: ESTIMATING WEIGHTS
    weights_nb = model.theta_
    features_nb = model.feature_names_in_

    # 4: COMPUTE TEST SET PREDICTIONS
    y_pred_nb = model.predict(x_test_nb)
    y_pred_proba_nb = model.predict_proba(x_test_nb)

    # 5: COMPUTE METRICS
    accuracy_nb = model.score(x_test_nb, y_test_nb)
    macro_f1_nb = recall_score(y_test_nb, y_pred_nb, average = "macro")
    micro_f1_nb = recall_score(y_test_nb, y_pred_nb, average = "micro")
    mcc_nb = matthews_corrcoef(y_test_nb, y_pred_nb)

    cm_nb = confusion_matrix(y_test_nb, y_pred_nb)
    precision_nb = cm_nb[1][1] / (cm_nb[1][1] + cm_nb[0][1])
    recall_nb = cm_nb[1][1] / (cm_nb[1][1] + cm_nb[1][0])

    # store metrics in a dictionary
    metrics_nb = {
        "accuracy": round(accuracy_nb, 4),
        "macro_f1": round(macro_f1_nb, 4),
        "micro_f1": round(micro_f1_nb, 4),
        "mcc": round(mcc_nb, 4),
        "precision": round(precision_nb, 4),
        "recall": round(recall_nb, 4),
        "confusion_matrix": cm_nb
    }

    return weights_nb, y_pred_nb, features_nb, metrics_nb

In [35]:
start = time.time()

weights_naive, y_pred_naive, features_naive, metrics_naive = sklearn_naive_bayes(X_test, X_train, y_train, y_test)

end = time.time()
print("Execution time: ", {end - start}, "seconds")
time_nb = end - start
time_nb_df = pd.DataFrame({"time": [time_nb]})
time_nb_df.to_csv("../exp/times_ML/time_nb.csv", sep = ",", index = False)

# save weights_ML and y_pred_ML
# weights_naive = pd.DataFrame([weights_naive], columns = features_naive)
# weights_naive.to_csv("../exp/weights_ML/weights_nb.csv", sep = ",", index = False)

y_pred_naive = pd.DataFrame(y_pred_naive, columns = ["y_pred"])
y_pred_naive.to_csv("../exp/y_pred_ML/y_pred_nb.csv", sep = ",", index = False)

Best parameters:  {'var_smoothing': np.float64(1.0)}
Best score:  0.7242733457019171
Execution time:  {2.729166030883789} seconds


In [36]:
metrics_naive

{'accuracy': 0.7398,
 'macro_f1': 0.5804,
 'micro_f1': 0.7398,
 'mcc': np.float64(0.2392),
 'precision': np.float64(0.6),
 'recall': np.float64(0.2174),
 'confusion_matrix': array([[167,  10],
        [ 54,  15]])}