In [1]:
import time

import numpy as np 
import pandas as pd

from sklearn.naive_bayes import BernoulliNB
from sklearn.model_selection import KFold
from sklearn.model_selection import cross_val_score
from sklearn import metrics

In [2]:
# Load the df_working pickle object for use in a Naive Bayes classifier
df = pd.read_pickle("df_one_hot.pkl")

In [29]:
# Instantiate the model and store in a variable 
bnb = BernoulliNB()

# Define the model features 
X = df.iloc[:,20:]

# Define the target variable 
y = df["income"]

# Set a timer to see how long the model takes to run
start = time.time()

# Run the model and score
cross_val = cross_val_score(bnb,X,y,cv=5)

# Determine the amount of time it takes the run the model and score
runtime = f"{(time.time() - start):0.2f}"

# Print the cross validation results
print(cross_val)

# Print the run time
print(f"\nModel run time: {runtime} secs.")

[0.79287579 0.79299754 0.78716216 0.79929361 0.79944717]

Model run time: 1.27 secs.


In [37]:
# Naive Bayes train and test
def nb_train_and_test(X,y,splits):
    # Instantiate the model and store in a variable 
    bnb = BernoulliNB()

    # Set the number of folds (training and test samples)
    kf = KFold(n_splits=splits)

    # Create a parent list to hold test and train scores
    score_lists = list()

    # Train and test using KFolds
    for i, (train_index, test_index) in enumerate(kf.split(X), start=1):
        # Create the train samples for the fold
        X_train, X_test = X.iloc[train_index], X.iloc[test_index]

        # Create the test samples for the fold
        y_train, y_test = y.iloc[train_index], y.iloc[test_index]

        # Locate the appropriate rows in the target series
        train_ref = y.iloc[train_index]

        # Set a timer to see how long fitting took
        start = time.time()
        
        # Train the model using the train and test data
        y_pred_train = bnb.fit(X_train,y_train).predict(X_train)

        # Check to see how long the fitting took
        fit_time = f"{(time.time() - start):0.2f}"

        # Convert the confusion matrix to a list for reporting output
        scores = list(metrics.confusion_matrix(y_train, y_pred_train).ravel()) 

        # Add the fold iteration to the scores list
        scores.insert(0,i)

        # Score the folds accuracy
        accuracy = metrics.accuracy_score(y_train, y_pred_train)

        # Append the accuracy to the scores list
        scores.append(accuracy)

        # Append the fit time to the scores list
        scores.append(fit_time)

        # Append the scores to the score_lists parent
        score_lists.append(scores)

        # Precision, recall and f1-score
        precision = pd.DataFrame.from_dict(
            metrics.classification_report(y_train, y_pred_train,output_dict=True)
        )

    # Create a dataframe of training and test results
    df_out = pd.DataFrame(score_lists, columns=["fold", "true_negative", "false_positive", "false_negative", "true_positive", "accuracy", "fit_time (secs.)"])

    # List of output objects
    resluts_output = [df_out,precision]

    return resluts_output

# Model features
X = df.iloc[:, 20:]

# Model target
y = df["income"]

# Define the number groups the data will be split into
splits = 5

# Get the report dataframes from the function return
nb_results, nb_precision = nb_train_and_test(X,y,splits)

# Show the results dataframe
display(nb_results)

# Show the second results dataframe
display(nb_precision)

Unnamed: 0,fold,true_negative,false_positive,false_negative,true_positive,accuracy,fit_time (secs.)
0,1,16012,3767,1529,4740,0.796683,0.22
1,2,15937,3813,1503,4796,0.795923,0.21
2,3,15977,3780,1503,4789,0.79719,0.2
3,4,15940,3846,1525,4738,0.793812,0.22
4,5,16012,3796,1527,4714,0.795654,0.21


Unnamed: 0,<=50K,>50K,accuracy,macro avg,weighted avg
precision,0.912937,0.553937,0.795654,0.733437,0.826925
recall,0.80836,0.755328,0.795654,0.781844,0.795654
f1-score,0.857472,0.639143,0.795654,0.748307,0.805163
support,19808.0,6241.0,0.795654,26049.0,26049.0
