In [20]:
# Import necessary libraries
import pandas as pd  # For data manipulation and analysis
from sklearn.naive_bayes import MultinomialNB, GaussianNB  # Import Naive Bayes algorithms for multinomial and Gaussian models
from sklearn.model_selection import train_test_split  # For splitting data into training and testing sets
from sklearn.metrics import accuracy_score, confusion_matrix  # For evaluating model performance

In [21]:
# Load and preprocess the dataset
bank_df = pd.read_csv('bank.csv', delimiter=';')  # Load the dataset from a CSV file, specifying the delimiter as ';'
# The delimiter ';' refers to the character used to separate values in the dataset file. When reading the CSV file, specifying delimiter=';'
# tells the pandas read_csv function to recognize the semicolon ';' as the boundary between columns in each row of data. This is essential for
# correctly parsing the file into a DataFrame where each column represents a distinct variable, especially in cases where the default delimiter
# (usually a comma ',') is not used.

In [22]:
# Convert all categorical variables to numerical format for model compatibility
for col in bank_df.columns:
    labels, uniques = pd.factorize(bank_df[col])  # Factorize transforms categorical variables into a numerical format
    bank_df[col] = labels  # Replace the original column values with their numerical encodings

In [23]:
# Prepare the dataset for training/testing
y = bank_df['y']  # Define the target variable (y) which we aim to predict
X = bank_df.drop(columns='y')  # Define the feature variables (X) used for prediction, excluding the target column

In [24]:
# Split the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3)  # Split data, allocating 70% for training and 30% for testing

In [25]:
# Model training and evaluation using Multinomial Naive Bayes
nb_multi = MultinomialNB()  # Initialize the Multinomial Naive Bayes classifier
nb_multi.fit(X, y)  # Train the classifier using the entire dataset (X, y)

In [26]:
# Model training and evaluation using Gaussian Naive Bayes
nb_gauss = GaussianNB()  # Initialize the Gaussian Naive Bayes classifier
nb_gauss.fit(X, y)  # Train the classifier using the entire dataset (X, y)

In [27]:
# Predictions and performance evaluation for Multinomial Naive Bayes
multi_preds = nb_multi.predict(X_test)  # Predict the target variable for the test set using Multinomial Naive Bayes
print("Results for multinomial distribution assumption:")  # Print results header for Multinomial Naive Bayes
print(accuracy_score(y_test, multi_preds))  # Calculate and print the accuracy of the Multinomial Naive Bayes model
print(confusion_matrix(y_test, multi_preds))  # Calculate and print the confusion matrix to evaluate model performance

Results for multinomial distribution assumption:
0.5386882829771555
[[620 573]
 [ 53 111]]


In [28]:
# Predictions and performance evaluation for Gaussian Naive Bayes
gauss_preds = nb_gauss.predict(X_test)  # Predict the target variable for the test set using Gaussian Naive Bayes
print("Results for Gaussian distribution assumption:")  # Print results header for Gaussian Naive Bayes
print(accuracy_score(y_test, gauss_preds))  # Calculate and print the accuracy of the Gaussian Naive Bayes model
print(confusion_matrix(y_test, gauss_preds))  # Calculate and print the confusion matrix to evaluate model performance

Results for Gaussian distribution assumption:
0.8268238761974944
[[1066  127]
 [ 108   56]]


In [29]:
# This code snippet demonstrates how to implement and evaluate two types of Naive Bayes classifiers (Multinomial and Gaussian) on the bank dataset.
# It includes data preprocessing (converting categorical variables to numerical), model training, predictions, and evaluation (using accuracy and
# confusion matrix). The choice between Multinomial and Gaussian Naive Bayes depends on the distribution of input features; Multinomial is used for
# discrete data, and Gaussian for data that follows a normal distribution.