# Implementation of Gaussian Naive Bayes classifier using scikit-learn

# Example 1: Gaussian Naive Bayes with Synthetic Dataset

In [None]:
#Classifier Building in Scikit-learn

#In the first example, we will generate synthetic data using scikit-learn and train and evaluate 
#the Gaussian Naive Bayes algorithm. 



In [None]:
#Generating the Dataset
#Scikit-learn provides us with a machine learning ecosystem so that you can generate the dataset and 
#evaluate various machine learning algorithms. 

#In this example we create a dataset with six features, three classes, and 800 samples (observations) 
#using the `make_classification` function. 

In [1]:
from sklearn.datasets import make_classification

X, y = make_classification(
    n_features=6,
    n_classes=3,
    n_samples=800,
    n_informative=2,
    random_state=1,
    n_clusters_per_class=1,
)

In [2]:
y

array([1, 0, 2, 0, 1, 1, 2, 2, 1, 0, 1, 2, 1, 0, 0, 0, 0, 1, 1, 2, 0, 0,
       0, 1, 1, 2, 1, 0, 0, 0, 2, 2, 2, 2, 1, 0, 2, 2, 0, 1, 2, 0, 0, 0,
       2, 1, 2, 1, 1, 0, 2, 2, 2, 1, 2, 2, 1, 2, 0, 0, 1, 2, 1, 2, 0, 2,
       2, 2, 0, 0, 1, 1, 1, 1, 2, 2, 1, 1, 2, 1, 1, 2, 1, 2, 0, 1, 2, 1,
       2, 0, 0, 1, 0, 1, 1, 0, 0, 2, 0, 1, 2, 2, 1, 0, 0, 1, 2, 1, 1, 0,
       0, 1, 0, 0, 0, 1, 2, 0, 1, 2, 1, 2, 0, 2, 1, 0, 0, 1, 2, 2, 0, 2,
       1, 0, 2, 1, 1, 1, 0, 0, 2, 2, 1, 2, 2, 0, 0, 1, 1, 0, 0, 1, 0, 0,
       1, 2, 0, 1, 0, 2, 1, 0, 0, 0, 1, 1, 2, 2, 1, 0, 2, 1, 0, 2, 2, 0,
       2, 2, 1, 0, 1, 1, 1, 0, 2, 0, 2, 0, 1, 1, 2, 2, 2, 2, 1, 0, 1, 1,
       0, 1, 1, 1, 0, 0, 0, 2, 2, 1, 2, 0, 1, 0, 2, 2, 0, 1, 2, 2, 0, 1,
       0, 1, 1, 2, 2, 0, 1, 1, 0, 2, 1, 0, 1, 2, 1, 1, 1, 1, 0, 0, 2, 2,
       1, 2, 2, 2, 0, 2, 1, 1, 0, 1, 2, 1, 0, 0, 1, 2, 0, 2, 1, 1, 1, 1,
       1, 2, 1, 0, 2, 1, 1, 1, 2, 0, 0, 2, 2, 0, 0, 1, 1, 2, 0, 0, 1, 0,
       0, 0, 0, 2, 1, 2, 2, 0, 0, 0, 2, 1, 2, 2, 1,

In [None]:
# Test Split
#Before we start the training process, we need to split the dataset into training and testing for model evaluation. 

In [3]:


from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=125)


In [None]:
#Model Building and Training 
#Build a generic Gaussian Naive Bayes and train it on a training dataset. 
#After that, feed a random test sample to the model to get a predicted value.

In [8]:
from sklearn.naive_bayes import GaussianNB

# Build a Gaussian Classifier
model = GaussianNB()

# Model training
model.fit(X_train, y_train)

# Predict Output
predicted = model.predict([X_test[6]])

print("Actual Value:", y_test[6])
print("Predicted Value:", predicted[0])

Actual Value: 0
Predicted Value: 0


In [None]:
#Both actual and predicted values are the same. 

In [5]:
#Model Evaluation

from sklearn.metrics import (
    accuracy_score,
    confusion_matrix,
    ConfusionMatrixDisplay,
    f1_score,
)

y_pred = model.predict(X_test)
accuray = accuracy_score(y_pred, y_test)
f1 = f1_score(y_pred, y_test, average="weighted")

print("Accuracy:", accuray)
print("F1 Score:", f1)

Accuracy: 0.8484848484848485
F1 Score: 0.8491119695890328


In [None]:
#Our model performs fairly well with default hyperparameters. 

In [None]:
#To visualize the Confusion matrix, we will use `confusion_matrix` to calculate the 
#true positives and true negatives and `ConfusionMatrixDisplay` to display the confusion matrix with the labels.

In [None]:
labels = [0,1,2]
cm = confusion_matrix(y_test, y_pred, labels=labels)
disp = ConfusionMatrixDisplay(confusion_matrix=cm, display_labels=labels)
disp.plot();

In [None]:
#Our model has performed quite well, and we can improve model performance by scaling, preprocessing cross-validations, and hyperparameter optimization. 

# Example 2: Naive Bayes Using Iris Dataset

In [9]:
# load the iris dataset
from sklearn.datasets import load_iris
iris = load_iris()
  

In [10]:
# store the feature matrix (X) and response vector (y)
X = iris.data
y = iris.target

In [11]:
# splitting X and y into training and testing sets
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.4, random_state=1)

In [12]:
# training the model on training set
from sklearn.naive_bayes import GaussianNB
gnb = GaussianNB()
gnb.fit(X_train, y_train)

GaussianNB()

In [13]:
# making predictions on the testing set
y_pred = gnb.predict(X_test)

In [14]:
# comparing actual response values (y_test) with predicted response values (y_pred)
from sklearn import metrics
print("Gaussian Naive Bayes model accuracy(in %):", metrics.accuracy_score(y_test, y_pred)*100)

Gaussian Naive Bayes model accuracy(in %): 95.0


In [None]:
# load the iris dataset
from sklearn.datasets import load_iris
iris = load_iris()
  
# store the feature matrix (X) and response vector (y)
X = iris.data
y = iris.target
  
# splitting X and y into training and testing sets
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.4, random_state=1)
  
# training the model on training set
from sklearn.naive_bayes import GaussianNB
gnb = GaussianNB()
gnb.fit(X_train, y_train)
  
# making predictions on the testing set
y_pred = gnb.predict(X_test)
  
# comparing actual response values (y_test) with predicted response values (y_pred)
from sklearn import metrics
print("Gaussian Naive Bayes model accuracy(in %):", metrics.accuracy_score(y_test, y_pred)*100)
