In [2]:
# import required libraries

import matplotlib.pyplot as plt
import numpy as np
from sklearn.datasets import load_digits
from sklearn.metrics import silhouette_score
from collections import Counter

In [3]:
# Load the digits dataset
digits = load_digits()
X = digits.data
y = digits.target
print(X.shape, y.shape)

(1797, 64) (1797,)


In [10]:
# normalize the data, so values are between 0 and 1
X = X / 16.0

# setting a random seed so we get same results everytime
np.random.seed(42)

# list of indices for all samples
indices = np.arange(len(X))

# shuffle indices randomly
np.random.shuffle(indices)

# separate the 80% of dataset into training
split = int(0.8 * len(X))

# first 80% of the sample used for training
train_indices = indices[:split]
# remaining 20% of the samples for testing
test_indices = indices[split:]

# create training and testing datasets
X_train = X[train_indices]
y_train = y[train_indices]
X_test = X[test_indices]
y_test = y[test_indices]


print(f"Number of samples in train data {train_indices.shape}")
print(f"Number of samples in test data {test_indices.shape}\n")
print(f"Shape of training data {X_train.shape}")
print(f"Shape of testing data {X_test.shape}")

Number of samples in train data (1437,)
Number of samples in test data (360,)

Shape of training data (1437, 64)
Shape of testing data (360, 64)


In [12]:
import numpy as np

classes = np.unique(y_train)

means = {}
variances = {}
priors = {}

for c in classes:
    X_c = X_train[y_train == c]
    means[c] = X_c.mean(axis=0)
    variances[c] = X_c.var(axis=0)
    priors[c] = len(X_c) / len(X_train)
def gaussian_pdf(x, mean, var):
    eps = 1e-9  # to avoid division by zero
    coeff = 1.0 / np.sqrt(2.0 * np.pi * var + eps)
    exponent = np.exp(-((x - mean)**2) / (2 * var + eps))
    return coeff * exponent
def predict_one(x):
    posteriors = []
    for c in classes:
        # log prior
        log_prior = np.log(priors[c])
        # log likelihoods
        log_likelihood = np.sum(np.log(gaussian_pdf(x, means[c], variances[c])))
        # posterior
        posterior = log_prior + log_likelihood
        posteriors.append(posterior)
    return classes[np.argmax(posteriors)]

# Predict for all test samples
y_pred = np.array([predict_one(x) for x in X_test])
accuracy = np.mean(y_pred == y_test)
print(f"Naive Bayes Test Accuracy: {accuracy * 100:.2f}%")


Naive Bayes Test Accuracy: 84.17%


  log_likelihood = np.sum(np.log(gaussian_pdf(x, means[c], variances[c])))


In [11]:
import numpy as np

class NaiveBayes:

    def fit(self, X, y):
        n_samples, n_features = X.shape
        self._classes = np.unique(y)
        n_classes = len(self._classes)

        # calculate mean, var, and prior for each class
        self._mean = np.zeros((n_classes, n_features), dtype=np.float64)
        self._var = np.zeros((n_classes, n_features), dtype=np.float64)
        self._priors = np.zeros(n_classes, dtype=np.float64)

        for idx, c in enumerate(self._classes):
            X_c = X[y == c]
            self._mean[idx, :] = X_c.mean(axis=0)
            self._var[idx, :] = X_c.var(axis=0)
            self._priors[idx] = X_c.shape[0] / float(n_samples)


    def predict(self, X):
        y_pred = [self._predict(x) for x in X]
        return np.array(y_pred)

    def _predict(self, x):
        posteriors = []

        # calculate posterior probability for each class
        for idx, c in enumerate(self._classes):
            prior = np.log(self._priors[idx])
            posterior = np.sum(np.log(self._pdf(idx, x)))
            posterior = posterior + prior
            posteriors.append(posterior)

        # return class with the highest posterior
        return self._classes[np.argmax(posteriors)]

    def _pdf(self, class_idx, x):
        mean = self._mean[class_idx]
        var = self._var[class_idx]
        numerator = np.exp(-((x - mean) ** 2) / (2 * var))
        denominator = np.sqrt(2 * np.pi * var)
        return numerator / denominator

# create the model
nb = NaiveBayes()
# train the model using training data
nb.fit(X_train, y_train)
# predict labels for test data
predictions = nb.predict(X_test)

# compute and print accuracy
accuracy = np.sum(predictions == y_test) / len(y_test) * 100
print(f"Accuracy of test data: {accuracy:.2f} %")


Accuracy of test data: 12.22 %


  numerator = np.exp(-((x - mean) ** 2) / (2 * var))
  numerator = np.exp(-((x - mean) ** 2) / (2 * var))
  return numerator / denominator
  posterior = np.sum(np.log(self._pdf(idx, x)))
