In [5]:
import numpy as np
from numpy.linalg import eig
from numpy.linalg import inv,pinv
import pandas as pd
import csv
from collections import defaultdict
from functools import partial
import matplotlib.cm as cm
import matplotlib.pyplot as plt
from IPython.display import display
import math
%run common_functions.ipynb

In [71]:
class naive_bayes_classifier:
    def __init__(self):
        self.class_means, self.class_stds, self.class_priors = {}, {}, {}
        self.N = None
    

    # model a multi-variate Gaussian distribution for each class’ likelihood distribution P(x|Ck)
    def fit(self, x, y):
        dataset = get_data_grouped_by_class(x, y)
        self.N = y.shape[0]
        
        for class_id, features_k in dataset.items():
            features_k = np.asarray(features_k)
            self.class_means[class_id] = np.mean(features_k, axis=0)
            self.class_stds[class_id] = np.std(features_k, axis=0)
            self.class_priors[class_id] = features_k.shape[0] / self.N
#             print(features_k.shape[0])
            
#         print(self.class_priors)
            
  
    def calculate_probability(self, x, mean, std):
#         np.divide(a, b, out=np.zeros_like(a), where=b!=0)
        exponent = np.exp(-(np.divide(np.square(x - mean) / 2, np.square(std))))
        feature_probabilities = np.multiply(np.divide(1, (np.sqrt(2 * np.pi) * std)), exponent)
        feature_probabilities[std == 0] = 1
        return np.prod(feature_probabilities)
    

    def predict(self, x, y):
        likelihoods = []
        classes = np.asarray(list(self.class_means.keys()))
        for x_i in x:
            row = []
            for class_id in classes:  # iterate through all the classes to get probabilities for each class
                probabilities = self.calculate_probability(x_i, self.class_means[class_id], self.class_stds[class_id]) 
                res = self.class_priors[class_id] * probabilities
                # Compute the posterios P(Ck|x) prob of a class k given a point x
                row.append(res)
            likelihoods.append(row)

        likelihoods = np.asarray(likelihoods)
#         print(likelihoods)

        # assign x to the class with the largest posterior probability
        predictions = classes[np.argmax(likelihoods, axis=1)]
        return np.sum(predictions == y[:, 0]) / len(y), predictions

In [72]:
with open('digits.csv', 'r') as csvfile:
    digitDataset = np.asarray(list(csv.reader(csvfile, quoting=csv.QUOTE_NONNUMERIC)))
    
x = digitDataset[:, :-1]
y = digitDataset[:, -1:]
print("x: ", x.shape)
print("y: ", y.shape)
print(np.unique(y))

nb = naive_bayes_classifier()

train_indices, test_indices = get_train_test_indices_by_train_percentage(80, y.shape[0], shuffle=True)
train_x = x[train_indices]
test_x = x[test_indices]
train_y = y[train_indices]
test_y = y[test_indices]

nb.fit(train_x, train_y)

split_train_accuracy, split_train_predictions = nb.predict(train_x, train_y)
split_test_accuracy, split_test_predictions = nb.predict(test_x, test_y)
split_train_accuracy, split_test_accuracy

x:  (1797, 64)
y:  (1797, 1)
[0. 1. 2. 3. 4. 5. 6. 7. 8. 9.]




(0.9068150208623088, 0.9108635097493036)