In [47]:
import numpy as np
from numpy.linalg import eig
from numpy.linalg import inv,pinv
import pandas as pd
import csv
from collections import defaultdict
from functools import partial
import matplotlib.cm as cm
import matplotlib.pyplot as plt
from IPython.display import display
import math
import random
%run common_functions.ipynb

In [105]:
def get_train_test_indices_by_test_percentage(test_percentage, n):
#     possible_indices = random.shuffle(list(range(n))) 
    possible_indices = random.sample(range(n), k=n)
    split_index = n * test_percentage//100 + 1
        
    return possible_indices[:-split_index], possible_indices[-split_index:]


class naive_bayes_classifier:
    def __init__(self):
        self.means, self.std_devs, self.priors = None, None, None
        self.training_size = None
    

    # model a multi-variate Gaussian distribution for each class’ likelihood distribution P(x|Ck)
    def fit(self, x, y):
        dataset = get_data_grouped_by_class(x, y)
        self.training_size = x.shape[0]
        means = {}
        std_devs = {}
        priors = {}
        
        for k, features_in_class_k in dataset.items():
            features_in_class_k = np.asarray(features_in_class_k)
            samples_in_class = features_in_class_k.shape[0]
            
            means[k] = np.mean(features_in_class_k, axis=0)
            std_devs[k] = np.std(features_in_class_k, axis=0)
            priors[k] = samples_in_class / self.training_size
            
        self.means = means
        self.std_devs = std_devs
        self.priors = priors
            
  
    def class_probability(self, x, class_mean, class_std_devs):
#         np.divide(a, b, out=np.zeros_like(a), where=b!=0)
        exponent = np.exp(np.square(x - class_mean) / 2 / np.square(class_std_devs) * -1)
        feature_probabilities = 1 / np.sqrt(2 * np.pi) / class_std_devs * exponent
        broadcasted_std_devs = np.broadcast_to(class_std_devs, (x.shape[0], class_std_devs.shape[0]))
        feature_probabilities[broadcasted_std_devs == 0] = 1
        return np.prod(feature_probabilities, axis=1)
    

    def make_predictions(self, x, y):
        likelihoods = []
        classes = np.asarray(list(self.means.keys()))
        
        class_probabilities = []
        for class_id in classes:  # iterate through all the classes to get probabilities for each class
            probabilities = self.class_probability(x, self.means[class_id], self.std_devs[class_id]) 
            res = self.priors[class_id] * probabilities
            # Compute the posterios P(Ck|x) prob of a class k given a point x
            class_probabilities.append(res)

        class_likelihoods = np.asarray(class_probabilities).T
#         print(class_likelihoods)

        # assign x to the class with the largest posterior probability
        predicted_classes = classes[np.argmax(class_likelihoods, axis=1)]
        error_percentage = 100 - np.mean(predicted_classes == y[:, 0]) * 100
        return error_percentage, predicted_classes

In [108]:
with open('digits.csv', 'r') as csvfile:
    digitDataset = np.asarray(list(csv.reader(csvfile, quoting=csv.QUOTE_NONNUMERIC)))
    
x = digitDataset[:, :-1]
y = digitDataset[:, -1:]
print("x: ", x.shape)
print("y: ", y.shape)
print(np.unique(y))

nb = naive_bayes_classifier()

train_indices, test_indices = get_train_test_indices_by_test_percentage(20, y.shape[0])

train_x = x[train_indices]
test_x = x[test_indices]
train_y = y[train_indices]
test_y = y[test_indices]

nb.fit(train_x, train_y)

split_train_error, split_train_predictions = nb.make_predictions(train_x, train_y)
split_test_error, split_test_predictions = nb.make_predictions(test_x, test_y)
print(split_train_error, split_test_error)

k = 10
k_folded_train_indices, k_folded_test_indices = get_k_fold_indices(k, y.shape[0], shuffle=True)

folds_errors = []

for fold, train_indices in enumerate(k_folded_train_indices):
    print(f'Fold#{fold + 1} of {k}:')
    test_indices = k_folded_test_indices[fold]
    
    train_x = x[train_indices]
    test_x = x[test_indices]
    train_y = y[train_indices]
    test_y = y[test_indices]
    
    nb.fit(train_x, train_y)

    train_error, train_predictions = nb.make_predictions(train_x, train_y)
    test_error, test_predictions = nb.make_predictions(test_x, test_y)
#     print(train_accuracy)
    folds_errors.append([train_error, test_error])

# print(all_projected_test_features)
errors_df = pd.DataFrame(folds_errors, columns = ["Train errors(%)", "Test errors(%)"])
print(errors_df)

x:  (1797, 64)
y:  (1797, 1)
[0. 1. 2. 3. 4. 5. 6. 7. 8. 9.]
1437
8.907446068197629 10.0
Fold#1 of 10:
Fold#2 of 10:
Fold#3 of 10:
Fold#4 of 10:
Fold#5 of 10:




Fold#6 of 10:
Fold#7 of 10:
Fold#8 of 10:
Fold#9 of 10:
Fold#10 of 10:
   Train errors(%)  Test errors(%)
0         8.905380        9.444444
1         8.967223       10.000000
2         8.658009       12.777778
3         9.029066       11.111111
4         8.348794       10.000000
5         9.585652        8.333333
6         8.843537       14.444444
7         8.961681        8.938547
8         9.456119        5.586592
9         9.147095       11.173184
