# NB with less data

In [1]:
# https://deeplearningcourses.com/c/data-science-supervised-machine-learning-in-python
# https://www.udemy.com/data-science-supervised-machine-learning-in-python
# This is an example of a Naive Bayes classifier on MNIST data.
from __future__ import print_function, division
from future.utils import iteritems
from builtins import range, input
# Note: you may need to update your version of future
# sudo pip install -U future


import numpy as np
from util import get_data
from datetime import datetime
from scipy.stats import norm
from scipy.stats import multivariate_normal as mvn

In [2]:
class NaiveBayes(object):
    def fit(self, X, Y, smoothing=1e-2):
        self.gaussians = dict()
        self.priors = dict()
        labels = set(Y)
        for c in labels:
            current_x = X[Y == c]
            self.gaussians[c] = {
                'mean': current_x.mean(axis=0),
                'var': current_x.var(axis=0) + smoothing,
            }
            self.priors[c] = float(len(Y[Y == c])) / len(Y)

    def score(self, X, Y):
        P = self.predict(X)
        return np.mean(P == Y)

    def predict(self, X):
        N, D = X.shape
        K = len(self.gaussians)
        P = np.zeros((N, K))
        for c, g in iteritems(self.gaussians):
            mean, var = g['mean'], g['var']
            P[:,c] = mvn.logpdf(X, mean=mean, cov=var) + np.log(self.priors[c])
        return np.argmax(P, axis=1)

In [3]:
from scipy.io import loadmat
mnist_raw = loadmat("./data/mnist-original.mat")
mnist = {
    "data": mnist_raw["data"].T,
    "label": mnist_raw["label"][0],
    "COL_NAMES": ["label", "data"],
    "DESCR": "mldata.labelorg dataset: mnist-original",
}
data=mnist["data"]
label=mnist["label`"]

In [5]:
from numpy import random
random.shuffle(data)
random.shuffle(label)


In [20]:
# %debug
if __name__ == '__main__':
    X, Y = data, np.array(list(map(int,label)))
    Ntrain = len(Y) // 2
    Xtrain, Ytrain = X[:Ntrain], Y[:Ntrain]
    Xtest, Ytest = X[Ntrain:], Y[Ntrain:]

    model = NaiveBayes()
    t0 = datetime.now()
    
#     import ipdb 
#     ipdb.set_trace()
    
    model.fit(Xtrain, Ytrain)
    print("Training time:", (datetime.now() - t0))

    t0 = datetime.now()
    print("Train accuracy:", model.score(Xtrain, Ytrain))
    print("Time to compute train accuracy:", (datetime.now() - t0), "Train size:", len(Ytrain))

    t0 = datetime.now()
    print("Test accuracy:", model.score(Xtest, Ytest))
    print("Time to compute test accuracy:", (datetime.now() - t0), "Test size:", len(Ytest))

None
> [0;32m<ipython-input-20-93b1caab84dc>[0m(14)[0;36m<module>[0;34m()[0m
[0;32m     13 [0;31m[0;34m[0m[0m
[0m[0;32m---> 14 [0;31m    [0mmodel[0m[0;34m.[0m[0mfit[0m[0;34m([0m[0mXtrain[0m[0;34m,[0m [0mYtrain[0m[0;34m)[0m[0;34m[0m[0m
[0m[0;32m     15 [0;31m    [0mprint[0m[0;34m([0m[0;34m"Training time:"[0m[0;34m,[0m [0;34m([0m[0mdatetime[0m[0;34m.[0m[0mnow[0m[0;34m([0m[0;34m)[0m [0;34m-[0m [0mt0[0m[0;34m)[0m[0;34m)[0m[0;34m[0m[0m
[0m
ipdb> c
Training time: 0:00:04.086186
Train accuracy: 0.11208571428571429
Time to compute train accuracy: 0:00:10.583402 Train size: 35000
Test accuracy: 0.10011428571428571
Time to compute test accuracy: 0:00:10.451310 Test size: 35000
