In [2]:
import numpy as np 
import matplotlib.pyplot as plt
from datetime import datetime 
from scipy.stats import norm 
from scipy.stats import multivariate_normal as mvn


In [4]:
from util import get_data

# Naive Bayes 
This example uses Gaussian (Normal) distribution as a model of likelihood.  
Mean/Var to each pixel (each of multi-dimentional features) is calculated to each target class, and this is used as likelihood.  
During prediction, posterior is calculated by using likelihood and prior (based on count of each class in test data) 

> Tricky part of the code is use of `multivariate_normal.logpdf( X, mean=mean, cov=var )`.  
> MultiVariate calculate joint probability distribution of all features.   
> Given covariance is 1D vector, which is recognized as diagonal variances (independence).  
> This is products of PDF(x), i.e., summation of logPDF(x).   
> This is essential part of **Naive-ness**.  

In [14]:
class NaiveBayes(object):
    def fit(self, X, Y, smoothing=1e-2):
        # create dictionary of gaussians(mu,var) and prior to each target class 
        self.gaussians = dict() 
        self.priors = dict() 
        labels = set(Y)      # unique set of target label 
        for c in labels:
            current_x = X[Y==c]  # test data with 'c' labeled 
            
            # mean and variance (with smoothing constant) to each pixel (feature), over all test data (axis=0)
            # Each mean & var are 1D vector (1xD) 
            self.gaussians[c] = { 
                'mean': current_x.mean(axis=0), 
                'var': current_x.var(axis=0) + smoothing, 
            }
            self.priors[c] = float(len(Y[Y==c])) / len(Y)
    
    def score(self, X, Y): 
        P = self.predict(X)
        return np.mean( P == Y ) 
    
    def predict(self, X): 
        N, D = X.shape 
        K = len(self.gaussians) # the number of target labels 
        P = np.zeros((N,K))     # For each test data (N), compute probability of each classs (K) 
        for c,g in self.gaussians.items(): 
            mean, var = g['mean'], g['var']
            # Vectorized to all test data (N)
            # X={DxN}, mean={Dx1}, cov={Dx1} 
            # mvn.logpdf => sum of logpdf( each feature out of D )
            P[:,c] = mvn.logpdf(X, mean=mean, cov=var) + np.log(self.priors[c])
        # return argmax over classes 
        return np.argmax( P, axis=1 ) 


In [10]:
X, Y = get_data(10000)
Ntrain = len(Y) // 2 
Xtrain, Ytrain = X[:Ntrain], Y[:Ntrain]
Xtest, Ytest = X[Ntrain:], Y[Ntrain:]


Reading in and transforming data...


In [15]:
model = NaiveBayes() 

t0 = datetime.now() 
model.fit(Xtrain, Ytrain) 
print("Training Time", (datetime.now() - t0))

t0 = datetime.now() 
train_score = model.score(Xtrain, Ytrain) 
print("Train Accuracy: ", train_score)
print("Time to compute train accuracy", (datetime.now()-t0), "Train size:", len(Ytrain)) 

t0 = datetime.now() 
test_score = model.score(Xtest, Ytest) 
print("Test Accuracy: ", test_score)
print("Time to compute test accuracy", (datetime.now()-t0), "Test size:", len(Ytest))

Training Time 0:00:00.068000
Train Accuracy:  0.8064
Time to compute train accuracy 0:00:01.413000 Train size: 5000
Test Accuracy:  0.798
Time to compute test accuracy 0:00:01.329000 Test size: 5000
