In [8]:
import numpy as np 
import pandas as pd
from datetime import datetime
from scipy.stats import norm
from scipy.stats import multivariate_normal as mvn

### 1. Get Data

In [32]:
def get_data(limit=None):
    df = pd.read_csv('./mnist_train.csv')
    data = df.as_matrix()
    np.random.shuffle(data)
    
    X = data[:, 1:] / 255.0
    Y = data[:, 0]
    
    if limit is not None:
        X, Y = X[:limit], Y[:limit]
    return X, Y

### 2. Naive Bayes

In [63]:
from datetime import datetime

In [101]:
class NaiveBayes(object):
    def fit(self, X, Y, smoothing=10e-3):
        self.gaussians = dict()
        self.priors = dict()
        labels = set(Y)

        for c in labels:
            # 특정 Class
            current_x = X[Y == c]
            self.gaussians[c] = {
                'mean': current_x.mean(axis=0), #784
                'var': current_x.var(axis=0) + smoothing, # 784 - independency 
            }
            self.priors[c] = float(len(Y[Y == c])) / len(Y)

    def score(self, X, Y):
        P = self.predict(X)
        return np.mean(P == Y)

    def predict(self, X):
        N, D = X.shape # 5000, 784
        K = len(self.gaussians) # 10
        P = np.zeros((N, K)) # 5000, 10

        for c, g in self.gaussians.items():
            mean, var = g['mean'], g['var']    # 784, 784

            # Calculating Posterior - fit한 모델에서 각 X에 대한 pdf 구한다.
            P[:, c] = mvn.logpdf(X, mean=mean, cov=var) + np.log(self.priors[c])
          
        return np.argmax(P, axis=1)

In [102]:
X, Y = get_data(10000)
Ntrain = int(len(Y) / 2)
Xtrain, Ytrain = X[:Ntrain], Y[:Ntrain]
Xtest, Ytest = X[Ntrain:], Y[Ntrain:]

In [103]:
model = NaiveBayes()
t0 = datetime.now()
model.fit(Xtrain, Ytrain)
print("Training Time: ", (datetime.now() - t0))

t0 = datetime.now()
print("Train accuracy: ", model.score(Xtrain, Ytrain))
print("Time to compute train accuracy: ", (datetime.now() - t0), "Train size:", len(Ytrain))

t0 = datetime.now()
print("Test accuracy: ", model.score(Xtest, Ytest))
print("Time to compute test accuracy: ", (datetime.now() - t0), "Test size:", len(Ytrain))

Training Time:  0:00:00.060050
Train accuracy:  0.8088
Time to compute train accuracy:  0:00:04.334622 Train size: 5000
Test accuracy:  0.7956
Time to compute test accuracy:  0:00:01.069893 Test size: 5000


## 3. Bayes Classifier

In [113]:
class Bayes(object):
    def fit(self, X, Y, smoothing=10e-3):
        N, D = X.shape
        self.gaussians = dict()
        self.priors = dict()
        labels = set(Y)
        

        for c in labels:
            # 특정 Class
            current_x = X[Y == c]
            self.gaussians[c] = {
                'mean': current_x.mean(axis=0), #784
                'cov': np.cov(current_x.T) + np.eye(D) * smoothing, # 784 * 784 - independency 
            }
            self.priors[c] = float(len(Y[Y == c])) / len(Y)

    def score(self, X, Y):
        P = self.predict(X)
        return np.mean(P == Y)

    def predict(self, X):
        N, D = X.shape # 5000, 784
        K = len(self.gaussians) # 10
        P = np.zeros((N, K)) # 5000, 10

        for c, g in self.gaussians.items():
            mean, cov = g['mean'], g['cov']    # 784, 784

            # Calculating Posterior - fit한 모델에서 각 X에 대한 pdf 구한다.
            P[:, c] = mvn.logpdf(X, mean=mean, cov=cov) + np.log(self.priors[c])
          
        return np.argmax(P, axis=1)

In [114]:
X, Y = get_data(10000)
Ntrain = int(len(Y) / 2)
Xtrain, Ytrain = X[:Ntrain], Y[:Ntrain]
Xtest, Ytest = X[Ntrain:], Y[Ntrain:]

In [115]:
model = Bayes()
t0 = datetime.now()
model.fit(Xtrain, Ytrain)
print("Training Time: ", (datetime.now() - t0))

t0 = datetime.now()
print("Train accuracy: ", model.score(Xtrain, Ytrain))
print("Time to compute train accuracy: ", (datetime.now() - t0), "Train size:", len(Ytrain))

t0 = datetime.now()
print("Test accuracy: ", model.score(Xtest, Ytest))
print("Time to compute test accuracy: ", (datetime.now() - t0), "Test size:", len(Ytrain))

Training Time:  0:00:00.152126
Train accuracy:  0.9984
Time to compute train accuracy:  0:00:01.099919 Train size: 5000
Test accuracy:  0.9394
Time to compute test accuracy:  0:00:01.075899 Test size: 5000
