In [1]:
import math
from collections import defaultdict

import numpy as np

class BernoulliNB(object):
    p_y, p_x_y = None, None

    def fit(self, x, y):
        self.p_y = np.zeros((2,), dtype=np.float64)
        self.p_x_y = np.ones((2, 2, x.shape[1]), dtype=np.float64)  # NOTE: Laplace Smoothing

        y_0 = np.argwhere(y == 0.0)
        y_1 = np.argwhere(y == 1.0)

        self.p_y[0] += y_0.shape[0]
        self.p_y[1] += y_1.shape[0]

        x_j_0_sum = x[y_0].sum(axis=0).squeeze()
        x_j_1_sum = x[y_1].sum(axis=0).squeeze()

        self.p_x_y[0][1] += x_j_0_sum  # NOTE: Xj=1, Yj=0
        self.p_x_y[1][1] += x_j_1_sum  # NOTE: Xj=1, Yj=1
        self.p_x_y[0][0] += y_0.shape[0] - x_j_0_sum  # NOTE: Xj=0, Yj=0
        self.p_x_y[1][0] += y_1.shape[0] - x_j_1_sum  # NOTE: Xj=0, Yj=1

        self.p_x_y /= np.expand_dims(self.p_y, axis=1) + 2  # NOTE: Laplace Smoothing
        self.p_y /= self.p_y.sum()

        return self

    def predict(self, x):
        pr = []
        for x_i in x:
            lo = math.log2(self.p_y[1]) - math.log2(1 - self.p_y[1])
            for j, x_i_j in enumerate(x_i):
                lo += math.log2(self.p_x_y[1][x_i_j][j]) - math.log2(self.p_x_y[0][x_i_j][j])
            pr.append(int(lo >= 0))
        return np.array(pr, dtype=np.float64)

In [2]:
import logging
from datetime import datetime

import numpy as np

logger = logging.getLogger(__name__)
np.random.seed(2019)

log_path = datetime.now().strftime('./logs/%Y-%m-%d-%H-%M-%S.log')
logging.basicConfig(filename=log_path, level=logging.INFO)
logging.getLogger().addHandler(logging.StreamHandler())

In [3]:
import os

import numpy as np

def load_data(dp):
    x = []
    for fn in sorted(os.listdir(dp), key=lambda y: int(y[:-4])):
        with open('{dp}{fn}'.format(dp=dp, fn=fn), 'r') as f:
            x.append(f.read())
    return x

x_tr_pos = np.array(load_data('./dataset/train/pos/'), dtype=np.str)
x_tr_neg = np.array(load_data('./dataset/train/neg/'), dtype=np.str)
x_tr = np.concatenate((x_tr_pos, x_tr_neg), axis=0)
y_tr = np.concatenate((np.ones_like(x_tr_pos, dtype=np.float64), np.zeros_like(x_tr_neg, dtype=np.float64)), axis=0)
x_ts = np.array(load_data('./dataset/test/'), dtype=np.str)

del x_tr_pos
del x_tr_neg

In [4]:
import string

from sklearn.feature_extraction.text import CountVectorizer

token_pattern = r'\w+|[%s]' % string.punctuation
cnt = CountVectorizer(token_pattern=token_pattern,
                      ngram_range=(1, 1),
                      binary=True)
x_tr = cnt.fit_transform(x_tr)
x_ts = cnt.transform(x_ts)

In [7]:
cl_bnb = BernoulliNB().fit(x_tr.toarray(), y_tr)
cl_bnb_prd = cl_bnb.predict(x_ts[:100].toarray())

In [8]:
from sklearn import naive_bayes

sk_bnb = naive_bayes.BernoulliNB().fit(x_tr, y_tr)
sk_bnb_prd = sk_bnb.predict(x_ts[:100].toarray())

In [9]:
for x, y in zip(cl_bnb_prd, sk_bnb_prd):
    assert x == y