## Load data

In [3]:
pos = open('data/prep_pos.txt').read().split('\n')
neg = open('data/prep_neg.txt').read().split('\n')
pos_test = open('data/prep_pos_test.txt').read().split('\n')
neg_test = open('data/prep_neg_test.txt').read().split('\n')
pos_val = open('data/prep_pos_val.txt').read().split('\n')
neg_val = open('data/prep_neg_val.txt').read().split('\n')

## Random

In [4]:
import random
def randomData(pos, neg):
  countPos = 0
  countNeg = 0
  X = []
  Y = []

  while countPos < len(pos) or countNeg < len(neg):
    if countNeg == len(neg):
      X.append(pos[countPos])
      Y.append(1)
      countPos += 1
      continue
    if countPos == len(pos):
      X.append(neg[countNeg])
      countNeg += 1
      Y.append(0)
      continue
    if (random.randint(0, 1) == 0):
      X.append(pos[countPos])
      Y.append(1)
      countPos += 1
    else:
      X.append(neg[countNeg])
      countNeg += 1
      Y.append(0)
      continue
  
  return X, Y

X_train, Y_train = randomData(pos, neg)
X_test, Y_test = randomData(pos_test, neg_test)
X_val, Y_val = randomData(pos_val, neg_val)

In [5]:
# Convert to np.ndarray
import numpy as np
X_train = np.array(X_train)
X_test = np.array(X_test)
X_val = np.array(X_val)
Y_train = np.array(Y_train)
Y_test = np.array(Y_test)
Y_val = np.array(Y_val)
print(X_train.shape, Y_train.shape, X_val.shape, Y_val.shape, X_test.shape, Y_test.shape)

(29999,) (29999,) (10000,) (10000,) (10002,) (10002,)


## Create vocabs

In [6]:
import json
vocabs = {}
def add_to_vocab(X):
  for sample in X:
    words = sample.split(' ')
    for word in words:
      if word not in vocabs.keys():
        vocabs[word] = len(vocabs)

add_to_vocab(X_train)
#print(vocabs)
#print(len(vocabs))
fileVocabs = open("model/vocabs.json", "w", encoding="utf-8")
json.dump(vocabs, fileVocabs, indent=4)
fileVocabs.close()
fileVocabs = open("model/vocabs.json", "r", encoding="utf-8")
vocabs = json.load(fileVocabs)
print(vocabs)
print(len(vocabs))

35977


## Feature extraction

In [7]:
# Change samples to vector
# parameter single: True when X is a single sample else False
def featureExtract(X, single=False):
  if single:
    X_new = np.zeros((len(vocabs), ))
    words = X.split(' ')
    for word in words:
      if word in vocabs.keys():
        X_new[vocabs[word]] += 1
  else:
    X_new = np.zeros((X.shape[0], len(vocabs)))
    for i, sample in enumerate(X):
      words = sample.split(' ')
      for word in words:
        if word in vocabs.keys():
          X_new[i][vocabs[word]] += 1

  words = None
  return X_new

## Naive Bayes

In [8]:
import numpy as np
class multinomialNB:
  def __init__(self, feature_extract_func, alpha=1):
      self.alpha = alpha 
      self.feature_extract_func = feature_extract_func

  def load(self, priors, likelihoods, classes):
      self._priors = priors
      self._likelihoods = likelihoods
      self._classes = classes

  def fit(self, X_train, y_train, n, classes, batch_size):
      m = X_train.shape[0]
      self._classes = classes
      n_classes = len(self._classes)

      # init: Prior & Likelihood
      self._priors = np.zeros(n_classes)
      self._likelihoods = np.zeros((n_classes, n))

      # process by batch
      start_idx = 0
      while start_idx < m:
          # Get Prior and Likelihood
          for idx, c in enumerate(self._classes):
              X_train_c_slice = X_train[c == y_train][start_idx:(start_idx + batch_size)]
              # Feature extraction
              X_train_c = self.feature_extract_func(X_train_c_slice)
              # Calculation
              self._priors[idx] += X_train_c.shape[0]
              self._likelihoods[idx, :] += X_train_c.sum(axis=0)
              # Free memory
              X_train_c = None
              X_train_c_slice = None
          start_idx += batch_size
    
      for idx in range(0, n_classes):
          self._priors[idx] /= m
          self._likelihoods[idx, :] = (self._likelihoods[idx, :] + self.alpha) / (np.sum(self._likelihoods[idx, :]) + self.alpha)

  def predict(self, X_test):
      return [self._predict(x_test) for x_test in X_test]

  def _predict(self, x_test_inp):
      # Feature extraction
      x_test = self.feature_extract_func(x_test_inp, single=True)

      # Calculate posterior for each class
      posteriors = []
      for idx, c in enumerate(self._classes):
          prior_c = np.log(self._priors[idx])
          likelihoods_c = self.calc_likelihood(self._likelihoods[idx,:], x_test)
          posteriors_c = np.sum(likelihoods_c) + prior_c
          posteriors.append(posteriors_c)

          prior_c = None
          likelihoods_c = None

      pred_idx = np.argmax(posteriors)

      x_test = None
      posteriors = None

      return self._classes[pred_idx]

  def calc_likelihood(self, cls_likeli, x_test):
      return np.log(cls_likeli) * x_test

  def score(self, X_test, y_test):
      y_pred = self.predict(X_test)
      return np.sum(y_pred == y_test)/len(y_test)

  def get_info(self):
      return {'priors': self._priors, 'likelihoods': self._likelihoods}

## Trainning

In [9]:
model = multinomialNB(featureExtract)
model.fit(X_train, Y_train, len(vocabs), np.array([0, 1]), X_train.shape[0])

In [10]:
model_score = model.score(X_train, Y_train)
print(model_score)

0.8723624120804027


In [11]:
model_score = model.score(X_val, Y_val)
print(model_score)

0.8527


In [12]:
model_score = model.score(X_test, Y_test)
print(model_score)

0.8514297140571886


In [13]:
model_info = model.get_info()
print(model_info)

{'priors': array([0.49998333, 0.50001667]), 'likelihoods': array([[2.13937961e-03, 1.56971324e-02, 9.73546991e-04, ...,
        8.07922814e-07, 8.07922814e-07, 8.07922814e-07],
       [1.45786258e-03, 1.49389964e-02, 1.22489578e-03, ...,
        1.82005316e-06, 2.73007975e-06, 1.82005316e-06]])}


## Save model

In [14]:
np.savetxt('model/NB_priors.csv', model_info['priors'], delimiter=',')
np.savetxt('model/NB_likelihoods.csv', model_info['likelihoods'], delimiter=',')

In [15]:
# Load model example
priors = np.loadtxt('model/NB_priors.csv', delimiter=',')
likelihoods = np.loadtxt('model/NB_likelihoods.csv', delimiter=',')
print(priors)
print(likelihoods)
model = multinomialNB(featureExtract)
model.load(priors, likelihoods, np.array([0, 1]))

[0.49998333 0.50001667]
[[2.13937961e-03 1.56971324e-02 9.73546991e-04 ... 8.07922814e-07
  8.07922814e-07 8.07922814e-07]
 [1.45786258e-03 1.49389964e-02 1.22489578e-03 ... 1.82005316e-06
  2.73007975e-06 1.82005316e-06]]
