<a href="https://colab.research.google.com/github/jwang44/crispy-fiesta/blob/main/NB_latest.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

## Bernoulli NB model

In [None]:
import time
import sklearn.preprocessing as skpre
# from sklearn import metrics
# from sklearn.model_selection import GridSearchCV

In [None]:
class Bernoulli_NB():
  def __init__(self, LaplaceSmoothing = True):
    self.LaplaceSmoothing = LaplaceSmoothing
    self.Prob_Y = None      # P(Y)
    self.Prob_X_Y = None     # P(xj|Y)
    self.Prob_X_Y = None     # P(xj|Y)
    self.n_class = 0
    self.w0 = None
    self.w = None
    self.Ytarget = None

  def ProbY(self, Y):
    # calculate P(Y=1) and P(Y=0)
    ProbY = np.zeros((1,2))
    ProbY[0,1] = np.sum(Y)/np.shape(Y)
    ProbY[0,0] = 1 - ProbY[0,1]
    return ProbY

  def ProbX_Yi(self, X, Y, label):
    # calculte P(xj=1|Yi=1)
    rows,cols = X.shape       # feature shape
    numerator = np.zeros((1,cols))    # initialize numerator

    # xj=1 and Yi=1
    for n in range(rows):
      if Y[n] == label:
        numerator += X[n,:]
    # Yi=1
    denominator = np.count_nonzero(Y == label)

    # Laplace Smoothing
    if(self.LaplaceSmoothing):
      numerator += 1
      denominator += 2

    # P(xj=1|Yi=1)
    prob = numerator/denominator
    return prob    

  def fit(self, X, Y):
    print('---------------------- start fitting ---------------------')
    t1 = time.time()
    
    rows,cols = X.shape              # feature shape
    self.n_class = len(np.unique(Y))       # number of classes
    self.Prob_Y = np.zeros((self.n_class,2))   # initialize P(Y)
    self.Prob_X_Y = np.zeros((self.n_class,2,cols)) # initialize P(x|Y)
    c = np.zeros((self.n_class,cols))       # rows:class cols:xj
    d = np.zeros((self.n_class,cols))       # rows:class cols:xj
    self.w0 = np.zeros((1,self.n_class))     # [w0Y1,w0Y2,...]
    self.w = np.zeros((self.n_class,cols))    # [(w1,w2,...)Y1;
                            # (w1,w2,...)Y2]
    for Yi in range(self.n_class):
      Y_onevsall = np.where(Y == Yi, 1, 0)    # only have 2 classes: Yi(1) & notYi(0)
      self.Prob_Y[Yi,:] = self.ProbY(Y_onevsall)   # [P(notYi), P(Yi)]
      self.Prob_X_Y[Yi,0,:] = self.ProbX_Yi(X,Y_onevsall,0)  # [P(x1|notYi), P(x2|notYi),...]
      self.Prob_X_Y[Yi,1,:] = self.ProbX_Yi(X,Y_onevsall,1)  # [P(x1|Yi), P(x2|Yi),...]
      c[Yi,:] = np.log10(self.Prob_X_Y[Yi,1,:]/self.Prob_X_Y[Yi,0,:])     # log(P(xj|Y=1)/P(xj|Y=0))
      d[Yi,:] = np.log10((1-self.Prob_X_Y[Yi,1,:])/(1-self.Prob_X_Y[Yi,0,:])) # log((1-P(xj|Y=1))/(1-P(xj|Y=0)))
      self.w0[0,Yi] = np.log10(self.Prob_Y[Yi,1]/self.Prob_Y[Yi,0]) + np.sum(d[Yi,:])
      self.w[Yi,:] = c[Yi,:] - d[Yi,:]

    print('------ fit done, total time: ',time.time()-t1,' -----')
    # return self.Prob_Y,self.Prob_X_Y

  def predict(self, X):
    print('---------------------- start predict ---------------------')
    t1 = time.time()
    
    #X = self.encoder(X)
    rows,cols = np.shape(X)       # feature shape
    PreY = np.zeros(rows) # initialize Y
    # print('type',type(PreY),'pre',PreY)
    LogOddsRatio = np.zeros((1,self.n_class))  # initialize log odds ratio a(x) 
    Logistic = np.zeros((1,self.n_class))    # initialize logistic function
    Y_index = 0
    for obs in range(rows):
      for Yi in range(self.n_class): 
        part2 = self.w[Yi,:].reshape(1,cols) @ np.transpose(X[obs,:])
        LogOddsRatio[0,Yi] = self.w0[0,Yi] + part2.astype(np.float64)
        Logistic[0,Yi] = 1/(1+np.exp(-LogOddsRatio[0,Yi]))
      #print('w shape',self.w[Yi,:].reshape(1,cols).shape)
      #print('x shape',np.transpose(X[obs,:]).shape)
      #print('w',self.w[Yi,:].reshape(1,cols))
      #print('x',np.transpose(X[obs,:]))
      #print('part2',np.matmul(self.w[Yi,:].reshape(1,cols),np.transpose(X[obs,:])))
      #print(np.where(Logistic == np.amax(Logistic))[1])
      PreY[obs] = np.where(Logistic == np.amax(Logistic))[1]
    print('------ predict done, total time: ',time.time()-t1,' -----')
    return PreY

  def score(self,X,Y):
    # Return the mean accuracy on the given test data and labels.
    PreY = self.predict(X)
    rows = np.shape(Y)[0]
    n_correct = 0;
    for obs in range(rows):
      if PreY[obs] == Y[obs]:
        n_correct += 1
    #print('n:',n_correct)
    #print('rows:',rows)
    accuracy = n_correct / rows
    print('------------------ accuracy:',accuracy,' -----------------')
    return accuracy


In [None]:
x = np.array([[1,1,1],
              [1,1,0],
              [1,1,1],
              [1,0,1],
              [0,0,1]])
y = np.array([0,1,0,2,3])
xtest = np.array([[0,1,0],
              [1,1,0],
              [1,1,0],
              [1,0,1],
              [0,0,1]])
ytest = np.array([0,1,1,2,3])
nb = Bernoulli_NB()
nb.fit(x,y)
nb.score(xtest,ytest)

---------------------- start fitting ---------------------
------ fit done, total time:  0.0012984275817871094  -----
---------------------- start predict ---------------------
------ predict done, total time:  0.00272369384765625  -----
------------------ accuracy: 0.8  -----------------


0.8