<a href="https://colab.research.google.com/github/jwang44/crispy-fiesta/blob/main/Bernoulli_NB.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [12]:
import numpy as np
import time
import scipy

In [13]:
class Bernoulli_NB():
  def __init__(self, LaplaceSmoothing = True):
    self.LaplaceSmoothing = LaplaceSmoothing
    self.Prob_Y = None      # P(Y)
    self.Prob_X_Y = None     # P(xj|Y)
    self.Prob_X_Y = None     # P(xj|Y)
    self.n_class = 0
    self.w0 = None
    self.w = None
    self.Ytarget = None


  def ProbY(self, Y):
    # calculate P(Y=1) and P(Y=0)
    ProbY = np.zeros((1,2))
    #print('shape',np.shape(Y))
    ProbY[0,1] = np.sum(Y)/np.shape(Y)
    ProbY[0,0] = 1 - ProbY[0,1]
    return ProbY

  def ProbX_Yi(self, X, Y, label):
    # calculte P(xj=1|Yi=1)
    rows,cols = X.shape       # feature shape
    numerator = np.zeros(cols)    # initialize numerator

    # xj=1 and Yi=1
    for n in range(rows):
      if Y[n] == label:
        numerator += X[n,:]
    # Yi=1
    denominator = np.count_nonzero(Y == label)
    # Laplace Smoothing
    if(self.LaplaceSmoothing):
      numerator += 1
      denominator += 2
    # P(xj=1|Yi=1)
    prob = numerator/denominator
    return prob    

  def fit(self, X, Y):
    print('----------------------start fitting---------------------')
    t1 = time.time()

    rows,cols = X.shape              # feature shape
    self.n_class = len(np.unique(Y))       # number of classes
    self.Prob_Y = np.zeros((self.n_class,2))   # initialize P(Y)
    self.Prob_X_Y = np.zeros((self.n_class,2,cols)) # initialize P(x|Y)
    c = np.zeros((self.n_class,cols))       # rows:class cols:xj
    d = np.zeros((self.n_class,cols))       # rows:class cols:xj
    self.w0 = np.zeros((1,self.n_class))     # [w0Y1,w0Y2,...]
    self.w = np.zeros((self.n_class,cols))    # [(w1,w2,...)Y1;
                            # (w1,w2,...)Y2]
    self.Ytarget = np.unique(Y)
    Y_index = 0
    for Yi in self.Ytarget:
      Y_onevsall = np.where(Y == Yi, 1, 0)    # only have 2 classes: Yi(1) & notYi(0)
      self.Prob_Y[Y_index,:] = self.ProbY(Y_onevsall)   # [P(notYi), P(Yi)]
      self.Prob_X_Y[Y_index,0,:] = self.ProbX_Yi(X,Y_onevsall,0)  # [P(x1|notYi), P(x2|notYi),...]
      self.Prob_X_Y[Y_index,1,:] = self.ProbX_Yi(X,Y_onevsall,1)  # [P(x1|Yi), P(x2|Yi),...]
      c[Y_index,:] = np.log10(self.Prob_X_Y[Y_index,1,:]/self.Prob_X_Y[Y_index,0,:])     # log(P(xj|Y=1)/P(xj|Y=0))
      d[Y_index,:] = np.log10((1-self.Prob_X_Y[Y_index,1,:])/(1-self.Prob_X_Y[Y_index,0,:])) # log((1-P(xj|Y=1))/(1-P(xj|Y=0)))
      self.w0[0,Y_index] = np.log10(self.Prob_Y[Y_index,1]/self.Prob_Y[Y_index,0]) + np.sum(d[Y_index,:])
      self.w[Y_index,:] = c[Y_index,:] - d[Y_index,:]
      Y_index += 1

    print('------fit done, total time:',time.time()-t1,' -----')
    # return self.Prob_Y,self.Prob_X_Y

  def predict(self, X):
    print('----------------------start predict---------------------')
    t1 = time.time()

    rows,cols = X.shape       # feature shape
    PreY = np.empty((rows,1),dtype='S') # initialize Y
    print('type',type(PreY))
    LogOddsRatio = np.zeros((1,self.n_class))  # initialize log odds ratio a(x) 
    Logistic = np.zeros((1,self.n_class))    # initialize logistic function
    Y_index = 0
    for obs in range(rows):
      for Yi in self.Ytarget:
        LogOddsRatio[0,Y_index] = self.w0[0,Y_index] + np.sum((self.w[Y_index,:]*X[obs,:])) 
        Logistic[0,Y_index] = 1/(1+np.exp(-LogOddsRatio[0,Y_index]))
        Y_index += 1
      print('np',np.where(Logistic == np.amax(Logistic))[1])
      PreY[obs] = self.Ytarget[np.where(Logistic == np.amax(Logistic))[1]]
    print('------predict done, total time:',time.time()-t1,' -----')
    return PreY

  def Accu_eval(self,PreY,TrueY):
    #y_predict = self.predict(X_test)
    rows,cols = np.shape(PreY)
    TP = 0;FP = 0;TN = 0;FN = 0
    # count TP,TN,FP,FN in validation set
    for obs in range(rows):
      if  TrueY[obs]==1 and PreY[obs]==1:
        TP = TP+1
      elif TrueY[obs]==0 and PreY[obs]==0:
        TN = TN+1
      elif TrueY[obs]==0 and PreY[obs]==1:
        FP = FP+1
      elif TrueY[obs]==1 and PreY[obs]==0:
        FN = FN+1    
    accuracy = (TP+TN)/(TP+TN+FP+FN)
    precision = TP/(TP+FP)
    recall = TP/(TP+FN)
    F = 2*precision*recall/(precision+recall)
    specificity = TN/(FP+TN)
    FPR = FP/(FP+TN)
    print("---------------------accuracy:",accuracy,'-------------------')
    return accuracy


In [14]:
B.predict(Y_train)

----------------------start predict---------------------
np [0]


ValueError: ignored

In [50]:
target = np.array(['apple','banana','orange'])

In [51]:
target[0]

'apple'

In [3]:
from sklearn.datasets import fetch_20newsgroups_vectorized
newsgroups = fetch_20newsgroups_vectorized(subset='test')

In [None]:
newsgroups.target_names

In [4]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(newsgroups.data, newsgroups.target, train_size=0.8, test_size=0.2,random_state = 0)

In [6]:
B = Bernoulli_NB()
B.fit(X_train,X_test)

----------------------start fitting---------------------
------fit done, total time: 0.00417327880859375  -----


In [4]:
X_test = np.array(['apple','banana','orange'])
X_train = np.array([[0,0,0],
              [0,1,0],
              [1,1,1]])

In [5]:
Y_test = np.array(['banana','orange'])
Y_train = np.array([[0,1,0],
              [1,1,1]])

In [None]:
Y = np.array([0,1,1,1,2])
X = np.array([[0,1],
              [1,0],
              [1,1],
              [1,1],
              [0,0]])
B = Bernoulli_NB()
B.fit(X,Y)
Xtest = np.array([[0,1],
              [1,0],
              [0,0]])
Ytest = np.array([0,1,2])
PreY = B.predict(Xtest)
B.Accu_eval(PreY,Ytest)