In [1]:
import numpy as np 
import pandas as pd 
import matplotlib.pyplot as plt 
import math 
import seaborn as sns 
from tqdm import tqdm 
from sklearn.model_selection import train_test_split

In [2]:
a = np.random.randint(0, 2, 3000)
a = a.reshape(1000,3)
df_naive = pd.DataFrame(a, columns=['x1','x2','y'])
df=df_naive

In [3]:
from sklearn.datasets import make_blobs
from scipy.stats import norm

In [18]:
class NaiveBayes:
    
    def __init__(self, df, discrete = False, laplace_smoothing = False):


        self.df = df
        self.discrete = discrete
        self.laplace_smoothing = laplace_smoothing
        self.df_train, self.df_test = train_test_split(self.df, random_state=0, test_size=0.3)
        self.X_gauss ,self.y_gauss = make_blobs(n_samples = 10000, centers = 2 , n_features = 2, random_state=1)
        self.X_train_gauss, self.X_test_gauss,self.y_train_gauss,  self.y_test_gauss = train_test_split(self.X_gauss, self.y_gauss, random_state=0, test_size=0.3)
        
        
    def fit_dist(self,data):
        mu = np.mean(data)
        sigma = np.std(data)
        dist = norm(mu, sigma)
        return dist
        
    def probability(self,X,prior, dist1,dist2 ):
        return prior * dist1.pdf(X[0]) * dist2.pdf(X[1])
    
    
    def run_model(self):

      self.X_train = self.df_train.values[:,0:2]
      self.y_train = self.df_train.values[:,2]
      self.X_test = self.df_test.values[:,0:2]
      self.y_test = self.df_test.values[:,2]
      self.X0_train = self.df_train[self.y_train==0]
      self.X1_train = self.df_train[self.y_train==1]
  

      if self.discrete and not self.laplace_smoothing:

          print('Solving Using Discrete Data Without Laplace smoothing by fitting into PDF')
  
          self.prior_y0_discrete = len(self.X0_train) / len(self.X_train)
          self.prior_y1_discrete = len(self.X1_train) / len(self.X_train)

          self.dist_X0y0 = self.fit_dist(self.X0_train.values[:,0])
          self.dist_X1y0 = self.fit_dist(self.X0_train.values[:,1])

          self.dist_X0y1 = self.fit_dist(self.X1_train.values[:,0])
          self.dist_X1y1 = self.fit_dist(self.X1_train.values[:,1])

          self.prior_y0_discrete = len(self.X0_train) / len(self.X_train)
          self.prior_y1_discrete = len(self.X1_train) / len(self.X_train)


      elif self.discrete and self.laplace_smoothing:
        
          print("Solving using manual method with laplace smoothing")

          lam = 1 

          self.prior_y0_discrete = (len(self.X0_train)+lam )/ (len(self.X_train) + self.df_train['y'].nunique()*lam)
          self.prior_y1_discrete = (len(self.X1_train)+lam )/(len(self.X_train)+ self.df_train['y'].nunique()*lam)
  
          self.X10 = self.X0_train.values[:,0]
          self.X20 = self.X0_train.values[:,1]
          self.X11 = self.X1_train.values[:,0]
          self.X21 = self.X1_train.values[:,1]

          #Liklihoods for (x1|y)
          self.LX10y0 = (len(self.X10[self.X10 == 0])+lam)/(len(self.X0_train)+self.df_train['x1'].nunique()*lam)
          self.LX10y1 = (len(self.X11[self.X11 == 0])+lam)/(len(self.X1_train)+self.df_train['x1'].nunique()*lam)
          self.LX11y0 = (len(self.X10[self.X10 == 1])+lam)/(len(self.X0_train)+self.df_train['x1'].nunique()*lam)
          self.LX11y1 = (len(self.X11[self.X11 == 1])+lam)/(len(self.X1_train)+self.df_train['x1'].nunique()*lam)

          #Likelihoods for (x2|y)
          self.LX20y0 = (len(self.X20[self.X20 == 0])+lam)/(len(self.X0_train)+self.df_train['x2'].nunique()*lam)
          self.LX20y1 = (len(self.X21[self.X21 == 0])+lam)/(len(self.X1_train)+self.df_train['x2'].nunique()*lam)
          self.LX21y0 = (len(self.X20[self.X20 == 1])+lam)/(len(self.X0_train)+self.df_train['x2'].nunique()*lam)
          self.LX21y1 = (len(self.X21[self.X21 == 1])+lam)/(len(self.X1_train)+self.df_train['x2'].nunique()*lam)
      

      else: 

          print('Solving Using Gaussian Naive Bayes')
          
          self.X0_train = self.X_train_gauss[self.y_train_gauss == 0]
          self.X1_train = self.X_train_gauss[self.y_train_gauss == 1]
          #prior
          self.prior_y0 = len(self.X0_train) / len(self.X_train)
          self.prior_y1 = len(self.X1_train) / len(self.X_train)
        
          #pdf for class 0
          self.dist_X0y0 = self.fit_dist(self.X0_train[:,0])
          self.dist_X1y0 = self.fit_dist(self.X0_train[:,1])
        
         #pdf for class 1
          self.dist_X0y1 = self.fit_dist(self.X1_train[:,0])
          self.dist_X1y1 = self.fit_dist(self.X1_train[:,1])
  
        
        
    def predict(self):
     

      if self.discrete and not self.laplace_smoothing:

        right2 = 0
        wrong2 = 0

        for sample, target in zip(self.X_test, self.y_test):
            
            py0 = self.probability(sample, self.prior_y0_discrete, self.dist_X0y0, self.dist_X1y0)
            py1 = self.probability(sample, self.prior_y1_discrete, self.dist_X0y1, self.dist_X1y1)
            
            print('p(y=0 | %s) = %.3f' % (sample, py0*100))
            print('p(y=1 | %s) = %.3f' % (sample, py1*100))

            print('model predicted class {} and the truth was {} \n\n' .format(np.argmax([py0*100,py1*100]), target))
            if (np.argmax([py0*100,py1*100]) == target):
              print('Right\n')
              right2+=1
            else:
              print('Wrong\n')
              wrong2+=1

            print("Total Correct Classifications are :",right2)
            print("Total Wrong Classifications are :", wrong2)


      elif self.discrete and self.laplace_smoothing:
        right = 0
        wrong = 0


        for sample, target in zip(self.X_test, self.y_test):
           
           
           if sample[0]==0 and sample[1] == 0 and target==0:
             
             py0 = self.prior_y0_discrete*self.LX10y0*self.LX20y0
             py1 = self.prior_y1_discrete*self.LX10y1*self.LX20y1
             print('p(y=0 | %s) = %.3f' % (sample, py0*100))
             print('p(y=1 | %s) = %.3f' % (sample, py1*100))
             
             print('model predicted class {} and the truth was {} \n' .format(np.argmax([py0*100,py1*100]), target))
             if (np.argmax([py0*100,py1*100]) == target):
               print('Right\n')
               right+=1
             else:
               print('Wrong\n')
               wrong+=1
             
           elif sample[0]==0 and sample[1] == 0 and target==1:

             py0 = self.prior_y0_discrete*self.LX10y0*self.LX20y0
             py1 = self.prior_y1_discrete*self.LX10y1*self.LX20y1
             print('p(y=0 | %s) = %.3f' % (sample, py0*100))
             print('p(y=1 | %s) = %.3f' % (sample, py1*100))
             
             print('model predicted class {} and the truth was {} \n' .format(np.argmax([py0*100,py1*100]), target))
             if (np.argmax([py0*100,py1*100]) == target):
               print('Right\n')
               right+=1
             else:
               print('Wrong\n')
               wrong+=1
             
           elif sample[0] == 0 and sample[1] == 1 and target == 0:


             py0 = self.prior_y0_discrete*self.LX10y0*self.LX21y0
             py1 = self.prior_y1_discrete*self.LX10y1*self.LX21y1
             print('p(y=0 | %s) = %.3f' % (sample, py0*100))
             print('p(y=1 | %s) = %.3f' % (sample, py1*100))

             print('model predicted class {} and the truth was {} \n' .format(np.argmax([py0*100,py1*100]), target))
             if (np.argmax([py0*100,py1*100]) == target):
               print('Right\n')
               right+=1
             else:
               print('Wrong\n')
               wrong+=1
             
           elif sample[0] == 0 and sample[1] == 1 and target == 1:

             py0 = self.prior_y0_discrete*self.LX10y0*self.LX21y0
             py1 = self.prior_y1_discrete*self.LX10y1*self.LX21y1
             print('p(y=0 | %s) = %.3f' % (sample, py0*100))
             print('p(y=1 | %s) = %.3f' % (sample, py1*100))
             print('model predicted class {} and the truth was {} \n' .format(np.argmax([py0*100,py1*100]), target))
             if (np.argmax([py0*100,py1*100]) == target):
               print('Right\n')
               right+=1
             else:
               print('Wrong\n')
               wrong+=1
             
           elif sample[0] == 1 and sample[1] == 0 and target == 0:

             py0 = self.prior_y0_discrete*self.LX11y0*self.LX20y0
             py1 = self.prior_y1_discrete*self.LX11y1*self.LX20y1
             print('p(y=0 | %s) = %.3f' % (sample, py0*100))
             print('p(y=1 | %s) = %.3f' % (sample, py1*100))
             print('model predicted class {} and the truth was {} \n' .format(np.argmax([py0*100,py1*100]), target))

             if (np.argmax([py0*100,py1*100]) == target):
               print('Right\n')
               right+=1
             else:
               print('Wrong\n')
               wrong+=1
             
           elif sample[0] == 1 and sample[1] == 0 and target == 1:

             py0 = self.prior_y0_discrete*self.LX11y0*self.LX20y0
             py1 = self.prior_y1_discrete*self.LX11y1*self.LX20y1
             print('p(y=0 | %s) = %.3f' % (sample, py0*100))
             print('p(y=1 | %s) = %.3f' % (sample, py1*100))
             print('model predicted class {} and the truth was {} \n' .format(np.argmax([py0*100,py1*100]), target))
             if (np.argmax([py0*100,py1*100]) == target):

               print('Right\n')
               right+=1
             else:
               print('Wrong\n')
               wrong+=1
             

           elif sample[0] == 1 and sample[1] == 1 and target == 0:

             py0 = self.prior_y0_discrete*self.LX11y0*self.LX21y0
             py1 = self.prior_y1_discrete*self.LX11y1*self.LX21y1
             print('p(y=0 | %s) = %.3f' % (sample, py0*100))
             print('p(y=1 | %s) = %.3f' % (sample, py1*100))
             print('model predicted class {} and the truth was {} \n' .format(np.argmax([py0*100,py1*100]), target))
             if (np.argmax([py0*100,py1*100]) == target):
               print('Right\n')
               right+=1
             else:
               print('Wrong\n')
               wrong+=1
             
           elif sample[0] == 1 and sample[1] == 1 and target == 1:

             py0 = self.prior_y0_discrete*self.LX11y0*self.LX21y0
             py1 = self.prior_y1_discrete*self.LX11y1*self.LX21y1
             print('p(y=0 | %s) = %.3f' % (sample, py0*100))
             print('p(y=1 | %s) = %.3f' % (sample, py1*100))
             print('model predicted class {} and the truth was {} \n' .format(np.argmax([py0*100,py1*100]), target))
             if (np.argmax([py0*100,py1*100]) == target):
               print('Right\n')
               right+=1
             else:
               print('Wrong\n')
               wrong+=1
             

        print("Total Correct Classifications are :",right)
        print("Total Wrong Classifications are :", wrong)

     
      else:

        right3 = 0
        wrong3 = 0
        for sample, target in zip(self.X_test_gauss, self.y_test_gauss):
            
            py0 = self.probability(sample, self.prior_y0, self.dist_X0y0, self.dist_X1y0)
            py1 = self.probability(sample, self.prior_y1, self.dist_X0y1, self.dist_X1y1)
            
            print('p(y=0 | %s) = %.3f' % (sample, py0*100))
            print('p(y=1 | %s) = %.3f' % (sample, py1*100))
            
            print('model predicted class {} and the truth was {} \n' .format(np.argmax([py0*100,py1*100]), target))
            if (np.argmax([py0*100,py1*100]) == target):
               print('Right\n')
               right3+=1
            else:
               print('Wrong\n')
               wrong3+=1
             

        print("Total Correct Classifications are :",right3)
        print("Total Wrong Classifications are :", wrong3)



        
        

In [12]:
nb = NaiveBayes(df,discrete = True)
nb.run_model()
nb.predict()

Solving Using Discrete Data Without Laplace smoothing by fitting into PDF

Solving using discrete data with pdf function

p(y=0 | [1 0]) = 11.331
p(y=1 | [1 0]) = 12.973
model predicted class 1 and the truth was 1 


Right

Total Correct Classifications are : 1
Total Wrong Classifications are : 0
p(y=0 | [0 1]) = 12.816
p(y=1 | [0 1]) = 9.747
model predicted class 0 and the truth was 0 


Right

Total Correct Classifications are : 2
Total Wrong Classifications are : 0
p(y=0 | [1 0]) = 11.331
p(y=1 | [1 0]) = 12.973
model predicted class 1 and the truth was 1 


Right

Total Correct Classifications are : 3
Total Wrong Classifications are : 0
p(y=0 | [1 1]) = 13.696
p(y=1 | [1 1]) = 10.717
model predicted class 0 and the truth was 1 


Wrong

Total Correct Classifications are : 3
Total Wrong Classifications are : 1
p(y=0 | [1 1]) = 13.696
p(y=1 | [1 1]) = 10.717
model predicted class 0 and the truth was 0 


Right

Total Correct Classifications are : 4
Total Wrong Classifications are : 1

In [9]:
nb = NaiveBayes(df,discrete = True, laplace_smoothing=True)
nb.run_model()
nb.predict()

Solving using manual method with laplace smoothing
p(y=0 | [1 0]) = 12.106
p(y=1 | [1 0]) = 13.830
model predicted class 1 and the truth was 1 

Right

p(y=0 | [0 1]) = 13.669
p(y=1 | [0 1]) = 10.422
model predicted class 0 and the truth was 0 

Right

p(y=0 | [1 0]) = 12.106
p(y=1 | [1 0]) = 13.830
model predicted class 1 and the truth was 1 

Right

p(y=0 | [1 1]) = 14.601
p(y=1 | [1 1]) = 11.451
model predicted class 0 and the truth was 1 

Wrong

p(y=0 | [1 1]) = 14.601
p(y=1 | [1 1]) = 11.451
model predicted class 0 and the truth was 0 

Right

p(y=0 | [0 0]) = 11.333
p(y=1 | [0 0]) = 12.587
model predicted class 1 and the truth was 1 

Right

p(y=0 | [1 0]) = 12.106
p(y=1 | [1 0]) = 13.830
model predicted class 1 and the truth was 1 

Right

p(y=0 | [0 1]) = 13.669
p(y=1 | [0 1]) = 10.422
model predicted class 0 and the truth was 1 

Wrong

p(y=0 | [1 1]) = 14.601
p(y=1 | [1 1]) = 11.451
model predicted class 0 and the truth was 0 

Right

p(y=0 | [0 0]) = 11.333
p(y=1 | [0 0]) =

In [19]:
nb = NaiveBayes(df)
nb.run_model()
nb.predict()

[1;30;43mStreaming output truncated to the last 5000 lines.[0m
p(y=0 | [-0.31935223  5.05798477]) = 27.055
p(y=1 | [-0.31935223  5.05798477]) = 0.000
model predicted class 0 and the truth was 0 

Right

p(y=0 | [-9.35561304 -4.45060481]) = 0.000
p(y=1 | [-9.35561304 -4.45060481]) = 58.251
model predicted class 1 and the truth was 1 

Right

p(y=0 | [-10.21901341  -5.47776599]) = 0.000
p(y=1 | [-10.21901341  -5.47776599]) = 24.318
model predicted class 1 and the truth was 1 

Right

p(y=0 | [-11.01396112  -4.26191722]) = 0.000
p(y=1 | [-11.01396112  -4.26191722]) = 44.585
model predicted class 1 and the truth was 1 

Right

p(y=0 | [-0.7206192   4.98118509]) = 44.245
p(y=1 | [-0.7206192   4.98118509]) = 0.000
model predicted class 0 and the truth was 0 

Right

p(y=0 | [-1.79593242  3.03720946]) = 30.118
p(y=1 | [-1.79593242  3.03720946]) = 0.000
model predicted class 0 and the truth was 0 

Right

p(y=0 | [-9.69220703 -3.92051815]) = 0.000
p(y=1 | [-9.69220703 -3.92051815]) = 76.127
