In [1]:
import numpy as np
import pandas as pd
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split

## Dataset preparation and spliting

In [2]:
dataframe = pd.read_csv("../dataset/mushrooms.csv")

In [3]:
dataframe.head(3)

Unnamed: 0,type,cap_shape,cap_surface,cap_color,bruises,odor,gill_attachment,gill_spacing,gill_size,gill_color,...,stalk_surface_below_ring,stalk_color_above_ring,stalk_color_below_ring,veil_type,veil_color,ring_number,ring_type,spore_print_color,population,habitat
0,p,x,s,n,t,p,f,c,n,k,...,s,w,w,p,w,o,p,k,s,u
1,e,x,s,y,t,a,f,c,b,k,...,s,w,w,p,w,o,p,n,n,g
2,e,b,s,w,t,l,f,c,b,n,...,s,w,w,p,w,o,p,n,n,m


In [4]:
np.unique(dataframe["cap_color"]).shape

(10,)

In [7]:
le = LabelEncoder()
df = dataframe.apply(le.fit_transform)

In [6]:
df.head(3)

Unnamed: 0,type,cap_shape,cap_surface,cap_color,bruises,odor,gill_attachment,gill_spacing,gill_size,gill_color,...,stalk_surface_below_ring,stalk_color_above_ring,stalk_color_below_ring,veil_type,veil_color,ring_number,ring_type,spore_print_color,population,habitat
0,1,5,2,4,1,6,1,0,1,4,...,2,7,7,0,2,1,4,2,3,5
1,0,5,2,9,1,0,1,0,0,4,...,2,7,7,0,2,1,4,3,2,1
2,0,0,2,8,1,3,1,0,0,5,...,2,7,7,0,2,1,4,3,2,3


In [8]:
df = df.values

In [9]:
X = df[:,1:]
Y = df[:,0]

In [10]:
print(X.shape,Y.shape)

(8124, 22) (8124,)


In [11]:
x_train , x_test , y_train , y_test = train_test_split(X,Y)

In [12]:
print(x_train.shape,y_train.shape,x_test.shape,y_test.shape)

(6093, 22) (6093,) (2031, 22) (2031,)


# Implemenation

In [13]:
class Naive_Bayes:
    
    def __init__(self, X ,Y):
        self.x = X
        self.y = Y
        
    def prior_probability(self , klass):
        return np.sum(self.y == klass) / self.y.shape[0]
    
    def condition_probability(self,f_col,f_val,cls):
        x_dsr = self.x[self.y==cls]
        
        num = np.sum(x_dsr[:,f_col] == f_val)
        den = x_dsr.shape[0]
        
        return num/float(den)
    
    def posterior_prob(self,x):
        
        posterior = list()
        
        for cls in np.unique(self.y):
            likelihood = 1
            for f in range(self.x.shape[1]):
                likelihood *= self.condition_probability(f,x[f],cls)
            
            prior = self.prior_probability(cls)
            
            posterior.append(likelihood * prior)
            
        return np.array(posterior)
    
    def predict(self, x):
        return np.argmax(self.posterior_prob(x))
    
    def get_pred(self,x_test):
        pred = list()
        
        for x in x_test:
            pred.append(self.predict(x))
            
        return pred
    
    def score(self,pred,y_test):
        return np.mean(pred == y_test)
        

## MNist Gausssian vs Multinomial

In [14]:
nb = Naive_Bayes(x_train , y_train)

In [15]:
predlist = nb.get_pred(x_test)

In [16]:
nb.score(predlist,y_test)

0.9965534219596258

In [17]:
from sklearn.datasets import load_digits

In [18]:
digit = load_digits()
print(digit.data.shape)

(1797, 64)


In [19]:
data = digit.data
target = digit.target
print(data.shape,target.shape)

(1797, 64) (1797,)


In [20]:
import matplotlib.pyplot as plt
print(target[0])
plt.imshow(data[0].reshape(8,8),cmap="gray")
plt.show()

0


<Figure size 640x480 with 1 Axes>

In [21]:
from sklearn.naive_bayes import MultinomialNB , GaussianNB

In [22]:
mb = MultinomialNB()
gb =GaussianNB()

In [23]:
x_1train ,x_1test,y_1train,y1_test = train_test_split(data,target) 

In [24]:
print(x_1train.shape,y_1train.shape)

(1347, 64) (1347,)


In [25]:
x_test[1] #discreet value 0-15

array([2, 0, 2, 1, 5, 1, 0, 0, 5, 1, 1, 2, 2, 3, 3, 0, 2, 1, 4, 2, 4, 0])

In [26]:
mb.fit(x_1train,y_1train)

MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True)

In [27]:
mb.score(x_1test,y1_test)

0.9111111111111111

In [28]:
gb.fit(x_1train,y_1train)

GaussianNB(priors=None, var_smoothing=1e-09)

In [29]:
gb.score(x_1test,y1_test)

0.86

In [30]:
## Here discreet value of dataset the mb score more and gb is less because bcz it is more for contious value