In [1]:
import pandas as pd
import numpy as np

In [2]:
#loading the mushroom dataset
df = pd.read_csv(r"https://raw.githubusercontent.com/codeforcauseorg/ML-Bootcamp-July/master/datasets/mushrooms.csv")

In [3]:
df.head()

Unnamed: 0,type,cap_shape,cap_surface,cap_color,bruises,odor,gill_attachment,gill_spacing,gill_size,gill_color,...,stalk_surface_below_ring,stalk_color_above_ring,stalk_color_below_ring,veil_type,veil_color,ring_number,ring_type,spore_print_color,population,habitat
0,p,x,s,n,t,p,f,c,n,k,...,s,w,w,p,w,o,p,k,s,u
1,e,x,s,y,t,a,f,c,b,k,...,s,w,w,p,w,o,p,n,n,g
2,e,b,s,w,t,l,f,c,b,n,...,s,w,w,p,w,o,p,n,n,m
3,p,x,y,w,t,p,f,c,n,n,...,s,w,w,p,w,o,p,k,s,u
4,e,x,s,g,f,n,f,w,b,k,...,s,w,w,p,w,o,e,n,a,g


In [4]:
df.shape

(8124, 23)

In [5]:
#This data is categorical , so we are converting to numeric data using LabelEncoder.
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split

le = LabelEncoder()

In [6]:
ds = df.apply(func = le.fit_transform)  #applying this function to all the columns of training data. LabelEncoder converts the categorical data to numerical data.

In [7]:
ds.head()

Unnamed: 0,type,cap_shape,cap_surface,cap_color,bruises,odor,gill_attachment,gill_spacing,gill_size,gill_color,...,stalk_surface_below_ring,stalk_color_above_ring,stalk_color_below_ring,veil_type,veil_color,ring_number,ring_type,spore_print_color,population,habitat
0,1,5,2,4,1,6,1,0,1,4,...,2,7,7,0,2,1,4,2,3,5
1,0,5,2,9,1,0,1,0,0,4,...,2,7,7,0,2,1,4,3,2,1
2,0,0,2,8,1,3,1,0,0,5,...,2,7,7,0,2,1,4,3,2,3
3,1,5,3,8,1,6,1,0,1,5,...,2,7,7,0,2,1,4,2,3,5
4,0,5,2,3,0,5,1,1,0,4,...,2,7,7,0,2,1,0,3,0,1


In [8]:
#From above we can see that y is type column and rest columns are the features ie X
data = ds.values   #For converting ds to numpy array
X = data[:, 1:] 
y = data[:, 0]

In [9]:
X.shape, y.shape

((8124, 22), (8124,))

In [11]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.33, random_state = 42)

# Sklearn NB

In [12]:
from sklearn.naive_bayes import GaussianNB
gnb = GaussianNB()

In [13]:
y_pred = gnb.fit(X_train, y_train).predict(X_test)

In [15]:
gnb.score(X_test, y_test)

0.9261469600895188

# Custom NB

In [23]:
class CustomNB:
    
    def fit(self, X, y):
        self.X_train = X
        self.y_train = y
        
    
    #label -represents class for which we are finding the probability
    def prior_prob(self, label):
        total = self.y_train.shape[0]
        examples_containing_label = np.sum(self.y_train == label)
        
        return examples_containing_label / total
    
    def conditional_prob(self, feature_col, feature_val, label):
        #label is the class in which we are finding probability.
        #feature_col is the column in which we are finding the points with value = feature_val.
        
        X_required = self.X_train[self.y_train == label]   #this represents all the examples having the value of class = label
        numerator = np.sum(X_required[:, feature_col] == feature_val)   #finding all the columns with class = label ,having column = feature_col and inside the column having value = feature_val.
        denominator = len(X_required)  #gives the points in a particular class label
        return numerator / denominator
    
    
    #We will do all the above thing for all the features.
    def predict_point(self, X_test):
        
        #X_test is a single example with n_features
        classes = np.unique(self.y_train)   #returns all the unique class labels in the y_train
        
        n_features = self.X_train.shape[1]    #returns the total no of features 
        
        posterior_prob = []   #Storing all the posterior probabilities in this list
        
        for label in classes:
            
            #posterior prob = prior prob * likelihood
            likelihood = 1.0
            for feature in range(n_features):
                conditional_prob = self.conditional_prob(feature, X_test[feature], label)
                likelihood = likelihood * conditional_prob
                
            prior_prob = self.prior_prob(label)   #this represents the prior probability.
            
            post = prior_prob * likelihood   #it is the posterior probability
            posterior_prob.append(post)
            
        #ns = maximum of all labels
        return np.argmax(posterior_prob)
    
    
    def predict(self, X_test):
        
        #this returns the answer for n points in X test
        result = []
        for point in X_test:
            result.append(self.predict_point(point))
            
        return np.array(result)
    
    def score(self, X_test, y_test):
        return (self.predict(X_test) == y_test).mean()        

In [24]:
model = CustomNB()

In [25]:
model.fit(X_train, y_train)

In [26]:
y_pred = model.predict(X_test)

In [27]:
model.score(X_test, y_test)

0.9973890339425587