## Naive Bayes classifier code form scratch

In [83]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

In [84]:
def prior(Y_train, label):
    '''
         label means which class to predict for
         returns P(Y=label) 
    '''
    
    num = np.sum(Y_train==label)
    denom = Y_train.shape[0]
    return num / denom
    

In [85]:
def likelihood(X_train, Y_train, Xquery, label):  
    # X and Y should be numpy arrays
    '''
        returns P(Xquery|Y=label) = P(Xquery0|Y=label)P(Xquery1|Y=label)P(Xquery2|Y=label)....so on. 
    '''
        
    if X_train.ndim == 1 : # hence number of given features is only 1.
        filtered_X_train = X_train[Y_train==label]
        num = np.sum(filtered_X_train==Xquery)
        denom = np.sum(Y==label)
        return num/denom
    
    prod = 1
    denom = np.sum(Y_train==label)
    for i in range(Xquery.shape[0]): # for more than one feature.
        ith_feature = X_train[:,i]
        filtered_X_train = ith_feature[Y_train == label]
        num = np.sum(filtered_X_train == Xquery[i])
        prod *=  (num / denom)
        
    return prod

In [86]:
def posterior_proportional(X_train, Y_train, Xquery, label) :
    '''
        returns posterior is proportional to likelihood times prior
         or P(Y=label|Xquery) = P(Xquery|Y=label) * P(Y=label)  
    '''
    likelihood_a = likelihood(X_train, Y_train, Xquery, label) 
    prior_b = prior(Y_train,label)
    return (likelihood_a * prior_b)

In [87]:
def NBClassifier(X_train, Y_train, Xquery) :
    '''
        returns max class for Xquery 
    '''
    # we need to find out P(Y=c|Xquery) and return max argument out of them so to classify. 
#     X_train = X_train.values
#     Y_train = Y_train.values


   # storing individual probabilities per class
    total_prob = 0
    prob_list = []
    
    total_class = np.unique(Y)
    max_prob_class = None
    max_prob = 0
    for label in total_class : 
        prob = posterior_proportional(X_train, Y_train, Xquery ,label)
        total_prob += prob
        prob_list.append(prob)
        if prob > max_prob :
            max_prob_class = label
            max_prob = prob
    
    prob_list = np.array(prob_list) # can also print the possible probabilty for particular  label
#     print(prob_list)
    prob_list /= total_prob
    return max_prob_class, prob_list   

In [88]:
def predict(X_train, Y_train, X_test):
    Y_pred = []
    for Xquery in X_test:
        label, prob_list = NBClassifier(X_train, Y_train, Xquery)
        Y_pred.append(label)

    Y_pred = np.array(Y_pred)
    return Y_pred

In [89]:
def accuracy(Y_pred, Y_test):
    accuracy = np.sum(Y_pred == Y_test) * 100 / Y_pred.shape[0]
    return accuracy

# Dataset Loading

In [90]:
# https://www.kaggle.com/ymotonskillupai/mushroomscsv#mushrooms.csv ->  dataset 
# downloaded locally in dataset folder only
import pandas as pd
df = pd.read_csv('Datasets/mushrooms.csv')
df.head(n=5) 

Unnamed: 0,type,cap_shape,cap_surface,cap_color,bruises,odor,gill_attachment,gill_spacing,gill_size,gill_color,...,stalk_surface_below_ring,stalk_color_above_ring,stalk_color_below_ring,veil_type,veil_color,ring_number,ring_type,spore_print_color,population,habitat
0,p,x,s,n,t,p,f,c,n,k,...,s,w,w,p,w,o,p,k,s,u
1,e,x,s,y,t,a,f,c,b,k,...,s,w,w,p,w,o,p,n,n,g
2,e,b,s,w,t,l,f,c,b,n,...,s,w,w,p,w,o,p,n,n,m
3,p,x,y,w,t,p,f,c,n,n,...,s,w,w,p,w,o,p,k,s,u
4,e,x,s,g,f,n,f,w,b,k,...,s,w,w,p,w,o,e,n,a,g


In [91]:
df.shape

(8124, 23)

In [92]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8124 entries, 0 to 8123
Data columns (total 23 columns):
type                        8124 non-null object
cap_shape                   8124 non-null object
cap_surface                 8124 non-null object
cap_color                   8124 non-null object
bruises                     8124 non-null object
odor                        8124 non-null object
gill_attachment             8124 non-null object
gill_spacing                8124 non-null object
gill_size                   8124 non-null object
gill_color                  8124 non-null object
stalk_shape                 8124 non-null object
stalk_root                  8124 non-null object
stalk_surface_above_ring    8124 non-null object
stalk_surface_below_ring    8124 non-null object
stalk_color_above_ring      8124 non-null object
stalk_color_below_ring      8124 non-null object
veil_type                   8124 non-null object
veil_color                  8124 non-null object
ring_number

In [93]:
df.describe()

Unnamed: 0,type,cap_shape,cap_surface,cap_color,bruises,odor,gill_attachment,gill_spacing,gill_size,gill_color,...,stalk_surface_below_ring,stalk_color_above_ring,stalk_color_below_ring,veil_type,veil_color,ring_number,ring_type,spore_print_color,population,habitat
count,8124,8124,8124,8124,8124,8124,8124,8124,8124,8124,...,8124,8124,8124,8124,8124,8124,8124,8124,8124,8124
unique,2,6,4,10,2,9,2,2,2,12,...,4,9,9,1,4,3,5,9,6,7
top,e,x,y,n,f,n,f,c,b,b,...,s,w,w,p,w,o,p,w,v,d
freq,4208,3656,3244,2284,4748,3528,7914,6812,5612,1728,...,4936,4464,4384,8124,7924,7488,3968,2388,4040,3148


In [94]:
df.shape

(8124, 23)

# Encoding - preprocessing Step

In [95]:
# data is categorical, but for being used by algo , 
# we need to encode them into nominal values(associating integer to class group)
from sklearn.preprocessing import LabelEncoder # Encode labels with value between 0 and n_classes-1 .
l = LabelEncoder() # encodes array like of shape(n_samples)
df = df.apply(l.fit_transform,axis = 0)

In [96]:
# see now we got our data whom to apply our algorithm
df.head(n = 10)

Unnamed: 0,type,cap_shape,cap_surface,cap_color,bruises,odor,gill_attachment,gill_spacing,gill_size,gill_color,...,stalk_surface_below_ring,stalk_color_above_ring,stalk_color_below_ring,veil_type,veil_color,ring_number,ring_type,spore_print_color,population,habitat
0,1,5,2,4,1,6,1,0,1,4,...,2,7,7,0,2,1,4,2,3,5
1,0,5,2,9,1,0,1,0,0,4,...,2,7,7,0,2,1,4,3,2,1
2,0,0,2,8,1,3,1,0,0,5,...,2,7,7,0,2,1,4,3,2,3
3,1,5,3,8,1,6,1,0,1,5,...,2,7,7,0,2,1,4,2,3,5
4,0,5,2,3,0,5,1,1,0,4,...,2,7,7,0,2,1,0,3,0,1
5,0,5,3,9,1,0,1,0,0,5,...,2,7,7,0,2,1,4,2,2,1
6,0,0,2,8,1,0,1,0,0,2,...,2,7,7,0,2,1,4,2,2,3
7,0,0,3,8,1,3,1,0,0,5,...,2,7,7,0,2,1,4,3,3,3
8,1,5,3,8,1,6,1,0,1,7,...,2,7,7,0,2,1,4,2,4,1
9,0,0,2,9,1,0,1,0,0,2,...,2,7,7,0,2,1,4,2,3,3


# Genearting data for prediction

In [97]:

X = df[df.columns[1:]] # fetures of mushrooms
Y = df['type'] # first coumn in our dataset is the type of mushrooms

In [98]:
X.shape

(8124, 22)

In [99]:
Y.shape

(8124,)

In [100]:
X = X.values
Y = Y.values

In [101]:
X

array([[5, 2, 4, ..., 2, 3, 5],
       [5, 2, 9, ..., 3, 2, 1],
       [0, 2, 8, ..., 3, 2, 3],
       ...,
       [2, 2, 4, ..., 0, 1, 2],
       [3, 3, 4, ..., 7, 4, 2],
       [5, 2, 4, ..., 4, 1, 2]])

In [102]:
Y

array([1, 0, 0, ..., 0, 1, 0])

# Splitting Data

In [103]:
from sklearn.model_selection import train_test_split
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size = 0.1, random_state = 101)

In [104]:
X_train.shape

(7311, 22)

In [105]:
X_test.shape

(813, 22)

In [106]:
# applying our Algorithm

In [107]:
Y_pred = predict(X_train, Y_train , X_test)

In [109]:
Y_pred

array([1, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 1, 0, 1, 0, 0, 1, 0, 1,
       0, 1, 1, 0, 0, 1, 0, 0, 0, 0, 0, 1, 1, 1, 0, 0, 1, 1, 0, 1, 0, 1,
       1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 1, 1, 1, 1,
       1, 0, 1, 0, 0, 1, 0, 1, 1, 0, 0, 1, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1,
       1, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 1, 0, 1, 1, 1, 1, 1, 0, 1, 0, 0,
       0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 1, 0, 1, 0, 1, 0, 0, 1, 0,
       1, 1, 0, 1, 1, 0, 0, 1, 0, 0, 1, 1, 0, 0, 0, 1, 1, 1, 0, 0, 1, 0,
       0, 1, 0, 0, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1,
       0, 1, 0, 1, 1, 0, 0, 1, 0, 1, 1, 0, 0, 0, 1, 1, 1, 0, 0, 0, 0, 1,
       0, 0, 0, 0, 1, 1, 0, 1, 0, 1, 0, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 1,
       0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1,
       1, 0, 1, 1, 1, 1, 0, 0, 1, 0, 1, 0, 0, 1, 1, 0, 0, 1, 1, 0, 0, 1,
       0, 1, 0, 0, 1, 0, 0, 0, 1, 1, 0, 0, 0, 1, 1, 1, 1, 0, 1, 0, 0, 1,
       1, 0, 0, 0, 0, 1, 0, 1, 0, 1, 0, 0, 0, 0, 0,

In [108]:
accuracy(Y_pred,Y_test)

99.6309963099631

In [81]:
# using Sklearn