In [37]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split

In [38]:
df = pd.read_csv('mushrooms.txt')
df.head(5)

Unnamed: 0,type,cap_shape,cap_surface,cap_color,bruises,odor,gill_attachment,gill_spacing,gill_size,gill_color,...,stalk_surface_below_ring,stalk_color_above_ring,stalk_color_below_ring,veil_type,veil_color,ring_number,ring_type,spore_print_color,population,habitat
0,p,x,s,n,t,p,f,c,n,k,...,s,w,w,p,w,o,p,k,s,u
1,e,x,s,y,t,a,f,c,b,k,...,s,w,w,p,w,o,p,n,n,g
2,e,b,s,w,t,l,f,c,b,n,...,s,w,w,p,w,o,p,n,n,m
3,p,x,y,w,t,p,f,c,n,n,...,s,w,w,p,w,o,p,k,s,u
4,e,x,s,g,f,n,f,w,b,k,...,s,w,w,p,w,o,e,n,a,g


### Encode the Categorical Data

In [39]:
encoder = LabelEncoder()
df_encoded = df.apply(encoder.fit_transform)
data = df_encoded.values

In [40]:
# Split Data into X (feature) and Y (labels)

X = data[:,1:]
Y = data[:,0]
print(X.shape, Y.shape)
np.unique(Y)   ## Number of Labels

(8124, 22) (8124,)


array([0, 1])

In [41]:
# Split data into Training and Test sets

X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.3)  ## 70-30 split for training and testing data
print(X_train.shape, Y_train.shape)
print(X_test.shape, Y_test.shape)

(5686, 22) (5686,)
(2438, 22) (2438,)


### Building the Naive Bayes' Classifier

In [42]:
def prior_prob(Y_train, label):
    # Compute the probability -> P(Y=label)
    
    total_examples = Y_train.shape[0]
    label_examples = np.sum(Y_train==label)
    return label_examples/float(total_examples)

def conditional_prob(X_train, Y_train, feature_col, feature_value, label):
    # Computes the conditional probability for -> P(X_train [feature_col] = feature_value | Y_train=label)
    
    X_labeled = X_train[Y_train==label]
    numerator = np.sum(X_labeled[:,feature_col]==feature_value)
    denominator = np.sum(Y_train==label)
    
    return numerator/float(denominator)

### Compute Posterior Probabilities for Test Samples and make Predictions !!

In [43]:
def predict(X_train, Y_train, X_point):
    # Computes the posterior probabilities for test -> X_point
    
    post_probs = []   # A list to store the posterior probabilities for all classes given a single testing point
    classes = np.unique(Y_train)
    n_features = X_train.shape[1]
    
    # Calculate posterior probability for each class
    for label in classes:
        
        # Post_C = likelihood*prior
        likelihood = 1.0
        for f in range(n_features):
            cond_prob = conditional_prob(X_train, Y_train, f, X_point[f], label)
            likelihood *= cond_prob
            
        label_prob = prior_prob(Y_train, label)
        post_prob = likelihood*label_prob
        post_probs.append(post_prob)
    
    return np.argmax(post_probs)

In [44]:
def accuracy(X_test, Y_test, X_train, Y_train):
    
    pred = []
    acc = 0
    
    for i in range(X_test.shape[0]):
        pred_label = predict(X_train, Y_train, X_test[i])
        pred.append(pred_label)
        
        if pred_label==Y_test[i]:
            acc += 1
            
    acc = acc/float(X_test.shape[0])
    return acc

In [45]:
acc = accuracy(X_test, Y_test, X_train, Y_train)

In [46]:
print('Score = ',acc)

Score =  0.9983593109105825
