## Naive Bayes - Mushroom Dataset 
- goal is to predict the class of the mushrooms ,  given some of the features of the mushrooms . We will use Bayes Model for this Classification .

### Load the DataSet 

In [1]:
import numpy as np 
import pandas as pd
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split

In [2]:
df = pd.read_csv("../NLTK/bayes/mushrooms.csv")

In [3]:
df.head(n=10)
df.shape

(8124, 23)

### Encode the Categorial Data into Numerical Data

In [4]:
le = LabelEncoder()
ds = df.apply(le.fit_transform)

In [5]:
ds.head()

Unnamed: 0,type,cap_shape,cap_surface,cap_color,bruises,odor,gill_attachment,gill_spacing,gill_size,gill_color,...,stalk_surface_below_ring,stalk_color_above_ring,stalk_color_below_ring,veil_type,veil_color,ring_number,ring_type,spore_print_color,population,habitat
0,1,5,2,4,1,6,1,0,1,4,...,2,7,7,0,2,1,4,2,3,5
1,0,5,2,9,1,0,1,0,0,4,...,2,7,7,0,2,1,4,3,2,1
2,0,0,2,8,1,3,1,0,0,5,...,2,7,7,0,2,1,4,3,2,3
3,1,5,3,8,1,6,1,0,1,5,...,2,7,7,0,2,1,4,2,3,5
4,0,5,2,3,0,5,1,1,0,4,...,2,7,7,0,2,1,0,3,0,1


In [6]:
data = ds.values
print(data.shape)
print(type(data))

(8124, 23)
<class 'numpy.ndarray'>


In [7]:
print(data[: 5 , : ])
X = data[: , 1 : ]
Y  = data[: , 0]

[[1 5 2 4 1 6 1 0 1 4 0 3 2 2 7 7 0 2 1 4 2 3 5]
 [0 5 2 9 1 0 1 0 0 4 0 2 2 2 7 7 0 2 1 4 3 2 1]
 [0 0 2 8 1 3 1 0 0 5 0 2 2 2 7 7 0 2 1 4 3 2 3]
 [1 5 3 8 1 6 1 0 1 5 0 3 2 2 7 7 0 2 1 4 2 3 5]
 [0 5 2 3 0 5 1 1 0 4 1 3 2 2 7 7 0 2 1 0 3 0 1]]


### break the data into train and test 

In [8]:
x_train , x_test , y_train , y_test = train_test_split(X , Y , test_size = 0.2)

In [9]:
print(x_train.shape , y_train.shape)
print(x_test.shape , y_test.shape)

(6499, 22) (6499,)
(1625, 22) (1625,)


In [10]:
np.unique(y_train)

array([0, 1])

## Building Our Classifier !

In [11]:
a = np.array([0,1,0,0,1,1,1,0])
a = np.sum(a==1)
print(a)

4


In [12]:
def prior_prob(y_train , label):
    total_examples = y_train.shape[0]
    class_examples = np.sum(y_train==label)
    return (class_examples / float(total_examples))






In [13]:
prior_prob(y_train , 1)

0.47930450838590555

In [14]:
def conditional_property(x_train , y_train , feature_col  , feature_val ,label):
    x_filtered = x_train[y_train==label]
    numerator = np.sum(x_filtered[:  , feature_col] == feature_val )
    denominator = np.sum(y_train==label)
    return numerator / denominator 


In [15]:
def predict(x_train , y_train , xtest):
    """ xtest is single point of n -  feature """
    classes = np.unique(y_train)
    post_prob = []
    n_features = x_train.shape[1]
    
    for label in classes:
        likelyhood = 1.0
        for f in range(n_features):
            cond = conditional_property(x_train , y_train , f , xtest[f] , label)
            likelyhood *=cond
        prior = prior_prob(y_train , label)
        post = likelyhood*prior
        post_prob.append(post)
    pred = np.argmax(post_prob)
    return pred
    
    

In [16]:
output = predict(x_train , y_train , x_test[1])

In [17]:
print(output , y_test[1])

0 0


In [18]:
def score(x_train , y_train , x_test , y_test):
    pred = []
    
    
    for i in range(x_test.shape[0]):
        pred_label = predict(x_train , y_train , x_test[i])
        pred.append(pred_label)
    pred = np.array(pred)
    accuracy = np.sum(pred==y_test) / y_test.shape[0]
    return accuracy
    

In [19]:
print(score(x_train , y_train , x_test , y_test))

0.9981538461538462
