# Naive Bayes Classifier

Naive Bayes is an algorithm which builds a probabilistic model using training data. Then for novel instances the model determines ist class by calculating the probability of the instance being within the learned distributions.

In [1]:
import csv
import random
import math
import pandas as pd
import numpy as np

In [2]:
# Load the data
def loadCSV(filename):
    df = pd.read_csv(filename)
    return df

# Split the data into a tran/test set
def split_dataset(dataset, train_percent=.66):
    perm = np.random.permutation(dataset)
    m = len(dataset)

    # Get the split indices
    train_end = int(train_percent * m)
    train, test = perm[:train_end], perm[train_end:]
    return train, test

In [4]:
class NaiveBayes(object):
    
    def __init__(self):
        self.groupClass = None
        self.stats = None
        
    def calculateGaussian(self, x, mean, std):    
        exponent = np.exp(-1*(np.power(x-mean,2)/(2*np.power(std,2))))
        std[std==0] = 0.00001
        return (1 / (np.sqrt(2*math.pi) * std)) * exponent
    
    def predict(self, x):
        probs = np.ones((len(x), len(self.stats)))

        for ix, instance in enumerate(x):
            for label_ix, label in enumerate(self.stats):
                probs[ix, int(label)] = probs[ix, int(label)] * \
                                      np.prod(self.calculateGaussian(instance, self.stats[label][0], self.stats[label][1]))
        return np.argmax(probs, 1)
    
    def score(self, x, y):
        pred = self.predict(x)
        return np.sum(1-np.abs(y - pred))/len(x)
        
    def train(self, x, y):
        self.splitClasses(x, y)
        self.getStats()
        pass
        
    def splitClasses(self, x, y):
        groupClass = {}

        for instance, label in zip(x, y):
            if not label in groupClass:
                groupClass.update({label: [instance]})
            else:
                groupClass[label].append(instance)
        self.groupClass = groupClass
        
    def getStats(self):
        stats = {}

        for label in self.groupClass:
            mean = np.mean(np.asarray(self.groupClass[label]), 0)
            std = np.std(np.asarray(self.groupClass[label]), 0)
            stats.update({label: [mean, std]})
        self.stats = stats

Test the Naive Bayes algorithm using the **inidians.csv** data.

In [8]:
clf.stats

{0.0: [array([  3.39233038, 110.86725664,  69.16519174,  20.04129794,
          70.61356932,  30.44100295,   0.42989086,  31.4159292 ]),
  array([  2.99104862,  27.58612294,  16.80977638,  14.74557299,
         102.14343518,   7.73148947,   0.3000292 ,  11.73180559])],
 1.0: [array([  4.82035928, 140.92215569,  71.7005988 ,  22.85628743,
          94.22155689,  34.68802395,   0.546     ,  37.19161677]),
  array([  3.71323154,  31.42446395,  20.86928665,  17.7986999 ,
         132.09941825,   7.03869376,   0.37407529,  11.06211863])]}

In [9]:
filename = 'indians.csv'
dataset = loadCSV(filename)

print(dataset.shape)

train, test = split_dataset(dataset)
x_train, y_train = train[:, 0:-1], train[:, -1]
x_test, y_test = test[:, 0:-1], test[:, -1]

print(x_train.shape)
print(y_train.shape)
print(x_test.shape)
print(y_test.shape)

clf = NaiveBayes()
clf.train(x_train, y_train)

clf.score(x_train, y_train)


(767, 9)
(506, 8)
(506,)
(261, 8)
(261,)


0.7608695652173914

In [10]:
clf.score(x_test, y_test)

0.7126436781609196