# Naive Bayes Classifier

Naive Bayes is an algorithm which builds a probabilistic model using training data to gather information about its model parameters. Then for novel instances the model determines its class by calculating the probability of the instance being within the learned class distributions.

This model makes the assumption that all features are uncorrelated, that their measurement is indepenedent of other features. This is the reason for it being called naive. 

In [3]:
import csv
import random
import math
import pandas as pd
import numpy as np

In [4]:
# Load the data
def loadCSV(filename):
    df = pd.read_csv(filename)
    return df

# Split the data into a tran/test set
def split_dataset(dataset, train_percent=.66):
    perm = np.random.permutation(dataset)
    m = len(dataset)

    # Get the split indices
    train_end = int(train_percent * m)
    train, test = perm[:train_end], perm[train_end:]
    return train, test

In [5]:
class NaiveBayes(object):
    
    def __init__(self):
        self.groupClass = None
        self.stats = None
        
    def calculateGaussian(self, x, mean, std):    
        exponent = np.exp(-1*(np.power(x-mean,2)/(2*np.power(std,2))))
        std[std==0] = 0.00001
        return (1 / (np.sqrt(2*math.pi) * std)) * exponent
    
    def predict(self, x):
        probs = np.ones((len(x), len(self.stats)))

        for ix, instance in enumerate(x):
            for label_ix, label in enumerate(self.stats):
                probs[ix, int(label)] = probs[ix, int(label)] * \
                                      np.prod(self.calculateGaussian(instance, self.stats[label][0], self.stats[label][1]))
        return np.argmax(probs, 1)
    
    def score(self, x, y):
        pred = self.predict(x)
        return np.sum(1-np.abs(y - pred))/len(x)
        
    def train(self, x, y):
        self.splitClasses(x, y)
        self.getStats()
        pass
        
    def splitClasses(self, x, y):
        groupClass = {}

        for instance, label in zip(x, y):
            if not label in groupClass:
                groupClass.update({label: [instance]})
            else:
                groupClass[label].append(instance)
        self.groupClass = groupClass
        
    def getStats(self):
        stats = {}

        for label in self.groupClass:
            mean = np.mean(np.asarray(self.groupClass[label]), 0)
            std = np.std(np.asarray(self.groupClass[label]), 0)
            stats.update({label: [mean, std]})
        self.stats = stats

Test the Naive Bayes algorithm using the **inidians.csv** data.

In [14]:
filename = 'indians.csv'
dataset = loadCSV(filename)

print(dataset.shape)

train, test = split_dataset(dataset)
x_train, y_train = train[:, 0:-1], train[:, -1]
x_test, y_test = test[:, 0:-1], test[:, -1]

clf = NaiveBayes()
clf.train(x_train, y_train)

clf.score(x_train, y_train)

(767, 9)


0.7569169960474308

In [15]:
clf.stats

{0.0: [array([  3.37993921, 108.50759878,  69.11550152,  20.24924012,
          68.72948328,  30.42522796,   0.42635866,  31.2006079 ]),
  array([ 3.15144951, 25.63730132, 17.16853797, 14.89227399, 93.94612897,
          7.02101072,  0.30010135, 11.25860156])],
 1.0: [array([  4.62146893, 141.63276836,  70.68361582,  22.32768362,
          96.91525424,  35.61412429,   0.55610169,  36.83615819]),
  array([  3.6029166 ,  32.66099887,  21.02732719,  17.74314451,
         126.81161188,   6.87406273,   0.3957149 ,  10.76094859])]}

In [13]:
clf.score(x_test, y_test)

0.735632183908046