In [1]:
import csv
import random
import math
import pandas as pd
import numpy as np

In [2]:
# Load the data
def loadCSV(filename):
    df = pd.read_csv(filename)
    return df

# Split the data into a tran/test set
def split_dataset(dataset, train_percent=.66):
    perm = np.random.permutation(dataset)
    m = len(dataset)

    # Get the split indices
    train_end = int(train_percent * m)
    train, test = perm[:train_end], perm[train_end:]
    return train, test

In [19]:
class NaiveBayes(object):
    
    def __init__(self):
        self.groupClass = None
        self.stats = None

    ################################################
    ########## Training Methods ####################
    ################################################
    def train(self, x, y):
        self.splitClasses(x, y)
        self.getStats()
        pass
        
    def splitClasses(self, x, y):
        """ Split the classes by their labels
        
        Create a dictionary which will contain the data for each
        label set.
        
        Args:
            - x: the dataset
            - y: the labels
        """
        # Dictionary which will hold all the groups
        # of data for each label.
        groupClass = {}
        
        # For each datapoint place it in the correct 
        # dictionary bin.
        for instance, label in zip(x, y):
            if not label in groupClass:
                groupClass.update({label: [instance]})
            else:
                groupClass[label].append(instance)
        self.groupClass = groupClass
        
    def getStats(self):
        """ Get the statistics.
        
        For each label group in the dictionary get the relevant
        statistics for the Gaussian distribution we will be assuming. 
        We thus need to get the mean and the variance for each 
        feature and each label.
        
        For example. For all datapoints in label 1. Get the mean
        and the standard deviation of each feature.
        """
        # Build a dictionary containing all the statistics
        stats = {}
        # For each label in the dataset
        for label in self.groupClass:
            # Get the mean and standard deviation.
            mean = np.mean(np.asarray(self.groupClass[label]), 0)
            std = np.std(np.asarray(self.groupClass[label]), 0)
            stats.update({label: [mean, std]})
        self.stats = stats
    
    ################################################
    ########## Predicting Methods ##################
    ################################################
    
    def score(self, x, y):
        """ Get the percentage of correctly labeled value.
        
        For a dataset x and labels y, determine the number
        of correctly identified labels from our trained model.
        
        Args:
            - x: the dataset
            - y: the labels
        """
        pred = self.predict(x)
        return np.sum(1-np.abs(y - pred))/len(x)
    
    def predict(self, x):
        """ Predict the labels for a dataset 
        
        Args:
            - x: the dataset        
        """
        # We will identify the probability of each datapoint
        # belonging to each different label class.
        # probs shape = (#datapoints X #labels)
        probs = np.ones((len(x), len(self.stats)))
        
        # Iterate through each datapoint
        for ix, instance in enumerate(x):
            # For each label in the training data
            # Calculate the probability of the datapoint
            # belonging to that class.
            for label in self.stats:
                px = self.calculateGaussian(instance, 
                                            self.stats[label][0], 
                                            self.stats[label][1])
                # Probability of a multiple Gaussians is their products
                probs[ix, int(label)] = probs[ix, int(label)] * np.prod(px)
        return np.argmax(probs, 1)
    
    def calculateGaussian(self, x, mean, std):
        """ Calculate the probability of a Gaussian
        
        Calculate p(x) of a Gaussian with a given 
        mean and standard deviation.
        
        Args:
            - x: Input value
            - mean: The mean of the Gaussian
            - std: The standard deviation of the Gaussian        
        """
        exponent = np.exp(-1*(np.power(x-mean,2)/(2*np.power(std,2))))
        return (1 / (np.sqrt(2*math.pi) * std)) * exponent

In [20]:
filename = 'indians.csv'
dataset = loadCSV(filename)

print(dataset.shape)

train, test = split_dataset(dataset)
x_train, y_train = train[:, 0:-1], train[:, -1]
x_test, y_test = test[:, 0:-1], test[:, -1]

print(x_train.shape)
print(y_train.shape)
print(x_test.shape)
print(y_test.shape)

clf = NaiveBayes()
clf.train(x_train, y_train)

clf.score(x_train, y_train)


(767, 9)
(506, 8)
(506,)
(261, 8)
(261,)


0.73122529644268774

In [21]:
clf.score(x_test, y_test)

0.77011494252873558