In [1]:
import numpy as np
from sklearn.model_selection import train_test_split
from scipy.stats import multivariate_normal
from sklearn.metrics import accuracy_score
from collections import defaultdict
from scipy.stats import norm
import operator
import time

In [14]:
# Generate Data
mean1 = [0,0]
cov1 = [[1,0],[0,1]]
dataSize1 = 20
gauss1 = np.random.multivariate_normal(mean1, cov1, dataSize1)
labels1 = np.full(dataSize1, 1)

mean2 = [2,2]
cov2 = [[1,0],[0,1]]
dataSize2 = 20
gauss2 = np.random.multivariate_normal(mean2, cov2, dataSize2)
labels2 = np.full(dataSize1, 2)

dataset = np.concatenate((gauss1, gauss2), axis=0)
labels = np.concatenate((labels1, labels2), axis=0)

In [3]:
# Split into test/train
data_train, data_test, labels_train, labels_test = train_test_split(dataset, labels, test_size=0.20, random_state=42)

In [5]:
class FeatureDistribution:
    def __init__(self, mu, sigma):
        self.mu = mu
        self.sigma = sigma

In [6]:
class TrainingDataDistributions:
    def __init__(self):
        self.labelsDistributions = {}
        
    def addLabelDistribution(self, label, distributions):
        self.labelsDistributions[label] = distributions

    def getLabelProbabilities(self, datapoint):
        posteriorProbabilities = {}
        for label in self.labelsDistributions.keys():
            posteriorProbabilities[label] = CalculatePointProbabilityGivenDistributions(datapoint, self.labelsDistributions[label])
        return posteriorProbabilities

In [7]:
class NaiveBayesModel:

    def __init__(self):
        self.trainingDataDistributions = None
    
    def train(self, trainData, trainLabels):
        start_time = time.time()
        self.trainingDataDistributions = CalculateTrainingDataDistributions(trainData, trainLabels)
        return time.time() - start_time
        
    def test(self, testData, testLabels):
        if self.trainingDataDistributions is None:
            raise RunTimeError('You must train the model before testing')
            
        start_time = time.time()
        predictions = list(map(lambda testDatapoint:  NaiveBayes(self.trainingDataDistributions, testDatapoint), testData))
        accuracy = accuracy_score(testLabels.tolist(), predictions)
        timing = time.time() - start_time
        return accuracy, timing

In [8]:
def CalculateTrainingDataDistributions(data, labels):
    labelsDistributions = TrainingDataDistributions()
    for label in set(labels):
        labelData = data[np.where(labels == label)] 
        labelDistributions = CalculateLabelDistributions(labelData)
        labelsDistributions.addLabelDistribution(label, labelDistributions)
    return labelsDistributions

In [9]:
def CalculateLabelDistributions(labelData):
    labelDistributions = []
    for featureIndex in range(np.shape(labelData)[1]): # For each feature
        featureDistribution = CalculateDistributionForFeature(labelData[:,featureIndex])
        labelDistributions.append(featureDistribution)
    return labelDistributions

In [10]:
def CalculateDistributionForFeature(data):
    mu = np.mean(data)
    sigma = np.cov(data)
    return FeatureDistribution(mu, sigma)

In [11]:
def CalculatePointProbabilityGivenDistributions(datapoint, distributions):
    totalProbability = 1
    for featureIndex, feature in enumerate(datapoint):
        distribution = distributions[featureIndex]
        featureProbability = norm.pdf(datapoint[featureIndex], distribution.mu, distribution.sigma)
        totalProbability = totalProbability * featureProbability
    return totalProbability

In [12]:
def NaiveBayes(labelsDistributions, datapoint):
    posteriorProbabilities = labelsDistributions.getLabelProbabilities(datapoint)
    posteriorProbabilitiesSorted = sorted(posteriorProbabilities.items(), key=operator.itemgetter(1), reverse=True)
    biggestLabelAndProbability = posteriorProbabilitiesSorted[0]
    bestLabel = biggestLabelAndProbability[0]
    return bestLabel

In [13]:
model = NaiveBayesModel()
print(model.train(data_train, labels_train))
print(model.test(data_test, labels_test))

0.001001596450805664
(0.375, 0.014000177383422852)
