In [136]:
import math
import pandas as pd
import numpy as np
from tqdm import tqdm
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split

In [97]:
# random dataset with n = 1000 (# of data points) and d = 2 (# of dimensions)
x1= np.random.randint(2, size = 1000)
x2= np.random.randint(2, size = 1000)
y= np.random.randint(2, size = 1000)

In [137]:
X= np.vstack((x1,x2)).T

In [138]:
class NaiveBayes: 
    
    def __init__(self, X, y,t):
        
        self.X = X
        self.y = y
        self.t = t
        
    def splitData(self):
        
        self.X_train, self.X_test, self.y_train, self.y_test = train_test_split(self.X, self.y, test_size = 0.3, random_state = 15)
           
    def probability(self, X, prior, dist1, dist2):
        return prior * dist1 * dist2

    def computeLiklihoodArrayForCol(self,series):
        liklihood_probability_dict = {}
        liklihood_numerator_dict = {}
        liklihood_denominator_dict = {}
        for unique_value in series.unique():
            liklihood_numerator_dict[unique_value] = (series[series == unique_value].shape[0])
            liklihood_denominator_dict[unique_value] = series.shape[0]
            liklihood_probability_dict[unique_value] = (series[series == unique_value].shape[0])/series.shape[0]
        return liklihood_numerator_dict,liklihood_denominator_dict,liklihood_probability_dict


    ## FOR LAPLACE SMOOTHING WILL HAVE TO STORE THE NUMERATORS
    def computeLiklihoodArrayForColSmoothing(self,series):
        liklihood_dict ={}
        for unique_value in series.unique():
            liklihood_dict[unique_value] = (series[series == unique_value].shape[0])/series.shape[0]
        return liklihood_dict
    
    ## Liklihoods are stored in a dictionary of arrays of dictionary
    ## key of dict is target category. For each target category we have a array of length = number of columns
    ## For each index of this array we have a dictionary. key for this inner dictionary is column category and value is liklihood of seeing that category for that column given target category
    ## A similiar data structure is maintained for storing numerators and denominators. Storing numerators and denominators are necesary for doing Laplace smoothing
    def printLiklihoods(self):
        targetCategories = list(self.dictOfLiklihoods.keys())
        for unique_value in targetCategories:
            for col_index in range(self.X_train.shape[1]):
                colLiklihoodDict = self.dictOfLiklihoods[unique_value][col_index]
                for colCategory in list(colLiklihoodDict.keys()):
                    print('Liklihood of column {} having {} given target category {} is : {}'.format(col_index+1,colCategory,unique_value,colLiklihoodDict[colCategory]))

    def fit(self):
        self.splitData()
        target = pd.Series(self.y_train)
        self.prior_dict = {}
        for unique_value in target.unique():
            self.prior_dict[unique_value] = (target[target == unique_value].shape[0])/target.shape[0]
        for unique_value in target.unique():
            print('Prior probability for seeing {} is : {}'.format(unique_value,self.prior_dict[unique_value]))
        cols = ['X_' + str(i) for i in range(self.X_train.shape[1])]
        predictors = pd.DataFrame(self.X_train,columns=cols)
        self.dictOfLiklihoodsNumerator = {}
        self.dictOfLiklihoodsDenominator = {}
        self.dictOfLiklihoods = {}
        for unique_value in target.unique():
            liklihoods = []
            numeratorLiklihood = []
            denominatorLiklihood = []
            for col in cols:
                liklihood_numerator,lilkilhood_denominator,liklihood_probability = self.computeLiklihoodArrayForCol(predictors.loc[target == unique_value,col])
                numeratorLiklihood.append(liklihood_numerator)
                denominatorLiklihood.append(lilkilhood_denominator)
                liklihoods.append(liklihood_probability)
            self.dictOfLiklihoodsNumerator[unique_value] = numeratorLiklihood
            self.dictOfLiklihoodsDenominator[unique_value] = denominatorLiklihood
            self.dictOfLiklihoods[unique_value] = liklihoods
        self.printLiklihoods()
    
    def isNewCategory(self,col_index,colCategory):
        targetCategories = list(self.dictOfLiklihoods.keys())
        for category in targetCategories:
            if self.dictOfLiklihoods[category][col_index].get(colCategory,-1) != -1:
                return False
        return True
    
    def predict(self,X):
        targetCategories = list(self.dictOfLiklihoods.keys())
        predictions = []
        for i in range(X.shape[0]):
            testPoint = X[i,:]
            listLogProbabilities = []
            for category in targetCategories:
                logProbability = 0
                liklihoods = self.dictOfLiklihoods[category]
                for index in range(len(liklihoods)):
                    if self.isNewCategory(index,testPoint[index]):
                        self.laplaceSmoothing(index,testPoint[index])
                        print('{} is a new category for column {}. Hence we are doing Laplace Smoothing.'.format(testPoint[index],index+1))
                        print('Liklihoods after Lapace Smoothing are as follows:')
                        self.printLiklihoods()
                    if liklihoods[index].get(testPoint[index],-1) != -1:
                        logProbability += np.log(liklihoods[index][testPoint[index]])
                logProbability += np.log(self.prior_dict[category])
                listLogProbabilities.append(logProbability)
            predictionCateoryIndex = np.argmax(listLogProbabilities)
            predictions.append(targetCategories[predictionCateoryIndex])
        return predictions

    def laplaceSmoothing(self,col_index,newColCategory):
        targetCategories = list(self.dictOfLiklihoods.keys())
        t = self.t
        for category in targetCategories:
            denominator = 1
            catgoriesInCol = list(self.dictOfLiklihoodsNumerator[category][col_index].keys())
            for colCategory in catgoriesInCol:
                newNumerator = self.dictOfLiklihoodsNumerator[category][col_index][colCategory] + t
                newDenominator = self.dictOfLiklihoodsDenominator[category][col_index][colCategory] + (len(catgoriesInCol)+1)*t
                denominator = newDenominator
                self.dictOfLiklihoodsNumerator[category][col_index][colCategory] = newNumerator
                self.dictOfLiklihoodsDenominator[category][col_index][colCategory] = newDenominator
                self.dictOfLiklihoods[category][col_index][colCategory] = newNumerator/newDenominator
            self.dictOfLiklihoodsNumerator[category][col_index][newColCategory] = t
            self.dictOfLiklihoodsDenominator[category][col_index][newColCategory] = denominator
            self.dictOfLiklihoods[category][col_index][newColCategory] = t/denominator


In [139]:
clf = NaiveBayes(X, y,20)

In [140]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.3, random_state = 15)

In [141]:
clf.fit()

Prior probability for seeing 1 is : 0.5028571428571429
Prior probability for seeing 0 is : 0.49714285714285716
Liklihood of column 1 having 0 given target category 1 is : 0.48295454545454547
Liklihood of column 1 having 1 given target category 1 is : 0.5170454545454546
Liklihood of column 2 having 0 given target category 1 is : 0.5227272727272727
Liklihood of column 2 having 1 given target category 1 is : 0.4772727272727273
Liklihood of column 1 having 0 given target category 0 is : 0.5373563218390804
Liklihood of column 1 having 1 given target category 0 is : 0.46264367816091956
Liklihood of column 2 having 0 given target category 0 is : 0.45689655172413796
Liklihood of column 2 having 1 given target category 0 is : 0.5431034482758621


In [142]:
clf.predict(X_test)

[0,
 0,
 0,
 1,
 0,
 0,
 1,
 0,
 1,
 0,
 1,
 0,
 1,
 0,
 1,
 1,
 0,
 0,
 0,
 0,
 0,
 1,
 1,
 0,
 1,
 0,
 1,
 0,
 1,
 0,
 0,
 0,
 1,
 1,
 1,
 1,
 0,
 0,
 0,
 1,
 0,
 0,
 1,
 0,
 0,
 0,
 0,
 1,
 1,
 0,
 0,
 1,
 1,
 0,
 0,
 0,
 1,
 0,
 1,
 0,
 0,
 1,
 0,
 0,
 1,
 1,
 0,
 0,
 1,
 1,
 1,
 1,
 0,
 1,
 1,
 1,
 1,
 0,
 1,
 1,
 0,
 0,
 1,
 1,
 0,
 0,
 0,
 0,
 0,
 1,
 1,
 1,
 0,
 1,
 1,
 1,
 1,
 0,
 0,
 1,
 0,
 0,
 1,
 0,
 1,
 1,
 0,
 0,
 0,
 0,
 1,
 0,
 1,
 0,
 1,
 1,
 0,
 1,
 1,
 1,
 0,
 0,
 0,
 0,
 1,
 0,
 1,
 0,
 1,
 0,
 0,
 0,
 0,
 1,
 0,
 0,
 0,
 0,
 0,
 1,
 1,
 1,
 0,
 1,
 0,
 0,
 0,
 1,
 1,
 0,
 1,
 1,
 1,
 0,
 0,
 1,
 1,
 0,
 1,
 0,
 0,
 0,
 0,
 1,
 1,
 1,
 1,
 1,
 0,
 0,
 0,
 0,
 1,
 0,
 1,
 1,
 0,
 1,
 0,
 0,
 1,
 1,
 0,
 1,
 0,
 1,
 1,
 1,
 1,
 0,
 0,
 0,
 0,
 1,
 0,
 1,
 0,
 0,
 1,
 0,
 1,
 0,
 1,
 1,
 1,
 0,
 0,
 1,
 1,
 0,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 0,
 1,
 0,
 0,
 0,
 1,
 0,
 0,
 0,
 0,
 0,
 1,
 0,
 0,
 0,
 1,
 0,
 1,
 1,
 0,
 0,
 1,
 1,
 1,
 0,
 0,
 1,
 1,
 0,
 1,
 0,
 1,
 0,


### Laplace Smoothing

In [144]:
clf.predict(np.array([[1,2],[0,2]]))

2 is a new category for column 2. Hence we are doing Laplace Smoothing.
Liklihoods after Lapace Smoothing are as follows:
Liklihood of column 1 having 0 given target category 1 is : 0.48295454545454547
Liklihood of column 1 having 1 given target category 1 is : 0.5170454545454546
Liklihood of column 2 having 0 given target category 1 is : 0.49514563106796117
Liklihood of column 2 having 1 given target category 1 is : 0.4563106796116505
Liklihood of column 2 having 2 given target category 1 is : 0.04854368932038835
Liklihood of column 1 having 0 given target category 0 is : 0.5373563218390804
Liklihood of column 1 having 1 given target category 0 is : 0.46264367816091956
Liklihood of column 2 having 0 given target category 0 is : 0.4387254901960784
Liklihood of column 2 having 1 given target category 0 is : 0.5122549019607843
Liklihood of column 2 having 2 given target category 0 is : 0.049019607843137254


[1, 0]

In [145]:
clf.predict(X_test)

[0,
 0,
 1,
 1,
 1,
 0,
 1,
 1,
 1,
 1,
 1,
 0,
 1,
 1,
 1,
 1,
 0,
 0,
 0,
 0,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 0,
 1,
 1,
 1,
 0,
 1,
 1,
 1,
 1,
 0,
 1,
 1,
 1,
 0,
 0,
 1,
 0,
 0,
 0,
 0,
 1,
 1,
 0,
 1,
 1,
 1,
 1,
 1,
 0,
 1,
 1,
 1,
 0,
 0,
 1,
 0,
 1,
 1,
 1,
 1,
 0,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 0,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 0,
 0,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 0,
 1,
 1,
 0,
 0,
 1,
 1,
 1,
 1,
 0,
 1,
 1,
 0,
 1,
 1,
 1,
 0,
 1,
 1,
 0,
 1,
 1,
 1,
 0,
 1,
 0,
 0,
 1,
 1,
 1,
 0,
 1,
 0,
 0,
 1,
 1,
 1,
 0,
 0,
 1,
 0,
 1,
 1,
 1,
 1,
 0,
 1,
 0,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 0,
 1,
 1,
 1,
 0,
 1,
 1,
 1,
 0,
 0,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 0,
 0,
 1,
 1,
 1,
 1,
 1,
 1,
 0,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 0,
 0,
 1,
 1,
 0,
 1,
 1,
 1,
 1,
 1,
 1,
 0,
 1,
 1,
 1,
 1,
 0,
 1,
 1,
 0,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 0,
 1,
 1,
 1,
 0,
 1,
 1,
 1,
 0,
 1,
 0,
 1,
 1,
 0,
 0,
 1,
 0,
 1,
 1,
 1,
 0,
 1,
 1,
 1,
 0,
 0,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
