In [41]:
import pandas as pd 
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
from matplotlib import pyplot as plt
import numpy as np
from random import randrange
from math import sqrt
from math import exp
from math import pi
%matplotlib inline

data = pd.read_csv("multiclass_data.csv", header = None)
X = data.loc[:, 0:4]
y = data.loc[:,5]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42)

X_train = X_train.values.tolist()
X_test = X_test.values.tolist()
y_train = y_train.values.tolist()
y_test = y_test.values.tolist()

## My Naive Bayes

In [43]:
class Naive_Bayes:

    def accuracy_metric(self, y_pred, y_test):
        correct = 0
        for i in range(len(y_test)):
            if y_test[i] == y_pred[i]:
                correct += 1
        return correct/float(len(y_test))

    def mean(numbers):
        return sum(numbers)/float(len(numbers))

    def stdev(numbers):
        avg = mean(numbers)
        variance = sum([(x-avg)**2 for x in numbers]) / float(len(numbers)-1)
        return sqrt(variance)

    def calculate_probability(x, mean, stdev):
        exponent = exp(-((x-mean)**2 / (2 * stdev**2 )))
        return (1 / (sqrt(2 * pi) * stdev)) * exponent

    def calculate_class_probabilities(row):
        #ukupan broj odbiraka
        total_rows = sum([self.class_metrics[label][0][2] for label in self.class_metrics])
        probabilities = dict()

        #prolaze se metrike za svaku klasu i racuna se verovatnoca da obrirak pripada toj klasi
        for label, metrics in self.class_metrics.items():
            probabilities[label] = self.class_metrics[label][0][2]/float(total_rows)
            for i in range(len(metrics)):
                mean, stdev, _ = metrics[i]
                probabilities[label] *= calculate_probability(row[i], mean, stdev)
        return probabilities

    def predict(self, X_test):
        # racunanje predikcija za svaki odbirak (jedan red u test setu)
        self.test_set = X_test
        actual = [r[-1] for r in self.test_set]
        for row in self.test_set:
            row[-1] = None
            
        predictions = list()
        for row in self.test_set:
            probabilities = calculate_class_probabilities(self.class_metrics, row)
            best_label, best_prob = None, -1
            # za svaki odbirak se pronalazi najveca verovatnoca da pripada odredjenoj klasi
            for label, probability in probabilities.items():
                if best_label is None or probability > best_prob:
                    best_prob = probability
                    best_label = label
            predictions.append(best_label)
        
        return predictions


    def fit(self, X_train, y_train):
       
        # razdvajanje po klasama
        separated = dict()
        for i in range(len(X_train)):
            vector = X_train[i]
            class_value = y_train[i]
            if (class_value not in separated):
                separated[class_value] = list()
            separated[class_value].append(vector)

        # racunanje srednje vrednosti, standardne devijacije i duzine za svaku od klasa
        self.class_metrics = dict()
        for class_value, rows in separated.items():
            metric = [(mean(column), stdev(column), len(column)) for column in zip(*rows)]
            del(metric[-1])
            self.class_metrics[class_value] = metric


dataset = data
nb = Naive_Bayes()
nb.fit(X_train, y_train)
y_pred = nb.predict(X_test)
scores = nb.accuracy_metric(y_pred, y_test)
print('Tacnost na testirajucem skupu: %s' % scores)

Tacnost na testirajucem skupu: 0.9491525423728814


## Built-in Naive Bayes

In [46]:
from sklearn.naive_bayes import GaussianNB
from sklearn.metrics import accuracy_score

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42)
gnb = GaussianNB()
y_pred = gnb.fit(X_train, y_train).predict(X_test)
accuracy_score(y_pred, y_test)

0.9491525423728814