In [9]:
import numpy as np
from random import randrange
import csv
import math
from collections import defaultdict as dt
import pandas as pd


def load_csv_dataset(filename):
    
    lines = csv.reader(open(filename, 'rt'))
    # print(lines)
    dataset = list(lines)
    dataset=dataset[1:]
    for i in range(len(dataset)):
        dataset[i] = [float(x) for x in dataset[i]]  
    return dataset


def mean(numbers):
    
    return np.mean(numbers)


def stdev(numbers):
    
    return np.std(numbers)


def sigmoid(z):
    
    return 1.0 / (1.0 + math.exp(-z))


def cross_validation_split(dataset, n_folds):
    dataset_split = list()
    dataset_copy = list(dataset)
    fold_size = int(len(dataset) / n_folds)
    for i in range(n_folds):
        fold = list()
        while len(fold) < fold_size:
            index = randrange(len(dataset_copy))
            fold.append(dataset_copy.pop(index))

        # print(fold)
        dataset_split.append(fold)

    

    return dataset_split


def accuracy_metric(actual, predicted):
    correct = 0
    for i in range(len(actual)):
        if actual[i] == predicted[i]:
            correct += 1
    return correct / float(len(actual)) * 100.0


def evaluate_algorithm(dataset, algorithm, n_folds, ):
    folds = cross_validation_split(dataset, n_folds)
    scores = []
    
    for fold in folds:
        train_set = list(folds)
        
        train_set.remove(fold)
        train_set = sum(train_set, [])
        test_set = list()
        for row in fold:
            row_copy = list(row)
            test_set.append(row_copy)
            row_copy[-1] = None

        predicted = algorithm(train_set, test_set, )
        

        actual = [row[-1] for row in fold]
        accuracy = accuracy_metric(actual, predicted)
        a=pd.Series(actual,name='Actual')
        p=pd.Series(predicted,name='Predicted')
        confusion=pd.crosstab(a,p)
        scores.append([accuracy,confusion])
    return scores



# Naive Bayes


def separate_by_class(dataset):
    separated = {}
    for i in range(len(dataset)):
        row = dataset[i]
        if row[-1] not in separated:
            separated[row[-1]] = []
        separated[row[-1]].append(row)
    return separated


def model(dataset):
    models = [(mean(attribute), stdev(attribute)) for attribute in zip(*dataset)]
    models.pop() 
    return models


def model_by_class(dataset):
    separated = separate_by_class(dataset)
    class_models = {}
    for (classValue, instances) in separated.items():
        class_models[classValue] = model(instances)
    return class_models


def calculate_pdf(x, mean, stdev):
    if stdev == 0.0:
        if x == mean:
            return 1.0
        else:
            return 0.0
    exponent = math.exp(-(math.pow(x - mean, 2) / (2 * math.pow(stdev, 2))))
    return 1 / (math.sqrt(2 * math.pi) * stdev) * exponent


def calculate_class_probabilities(models, input):
    probabilities = {}
    for (classValue, classModels) in models.items():
        probabilities[classValue] = 1
        for i in range(len(classModels)):
            (mean, stdev) = classModels[i]
            x = input[i]
            probabilities[classValue] *= calculate_pdf(x, mean, stdev)
    return probabilities


def predict(models, inputVector):
    probabilities = calculate_class_probabilities(models, inputVector)
    # print(probabilities)
    (bestLabel, bestProb) = (None, -1)
    for (classValue, probability) in probabilities.items():
        if bestLabel is None or probability > bestProb:
            bestProb = probability
            bestLabel = classValue
    return bestLabel


def getPredictions(models, testSet):
    predictions = []
    for i in range(len(testSet)):
        result = predict(models, testSet[i])
        predictions.append(result)

    # print(predictions)
    return predictions


def naive_bayes(train, test, ):
    summaries = model_by_class(train)
    predictions = getPredictions(summaries, test)
    return predictions

def confusion_matrix(actual,predicted):
    return pd.crosstab(actual,predicted)

def main():

    filename = 'iris.csv'
    dataset = load_csv_dataset(filename)
    iris = pd.read_csv(filename)

    n_folds = 10

    


    print("---------- Gaussian Naive Bayes ---------------")


    accuracy_naive = evaluate_algorithm(dataset, naive_bayes, n_folds)
    s=0

    for i in accuracy_naive:
        s+=i[0]


    k_folds=dt(list)
    for k,val in enumerate(accuracy_naive):
        k_folds[k+1].append(val[0])
        k_folds[k+1].append(val[1])



    

    for i in range(1,n_folds+1):
        print('\nFold {}:'.format(i))

        print('Accuracy in this fold:',k_folds[i][0],'%')

        print('confusion matrix:')
        print(np.array(k_folds[i][1]))


    print('\nAverage Accuracy for 10 folds: %f' % (s / len(accuracy_naive)))

    
    
# note: there is no final confusion matrix because we have performed cross validation, that means randomly
# splitting the dataset in train-test part so there will be no such thing as final matrix or combined
# confusion matrix(meaningless)


if __name__ == '__main__':
    main()



---------- Gaussian Naive Bayes ---------------

Fold 1:
Accuracy in this fold: 93.33333333333333 %
confusion matrix:
[[7 0 0]
 [0 3 0]
 [0 1 4]]

Fold 2:
Accuracy in this fold: 100.0 %
confusion matrix:
[[6 0 0]
 [0 4 0]
 [0 0 5]]

Fold 3:
Accuracy in this fold: 100.0 %
confusion matrix:
[[4 0 0]
 [0 5 0]
 [0 0 6]]

Fold 4:
Accuracy in this fold: 100.0 %
confusion matrix:
[[3 0 0]
 [0 4 0]
 [0 0 8]]

Fold 5:
Accuracy in this fold: 100.0 %
confusion matrix:
[[8 0 0]
 [0 4 0]
 [0 0 3]]

Fold 6:
Accuracy in this fold: 100.0 %
confusion matrix:
[[5 0 0]
 [0 6 0]
 [0 0 4]]

Fold 7:
Accuracy in this fold: 100.0 %
confusion matrix:
[[5 0 0]
 [0 5 0]
 [0 0 5]]

Fold 8:
Accuracy in this fold: 100.0 %
confusion matrix:
[[3 0 0]
 [0 8 0]
 [0 0 4]]

Fold 9:
Accuracy in this fold: 100.0 %
confusion matrix:
[[3 0 0]
 [0 7 0]
 [0 0 5]]

Fold 10:
Accuracy in this fold: 100.0 %
confusion matrix:
[[6 0 0]
 [0 4 0]
 [0 0 5]]

Average Accuracy for 10 folds: 99.333333
