In [1]:
# Importing important libraries
import pandas as pd
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import train_test_split 
from sklearn import metrics
import matplotlib.pyplot as plt
import numpy as np
import seaborn as sns
import math

In [2]:
import csv
def load_csv(file):
    lines = csv.reader(open(file, 'rt'))
    dataset = list(lines)
    for i in range(len(dataset)):
        dataset[i] = [float(x) for x in dataset[i]]
    return dataset

file = "diabetes.csv"
dataset = load_csv(file)
print('Loaded data from {0} with {1} rows'.format(file, len(dataset)))

Loaded data from diabetes.csv with 769 rows


In [3]:
from random import randrange
def partition_data(dataset, ratio):
    train_size = int(len(dataset) * ratio)
    test_set = list(dataset)
    train_set = []
    
    while len(train_set) < train_size:
        index = randrange(len(test_set))
        train_set.append(test_set.pop(index))
        
    return [train_set, test_set]

train_set, test_set = partition_data(dataset, 0.67)
print('Split total data ({0} rows) into training set ({1} rows) and testing set ({2} rows)'.format(len(dataset), len(train_set), len(test_set)))
    

Split total data (769 rows) into training set (515 rows) and testing set (254 rows)


In [4]:
def group_by_class(dataset):
    klass_map = {}
    for el in dataset:
        klass = int(el[-1])
        if klass not in klass_map:
            klass_map[klass] = []
        klass_map[klass].append(el[:-1])
    return klass_map

classified_set = group_by_class(train_set)

for klass, data_points in classified_set.items():
    print('Class {0} contains {1} data points'.format(klass, len(data_points)))

Class 0 contains 334 data points
Class 1 contains 181 data points


In [5]:
def mean(n):
    return sum(n) / float(len(n))

def stdev(n):
    average = mean(n)
    return math.sqrt(sum([pow(x - average, 2) for x in n]) / float(len(n) - 1))

In [6]:
import multiprocessing as mp

def format_calc(t):
    return (mean(t), stdev(t))

def prepare_data(dataset):
    pool = mp.Pool(mp.cpu_count())
    summary = {}
    for klass, data_points in dataset.items():
        summary[klass] = pool.map(format_calc, zip(*data_points))
    pool.close()
    pool.join()
    return summary

summary_set = prepare_data(classified_set)

for klass, tupl in summary_set.items():
    print('Class {0} contains {1} tuples'.format(klass, len(tupl)))

Class 0 contains 8 tuples
Class 1 contains 8 tuples


In [7]:
def gauss(x, mean, stdev):
    ex = math.exp(-(math.pow(x - mean, 2) / (2 * math.pow(stdev, 2))))
    return (1 / (math.sqrt(2 * math.pi) * stdev)) * ex

In [8]:
def predict(summary_set, data_point):
    probabilities = {}
    for klass, summary in summary_set.items():
        probabilities[klass] = 1
        for i in range(len(summary)):
            mean, stdev = summary[i]
            probabilities[klass] *= gauss(data_point[i], mean, stdev)
    return max(probabilities.keys(), key=(lambda key: probabilities[key]))

In [10]:
def get_accuracy(summary_set, test_set):
    correct_count = 0
    for test_point in test_set:
        if test_point[-1] == predict(summary_set, test_point):
            correct_count += 1
    return correct_count / float(len(test_set)) * 100

accuracy = get_accuracy(summary_set, test_set)

print('The Naive-Bayes Model yields {0}% accuracy'.format(round(accuracy, 2)))

The Naive-Bayes Model yields 72.83% accuracy
