In [1]:
import csv
import math

In [2]:
def load_data(filename):
    with open(filename,'r') as dataset:
        buffer = csv.reader(dataset)
        data = list(buffer)
        for i in range(len(data)):
            data[i] = [float(x) for x in data[i]]
    return data

In [3]:
def divide(x,y):
    if y == 0:
        return 0
    else:
        return x / y

In [4]:
def mean(numbers):
    return sum(numbers)/len(numbers)

In [23]:
def stddev(numbers):
    avg = mean(numbers)
    var = divide(sum([math.pow((x-avg),2) for x in numbers]),len(numbers)-1)
    return math.sqrt(var)

In [6]:
def separate_by_class(dataset):
    classified = {}
    for row in dataset:
        if row[-1] not in classified:
            classified[row[-1]] = []
        classified[row[-1]].append(row)
    return classified

In [19]:
def split_data(dataset,train_ratio):
    length = int(len(dataset)*train_ratio)
    test = list(dataset)
    train = []
    while len(train) < length:
        train.append(test.pop(0))
    return (train,test)

In [8]:
def summarize(dataset):
    summaries = [(mean(a),stddev(a)) for a in zip(*dataset)]
    del summaries[-1]
    return summaries

In [21]:
def summarize_by_class(dataset):
    classified = separate_by_class(dataset)
    summaries = {}
    for c,i in classified.items():
        summaries[c] = summarize(i)
    return summaries

In [25]:
def probability(x,mean,stddev):
    e = math.exp(-1 * divide(math.pow((x-mean),2),(2*(math.pow(stddev,2)))))
    final = divide(-1,math.sqrt(2*stddev*math.pi)) * e
    return final

In [11]:
def calculate_class_probabilities(summaries,instance):
    probabilities = {}
    for classValue,summaries in summaries.items():
        probabilities[classValue] = 1
        for i in range(len(summaries)):
            mean,stddev = summaries[i]
            x = instance[i]
            probabilities[classValue] *= probability(x,mean,stddev)
    return probabilities

In [12]:
def predict(summaries,instance):
    probabilities = calculate_class_probabilities(summaries,instance)
    bestLabel,bestProb = None,-1
    for classValue,probability in probabilities.items():
        if probability > bestProb or bestLabel == None:
            bestLabel = classValue
            bestProb = probability
    return bestLabel

In [13]:
def get_predictions(summaries,test):
    predictions = []
    for i in range(len(test)):
        r = predict(summaries,test[i])
        predictions.append(r)
    return predictions

In [29]:
def accuracy(predictions,test):
    correct = 0
    for i in range(len(test)):
        if test[i][-1] == predictions[i]:
            correct += 1
    return divide(correct,float(len(test))) * 100.0

In [30]:
dataset = load_data("dataset/data.csv")

train,test = split_data(dataset,train_ratio = 0.7)

summaries = summarize_by_class(train)

predictions = get_predictions(summaries,test)

accuracy = accuracy(predictions,test)


In [31]:
accuracy

60.0

In [32]:
actual = [r[-1] for r in test ]

In [33]:
actual

[10.0, 10.0, 10.0, 10.0, 5.0]

In [34]:
predictions

[10.0, 5.0, 5.0, 10.0, 5.0]