In [12]:
import csv
import numpy as np
import matplotlib.pyplot as plt
def loadCsv(filename):
	lines = csv.reader(open(filename, "rb"))
	dataset = list(lines)
	del dataset[0] #delete row headings
	for i in range(len(dataset)):
		dataset[i] = [float(x) for x in dataset[i]]
		del dataset[i][0] #delete first element the row because it contains the observation number, which is not needed
	return dataset

import random
def splitDataset(dataset, splitRatio):
	trainSize = int(len(dataset) * splitRatio) #calculate lenth of training size.  this will set an upper limit on 
	trainSet = []
	copy = list(dataset) #convert each element in the list to a list
	while len(trainSet) < trainSize:
		index = random.randrange(len(copy))
		trainSet.append(copy.pop(index))
	return trainSet, copy

def separateByClass(dataset):
    separated = {}
    for i in range(len(dataset)):
        vector = dataset[i] #list i into vector
        if (vector[0] not in separated): #if the last element in the vector is not in seperated dictionary..
            separated[vector[0]] = [] #create key-value for last element.  value is empty list
        separated[vector[0]].append(vector) #append list i to the list for the value
    return separated

import math
def mean(numbers):
	return sum(numbers)/float(len(numbers)) #return average (use float in denominator to get decimal number for average)
 
def stdev(numbers):
	avg = mean(numbers) #assign average to avg
	variance = sum([pow(x-avg,2) for x in numbers])/float(len(numbers)-1)#calculate variance
	return math.sqrt(variance) #calculate standard deviation by square rooting variance

#lets combine the above into a function
def summarize(dataset):
	summaries = [(mean(attribute), stdev(attribute)) for attribute in zip(*dataset)]    
	del summaries[0] #delete because summaries[0] contains the target variable, in this example (target variable is usually in the beginning or ending column)
	return summaries

def summarizeByClass(dataset):
	separated = separateByClass(dataset)
	summaries = {}
	for classValue, instances in separated.iteritems():
		summaries[classValue] = summarize(instances)
	return summaries

import math
def calculateProbability(x, mean, stdev):
	exponent = math.exp((-(x-mean)**2)/(2*stdev**2))
	return (1 / ((2*math.pi)**(0.5) * stdev)) * exponent

def calculateClassProbabilities(summaries, inputVector):
	probabilities = {}
	for classValue, classSummaries in summaries.iteritems(): #classValue is the key 0 or 1.  classSummaries contains mean and standard deviation i.e. (mean, standard deviation) for a variable/attribute/column 
		probabilities[classValue] = 1 #set the initial probability to 1 so that it can be multiplied later on
		for i in range(len(classSummaries)):
			mean, stdev = classSummaries[i] #classSummaries[i] contains mean and standard deviation for a variable/attribute/column.  split the mean and standardivation into seperate variables called mean and standard deviation.
			x = inputVector[i] #inputvector[i] is variable/attribute value for a particular observation
			probabilities[classValue] *= calculateProbability(x, mean, stdev) #calculate probability that the variable/attribute for a particular observation belongs to each class (class in this example is 0 or 1). Do that for each variable/attribute/column and multiply all the probabilities together.  the final result is the probability that the attribute/variable/column belongs each class. 
	return probabilities

def predict(summaries, inputVector):
	probabilities = calculateClassProbabilities(summaries, inputVector)
	bestLabel, bestProb = None, None
	for classValue, probability in probabilities.iteritems(): #loop through each key-value pair in probabilities
		if bestLabel is None or probability > bestProb: #compare probability for a class with the best probability thus far.  If the probability is higher then..
			bestProb = probability
			bestLabel = classValue #remember that class
	return bestLabel #return the class with the highest probability

def getPredictions(summaries, testSet):
	predictions = []
	for i in range(len(testSet)):
		result = predict(summaries, testSet[i])
		predictions.append(result)
	return predictions

def getAccuracy(testSet, predictions):
	correct = 0
	for x in range(len(testSet)):
		if testSet[x][0] == predictions[x]: #if the class, which in this case, is the last element of the row equals the prediction then..
			correct += 1 #count number of corrections
	return (correct/float(len(testSet))) * 100.0 #calculate accuracy by using number of corrections divided by total number of rows in dataset

def main():
    filename = 'Flying_Fitness.csv'
    splitRatio = 0.67 #training split
    dataset = loadCsv(filename)
    trainingSet, testSet = splitDataset(dataset, splitRatio)
    testTarget = []
    for i in testSet:
        testTarget.append(i[0])
    print "Test Target: ", testTarget
    summaries = summarizeByClass(trainingSet)
    threshold = 0.5
    predictions = getPredictions(summaries, testSet)
    print "Predictions: ",predictions
    accuracy = getAccuracy(testSet, predictions)
    print "Accuracy: ",accuracy


main()

Test Target:  [0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0]
Predictions:  [0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0]
Accuracy:  85.7142857143
