In [186]:
import csv
import math
import random

In [187]:
#ETL
def loadcsv() :
    line = csv.reader(open(r'C:\datasets\pima-indians_diabetes.csv'))
    dataset = list(line)

    #set all data to float (make sure they are not string!)
    #for i in range(len(dataset)) :
    for i in range(dataset[1:]) :
        dataset[i] = [float(x) for x in dataset[i]]
    return dataset

In [188]:
#split data for training
def splitdataset(dataset, splitratio) :
    trainsize = int(len(dataset) * splitratio)
    trainset = []
    copy = list(dataset)
    while len(trainset) < trainsize :
        index = random.randrange(len(copy))
        trainset.append(copy.pop(index))
    return [trainset, copy]

In [189]:
def separatebyclass(dataset) :
    separated = {}
    for i in range(len(dataset)) :
        vector = dataset[i]
        if (vector[-1] not in separated) :
            separated[vector[-1]] = []
        separated[vector[-1]].append(vector)
    return separated

In [190]:
def mean(numbers) :
    return sum(numbers)/float(len(numbers))

def stdev(numbers) :
    avg = mean(numbers)
    variance = sum([pow(x-avg, 2) for x in numbers])/float(len(numbers)-1)
    return math.sqrt(variance)

In [191]:
def summarize(dataset) :
    summaries = [(mean(attribute), stdev(attribute)) for attribute in zip(*dataset)]
    del summaries[-1]
    return summaries

In [192]:
def summarizebyclass(dataset) :
    separated = separatebyclass(dataset)
    summaries = {}
    for classvalue, instances in separated.items() :
        summaries[classvalue] = summaries(instances)
    return summaries

In [193]:
def calculateprobability(x, mean, stdev) :
    exponent = math.exp(-(math.pow(x-mean, 2)/(2*math.pow(stdev, 2))))
    return (1/(math.sqrt(2*math.pi)*stdev))*exponent

In [194]:
def calculateclassproabilities(summaries, inputvector) :
    probabilities = {}
    for classvalue, classsummaries in summaries.items() :
        probabilities[classvalue] = 1
        for i in range(len(classsummaries)) :
            mean, stdev = classsummaries[i]
            x = inputvector[i]
            probabilities[classvalue] *= calculateprobability(x, mean, stdev)
        return probabilities

In [195]:
def predict(summaries, inputvector) :
    probability = calculateclassproabilities(summaries, inputvector)
    bestlabel, bestprob = None, -1
    for classvalue, probability in probability.items() :
        if bestlabel is None or probability > bestprob:
            bestprob = probability
            bestlabel = classvalue
    return bestlabel

In [196]:
def getpredictions(summaries, testset) :
    predictions = []
    for i in range(len(testset)) :
        result = predict(summaries, testset[i])
        predictions.append(result)
    return predictions

In [197]:
def getaccuracy(testset, predictions) :
    correct = 0
    for x in range(len(testset)) :
        if testset[x][-1] == predictions[x] :
            correct += 1
    return (correct/float(len(testset)))*100.0

In [198]:
def main() :
    #filename = 'pima-indians_diabetes.csv'
    splitratio = 0.67
    dataset = loadcsv()
    trainingset, testset = splitdataset(dataset, splitratio)
    print('Split {0} rows into train = {1} and test = {2} rows'.format(len(dataset), len(trainingset), len(testset)))

    summaries = summarizebyclass(trainingset)

    preddictions = getpredictions(summaries, testset)
    accuracy = getaccuracy(testset, preddictions)
    print('Accuracy: {0}%'.format(accuracy))

main()

TypeError: 'list' object cannot be interpreted as an integer