# Handling Data

In [27]:
import csv
import random
import math

In [28]:
def load_csv(filename):
    lines = csv.reader(open(r'F:\dt\datasets_14370_19291_pima-indians-diabetes.csv'))
    dataset = list(lines)
    for x in range(len(dataset)):
        dataset[x] = [float(y) for y in dataset[x]]
    return dataset

In [29]:
def split_dataset(dataset, split_ratio):
    train_size = int(len(dataset) * split_ratio)
    train_set = []
    copy = list(dataset)
    while len(train_set) < train_size:
        index = random.randrange(len(copy))
        train_set.append(copy.pop(index))
    return [train_set, copy]

# Summarize the Data

In [30]:
def seperate_by_class(dataset):
    seperated = {}
    for i in range(len(dataset)):
        vector = dataset[i]
        if (vector[-1] not in seperated):
            seperated[vector[-1]] = []
        seperated[vector[-1]].append(vector)
    return seperated

In [31]:
def mean(numbers):
    return sum(numbers)/float(len(numbers))

In [32]:
def std_dev(numbers):
    avg = mean(numbers)
    variance = sum([pow((x - avg), 2) for x in numbers])/ float(len(numbers) - 1)
    return math.sqrt(variance)

In [33]:
def summarize(dataset):
    summaries = [(mean(attribute), std_dev(attribute)) for attribute in zip(*dataset)]
    del summaries[-1]
    return summaries

In [34]:
def summarize_by_class(dataset):
    seperated = seperate_by_class(dataset)
    summaries = {}
    for classvalue, instances in seperated.items():
        summaries[classvalue] = summarize(instances)
    return summaries

# Making Prediciton

In [42]:
def calculate_probability(x, mean, std_dev):
    exponent = math.exp(-(math.pow(x - mean, 2)/(2 * math.pow(std_dev, 2))))
    return (1/ (math.sqrt(2 * math.pi) * std_dev))* exponent

In [43]:
def calculate_class_probability(summaries, input_vector):
    probabilities = {}
    for class_value, class_summaries in summaries.items():
        probabilities[class_value] = 1
        for i in range(len(class_summaries)):
            mean, std_dev = class_summaries[i]
            x = input_vector[i]
            probabilities[class_value] *= calculate_probability(x, mean, std_dev)
    return probabilities

In [44]:
def predict(summaries, input_vector):
    probabilities = calculate_class_probability(summaries, input_vector)
    best_label, best_prob = None, -1
    for class_value, probability in probabilities.items():
        if best_label is None or probability > best_prob:
            best_prob = probability
            best_label = class_value
    return best_label

In [45]:
def get_predictions(summaries, test_set):
    predictions = []
    for i in range(len(test_set)):
        result = predict(summaries, test_set[i])
        predictions.append(result)
    return predictions

# Get Accuracy

In [46]:
def get_accuracy(test_set, predictions):
    correct = 0
    for x in range(len(test_set)):
        if test_set[x][-1] == predictions[x]:
            correct += 1
    return(correct/float(len(test_set))) * 100

# Main

In [47]:
def main():
    filename = 'F:\dt\datasets_14370_19291_pima-indians-diabetes.csv'
    split_ratio = 0.67
    dataset = load_csv(filename)
    training_set, test_set = split_dataset(dataset, split_ratio)
    print(('split {0}rows into train = {1} and test = {2} rows').format(len(dataset), len(training_set), len(test_set)))
    summaries = summarize_by_class(training_set)
    prediction = get_predictions(summaries, test_set)
    accuracy = get_accuracy(test_set, prediction)
    print(('Accuracy: {0}%').format(accuracy))
    
main()

split 768rows into train = 514 and test = 254 rows
Accuracy: 66.92913385826772%
