In [1]:
import csv
import random
import pandas
#import numpy as np
import math
import operator

with open('earthquakes-asia.csv', 'r') as csvfile:
    lines = csv.reader(csvfile)
    #for row in lines:
        #print(', '.join(row))

In [2]:
def loadDataset(filename, split, trainingSet=[] , testSet=[]):
    with open(filename, 'r') as csvfile:
        colnames = ['1','2','3','4','5','6','7','8','9','10','11','12','13','14','15','16','17','18','19','20','21','22']
        data = pandas.read_csv(filename,names=colnames)
        dataset = list(zip(data['2'], data['3'], data['5']))
        for x in range(len(dataset)):
            dataset[x] = list(dataset[x])
        for x in range(len(dataset)):
            for y in range(3):
                dataset[x][y] = float(dataset[x][y])
            if random.random() < split:
                trainingSet.append(dataset[x])
            else:
                testSet.append(dataset[x])

In [3]:
trainingSet = []
testSet = []
loadDataset(r'earthquakes-asia.csv', 0.66, trainingSet, testSet)
print('Train: ' + repr(len(trainingSet)))
print('Test: ' + repr(len(testSet)))

Train: 9971
Test: 5078


In [4]:
def euclideanDistance(instance1, instance2, length):
    distance = 0
    for x in range(length):
        distance += pow((instance1[x] - instance2[x]), 2)
    return math.sqrt(distance)

In [5]:
data1 = [2, 2, 2, 'a']
data2 = [4, 4, 4, 'b']
distance = euclideanDistance(data1, data2, 3)
print('Distance: ' + repr(distance))

Distance: 3.4641016151377544


In [6]:
def getNeighbors(trainingSet, testInstance, k):
    distances = []
    length = len(testInstance)
    for x in range(len(trainingSet)):
        dist = euclideanDistance(testInstance, trainingSet[x], length)
        distances.append((trainingSet[x], dist))
    distances.sort(key=operator.itemgetter(1))
    neighbors = []
    for x in range(k):
        neighbors.append(distances[x][0])
    return neighbors

In [7]:
trainSet = [[2, 2, 2, 'a'], [4, 4, 4, 'b']]
testInstance = [5, 5, 5]
k = 1
neighbors = getNeighbors(trainSet, testInstance, k)
print(neighbors)

[[4, 4, 4, 'b']]


In [8]:
def getResponse(neighbors, k):
    classVotes = {}
    total = 0
    for x in range(len(neighbors)):
        response = neighbors[x][-1]
        total = total + response
        if response in classVotes:
            classVotes[response] += 1
        else:
            classVotes[response] = 1
        sortedVotes = sorted(classVotes.items(), key=operator.itemgetter(1), reverse=True)
    return round(total/k, 1)

In [9]:
neighbors = [[1, 1, 1, 1], [2, 2, 2, 4], [3, 3, 3, 10]]
print(getResponse(neighbors, 3))

5.0


In [10]:
def getError(testSet, predictions):
    for x in range(len(testSet)):
        error = abs((predictions[x] - testSet[x][-1]) / testSet[x][-1])
    return (error / float(len(testSet))) * 100.0

In [11]:
testSet = [[1, 1, 1, 1], [2, 2, 2, 1], [3, 3, 3, 2]]
predictions = [1, 1, 1]
accuracy = getError(testSet, predictions)
print(accuracy)

16.666666666666664


In [12]:
def main():
    # prepare data
    trainingSet = []
    testSet = []
    split = 0.67
    loadDataset('earthquakes-asia.csv', split, trainingSet, testSet)
    print ('Train set: ' + repr(len(trainingSet)))
    print ('Test set: ' + repr(len(testSet)))
    # generate predictions
    predictions= []
    k = 3
    for x in range(len(testSet)):
        neighbors = getNeighbors(trainingSet, testSet[x], k)
        result = getResponse(neighbors, k)
        predictions.append(result)
        #print('> predicted=' + repr(result) + ', actual=' + repr(testSet[x][-1]))
    accuracy = getError(testSet, predictions)
    print('Average Percent Error: ' + repr(accuracy) + '%')

In [13]:
main()

Train set: 10069
Test set: 4980
Average Percent Error: 0.00037887398651208475%
