In [1]:
import math
import operator
import numpy as np 
import pandas as pd 
import matplotlib.pyplot as plt
from scipy.stats import pearsonr

In [8]:
def euclideanDistance(instance1, instance2, length):
    distance = 0
    for x in range(length):
        distance += pow((instance1[x] - instance2[x]), 2)
    return math.sqrt(distance)
 
def getNeighbors(trainingSet, testInstance, k):
    distances = []
    length = len(testInstance)-1
    print (testInstance)
    for x in range(len(trainingSet)):
        #print (testInstance,trainingSet[x])
        dist = euclideanDistance(testInstance, trainingSet[x], length)
        distances.append((trainingSet[x], dist))
    distances.sort(key=operator.itemgetter(1))
    neighbors = []
    for x in range(k):
        neighbors.append(distances[x][0])
    return neighbors
 
def getResponse(neighbors):
	classVotes = {}
	for x in range(len(neighbors)):
		response = neighbors[x][-1]
		if response in classVotes:
			classVotes[response] += 1
		else:
			classVotes[response] = 1
	sortedVotes = sorted(classVotes.items(), key=operator.itemgetter(1), reverse=True)
	return sortedVotes[0][0]
 
def getAccuracy(testSet, predictions):
	correct = 0
	for x in range(len(testSet)):
		if testSet[x] == predictions[x]:
			correct += 1
	return (correct/float(len(testSet))) * 100.0

In [3]:
#Getting the important
def get_important_features(data,label,thresh):
    correlation = []
    for col in data.columns:
        correlation.append((pearsonr(data[col],label)[0],col))

    return [x[1] for x in sorted(correlation) if x[0]>thresh]
    
def get_important_feature_data(data,important_features):
    important_feature_data = []
    #print important_features
    for index in range(tornado_data.shape[0]):
        row = [data[col][index] for col in important_features]
        important_feature_data.append(row)
    return important_feature_data

In [4]:
df = pd.read_csv('Tornadoes_SPC_1950to2015.csv')
tonadoes_1996 = df[df['yr'] >= 1996][['st','loss','closs']]


In [5]:
#Removing columns with string data and label
#print (tornado_data[0:12])
tornado_data = df.drop(['st','date','time','mag'],axis=1)
#Getting the labels
mag_data = df['mag']
tornado_data["mag"]=mag_data
print (tornado_data[0:12])

    om    yr  mo  dy  tz  stf  stn  inj  fat  loss  closs   slat   slon  \
0    1  1950   1   3   3   29    1    3    0   6.0    0.0  38.77 -90.22   
1    2  1950   1   3   3   17    2    3    0   5.0    0.0  39.10 -89.30   
2    3  1950   1   3   3   39    1    1    0   4.0    0.0  40.88 -84.58   
3    4  1950   1  13   3    5    1    1    1   3.0    0.0  34.40 -94.37   
4    5  1950   1  25   3   29    2    5    0   5.0    0.0  37.60 -90.68   
5    6  1950   1  25   3   17    3    0    0   5.0    0.0  41.17 -87.33   
6    7  1950   1  26   3   48    1    2    0   0.0    0.0  26.88 -98.12   
7    8  1950   2  11   3   48    2    0    0   4.0    0.0  29.42 -95.25   
8    9  1950   2  11   3   48    3   12    1   4.0    0.0  29.67 -95.05   
9   10  1950   2  11   3   48    4    5    0   5.0    0.0  32.35 -95.20   
10  11  1950   2  11   3   48    5    6    0   5.0    0.0  32.98 -94.63   
11  12  1950   2  12   3   48    6    8    1   4.0    0.0  33.33 -94.42   

     elat   elon   len  

In [6]:
important_features = get_important_features(tornado_data,mag_data,thresh=0.1)

In [7]:
important_feature_data = get_important_feature_data(tornado_data,important_features)
important_feature_data[0:12]

[[6.0, -90.219999999999999, 0, 3, 150, 9.5, 3],
 [5.0, -89.299999999999997, 0, 3, 130, 3.6000000000000001, 3],
 [4.0, -84.579999999999998, 0, 1, 10, 0.10000000000000001, 1],
 [3.0, -94.370000000000005, 1, 1, 17, 0.59999999999999998, 3],
 [5.0, -90.680000000000007, 0, 5, 300, 2.2999999999999998, 2],
 [5.0, -87.329999999999998, 0, 0, 100, 0.10000000000000001, 2],
 [0.0, -98.120000000000005, 0, 2, 133, 4.7000000000000002, 2],
 [4.0, -95.25, 0, 0, 400, 9.9000000000000004, 2],
 [4.0, -95.049999999999997, 1, 12, 1000, 12.0, 3],
 [5.0, -95.200000000000003, 0, 5, 100, 4.5999999999999996, 2],
 [5.0, -94.629999999999995, 0, 6, 67, 4.5, 2],
 [4.0, -94.420000000000002, 1, 8, 833, 8.0, 2]]

In [11]:
predictions = []
num_rows = mag_data.shape[0]
#print (mag_data.shape)
k = 5
for x in range(1000):
    #print (mag_data[x])
    neighbours = getNeighbors(important_feature_data[0:10000],important_feature_data[10000+x] , k)
    #print (neighbours)
    result = getResponse(neighbours)
    #print (result)
    predictions.append(result)
#print (type(predictions))
print (type(mag_data[10000:11000]))
mag = list (mag_data[10000:11000])
#mag = np.array(mag_data[10:12])
print(type(mag))
accuracy = getAccuracy(mag,predictions)
print (accuracy)

[0.0, -100.52, 0, 0, 17, 0.10000000000000001, 2]
[4.0, -76.579999999999998, 0, 5, 50, 2.0, 3]
[2.0, -102.84999999999999, 0, 0, 10, 0.10000000000000001, 1]
[0.0, -88.980000000000004, 0, 0, 10, 11.4, 1]
[4.0, -90.0, 0, 0, 10, 0.10000000000000001, 1]
[0.0, -89.719999999999999, 0, 0, 10, 0.10000000000000001, 0]
[4.0, -99.620000000000005, 0, 0, 10, 0.10000000000000001, 0]
[0.0, -94.579999999999998, 0, 0, 33, 0.20000000000000001, 0]
[0.0, -98.5, 0, 0, 10, 0.10000000000000001, 0]
[4.0, -84.25, 0, 0, 30, 2.0, 2]
[4.0, -81.629999999999995, 0, 0, 33, 0.5, 2]
[0.0, -97.700000000000003, 0, 0, 10, 0.10000000000000001, 0]
[0.0, -98.75, 0, 0, 10, 0.10000000000000001, 0]
[0.0, -97.599999999999994, 0, 0, 10, 0.10000000000000001, 0]
[0.0, -97.620000000000005, 0, 0, 10, 0.10000000000000001, 0]
[0.0, -98.079999999999998, 0, 0, 10, 0.10000000000000001, 0]
[0.0, -98.700000000000003, 0, 0, 10, 0.10000000000000001, 0]
[4.0, -97.799999999999997, 0, 0, 10, 0.10000000000000001, 1]
[0.0, -97.819999999999993, 0, 0

[0.0, -97.280000000000001, 0, 0, 10, 0.10000000000000001, 0]
[4.0, -97.879999999999995, 0, 0, 10, 0.10000000000000001, 1]
[4.0, -96.579999999999998, 0, 1, 10, 0.10000000000000001, 1]
[2.0, -97.920000000000002, 0, 0, 10, 0.10000000000000001, 1]
[1.0, -97.799999999999997, 0, 0, 10, 0.10000000000000001, 0]
[2.0, -97.799999999999997, 0, 0, 10, 0.10000000000000001, 1]
[0.0, -97.319999999999993, 0, 0, 10, 7.7000000000000002, 1]
[3.0, -97.319999999999993, 0, 0, 10, 0.10000000000000001, 1]
[6.0, -97.069999999999993, 0, 4, 333, 4.2999999999999998, 3]
[0.0, -99.230000000000004, 0, 0, 10, 0.10000000000000001, 0]
[5.0, -97.420000000000002, 0, 1, 10, 0.10000000000000001, 1]
[3.0, -98.069999999999993, 0, 0, 10, 0.10000000000000001, 1]
[0.0, -94.829999999999998, 0, 0, 10, 0.10000000000000001, 0]
[5.0, -98.120000000000005, 0, 0, 67, 5.0, 3]
[0.0, -97.950000000000003, 0, 0, 10, 0.10000000000000001, 0]
[4.0, -96.849999999999994, 0, 3, 167, 2.0, 3]
[0.0, -97.519999999999996, 0, 0, 10, 0.10000000000000001

[4.0, -86.219999999999999, 0, 0, 10, 1.0, 1]
[4.0, -87.700000000000003, 0, 0, 10, 0.10000000000000001, 1]
[0.0, -88.079999999999998, 0, 0, 10, 12.1, 2]
[0.0, -87.719999999999999, 0, 0, 10, 0.10000000000000001, 2]
[4.0, -87.150000000000006, 0, 0, 10, 0.10000000000000001, 1]
[4.0, -87.519999999999996, 0, 0, 10, 0.10000000000000001, 2]
[5.0, -86.170000000000002, 0, 0, 10, 0.10000000000000001, 1]
[3.0, -85.700000000000003, 0, 0, 10, 0.10000000000000001, 2]
[5.0, -159.66999999999999, 0, 0, 70, 2.5, 1]
[4.0, -88.969999999999999, 0, 0, 100, 0.29999999999999999, 0]
[6.0, -86.700000000000003, 2, 27, 300, 22.899999999999999, 2]
[4.0, -88.150000000000006, 0, 0, 10, 7.5999999999999996, 3]
[4.0, -85.200000000000003, 0, 2, 27, 3.2999999999999998, 2]
[4.0, -86.799999999999997, 0, 0, 10, 2.0, 2]
[4.0, -111.81999999999999, 0, 0, 33, 0.29999999999999999, 1]
[3.0, -90.280000000000001, 0, 0, 100, 0.5, 2]
[4.0, -92.069999999999993, 0, 2, 150, 12.300000000000001, 2]
[4.0, -92.450000000000003, 0, 0, 50, 6.40

[0.0, -97.420000000000002, 0, 0, 20, 0.29999999999999999, 1]
[0.0, -99.079999999999998, 0, 0, 10, 0.10000000000000001, 0]
[5.0, -97.400000000000006, 0, 1, 17, 0.10000000000000001, 2]
[4.0, -95.349999999999994, 0, 1, 67, 3.2999999999999998, 1]
[5.0, -84.599999999999994, 0, 1, 200, 9.3000000000000007, 1]
[7.0, -84.370000000000005, 6, 364, 550, 78.700000000000003, 4]
[6.0, -85.480000000000004, 0, 11, 100, 60.899999999999999, 3]
[6.0, -84.269999999999996, 1, 33, 300, 24.100000000000001, 4]
[4.0, -84.299999999999997, 0, 0, 100, 0.10000000000000001, 1]
[1.0, -84.0, 0, 0, 10, 0.10000000000000001, 2]
[4.0, -83.420000000000002, 0, 1, 20, 0.10000000000000001, 0]
[5.0, -82.719999999999999, 0, 0, 200, 8.1999999999999993, 1]
[5.0, -82.870000000000005, 0, 1, 100, 4.9000000000000004, 2]
[6.0, -82.950000000000003, 0, 5, 10, 0.10000000000000001, 3]
[5.0, -85.819999999999993, 0, 1, 10, 0.10000000000000001, 1]
[6.0, -82.799999999999997, 7, 93, 400, 34.0, 5]
[3.0, -92.349999999999994, 0, 0, 67, 1.0, 2]
[5

[4.0, -86.230000000000004, 0, 0, 10, 0.10000000000000001, 2]
[0.0, -87.129999999999995, 0, 0, 10, 0.10000000000000001, 1]
[3.0, -83.769999999999996, 0, 0, 17, 0.20000000000000001, 1]
[3.0, -87.469999999999999, 0, 0, 10, 0.10000000000000001, 1]
[4.0, -88.450000000000003, 0, 0, 100, 1.0, 0]
[4.0, -83.200000000000003, 0, 0, 37, 2.0, 1]
[3.0, -83.200000000000003, 0, 0, 33, 0.90000000000000002, 1]
[0.0, -80.599999999999994, 0, 0, 10, 0.10000000000000001, 0]
[3.0, -95.680000000000007, 0, 0, 10, 0.10000000000000001, 1]
[5.0, -87.430000000000007, 0, 0, 10, 2.0, 2]
[0.0, -80.670000000000002, 0, 0, 10, 0.10000000000000001, 0]
[4.0, -85.099999999999994, 0, 0, 33, 0.80000000000000004, 2]
[0.0, -98.370000000000005, 0, 0, 10, 0.10000000000000001, 0]
[0.0, -99.599999999999994, 0, 0, 17, 0.5, 0]
[0.0, -102.38, 0, 0, 67, 1.0, 0]
[0.0, -102.38, 0, 0, 17, 0.20000000000000001, 0]
[0.0, -101.38, 0, 0, 23, 0.10000000000000001, 0]
[6.0, -101.92, 0, 0, 3000, 5.0, 2]
[0.0, -101.78, 0, 0, 33, 0.1000000000000000

[5.0, -91.780000000000001, 0, 2, 300, 39.0, 2]
[2.0, -97.599999999999994, 0, 0, 50, 2.0, 2]
[4.0, -69.769999999999996, 0, 0, 33, 2.2999999999999998, 1]
[3.0, -68.900000000000006, 0, 2, 27, 0.29999999999999999, 1]
[5.0, -70.799999999999997, 0, 1, 100, 0.29999999999999999, 1]
[4.0, -80.700000000000003, 0, 0, 10, 0.10000000000000001, 1]
[3.0, -82.420000000000002, 0, 0, 400, 0.80000000000000004, 1]
[3.0, -105.15000000000001, 0, 0, 17, 0.10000000000000001, 0]
[0.0, -94.730000000000004, 0, 0, 10, 0.10000000000000001, 0]
[4.0, -111.75, 0, 2, 33, 0.10000000000000001, 2]
[1.0, -84.299999999999997, 0, 0, 10, 0.10000000000000001, 0]
[4.0, -101.27, 0, 0, 30, 0.5, 1]
[0.0, -96.719999999999999, 0, 0, 60, 1.2, 0]
[4.0, -80.599999999999994, 0, 1, 10, 0.10000000000000001, 1]
[1.0, -92.25, 0, 0, 50, 0.29999999999999999, 0]
[1.0, -80.200000000000003, 0, 0, 10, 0.10000000000000001, 0]
[3.0, -102.53, 0, 0, 10, 0.10000000000000001, 0]
[5.0, -77.049999999999997, 0, 0, 10, 0.10000000000000001, 1]
[0.0, -96.59

[3.0, -97.400000000000006, 0, 0, 17, 2.0, 1]
[0.0, -102.42, 0, 0, 10, 0.10000000000000001, 0]
[3.0, -97.829999999999998, 0, 0, 17, 0.20000000000000001, 0]
[0.0, -100.81999999999999, 0, 0, 13, 0.10000000000000001, 0]
[0.0, -102.83, 0, 0, 200, 1.0, 1]
[3.0, -100.83, 0, 0, 17, 0.10000000000000001, 0]
[0.0, -122.03, 0, 0, 13, 0.10000000000000001, 0]
[0.0, -98.599999999999994, 0, 0, 10, 0.10000000000000001, 0]
[4.0, -80.099999999999994, 0, 0, 10, 0.10000000000000001, 1]
[4.0, -82.549999999999997, 0, 0, 50, 0.10000000000000001, 2]
[3.0, -97.5, 0, 0, 17, 1.0, 1]
[0.0, -110.93000000000001, 0, 0, 10, 0.10000000000000001, 0]
[0.0, -101.83, 0, 0, 17, 0.10000000000000001, 0]
[0.0, -89.079999999999998, 0, 5, 123, 2.0, 3]
[4.0, -89.650000000000006, 0, 3, 180, 43.5, 2]
[0.0, -89.370000000000005, 0, 5, 50, 2.0, 2]
[5.0, -88.120000000000005, 0, 18, 400, 70.400000000000006, 3]
[4.0, -79.769999999999996, 0, 0, 40, 0.80000000000000004, 1]
[5.0, -97.0, 0, 2, 200, 14.4, 3]
[5.0, -95.200000000000003, 0, 0, 1

In [None]:
for x in range(10):
    print (x)

In [174]:
def main():
    # prepare data
    trainingSet=[]
    testSet=[]
    split = 0.67
    df = pd.read_csv('Tornadoes_SPC_1950to2015.csv')
    #tornado_data = df.drop(['st','date','time','mag'],axis=1)
    dataset = df
    print(dataset)
    for x in range(len(dataset)-1):
        if dataset[x]!='mag':
            trainingSet.append(dataset[x])
        else:
            testSet.append(dataset[x])
    print (trainingSet)
    print (testSet)
    print (dataset['mag'])
    print ('Train set: ' + repr(len(trainingSet)))
    print ('Test set: ' + repr(len(testSet)))
    # generate predictions
    predictions=[]
    k = 3
    for x in xrange(len(testSet)):
        neighbors = getNeighbors(trainingSet, testSet[x], k)
        result = getResponse(neighbors)
        predictions.append(result)
        print('> predicted=' + repr(result) + ', actual=' + repr(testSet[x][-1]))
    accuracy = getAccuracy(testSet, predictions)
    print('Accuracy: ' + repr(accuracy) + '%')
    
