In [29]:
import pandas as pd
pd.set_option('display.max_columns', None)
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from math import sqrt
import time



import warnings
warnings.filterwarnings('ignore')

In [2]:
# KNN_Start

def loadData(filename):
    # Load data from file into X
    X = []
    count = 0
    
    text_file = open(filename, "r")
    lines = text_file.readlines()
        
    for line in lines:
        X.append([])
        words = line.split(",")
        # Convert values of the first attribute into float
        for word in words:
            if (word=='M'):
                word = 0.333
            if (word=='F'):
                word = 0.666
            if (word=='I'):
                word = 1
            X[count].append(float(word))
        count += 1
    
    return np.asarray(X)


def testNorm(X_norm):
    xMerged = np.copy(X_norm[0])
    # Merge datasets
    for i in range(len(X_norm)-1):
        xMerged = np.concatenate((xMerged,X_norm[i+1]))
    print(np.mean(xMerged,axis=0))
    print(np.sum(xMerged,axis=0))


# This is an example the main of KNN with train-and-test + Euclidean
def knnMain(filename,percentTrain,k):
 
    # Data load
    X = loadData(filename)
    # Normalization
    X_norm = dataNorm(X)
    # Data split: train-and-test
    X_split = splitTT(X_norm,percentTrain)
    # KNN: Euclidean
    accuracy = knn(X_split[0],X_split[1],k)
    
    return accuracy



In [3]:

X = loadData('abalone.data')
print(X)

[[ 0.333   0.455   0.365  ...  0.101   0.15   15.    ]
 [ 0.333   0.35    0.265  ...  0.0485  0.07    7.    ]
 [ 0.666   0.53    0.42   ...  0.1415  0.21    9.    ]
 ...
 [ 0.333   0.6     0.475  ...  0.2875  0.308   9.    ]
 [ 0.666   0.625   0.485  ...  0.261   0.296  10.    ]
 [ 0.333   0.71    0.555  ...  0.3765  0.495  12.    ]]


In [4]:
# Normalisation 
def dataNorm(X):
    X_norm = np.empty(X.shape)
    # 8 normalized input attributes plus 1 non-normalized output attribute
    for x in range(X.shape[1] -1):
        X_norm[:,x]  =(X[:,x] - np.min(X[:,x]))/(np.max(X[:,x])-np.min(X[:,x]))
    X_norm[:,-1] = X[:,-1]
    return X_norm

X_norm = dataNorm(X)
print(X_norm)



[[ 0.          0.51351351  0.5210084  ...  0.1323239   0.14798206
  15.        ]
 [ 0.          0.37162162  0.35294118 ...  0.06319947  0.06826109
   7.        ]
 [ 0.49925037  0.61486486  0.61344538 ...  0.18564845  0.2077728
   9.        ]
 ...
 [ 0.          0.70945946  0.70588235 ...  0.37788018  0.30543099
   9.        ]
 [ 0.49925037  0.74324324  0.72268908 ...  0.34298881  0.29347285
  10.        ]
 [ 0.          0.85810811  0.84033613 ...  0.49506254  0.49177877
  12.        ]]


In [5]:
column_means = X_norm.mean(axis=0)
print(column_means)


[0.47750066 0.60674608 0.59307774 0.12346584 0.29280756 0.24100033
 0.23712127 0.2365031  9.93368446]


In [6]:
column_sums = X_norm.sum(axis=0)
print(column_sums)

[ 1994.52023988  2534.37837838  2477.28571429   515.71681416
  1223.05719851  1006.65837256   990.45556287   987.87344295
 41493.        ]


*Part C*

In [7]:
# Splitting (TT)
def splitTT(X_norm, percentTrain):
    I = X.shape[0]
    # shuffle the data with numpy.random.shuffle() before splitting the dataset
    np.random.shuffle(X_norm)
    # X_train for X_split
    X_train = X_norm[ :round(I*percentTrain),:]
    X_test = X_norm[round(I*percentTrain):, ]

    X_split = [X_train, X_test]
    return X_split

X_split = splitTT(X_norm,0.6)
#testNorm(X_split)
    
print(X_split)

[array([[ 0.        ,  0.89189189,  0.8907563 , ...,  0.7235023 ,
         0.49676134, 11.        ],
       [ 0.        ,  0.55405405,  0.52941176, ...,  0.18564845,
         0.17289487, 10.        ],
       [ 0.        ,  0.47972973,  0.47058824, ...,  0.11323239,
         0.10164425,  7.        ],
       ...,
       [ 0.        ,  0.66216216,  0.66386555, ...,  0.31533904,
         0.26258097, 10.        ],
       [ 0.49925037,  0.77027027,  0.73109244, ...,  0.41474654,
         0.29646238,  9.        ],
       [ 0.49925037,  0.46621622,  0.46218487, ...,  0.1441738 ,
         0.1429995 , 10.        ]]), array([[ 1.        ,  0.53378378,  0.49579832, ...,  0.12969059,
         0.1429995 , 11.        ],
       [ 0.49925037,  0.73648649,  0.74789916, ...,  0.28900592,
         0.32884903, 10.        ],
       [ 0.49925037,  0.7027027 ,  0.69747899, ...,  0.35352205,
         0.30742402,  9.        ],
       ...,
       [ 0.49925037,  0.67567568,  0.63865546, ...,  0.30546412,
        

In [8]:
# Split X_Split

X_train = np.array(X_split[0])
X_test = np.array(X_split[1])

print(X_train)
print()
print(X_test)

[[ 0.          0.89189189  0.8907563  ...  0.7235023   0.49676134
  11.        ]
 [ 0.          0.55405405  0.52941176 ...  0.18564845  0.17289487
  10.        ]
 [ 0.          0.47972973  0.47058824 ...  0.11323239  0.10164425
   7.        ]
 ...
 [ 0.          0.66216216  0.66386555 ...  0.31533904  0.26258097
  10.        ]
 [ 0.49925037  0.77027027  0.73109244 ...  0.41474654  0.29646238
   9.        ]
 [ 0.49925037  0.46621622  0.46218487 ...  0.1441738   0.1429995
  10.        ]]

[[ 1.          0.53378378  0.49579832 ...  0.12969059  0.1429995
  11.        ]
 [ 0.49925037  0.73648649  0.74789916 ...  0.28900592  0.32884903
  10.        ]
 [ 0.49925037  0.7027027   0.69747899 ...  0.35352205  0.30742402
   9.        ]
 ...
 [ 0.49925037  0.67567568  0.63865546 ...  0.30546412  0.22770304
   9.        ]
 [ 1.          0.54054054  0.61344538 ...  0.19749835  0.18236173
   8.        ]
 [ 1.          0.37837838  0.34453782 ...  0.06451613  0.05630294
   7.        ]]


In [9]:
# splitCV() , that takes in the normalized dataset X_norm and the value k
def splitCV(X_norm, k):
    np.random.shuffle(X_norm)
    return np.array_split(X_norm,k)

X_split = splitCV(X_norm, 2)
testNorm(X_split)

[0.47750066 0.60674608 0.59307774 0.12346584 0.29280756 0.24100033
 0.23712127 0.2365031  9.93368446]
[ 1994.52023988  2534.37837838  2477.28571429   515.71681416
  1223.05719851  1006.65837256   990.45556287   987.87344295
 41493.        ]


In [10]:
training_data=X_split[0]
testing_data=X_split[1]

row0 = training_data[0]

In [11]:
# Using Euclidean Distance

def euclideanDistance(rowA, rowB):
    distance = 0.0
    for x in range(len(rowA)-1):
        distance += (rowA[x] - rowB[x])**2
    return sqrt(distance)



In [12]:
training_data=X_split[0]
testing_data=X_split[1]

row0 = training_data[0]
for row in training_data:
  distance = euclideanDistance(row0, row)
  print(distance)

0.0
0.5904121386805619
1.042715030478458
0.3122854947139375
1.4918240584834035
0.014620601300211684
0.47739318565274363
0.7722252338511681
1.1321387899819013
1.3817450258314004
1.1162119526881542
0.6617999983387515
1.4565744666759766
1.3284575392479567
0.7619273617274166
1.844431163930366
1.4261608942793245
0.7340852779819854
1.2442082017798124
1.3530081125162123
0.9508711218594933
1.3290173676940522
1.4155386999281951
1.155357403650249
1.6977802424699082
0.4538891412024839
1.3342121535112628
1.0143740250229174
1.214070964003888
0.8976893533522216
0.8321918827685727
1.051071579525138
0.7406727451421558
1.3513848547898337
1.092055053310688
0.7332964305592727
0.5145854087974098
0.23565127635999114
1.0479452402456186
1.8918944385229075
1.3763519637751886
0.7916237844438517
1.4114644764556348
1.406037487495969
1.4872905477800962
1.3213804514442171
1.5722414754820566
0.4685338610959914
1.0934821492966524
1.082416013585014
1.4132378600673434
1.0489455815377986
1.176752280339661
0.70538648547

In [13]:
def get_neighbours(training_data, test_row, num_neighbors):
    distances = list()
    for train_row in training_data: 
        distance = euclideanDistance(test_row, train_row)
        distances.append((train_row, distance))
    distances.sort(key=lambda tup: tup[1])
    neighbours = list()
    for n in range(num_neighbors):
        neighbours.append(distances[n][0])
    return neighbours
 

In [14]:
test_row = testing_data[16]
neighbours = (get_neighbours(training_data, test_row, 3))
for neighbour in neighbours: 
    print(neighbour)

[0.49925037 0.77027027 0.76470588 0.14159292 0.43899416 0.39038332
 0.36273864 0.31240658 9.        ]
[0.49925037 0.75675676 0.76470588 0.15044248 0.43261909 0.35709482
 0.35615537 0.35127055 9.        ]
[ 0.49925037  0.75675676  0.74789916  0.15929204  0.44430671  0.36180229
  0.38380513  0.3472845  10.        ]


In [15]:
# Making classifiation prediction with the neighbours above

def predictClassification(training_data, test_row, num_neighbors):
    neighbours = get_neighbours(training_data, test_row, num_neighbors)
    outputValues = [row[-1] for row in neighbours]
    # Getting the most represented class among the neighbours
    prediction = max(set(outputValues), key=outputValues.count)
    return prediction
    
    


In [16]:
prediction = (X_train, X_test, 1)
print((training_data[0][-1], prediction))

(6.0, (array([[ 0.        ,  0.89189189,  0.8907563 , ...,  0.7235023 ,
         0.49676134, 11.        ],
       [ 0.        ,  0.55405405,  0.52941176, ...,  0.18564845,
         0.17289487, 10.        ],
       [ 0.        ,  0.47972973,  0.47058824, ...,  0.11323239,
         0.10164425,  7.        ],
       ...,
       [ 0.        ,  0.66216216,  0.66386555, ...,  0.31533904,
         0.26258097, 10.        ],
       [ 0.49925037,  0.77027027,  0.73109244, ...,  0.41474654,
         0.29646238,  9.        ],
       [ 0.49925037,  0.46621622,  0.46218487, ...,  0.1441738 ,
         0.1429995 , 10.        ]]), array([[ 1.        ,  0.53378378,  0.49579832, ...,  0.12969059,
         0.1429995 , 11.        ],
       [ 0.49925037,  0.73648649,  0.74789916, ...,  0.28900592,
         0.32884903, 10.        ],
       [ 0.49925037,  0.7027027 ,  0.69747899, ...,  0.35352205,
         0.30742402,  9.        ],
       ...,
       [ 0.49925037,  0.67567568,  0.63865546, ...,  0.30546412,
  

In [17]:
test_row1 = testing_data[100]
prediction = predictClassification(training_data, test_row1, 2)
print(prediction)
print('Expected %d, Got %d.' % (training_data[100][-1], prediction))

7.0
Expected 9, Got 7.


In [18]:
# The KNN() function should take in the training dataset X_train , testing dataset X_test , 
# and the number of nearest neighbors, K , and returns the accuracy score as a result of the classification.
# Get accruacy % 
def accuracyMetric(actual, predicted):
    correct = 0
    for i in range(len(actual)):
        if actual[i] == predicted[i]:
            correct += 1
    return correct / float(len(actual)) * 100.0

def KNN(X_train, X_test, num_neighbors):
    predictions = []
    actualVal = [a[-1] for a in X_test]
    for row in X_test:
        out = predictClassification(X_train, row, num_neighbors)
        predictions.append(out)
    return predictions, actualVal


In [19]:
def knn(train, test, num_neighbors):
    predictions, actual_val=KNN(train, test, num_neighbors)
    accuracy = accuracyMetric(actual_val, predictions)
    return accuracy

In [20]:
def knnMain_TT(filename,percentTrain,k):
 
    # Data load
    X = loadData(filename)
    # Normalization
    X_norm = dataNorm(X)
    # Data split: train-and-test
    X_split = splitTT(X_norm, percentTrain)
    # KNN: Euclidean
    accuracy = knn(X_split[0],X_split[1],k)
    
    return accuracy

In [66]:
print("0.7, 1")
t0 = time.process_time()
result_accuracy0_TT=knnMain_TT('abalone.data', 0.7, 1)
elapsed_time0_TT = time.process_time() - t0

print(result_accuracy0_TT)
print(elapsed_time0_TT)

print()
print("0.7, 5")
t0 = time.process_time()
result_accuracy0_TT=knnMain_TT('abalone.data', 0.7, 5)
elapsed_time0_TT = time.process_time() - t0

print(result_accuracy0_TT)
print(elapsed_time0_TT)

print()
print("0.7, 10")
t0 = time.process_time()
result_accuracy0_TT=knnMain_TT('abalone.data', 0.7, 10)
elapsed_time0_TT = time.process_time() - t0

print(result_accuracy0_TT)
print(elapsed_time0_TT)

print()
print("0.7, 15")
t0 = time.process_time()
result_accuracy0_TT=knnMain_TT('abalone.data', 0.7, 15)
elapsed_time0_TT = time.process_time() - t0

print(result_accuracy0_TT)
print(elapsed_time0_TT)

print()
print("0.7, 20")
t0 = time.process_time()
result_accuracy0_TT=knnMain_TT('abalone.data', 0.7, 20)
elapsed_time0_TT = time.process_time() - t0

print(result_accuracy0_TT)
print(elapsed_time0_TT)

0.7, 1
21.94732641660016
9.55148500000007

0.7, 5
24.18196328810854
9.528658999999948

0.7, 10
23.224261771747805
9.463391000000001

0.7, 15
25.538707102952912
9.469817000000035

0.7, 20
26.895450917797287
9.476550999999972


In [67]:
print("0.6, 1")
t0 = time.process_time()
result_accuracy0_TT=knnMain_TT('abalone.data', 0.6, 1)
elapsed_time0_TT = time.process_time() - t0

print(result_accuracy0_TT)
print(elapsed_time0_TT)

print()
print("0.6, 5")
t0 = time.process_time()
result_accuracy0_TT=knnMain_TT('abalone.data', 0.6, 5)
elapsed_time0_TT = time.process_time() - t0

print(result_accuracy0_TT)
print(elapsed_time0_TT)

print()
print("0.6, 10")
t0 = time.process_time()
result_accuracy0_TT=knnMain_TT('abalone.data', 0.6, 10)
elapsed_time0_TT = time.process_time() - t0

print(result_accuracy0_TT)
print(elapsed_time0_TT)

print()
print("0.6, 15")
t0 = time.process_time()
result_accuracy0_TT=knnMain_TT('abalone.data', 0.6, 15)
elapsed_time0_TT = time.process_time() - t0

print(result_accuracy0_TT)
print(elapsed_time0_TT)

print()
print("0.6, 20")
t0 = time.process_time()
result_accuracy0_TT=knnMain_TT('abalone.data', 0.6, 20)
elapsed_time0_TT = time.process_time() - t0

print(result_accuracy0_TT)
print(elapsed_time0_TT)

0.6, 1
21.244763614602036
10.826109000000088

0.6, 5
21.603830041891083
10.816377999999986

0.6, 10
24.83542788749252
10.79488399999991

0.6, 15
24.117295032914424
10.79551200000003

0.6, 20
25.73309395571514
10.794289999999933


In [77]:
print("0.5, 1")
t0 = time.process_time()
result_accuracy0_TT=knnMain_TT('abalone.data', 0.5, 1)
elapsed_time0_TT = time.process_time() - t0

print(result_accuracy0_TT)
print(elapsed_time0_TT)

print()
print("0.5, 5")
t0 = time.process_time()
result_accuracy0_TT=knnMain_TT('abalone.data', 0.5, 5)
elapsed_time0_TT = time.process_time() - t0

print(result_accuracy0_TT)
print(elapsed_time0_TT)

print()
print("0.5, 10")
t0 = time.process_time()
result_accuracy0_TT=knnMain_TT('abalone.data', 0.5, 10)
elapsed_time0_TT = time.process_time() - t0

print(result_accuracy0_TT)
print(elapsed_time0_TT)

print()
print("0.5, 15")
t0 = time.process_time()
result_accuracy0_TT=knnMain_TT('abalone.data', 0.5, 15)
elapsed_time0_TT = time.process_time() - t0

print(result_accuracy0_TT)
print(elapsed_time0_TT)

print()
print("0.5, 20")
t0 = time.process_time()
result_accuracy0_TT=knnMain_TT('abalone.data', 0.5, 20)
elapsed_time0_TT = time.process_time() - t0

print(result_accuracy0_TT)
print(elapsed_time0_TT)

0.5, 1
21.876495931067495
11.203926000000024

0.5, 5
21.302058401148873
11.22302000000002

0.5, 10
24.605074198180947
11.201097000000004

0.5, 15
27.23791287697463
11.187688999999978

0.5, 20
25.17951172809957
11.261899999999969


In [26]:
def knnMain_CV(filename,n_folds,k):
 
    # Data load
    X = loadData(filename)
    # Normalization
    X_norm = dataNorm(X)
    # Data split: train-and-test
    newSplit = splitCV(X_norm, n_folds)
    # KNN: Euclidean
#     newSplit = np.random.sample(X_split, len(X_split))
    test = newSplit[n_folds-1]
    training_set = newSplit[:n_folds-1]
    for train in training_set:
        accuracy = knn(train, test, k)
        return accuracy

In [73]:
print("5, 1")
t0 = time.process_time()
result_accuracy0=knnMain_CV('abalone.data', 5, 1)
elapsed_time0 = time.process_time() - t0

print(result_accuracy0)
print(elapsed_time0)

print()
print("5, 5")
t0 = time.process_time()
result_accuracy0=knnMain_CV('abalone.data', 5, 5)
elapsed_time0 = time.process_time() - t0

print(result_accuracy0)
print(elapsed_time0)

print()
print("5, 10")
t0 = time.process_time()
result_accuracy0=knnMain_CV('abalone.data', 5, 10)
elapsed_time0 = time.process_time() - t0

print(result_accuracy0)
print(elapsed_time0)

print()
print("5, 15")
t0 = time.process_time()
result_accuracy0=knnMain_CV('abalone.data', 5, 15)
elapsed_time0 = time.process_time() - t0

print(result_accuracy0)
print(elapsed_time0)

print()
print("5, 20")
t0 = time.process_time()
result_accuracy0=knnMain_CV('abalone.data', 5, 20)
elapsed_time0 = time.process_time() - t0

print(result_accuracy0)
print(elapsed_time0)

print()

5, 1
20.119760479041915
1.7758960000001025

5, 5
22.51497005988024
1.771645000000035

5, 10
23.952095808383234
1.7787349999999833

5, 15
22.75449101796407
1.7946190000000115

5, 20
26.586826347305387
1.7977849999999762



In [78]:
print("10, 1")
t0 = time.process_time()
result_accuracy0=knnMain_CV('abalone.data', 10, 1)
elapsed_time0 = time.process_time() - t0

print(result_accuracy0)
print(elapsed_time0)

print()
print("10, 5")
t0 = time.process_time()
result_accuracy0=knnMain_CV('abalone.data', 10, 5)
elapsed_time0 = time.process_time() - t0

print(result_accuracy0)
print(elapsed_time0)

print()
print("10, 10")
t0 = time.process_time()
result_accuracy0=knnMain_CV('abalone.data', 10, 10)
elapsed_time0 = time.process_time() - t0

print(result_accuracy0)
print(elapsed_time0)

print()
print("10, 15")
t0 = time.process_time()
result_accuracy0=knnMain_CV('abalone.data', 10, 15)
elapsed_time0 = time.process_time() - t0

print(result_accuracy0)
print(elapsed_time0)

print()
print("10, 20")
t0 = time.process_time()
result_accuracy0=knnMain_CV('abalone.data', 10, 20)
elapsed_time0 = time.process_time() - t0

print(result_accuracy0)
print(elapsed_time0)

print()

10, 1
21.34292565947242
0.45858400000008714

10, 5
22.302158273381295
0.4523669999999811

10, 10
23.980815347721823
0.4557999999999538

10, 15
24.700239808153476
0.4521750000000111

10, 20
24.46043165467626
0.4551500000000033



In [75]:
print("15, 1")
t0 = time.process_time()
result_accuracy0=knnMain_CV('abalone.data', 15, 1)
elapsed_time0 = time.process_time() - t0

print(result_accuracy0)
print(elapsed_time0)

print()
print("15, 5")
t0 = time.process_time()
result_accuracy0=knnMain_CV('abalone.data', 15, 5)
elapsed_time0 = time.process_time() - t0

print(result_accuracy0)
print(elapsed_time0)

print()
print("15, 10")
t0 = time.process_time()
result_accuracy0=knnMain_CV('abalone.data', 15, 10)
elapsed_time0 = time.process_time() - t0

print(result_accuracy0)
print(elapsed_time0)

print()
print("15, 15")
t0 = time.process_time()
result_accuracy0=knnMain_CV('abalone.data', 15, 15)
elapsed_time0 = time.process_time() - t0

print(result_accuracy0)
print(elapsed_time0)

print()
print("15, 20")
t0 = time.process_time()
result_accuracy0=knnMain_CV('abalone.data', 15, 20)
elapsed_time0 = time.process_time() - t0

print(result_accuracy0)
print(elapsed_time0)

print()

15, 1
23.741007194244602
0.208438000000001

15, 5
21.58273381294964
0.2072849999999562

15, 10
21.58273381294964
0.2075760000000173

15, 15
19.424460431654676
0.20916999999997188

15, 20
23.741007194244602
0.20929200000000492



In [144]:
import pandas as pd
pd.set_option('display.max_columns', None)
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from math import sqrt
import time



import warnings
warnings.filterwarnings('ignore')


# KNN_Start
def loadData(filename):
    # Load data from file into X
    X = []
    count = 0
    
    text_file = open(filename, "r")
    lines = text_file.readlines()
        
    for line in lines:
        X.append([])
        words = line.split(",")
        # Convert values of the first attribute into float
        for word in words:
            if (word=='M'):
                word = 0.333
            if (word=='F'):
                word = 0.666
            if (word=='I'):
                word = 1
            X[count].append(float(word))
        count += 1
    
    return np.asarray(X)


def testNorm(X_norm):
    xMerged = np.copy(X_norm[0])
    # Merge datasets
    for i in range(len(X_norm)-1):
        xMerged = np.concatenate((xMerged,X_norm[i+1]))
    print(np.mean(xMerged,axis=0))
    print(np.sum(xMerged,axis=0))


# This is an example the main of KNN with train-and-test + Euclidean
def knnMain(filename,percentTrain,k):
 
    # Data load
    X = loadData(filename)
    # Normalization
    X_norm = dataNorm(X)
    # Data split: train-and-test
    X_split = splitTT(X_norm,percentTrain)
    # KNN: Euclidean
    accuracy = knn(X_split[0],X_split[1],k)
    
    return accuracy


# Normalisation 
def dataNorm(X):
    X_norm = np.empty(X.shape)
    # 8 normalized input attributes plus 1 non-normalized output attribute
    for x in range(X.shape[1] -1):
        X_norm[:,x]  =(X[:,x] - np.min(X[:,x]))/(np.max(X[:,x])-np.min(X[:,x]))
    X_norm[:,-1] = X[:,-1]
    return X_norm


# Splitting (TT)
def splitTT(X_norm, percentTrain):
    I = X.shape[0]
    # shuffle the data with numpy.random.shuffle() before splitting the dataset
    np.random.shuffle(X_norm)
    # X_train for X_split
    X_train = X_norm[ :round(I*percentTrain),:]
    X_test = X_norm[round(I*percentTrain):, ]

    X_split = [X_train, X_test]
    return X_split

# splitCV() , that takes in the normalized dataset X_norm and the value k
def splitCV(X_norm, k):
    np.random.shuffle(X_norm)
    return np.array_split(X_norm,k)

# Euclidean Distance
def euclideanDistance(rowA, rowB):
    distance = 0.0
    for x in range(len(rowA)-1):
        distance += (rowA[x] - rowB[x])**2
    return sqrt(distance)

# Getting neighbour
def get_neighbours(training_data, test_row, num_neighbors):
    distances = list()
    for train_row in training_data: 
        distance = euclideanDistance(test_row, train_row)
        distances.append((train_row, distance))
    distances.sort(key=lambda tup: tup[1])
    neighbours = list()
    for n in range(num_neighbors):
        neighbours.append(distances[n][0])
    return neighbours
 
# Making classifiation prediction with the neighbours above
def predictClassification(training_data, test_row, num_neighbors):
    neighbours = get_neighbours(training_data, test_row, num_neighbors)
    outputValues = [row[-1] for row in neighbours]
    # Getting the most represented class among the neighbours
    prediction = max(set(outputValues), key=outputValues.count)
    return prediction
    
    
# The KNN() function should take in the training dataset X_train , testing dataset X_test , 
# and the number of nearest neighbors, K , and returns the accuracy score as a result of the classification.
# Get accruacy % 
def accuracyMetric(actual, predicted):
    correct = 0
    for i in range(len(actual)):
        if actual[i] == predicted[i]:
            correct += 1
    return correct / float(len(actual)) * 100.0

def KNN(X_train, X_test, num_neighbors):
    predictions = []
    actualVal = [a[-1] for a in X_test]
    for row in X_test:
        out = predictClassification(X_train, row, num_neighbors)
        predictions.append(out)
    return predictions, actualVal
 
def knn(train, test, num_neighbors):
    predictions, actual_val=KNN(train, test, num_neighbors)
    accuracy = accuracyMetric(actual_val, predictions)
    return accuracy

# Get knnMain for TT
def knnMain_TT(filename,percentTrain,k):
    # Data load
    X = loadData(filename)
    # Normalization
    X_norm = dataNorm(X)
    # Data split: train-and-test
    X_split = splitTT(X_norm, percentTrain)
    # KNN: Euclidean
    accuracy = knn(X_split[0],X_split[1],k)
    
    return accuracy

# Get knnMain for CV
def knnMain_CV(filename,n_folds,k):
    # Data load
    X = loadData(filename)
    # Normalization
    X_norm = dataNorm(X)
    # Data split: train-and-test
    newSplit = splitCV(X_norm, n_folds)
    # KNN: Euclidean
#     newSplit = np.random.sample(X_split, len(X_split))
    test = newSplit[n_folds-1]
    training_set = newSplit[:n_folds-1]
    for train in training_set:
        accuracy = knn(train, test, k)
        return accuracy

In [None]:
def knnMain_CV_CR(filename,n_folds,k):
 
    # Data load
    X = loadData(filename)
    X_norm = dataNorm(X)
    # Data split: train-and-test
    newSplit = splitCV(X_norm, n_folds)
    test = newSplit[n_folds-1]
    training_set = newSplit[:n_folds-1]
    for train in training_set:
        predictions, actualVal = KNN(training_data, testing_data, k)
        return predictions, actualVal

filename = 'abalone.data'
n_fold = 5
k = 15 
classification = knnMain_CV_CR(filename, n_fold, k)

from sklearn.metrics import classification_report
y_true = classification[1]
y_pred = classification[0]

report = classification_report(y_true, y_pred)
print(report)


