In [1]:
import numpy as np
import pandas as pd
from matplotlib import pyplot as plt
from sklearn.model_selection import KFold
import time

In [2]:
colnames = ["BI-RADS assessment", "Age", "Shape", "Margin", "Density", "Severity"]
df = pd.read_csv("F://Uni/991/Machine_Learning/ML_HW2/Dataset/1- KNN/mammographic_masses.data", names=colnames, header=None)

In [3]:
#df[df=='?']=np.nan
#df = df.fillna(method='ffill')

In [4]:
df[df=='?']=np.nan
df = df.dropna()

In [5]:
def Normalize(n, x):
    norm_x = x
    for i in range(0,n):
        min_x = np.amin(x[:,i])
        max_x = np.amax(x[:,i])
        norm_x[:,i] = (x[:,i] - min_x)/(max_x - min_x)
    return norm_x

In [6]:
def Euclidean(test1, train1):
    distance1 = np.sum((np.power((test1-train1),2)), axis = 1)
    return np.sqrt(distance1)

In [7]:
def Manhattan(test1, train1):
    distance1 = np.sum(abs(test1-train1), axis = 1)
    return distance1

In [8]:
def Cosine(test1, train1):
    distance1 = np.dot(test1, train1.T)/(np.linalg.norm(test1)*np.linalg.norm(train1))
    return distance1

In [9]:
def get_neighbors(train2, ytrain2, test_row2, k2, distnce_criterion):
    if distnce_criterion=="Euclidian":
        dist2 = Euclidean(test_row2, train2)
    elif distnce_criterion=="Cosine":
        dist2 = Cosine(test_row2, train2)
    else:
        dist2 = Manhattan(test_row2, train2)
    othery = ytrain2[np.argsort(dist2),0]
    neighbors_y = othery[0:k2]
    return neighbors_y

In [10]:
def predict(train3, ytrain3, test_row3, k3, distnce_criterion):
    neighbors = get_neighbors(train3, ytrain3, test_row3, k3, distnce_criterion)
    neighbors2 = list(neighbors)
    prediction = max(neighbors2, key=neighbors2.count)
    return prediction

In [11]:
def KNNRegression(train4, ytrain4, test4, k4, distnce_criterion):
    prediction1 = np.zeros((test4.shape[0],1))
    for i in range(test4.shape[0]):
        prediction1[i,0] = predict(train4, ytrain4, test4[i,:], k4, distnce_criterion)
    return prediction1

In [12]:
def ConfusionMatrix(y_pred, y_test):
    tp = 0
    tn = 0
    fp = 0
    fn = 0
    for i in range(y_test.size):
        if y_pred[i] == y_test[i]:
            if y_test[i]==1:
                tp+=1
            else:
                tn+=1
        else:
            if y_pred[i]==1:
                fp+=1
            else:
                fn+=1

    print(np.array([[tn, fp],[fn, tp]]))
    return tn, fp, fn, tp

In [13]:
x = np.ones((df.shape[0], df.shape[1]-1))
for l in range(df.shape[1]-1):
    x[:,l] = df[colnames[l]]
x = Normalize(x.shape[1],x)
y = np.expand_dims(np.array(df["Severity"]), 1)

In [14]:
print("Part 1 without Shuffle")
klist = [1,3,5,7,15,30]
distnce_criterion_list = ["Euclidian", "Cosine", "Manhattan"]
kf = KFold(n_splits=10,shuffle=False) 

l=0
accuracy = np.zeros((10,len(klist)))

for train_index, test_index in kf.split(x):
    j=0
    print("\nValidation #", l+1)    
    x_train, x_test = x[train_index,:], x[test_index,:]
    y_train, y_test = y[train_index,:], y[test_index,:]
    for k in klist:
        print("K:", k)
        t00 = time.time()
        y_pred = KNNRegression(x_train, y_train, x_test, k, distnce_criterion_list[0])
        t01 = time.time()
        print("Time Amount:", t01-t00)
        tn, fp, fn, tp = ConfusionMatrix(y_pred, y_test)
        accuracy[l,j] = (tp+tn)/y_test.size       
        print("Accuracy:", accuracy[l,j])
        j+=1
    l+=1
acc_avg = np.sum(accuracy, axis=0)/l
print(acc_avg)
best_k = klist[np.argmax(acc_avg)]
print(best_k)

Part 1 without Shuffle

Validation # 1
K: 1
Time Amount: 0.07795429229736328
[[27 12]
 [ 8 36]]
Accuracy: 0.7590361445783133
K: 3
Time Amount: 0.06696963310241699
[[27 12]
 [ 9 35]]
Accuracy: 0.7469879518072289
K: 5
Time Amount: 0.07216238975524902
[[28 11]
 [ 8 36]]
Accuracy: 0.7710843373493976
K: 7
Time Amount: 0.07695555686950684
[[28 11]
 [ 8 36]]
Accuracy: 0.7710843373493976
K: 15
Time Amount: 0.07598185539245605
[[28 11]
 [ 8 36]]
Accuracy: 0.7710843373493976
K: 30
Time Amount: 0.07597732543945312
[[27 12]
 [ 7 37]]
Accuracy: 0.7710843373493976

Validation # 2
K: 1
Time Amount: 0.06320452690124512
[[36 12]
 [ 4 31]]
Accuracy: 0.8072289156626506
K: 3
Time Amount: 0.07195830345153809
[[34 14]
 [ 5 30]]
Accuracy: 0.7710843373493976
K: 5
Time Amount: 0.04346799850463867
[[35 13]
 [ 4 31]]
Accuracy: 0.7951807228915663
K: 7
Time Amount: 0.06669187545776367
[[34 14]
 [ 4 31]]
Accuracy: 0.7831325301204819
K: 15
Time Amount: 0.048978328704833984
[[34 14]
 [ 4 31]]
Accuracy: 0.783132530120

In [15]:
print("Part 2")
dis_accuracy = np.zeros((10,len(distnce_criterion_list)))
l=0

for train_index, test_index in kf.split(x):
    j=0
    print("\nValidation #", l+1) 
    x_train, x_test = x[train_index,:], x[test_index,:]
    y_train, y_test = y[train_index,:], y[test_index,:]
    for distnce_criterion in distnce_criterion_list:
        print(distnce_criterion, "Distance")
        y_pred = KNNRegression(x_train, y_train, x_test, best_k, distnce_criterion)
        tn, fp, fn, tp = ConfusionMatrix(y_pred, y_test)
        dis_accuracy[l,j] = (tp+tn)/y_test.size
        print("Accuracy:", dis_accuracy[l,j])
        j+=1
    l+=1
acc_avg = np.sum(dis_accuracy, axis=0)/l
print(acc_avg)

Part 2

Validation # 1
Euclidian Distance
[[27 12]
 [ 7 37]]
Accuracy: 0.7710843373493976
Cosine Distance
[[39  0]
 [44  0]]
Accuracy: 0.46987951807228917
Manhattan Distance
[[27 12]
 [ 7 37]]
Accuracy: 0.7710843373493976

Validation # 2
Euclidian Distance
[[34 14]
 [ 4 31]]
Accuracy: 0.7831325301204819
Cosine Distance
[[48  0]
 [35  0]]
Accuracy: 0.5783132530120482
Manhattan Distance
[[34 14]
 [ 4 31]]
Accuracy: 0.7831325301204819

Validation # 3
Euclidian Distance
[[33  8]
 [ 2 40]]
Accuracy: 0.8795180722891566
Cosine Distance
[[41  0]
 [42  0]]
Accuracy: 0.4939759036144578
Manhattan Distance
[[33  8]
 [ 2 40]]
Accuracy: 0.8795180722891566

Validation # 4
Euclidian Distance
[[27  9]
 [ 5 42]]
Accuracy: 0.8313253012048193
Cosine Distance
[[36  0]
 [47  0]]
Accuracy: 0.43373493975903615
Manhattan Distance
[[26 10]
 [ 5 42]]
Accuracy: 0.8192771084337349

Validation # 5
Euclidian Distance
[[35  5]
 [ 6 37]]
Accuracy: 0.8674698795180723
Cosine Distance
[[40  0]
 [43  0]]
Accuracy: 0.48192

In [16]:
print("Part 3")
from sklearn.neighbors import KNeighborsClassifier
   
l=0
accuracy = np.zeros((10,len(klist)))
     
for train_index, test_index in kf.split(x):
    j=0
    print("\nValidation #", l+1)    
    x_train, x_test = x[train_index,:], x[test_index,:]
    y_train, y_test = y[train_index,:], y[test_index,:]
    for k in klist:
        print("K:", k)
        t10 = time.time()
        classifier = KNeighborsClassifier(n_neighbors=k, metric='euclidean')
        classifier.fit(x_train, np.squeeze(y_train))
        y_pred2 = classifier.predict(x_test)
        t11 = time.time()
        print("Time Amount: ", t11-t10)
        tn, fp, fn, tp = ConfusionMatrix(y_pred2, y_test)
        accuracy[l,j] = (tp+tn)/y_test.size       
        print("Accuracy:", accuracy[l,j])
        j+=1
    l+=1
acc_avg = np.sum(accuracy, axis=0)/l
print(acc_avg)
best_k = klist[np.argmax(acc_avg)]
print(best_k)

Part 3

Validation # 1
K: 1
Time Amount:  0.015992164611816406
[[26 13]
 [11 33]]
Accuracy: 0.7108433734939759
K: 3
Time Amount:  0.013993263244628906
[[27 12]
 [ 8 36]]
Accuracy: 0.7590361445783133
K: 5
Time Amount:  0.02398395538330078
[[28 11]
 [ 8 36]]
Accuracy: 0.7710843373493976
K: 7
Time Amount:  0.011994600296020508
[[27 12]
 [ 8 36]]
Accuracy: 0.7590361445783133
K: 15
Time Amount:  0.012994766235351562
[[28 11]
 [ 7 37]]
Accuracy: 0.7831325301204819
K: 30
Time Amount:  0.021986961364746094
[[28 11]
 [ 8 36]]
Accuracy: 0.7710843373493976

Validation # 2
K: 1
Time Amount:  0.006995677947998047
[[35 13]
 [ 6 29]]
Accuracy: 0.7710843373493976
K: 3
Time Amount:  0.005994558334350586
[[34 14]
 [ 5 30]]
Accuracy: 0.7710843373493976
K: 5
Time Amount:  0.008002042770385742
[[35 13]
 [ 4 31]]
Accuracy: 0.7951807228915663
K: 7
Time Amount:  0.00899815559387207
[[35 13]
 [ 4 31]]
Accuracy: 0.7951807228915663
K: 15
Time Amount:  0.0020372867584228516
[[34 14]
 [ 4 31]]
Accuracy: 0.78313253

In [19]:
print("Part 1 with Shuffle")
klist = [1,3,5,7,15,30]
distnce_criterion_list = ["Euclidian", "Cosine", "Manhattan"]
kf = KFold(n_splits=10,shuffle=True)
folds = list()
l=0
accuracy = np.zeros((10,len(klist)))

for train_index, test_index in kf.split(x):
    j=0
    folds.append((train_index, test_index))
    print("\nValidation #", l+1)    
    x_train, x_test = x[train_index,:], x[test_index,:]
    y_train, y_test = y[train_index,:], y[test_index,:]
    for k in klist:
        print("K:", k)
        t00 = time.time()
        y_pred = KNNRegression(x_train, y_train, x_test, k, distnce_criterion_list[0])
        t01 = time.time()
        print("Time Amount:", t01-t00)
        tn, fp, fn, tp = ConfusionMatrix(y_pred, y_test)
        accuracy[l,j] = (tp+tn)/y_test.size       
        print("Accuracy:", accuracy[l,j])
        j+=1
    l+=1
acc_avg = np.sum(accuracy, axis=0)/l
print(acc_avg)
best_k = klist[np.argmax(acc_avg)]
print(best_k)

Part 1 with Shuffle

Validation # 1
K: 1
Time Amount: 0.07995367050170898
[[27 12]
 [ 8 36]]
Accuracy: 0.7590361445783133
K: 3
Time Amount: 0.05070233345031738
[[32  7]
 [ 9 35]]
Accuracy: 0.8072289156626506
K: 5
Time Amount: 0.0721127986907959
[[33  6]
 [ 7 37]]
Accuracy: 0.8433734939759037
K: 7
Time Amount: 0.05604219436645508
[[32  7]
 [ 7 37]]
Accuracy: 0.8313253012048193
K: 15
Time Amount: 0.06252169609069824
[[27 12]
 [ 6 38]]
Accuracy: 0.7831325301204819
K: 30
Time Amount: 0.062494754791259766
[[27 12]
 [ 7 37]]
Accuracy: 0.7710843373493976

Validation # 2
K: 1
Time Amount: 0.059148311614990234
[[37  4]
 [10 32]]
Accuracy: 0.8313253012048193
K: 3
Time Amount: 0.06250143051147461
[[36  5]
 [10 32]]
Accuracy: 0.8192771084337349
K: 5
Time Amount: 0.04687142372131348
[[37  4]
 [ 9 33]]
Accuracy: 0.8433734939759037
K: 7
Time Amount: 0.07430577278137207
[[35  6]
 [ 8 34]]
Accuracy: 0.8313253012048193
K: 15
Time Amount: 0.04397726058959961
[[32  9]
 [ 8 34]]
Accuracy: 0.795180722891566