#Imports 


In [162]:
import numpy as np
import math
import random
from sklearn import datasets
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
import tensorflow as tf
from tensorflow import keras
import pandas as pd

# KNN Classifier

In [260]:
def isNaN(num):
    return num != num

def se(target, output):
  """
      The function takes in two parameters, target and output, and returns the average of the squared
      difference between the two
        
      :param target: the target values
      :param output: the output of the model
      :return: The average of the squared error.
  """
  return np.average((output - target) ** 2)

        


class KNN:

    def __init__(self, k=2, distance_metric="euclidean", 
                 weighted=False, problem="classification"):
        self.k = k
        self.distance_metric = distance_metric
        self.weighted = weighted
        self.problem = problem
        self.x_train = []
        self.y_train = []

    def train(self, features, labels):
        self.x_train = features
        self.y_train = labels

    def evaluate(self, features, labels):
        predictions = self.predict(features)
        accuracy = 0
        if self.get_problem_type() == 'r':
          
          loss = self._se(labels, predictions)
          return loss


        for prediction, target in zip(predictions, labels):
            if prediction == target:
                accuracy += 1

        accuracy = accuracy/len(labels)
        loss = 1 - accuracy
        print(accuracy, loss)

        return accuracy, loss

    def predict(self, data):
        predicted = []
        for i in data:

            distances = self._distance_func(i)
            distances = sorted(zip(distances, self.y_train)) # join and sort distances and labels 
            values = distances[:self.k]
            k_distance, k_labels = zip(*values)  # split distance and labels
            
            unique_labels, count = np.unique(k_labels, return_counts=True) # get unique labels and their frequency

            if self.get_problem_type() == 'r':
        
              k_mean = np.sum(k_distance)/len(k_distance)
              if self.weighted:
                predicted.append(1/ (k_mean)**2)
              else:
                predicted.append(k_mean)
            else:
              
               ## if weighted distance
              if self.weighted  and self.get_problem_type() != 'r' :
                k_labels = list(k_labels)
                k_distance = np.array(k_distance)
                k_distance = 1 / (k_distance ** 2) # squared distance inverse
                weighted_k = []
                for label in unique_labels:   # get sum of target corresponding inverse value in k
                  sums = 0
                  for i in range(len(k_distance)):
                    if k_labels[i] == label:
                      sums += k_distance[i]
                  weighted_k.append(sums)
                # predicition = max sum of k targets
                predicted.append(unique_labels[np.argmax(weighted_k)])
              else:
                # predicition = max frequencey of k targets
                predicted.append(unique_labels[count.argmax()])

          

        return np.array(predicted)

    def _distance_func(self, data):
        if self.distance_metric.lower() == 'euclidean':
            return self._euclidean(data)
        elif self.distance_metric.lower() == 'heom':
            return self._heom(data)

    def _euclidean(self, data):
       distances = []
       for x in self.x_train:
         distance = 0
         for i in range(len(x)):
           if isinstance(x[i], str) and isinstance(data[i], str):
             if x[i] != data[i]:
               distance += 1
           else:
            distance += pow((x[i] - data[i]), 2)
         distances.append(pow(distance, 1/2))
        #  for x in self.x_train:
        #     distances.append(np.sqrt(np.sum(np.square(np.subtract(x, data)))))
       return distances
    

    def _heom(self, data):
      distances = []
      for x in self.x_train:
         distance = 0
         for i in range(len(x)):
            if isinstance(x[i], str) and isinstance(data[i], str):
             if x[i] != data[i]:
               distance += 1
            elif isNaN(x[i]) or isNaN(data[i]):
               distance += 1
            else:
               train_attribute = self.x_train[:, i]
               attribute_range = np.nanmax(train_attribute, axis = 0) - np.nanmin(train_attribute, axis = 0)
               distance +=  np.abs(x[i] - data[i]) / attribute_range         
         distances.append(pow(distance, 2))
      return distances

    def get_problem_type(self):
      if self.problem.lower() == 'regression' or self.problem.lower() == 'r':
        return 'r'
      
      return 'c'

    def _se(self, target, output):
        """
        The function takes in two parameters, target and output, and returns the average of the squared
        difference between the two
        
        :param target: the target values
        :param output: the output of the model
        :return: The average of the squared error.
        """
        return np.average((output - target) ** 2)




          


In [136]:
print(se(100, 10))

8100.0


# *Magic Telescope*

In [131]:
names = ['fLength', 'fWidth', 'fSize', 'fConc', 'fConc1', 'fAsym', 'fM3Long', 'fM3Trans', 'fAlpha', 'fDist', 'class']

In [132]:
magic_telescope = pd.read_csv('http://archive.ics.uci.edu/ml/machine-learning-databases/magic/magic04.data',
                              names=names)

In [133]:
magic_telescope.head()

Unnamed: 0,fLength,fWidth,fSize,fConc,fConc1,fAsym,fM3Long,fM3Trans,fAlpha,fDist,class
0,28.7967,16.0021,2.6449,0.3918,0.1982,27.7004,22.011,-8.2027,40.092,81.8828,g
1,31.6036,11.7235,2.5185,0.5303,0.3773,26.2722,23.8238,-9.9574,6.3609,205.261,g
2,162.052,136.031,4.0612,0.0374,0.0187,116.741,-64.858,-45.216,76.96,256.788,g
3,23.8172,9.5728,2.3385,0.6147,0.3922,27.2107,-6.4633,-7.1513,10.449,116.737,g
4,75.1362,30.9205,3.1611,0.3168,0.1832,-5.5277,28.5525,21.8393,4.648,356.462,g


In [58]:
#convert to ndarray, split features and target
magic_telescope = np.array(magic_telescope)
y = magic_telescope[:, -1]
x = np.delete(magic_telescope, -1, 1)

In [59]:
# normalize features 
scaler = MinMaxScaler().fit(x)
x = scaler.transform(x)

In [60]:
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.25, random_state=9)

In [61]:
#create weighted KNN model
# K=2, distance metric = Heterogeneous Euclidean-Overlap Metric
model = KNN(k=2, weighted=True,distance_metric='heom')

In [62]:
model.train(x_train, y_train)

In [63]:
# evaluate test set
model.evaluate(x_test, y_test)



0.6443743427970557 0.35562565720294426


(0.6443743427970557, 0.35562565720294426)

In [None]:
prediction = model.predict(x_test)

In [None]:
# compare predicted vs target
prediction == y_test

#Iris

## *Preprocesssing & Training*

In [120]:
iris = datasets.load_iris()
x_train, x_test, y_train, y_test = train_test_split(iris.data, iris.target, random_state = 42, test_size = 0.2)

In [121]:
model = KNN(k=10)

In [122]:
model.train(x_train, y_train)

In [123]:
model.evaluate(x_test, y_test)

1.0 0.0


(1.0, 0.0)

In [124]:
prediction = model.predict(x_test)

In [125]:
prediction == y_test

array([ True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True])

#Credit Approval

In [149]:
# create row header
names = []
for i in range(16):
  names.append('A'+str(i+1))

In [150]:
# load credit approval dataset
credit = pd.read_csv('http://archive.ics.uci.edu/ml/machine-learning-databases/credit-screening/crx.data',
                     names=names)

In [151]:
credit.head()

Unnamed: 0,A1,A2,A3,A4,A5,A6,A7,A8,A9,A10,A11,A12,A13,A14,A15,A16
0,b,30.83,0.0,u,g,w,v,1.25,t,t,1,f,g,202,0,+
1,a,58.67,4.46,u,g,q,h,3.04,t,t,6,f,g,43,560,+
2,a,24.5,0.5,u,g,q,h,1.5,t,f,0,f,g,280,824,+
3,b,27.83,1.54,u,g,w,v,3.75,t,t,5,t,g,100,3,+
4,b,20.17,5.625,u,g,w,v,1.71,t,f,0,f,s,120,0,+


In [152]:
# convert to ndarray, split features and target
credit = np.array(credit)
y = credit[:, -1]
x = np.delete(credit, -1, 1)

In [102]:
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.25, random_state=9)

In [103]:
# create model
# use k = 2 
# set distance metric to Heterogeneous Euclidean-Overlap Metric
model = KNN(k=2, distance_metric='heom')

In [104]:
model.train(x_train, y_train)

In [105]:
# evaluate test set
model.evaluate(x_test, y_test)

0.791907514450867 0.20809248554913296


(0.791907514450867, 0.20809248554913296)

In [106]:
prediction = model.predict(x_test)

In [107]:
# compare predicted vs target
prediction == y_test

array([False,  True, False,  True, False,  True, False,  True,  True,
        True,  True,  True, False,  True,  True,  True,  True,  True,
        True,  True, False,  True,  True,  True,  True, False,  True,
       False,  True, False,  True,  True,  True,  True,  True, False,
        True, False,  True,  True,  True,  True,  True,  True, False,
        True,  True,  True,  True,  True, False,  True,  True, False,
        True,  True,  True,  True,  True,  True,  True, False,  True,
        True,  True,  True,  True,  True,  True,  True, False,  True,
       False,  True,  True,  True,  True, False,  True,  True,  True,
       False,  True, False,  True,  True, False, False,  True,  True,
        True,  True,  True,  True,  True,  True, False,  True,  True,
        True,  True,  True,  True,  True,  True,  True, False,  True,
        True,  True,  True,  True,  True,  True,  True,  True, False,
        True,  True,  True,  True,  True,  True, False, False,  True,
        True,  True,

#Boston Housing

##*Data Preprocessing*

In [339]:
boston = tf.keras.datasets.boston_housing.load_data(path='boston_housing.npz',
                                                    test_split=0.2, seed=3)


In [340]:
(x_train, y_train), ( x_test, y_test) = boston

In [350]:
# create a KNN regression model
model = KNN(k=3, problem='regression', weighted=False, distance_metric='euclidean')

In [351]:
model.train(x_train, y_train)

In [352]:
# evaluate test set
model.evaluate(x_test, y_test)

298.76247052395996

In [290]:
predicted = model.predict(x_test)

In [291]:
for i in range(len(predicted)):
  print("predicted: ", predicted[i], " Target: ", y_test[i])

predicted:  44.46441856246228  Target:  18.6
predicted:  18.294110762610423  Target:  23.0
predicted:  34.44540379288161  Target:  42.3
predicted:  26.03803665227254  Target:  17.2
predicted:  10.232828963884407  Target:  16.2
predicted:  21.397083416018354  Target:  20.0
predicted:  44.86812452514644  Target:  30.3
predicted:  30.940219485367574  Target:  20.9
predicted:  16.646116561669295  Target:  20.4
predicted:  29.680305694584824  Target:  24.8
predicted:  14.833761356816554  Target:  18.7
predicted:  16.154216316066414  Target:  16.8
predicted:  22.921136362843697  Target:  22.5
predicted:  30.048392388942503  Target:  18.8
predicted:  27.143507446229684  Target:  23.7
predicted:  10.151170649495798  Target:  23.8
predicted:  7.904538647972268  Target:  19.6
predicted:  13.894984141686672  Target:  20.4
predicted:  10.209477543685248  Target:  16.1
predicted:  40.390223269573795  Target:  44.0
predicted:  40.90770964266054  Target:  19.3
predicted:  48.98643907563132  Target:  

In [252]:
summ = 0
for i in range(len(predicted)):
  summ += (predicted[i] - y_test[i])** 2

print("Sum error ", summ, "MSE: ", summ/len(predicted))


Sum error  26884.691071472873 MSE:  263.57540266149874


In [253]:
print(se(y_test,predicted))

263.57540266149863
