In [58]:
import numpy as np

Implement a function to compute the Euclidean distance between two points.

In [59]:
v1 = np.array([1, 1])
v2 = np.array([2, 2])
v3 = np.array([1, 1])

In [60]:
def euclidian_norm(v1 : np.ndarray, v2:np.ndarray) -> np.float64:
    return np.sqrt(np.sum((v1 - v2) ** 2))

print(euclidian_norm(v1, v2))
print(euclidian_norm(v1, v3))

1.4142135623730951
0.0


Implement a function to compute the Manhattan distance.

In [61]:
def manhattan_norm(v1: np.ndarray, v2: np.ndarray) -> np.float64:
    return np.sum(np.abs(v1 - v2))

print(manhattan_norm(v1, v2))
print(manhattan_norm(v1, v3))


2
0


In [62]:
import heapq
a = [10, -1, 32]
h = heapq.heapify(a)
print(a)

[-1, 10, 32]


In [63]:
X = np.array([
    [1, 1],
    [1, 2],
    [6, 6],
    [2, 1],
    [7, 10],
    [10, 8],
    [1, 1],
])

y = np.array([1, 1, 0, 1, 0, 0, 1])
assert len(X) == len(y), 'Wrong data'

Write a simple KNN classifier for 2D data with k=3.

In [64]:
from sklearn.base import ClassifierMixin, BaseEstimator
import heapq
class KNNClassifier(ClassifierMixin, BaseEstimator):   
    def __init__(self, norm='euclidian_norm', k=3):
        super().__init__()
        self.func_norm = euclidian_norm
        if norm == 'manhattan_norm':
            self.func_norm = manhattan_norm
        self.k = k
                     
    def fit(self,X : np.ndarray, y : np.ndarray):        
        if X.shape[0] != y.shape[0]:
            raise ValueError('Wrong data')        
        self.X = X
        self.y = y

    def get_common_class(self, closest_classes: list):        
        classes, count = np.unique(closest_classes, return_counts=True)
        return classes[np.argmax(count)]
        
    def predict(self, U):                            
        if self.k >= self.X.shape[0]:
            return np.full(U.shape[0], self.get_common_class(list(self.y)), dtype=int)        
        
        y_pred = np.zeros(U.shape[0], dtype=np.int64)         

        for u_index, u in enumerate(U):            
            k_nearest = [( -self.func_norm(u, self.X[i]) , i) for i in range(self.k)] # - для работы min heap как max heap
            heapq.heapify(k_nearest)
            
            for i in range(self.k, self.X.shape[0]):                
                max_distance = -k_nearest[0][0]
                distance = self.func_norm(u, self.X[i])
                if distance < max_distance:
                    heapq.heappushpop(k_nearest, (-distance, i))
            k_nearest_indices = [neighbour[1] for neighbour in k_nearest]

            y_pred[u_index] = self.get_common_class(self.y[k_nearest_indices])

        return y_pred


In [None]:
my_classifier = KNNClassifier('manhattan_norm', 3)
my_classifier.fit(X, y)

In [66]:
X_test = np.array([
    [0, 0], 
    [10, 20],
])
y_test = np.array([1, 0])

y_pred = my_classifier.predict(X_test)
print(y_pred)
print(my_classifier.score(X_test, y_test))

[1 0]
1.0


Test KNN on Iris

In [87]:
from sklearn.datasets import load_iris
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split

X, y = load_iris(return_X_y=True)
X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=0.8, random_state=42, shuffle=True)
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)
X_train.shape

(120, 4)

In [88]:
my_classifier.fit(X_train, y_train)
print(my_classifier.predict(X_test) == y_test)
my_classifier.score(X_test, y_test)

[ True  True  True  True  True  True  True  True  True  True  True  True
  True  True  True  True  True  True  True  True  True  True  True  True
  True  True  True  True  True  True]


1.0

In [None]:
from sklearn.neighbors import KNeighborsClassifier
sklearn_classifier = KNeighborsClassifier(3)
sklearn_classifier.fit(X_train, y_train)
sklearn_classifier.score(X_test, y_test)

1.0