Let's take KNNClassifier from layer_1 and improve it

Add cosine distance and Jaccard distance

In [16]:
import numpy as np
def cosine_distance(x, y):
    return 1 - x.dot(y) / (np.linalg.norm(x) * np.linalg.norm(y))

def jaccard_distance(x: set, y: set):
    return 1 - len(x.intersection(y)) / len((x.union(y)))

def euclidian_norm(x, y):
    return np.sqrt(np.sum((x - y) ** 2))

def manhattan_norm(x, y):
    return np.sum(np.abs(x - y))

v1 = np.array([3, 1])
v2 = np.array([1, 3])
print(euclidian_norm(v1, v2))
print(cosine_distance(v1, v2))

v1 = np.array([3, 3])
v2 = np.array([1, 1])
print(euclidian_norm(v1, v2))
print(cosine_distance(v1, v2), 'because it measures vector alignment')


2.8284271247461903
0.40000000000000013
2.8284271247461903
0.0 because it measures vector alignment


Implement weighted KNN (uniform kernel )

In [91]:
from sklearn.base import ClassifierMixin, BaseEstimator
import heapq
from collections import Counter

class KNNClassifier(ClassifierMixin, BaseEstimator):   
    def __init__(self, norm='euclidian', k=3,kernel_type='uniform', h=1.0):
        super().__init__()

        if norm == 'euclidian':
            self.func_norm = euclidian_norm
        if norm == 'manhattan':
            self.func_norm = manhattan_norm        
        if norm == 'cosine':
            self.func_norm = cosine_distance
        if norm == 'jaccard':
            self.func_norm == jaccard_distance

        self.kernel_type = kernel_type

        self.k = k    
        self.h = h                    

    def fit(self,X : np.ndarray, y : np.ndarray):        
        if X.shape[0] != y.shape[0]:
            raise ValueError('Wrong data')        
        self.X = X
        self.y = y

    def get_common_class(self, closest_classes: list):        
        classes, count = np.unique(closest_classes, return_counts=True)        
        return classes[np.argmax(count)]
    
    def _get_weight(self, distance):                      
        if distance == 0:
            return float('inf')      
        if self.kernel_type == 'uniform':
            return 1.0
        if self.kernel_type == 'distance':
            return 1.0 / distance
        if self.kernel_type == 'gaussian':
            return np.exp(-(pow(distance, 2)) / (2 * pow(self.h, 2)))
        if self.kernel_type == 'epanechnikov':
            dist = distance / self.h
            if np.abs(dist) <= 1:
                return 0.75 * (1 - pow(dist,2))
            else:
                return 0.0    

    def get_weighted_common_class(self, closest_classes, distances):
        if self.kernel_type is None:
            return self.get_common_class(closest_classes)
        
        class_weights = Counter()
        for i in range(len(closest_classes)):
            current_class = closest_classes[i]
            current_distance = distances[i]

            weight = self._get_weight(current_distance)            
            if np.isinf(weight):
                return current_class
            
            class_weights[current_class] += weight
        total_weigt_sum = sum(class_weights.values())
        if total_weigt_sum == 0:
            # impossible to find class correctly - return no-weighted voting
            return self.get_common_class(closest_classes)
        
        return max(class_weights, key=class_weights.get)

    def predict(self, U):                            
        if self.k >= self.X.shape[0]:
            return np.full(U.shape[0], self.get_common_class(list(self.y)), dtype=int)        
        
        y_pred = np.zeros(U.shape[0], dtype=np.int64)         

        for u_index, u in enumerate(U):            
            k_nearest = [( -self.func_norm(u, self.X[i]) , i) for i in range(self.k)] # - для работы min heap как max heap
            heapq.heapify(k_nearest)
            
            for i in range(self.k, self.X.shape[0]):                
                max_distance = -k_nearest[0][0]
                distance = self.func_norm(u, self.X[i])
                if distance < max_distance:
                    heapq.heappushpop(k_nearest, (-distance, i))
                                            
            k_nearest_indices = [neighbour[1] for neighbour in k_nearest]                        
            k_nearest_distances = [-neighbour[0] for neighbour in k_nearest]                                    
            y_pred[u_index] = self.get_weighted_common_class(self.y[k_nearest_indices], k_nearest_distances)            
            

        return y_pred

In [96]:
from sklearn.datasets import load_iris
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split

X, y = load_iris(return_X_y=True)
scaler = StandardScaler()
X = scaler.fit_transform(X)
X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=0.8, random_state=42, shuffle=True)
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)


    

my_classifier = KNNClassifier(k=100, kernel_type='epanechnikov')
my_classifier.fit(X_train, y_train)
print(my_classifier.score(X_test, y_test))


my_classifier = KNNClassifier(k=100, kernel_type=None)
my_classifier.fit(X_train, y_train)
print(my_classifier.score(X_test, y_test))


1.0
0.4
