In [11]:
import copy

import numpy as np
from sklearn.neighbors import NearestNeighbors
from scipy.spatial.distance import cdist

class KNNClassifier:
    metric_type = 'minkowski'
    p = None
    kernel = None
    X_train = None
    y_train = None
    annoy_i = None
    n_trees = 10
    
    def __init__(self, kernel=lambda x: 1 - np.abs(x), metric='minkowski', p=2.0):
        self.kernel = kernel
        self.metric_type = metric
        self.p = p
        
    def get_distances(self):
        dists = list()
        
        for i in range(len(self.X_vals)):
            for j in range(i + 1, len(self.X_vals)):
                dists.append(self.metric(self.X_vals[i], self.X_vals[j]))
                
        return np.asarray(dists)
        
    def metric(self, a, b):
        if self.metric_type == 'minkowski':
            return np.sum(np.abs(a - b) ** self.p) ** (1.0 / self.p)
        if self.metric_type == 'cosine':
            return np.dot(a, b) / (np.linalg.norm(a) * np.linalg.norm(b))
        if self.metric_type == 'chebyshev':
            return np.max(np.abs(a - b))
        
        raise AttributeError(f'no metric with name {self.metric_type}')
    
    def fit(self, X, y):
        self.X_vals = X.values
        self.y_vals = y.values
        self.X_train = X
        self.y_train = y
        self.counter_pattern = dict()
        
        for label in np.unique(self.y_train):
            self.counter_pattern[label] = 0
    
    def base_predict(self, X, neighboubrs_mapper, element_processor, double_compute=False):
        nn = NearestNeighbors(
            metric=self.metric_type,
            p=self.p
        )
    
        nn.fit(self.X_train)
        dists, neighs = neighboubrs_mapper(nn)(X)
        dists_X, neighs_X = dists, neighs
        if double_compute:
            dists_X, neighs_X = neighboubrs_mapper(nn)(self.X_train)
        
        y_test = list()
        
        for i in range(len(X)):
            pred = element_processor(dists[i], neighs[i], dists_X, neighs_X)
            y_test.append(pred)
            
        return np.asarray(y_test)
    
    def __constant_window_processor(self, dists, neighs, ignored1, ignored2):
        tmp_counter = copy.deepcopy(self.counter_pattern)
            
        # no neigh
        if len(dists) == 0:
            return None, None
            
        for (dist, neigh) in zip(dists, neighs):
            n_type = self.y_vals[neigh]
            
            tmp_counter[n_type] += self.kernel(dist / h)

        pred = max(tmp_counter, key=tmp_counter.get)
        
        return pred
    
    def __float_window_processor(self, dists, neighs, ignored1, ignored2):
        tmp_counter = copy.deepcopy(self.counter_pattern)
        
        for (dist, neigh) in zip(dists[:-2], neighs[:-2]):
            n_type = self.y_vals[neigh]
            tmp_counter[n_type] += self.kernel(dist / dists[-1])
            
        pred = max(tmp_counter, key=tmp_counter.get)
        
        return pred
    
    def __lowess_window_processor(self, dists, neighs, dists_X, neighs_X):
        tmp_counter = copy.deepcopy(self.counter_pattern)
        
        for (dist, neigh) in zip(dists[:-3], neighs[:-3]):
            n_type = self.y_vals[neigh]
            print(dists_X[neigh])
            tmp_counter[n_type] += self.kernel(n_type - self.__float_window_processor(dists_X[neigh][1:], neighs_X[neigh][1:], None, None))
            
        pred = max(tmp_counter, key=tmp_counter.get)
        
        return pred  
    
    def predict_constant_window(self, X, h):
        return self.base_predict(X, lambda nn: lambda x: nn.radius_neighbors(X=x, radius=h), self.__constant_window_processor)
    
    def predict_float_window(self, X, k):
        return self.base_predict(X, lambda nn: lambda x: nn.kneighbors(X=x, n_neighbors=k), self.__float_window_processor)
    def predict_float_lowess_window(self, X, k):
        return self.base_predict(X, lambda nn: lambda x: nn.kneighbors(X=x, n_neighbors=k + 1), self.__lowess_window_processor, double_compute=True)


In [12]:
def base_kernel_constructor(a, b, c):
    return lambda x: c * (1 - np.abs(a) ** a) ** b

default_kernel = base_kernel_constructor(1, 1, 1)

def gaussian_kernel(x):
    return 1 / np.sqrt(2 * np.pi) * np.exp(-0.5 * x**2)

epanechnikov_kernel = base_kernel_constructor(0.75, 2, 1)

def cosine_kernel(x):
    return np.pi / 4 * np.cos(np.pi / 2 * x)

def uniform_kernel(ignored):
    return 0.5

triquadratic_kernel = base_kernel_constructor(35.0/32, 2, 3)


In [3]:
import pandas as pd

from sklearn.model_selection import train_test_split
from sklearn.neighbors import RadiusNeighborsClassifier, KNeighborsClassifier
from sklearn.metrics import mean_squared_error, accuracy_score

df = pd.read_csv('phone_infos.csv')
X = df.drop(columns=['price_range'])
y = df['price_range']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.4, random_state=153)

X_test

Unnamed: 0,battery_power,blue,clock_speed,dual_sim,fc,four_g,int_memory,m_dep,mobile_wt,n_cores,pc,px_height,px_width,ram,sc_h,sc_w,talk_time,three_g,touch_screen,wifi
1955,1515,1,2.1,1,4,1,24,0.9,176,5,6,747,1247,3104,6,5,20,1,0,0
1549,1772,1,1.6,0,17,1,45,0.5,159,2,18,837,1405,1146,6,1,17,1,1,0
188,1905,1,0.5,0,3,1,6,0.5,151,1,5,849,898,1545,9,5,10,1,1,0
1242,989,1,1.6,1,11,1,24,0.6,156,8,17,614,679,2819,16,13,5,1,0,1
1199,894,0,0.7,0,2,1,58,0.1,123,2,3,158,747,3305,12,7,2,1,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
442,902,1,0.7,0,3,0,3,0.3,134,7,9,675,1285,3411,12,11,15,0,0,1
1484,1689,1,0.5,0,11,0,9,0.5,150,5,15,467,675,2637,8,4,3,1,0,0
1530,571,1,2.0,1,5,1,58,0.6,101,6,11,31,1536,555,19,10,11,1,0,1
385,1880,1,1.8,0,4,1,18,0.7,138,3,5,71,699,3333,17,15,7,1,0,0


In [4]:
h = 1450

knn = KNNClassifier(metric='minkowski', p=10)
knn.fit(X_train, y_train)
y_test1, y_test_proba1 = knn.predict_constant_window(X_test, h)

knn_sklearn = RadiusNeighborsClassifier(radius=h)
knn_sklearn.fit(X_train, y_train)
y_test2 = knn_sklearn.predict(X_test)
y_test_proba2 = knn_sklearn.predict_proba(X_test)

print("ours", accuracy_score(y_test.values, y_test1))
print("theirs", accuracy_score(y_test, y_test2))
print("ours vs theirs", accuracy_score(y_test1, y_test2))

ValueError: too many values to unpack (expected 2)

In [None]:
d = knn.get_distances()

In [None]:
d.mean()

In [None]:
k = 5

knn = KNNClassifier(kernel=epanechnikov_kernel)
knn.fit(X_train, y_train)
y_test1, y_test_proba1 = knn.predict_float_window(X_test, k)

knn_sklearn = KNeighborsClassifier(n_neighbors=k)
knn_sklearn.fit(X_train, y_train)
y_test2 = knn_sklearn.predict(X_test)
y_test3 = knn_sklearn.predict_proba(X_test)

print("ours", accuracy_score(y_test.values, y_test1))
print("theirs", accuracy_score(y_test, y_test2))
print("ours vs theirs", accuracy_score(y_test1, y_test2))
print("ours vs theirs 2", mean_squared_error(y_test_proba1, y_test3))

In [None]:
d = knn.get_distances()

In [None]:
d.mean()

In [None]:
import optuna

def objective_const(trial: optuna.trial.Trial):
    metric = trial.suggest_categorical('metric', ['minkowski', 'cosine', 'chebyshev'])
    p = None
    
    if metric == 'minkowski':
        p = trial.suggest_float('p', 1, 10, log=True)
        
    h = trial.suggest_float('h', 1200, 4000, log=True)
    kernel = trial.suggest_categorical('kernel', ['default', 'epanechnikov', 'gaussian', 'cosine', 'uniform', 'triquadratic'])
    
    knn = KNNClassifier(
        metric=metric,
        p=p,
        kernel=globals()[f'{kernel}_kernel']
    )
    knn.fit(X_train, y_train)
    y_pred = knn.predict_constant_window(X_test, h)
    
    return 1 - accuracy_score(y_test, y_pred)

study = optuna.create_study()  
study.optimize(objective_const, n_jobs=-1, n_trials=1000, show_progress_bar=True)
study.best_params

In [5]:
import optuna

def objective_float(trial: optuna.trial.Trial):
    metric = trial.suggest_categorical('metric', ['minkowski', 'cosine', 'chebyshev'])
    p = None
    
    if metric == 'minkowski':
        p = trial.suggest_float('p', 1, 10)
        
    k = trial.suggest_int('k', 1, len(X_train.values) - 1)
    kernel = trial.suggest_categorical('kernel', ['default', 'epanechnikov', 'gaussian', 'cosine', 'uniform', 'triquadratic'])
    
    knn = KNNClassifier(
        metric=metric,
        p=p,
        kernel=globals()[f'{kernel}_kernel']
    )
    knn.fit(X_train, y_train)
    y_pred = knn.predict_float_window(X_test, k)
    
    return 1 - accuracy_score(y_test, y_pred)

study = optuna.create_study()  
study.optimize(objective_float, n_jobs=-1, n_trials=1000, show_progress_bar=True)  
study.best_params


[I 2024-04-04 11:23:07,027] A new study created in memory with name: no-name-2cdc0efa-cf21-45e7-ad9d-000a20774a39


  0%|          | 0/1000 [00:00<?, ?it/s]

[I 2024-04-04 11:23:07,680] Trial 0 finished with value: 0.08625000000000005 and parameters: {'metric': 'chebyshev', 'k': 43, 'kernel': 'uniform'}. Best is trial 0 with value: 0.08625000000000005.
[I 2024-04-04 11:23:08,799] Trial 4 finished with value: 0.07999999999999996 and parameters: {'metric': 'minkowski', 'p': 1.4269782753571878, 'k': 125, 'kernel': 'cosine'}. Best is trial 4 with value: 0.07999999999999996.
[I 2024-04-04 11:23:08,885] Trial 5 finished with value: 0.11750000000000005 and parameters: {'metric': 'minkowski', 'p': 4.1705359005104405, 'k': 168, 'kernel': 'epanechnikov'}. Best is trial 4 with value: 0.07999999999999996.
[I 2024-04-04 11:23:10,539] Trial 2 finished with value: 0.21250000000000002 and parameters: {'metric': 'minkowski', 'p': 5.632584270249414, 'k': 587, 'kernel': 'epanechnikov'}. Best is trial 4 with value: 0.07999999999999996.
[I 2024-04-04 11:23:10,731] Trial 7 finished with value: 0.565 and parameters: {'metric': 'cosine', 'k': 978, 'kernel': 'triqu

{'metric': 'chebyshev', 'k': 25, 'kernel': 'cosine'}

In [None]:
import optuna

def objective_float_sklearn(trial: optuna.trial.Trial):
    metric = trial.suggest_categorical('metric', ['minkowski', 'cosine', 'chebyshev'])
    p = None
    
    if metric == 'minkowski':
        p = trial.suggest_float('p', 1, 10, log=True)
        
    k = trial.suggest_int('k', 1, len(X_train.values) - 1, log=True)
    
    kernel = trial.suggest_categorical('kernel', ['default', 'epanechnikov', 'gaussian', 'cosine', 'uniform', 'triquadratic'])
    kernel_func = globals()[f'{kernel}_kernel']
    
    def weight(dists):
        weights = list()
        for d in dists:
            weights.append(np.asarray([kernel_func(dist / d[-1]) for dist in d]))
            
        return np.asarray(weights)
    
    knn = KNeighborsClassifier(
        metric=metric,
        p=p,
        weights=weight,
        n_neighbors=k
    )
    knn.fit(X_train, y_train)
    y_pred = knn.predict(X_test)
    
    print(y_pred)
    
    return 1 - accuracy_score(y_test.values, y_pred)

study = optuna.create_study()  
study.optimize(objective_float_sklearn, n_jobs=1, n_trials=1000, show_progress_bar=True)  
study.best_params


In [1]:
import optuna

def objective_const_sklearn(trial: optuna.trial.Trial):
    metric = trial.suggest_categorical('metric', ['minkowski', 'cosine', 'chebyshev'])
    p = None
    
    if metric == 'minkowski':
        p = trial.suggest_float('p', 1, 10)
        
    h = trial.suggest_float('h', 1200, 4000)
    kernel = trial.suggest_categorical('kernel', ['default', 'epanechnikov', 'gaussian', 'cosine', 'uniform', 'triquadratic'])
    kernel_func = globals()[f'{kernel}_kernel']

    def weight(dists):        
        weigths = list()
        for dist in dists:
            print(dist)
            weigths.append(kernel_func(dist / h))
            
        return np.asarray(weigths)
    
    knn = RadiusNeighborsClassifier(
        metric=metric,
        p=p,
        weights=weight,
        radius=h
    )
    knn.fit(X_train, y_train)
    y_pred = knn.predict(X_test)
    
    return 1 - accuracy_score(y_test, y_pred)

study = optuna.create_study()  
study.optimize(objective_const_sklearn, n_jobs=1, n_trials=1000, show_progress_bar=True)
study.best_params

[I 2024-04-04 11:22:22,997] A new study created in memory with name: no-name-a8a77b9c-5ced-4c76-a5bc-cbb513afd964


  0%|          | 0/1000 [00:00<?, ?it/s]

[W 2024-04-04 11:22:23,017] Trial 0 failed with parameters: {'metric': 'minkowski', 'p': 5.077990485425423, 'h': 3800.308984967217, 'kernel': 'default'} because of the following error: KeyError('default_kernel').
Traceback (most recent call last):
  File "/opt/homebrew/lib/python3.10/site-packages/optuna/study/_optimize.py", line 196, in _run_trial
    value_or_values = func(trial)
  File "/var/folders/bt/5b964mjj6qd8yl08sxkbyv9w0000gn/T/ipykernel_98832/877755744.py", line 12, in objective_const_sklearn
    kernel_func = globals()[f'{kernel}_kernel']
KeyError: 'default_kernel'
[W 2024-04-04 11:22:23,018] Trial 0 failed with value None.


KeyError: 'default_kernel'

In [13]:
nn = KNNClassifier(
    metric='minkowski',
    p=4.0,
    kernel=cosine_kernel
)
nn.fit(X_train, y_train)

k = 25

pred = nn.predict_float_lowess_window(X_test, k)
accuracy_score(y_test, pred)

[  0.         208.81228615 249.05209251 254.48919886 259.42529962
 305.27821589 306.62997376 321.37039153 322.55124984 333.22021624
 333.329993   344.18384478 364.48000599 373.93976567 375.09736347
 375.50630302 377.66829307 388.32390027 391.84339338 396.50945557
 397.58942284 402.44074096 403.51950916 406.4326125  410.31032998
 411.33467837]
[  0.         208.78749734 242.40417856 252.16715952 296.66868881
 305.86359593 325.78868014 335.93594631 340.19129602 349.08978711
 349.66327498 356.01804804 362.86892842 369.07000674 369.74669695
 377.00744709 377.66829307 384.93270958 396.13366919 406.20476211
 408.4459245  410.87501031 413.54368518 425.41322749 425.95865214
 429.60900962]
[  0.         208.78749734 244.47839478 264.12755263 272.86050138
 273.18837096 274.96407462 276.95833215 278.08795968 299.55947951
 321.11116499 338.85352166 339.94135067 342.7132243  359.32870551
 361.13180623 366.50305878 371.61928192 379.0139857  386.98169823
 393.65115208 397.3064056  402.06200437 407.59

0.91

In [None]:
import optuna

def objective_float_lowess(trial: optuna.trial.Trial):
    metric = trial.suggest_categorical('metric', ['minkowski', 'chebyshev'])
    p = None
    
    if metric == 'minkowski':
        p = trial.suggest_float('p', 1, 10)
        
    k = trial.suggest_int('k', 1, len(X_train.values) - 1, log=True)
    kernel = trial.suggest_categorical('kernel', ['default', 'epanechnikov', 'gaussian', 'cosine', 'uniform', 'triquadratic'])
    
    knn = KNNClassifier(
        metric=metric,
        p=p,
        kernel=globals()[f'{kernel}_kernel']
    )
    knn.fit(X_train, y_train)
    y_pred = knn.predict_float_lowess_window(X_test, k)
    
    return 1 - accuracy_score(y_test, y_pred)

study = optuna.create_study()  
study.optimize(objective_float_lowess, n_jobs=-1, n_trials=50, show_progress_bar=True)  
study.best_params
