## Initial work

## init (+data)

In [None]:
!pip install matplotlib
!pip install optuna

### imports

In [None]:
import numpy as np
import csv
import pandas as pd

from sklearn.model_selection import train_test_split

### data preporation

download data

In [None]:
df = pd.read_csv("https://github.com/Roni42/just_some_public_data/blob/main/good_disk_new.csv?raw=true")
df['length_minutes'] = df['length_minutes'].map(lambda num: int(num[1:-1]))
df['year'] = df['year'].map(lambda num: int(num[1:-1]))
df['price_k'] = df['price'].map(lambda num: int((num + 500) // 1000))
df.head(5)

In [None]:
# @title year

from matplotlib import pyplot as plt
df['year'].plot(kind='hist', bins=20, title='year')
plt.gca().spines[['top', 'right',]].set_visible(False)

In [None]:
df=df.head(1000)

fit transform

In [None]:
from sklearn.preprocessing import StandardScaler

numerical_features = ['length_minutes', 'year', 'number_of_disks', 'num_of_rates']
scaler = StandardScaler()
df[numerical_features] = scaler.fit_transform(df[numerical_features])
df.head(5)

In [None]:
# @title year

from matplotlib import pyplot as plt
df['year'].plot(kind='hist', bins=20, title='year')
plt.gca().spines[['top', 'right',]].set_visible(False)

In [None]:
df = df[df.price_k<13]
pd.unique(df.price_k)

show

In [None]:
df.info()
df.describe()

separate data

In [None]:
x = df[['number_of_disks', 'length_minutes', 'year', 'num_of_rates']]
y = df['price_k'].values.tolist()

In [None]:
print(x.shape, pd.DataFrame(y).shape)
y = np.ravel(y)
y.shape

In [None]:
len(y)
x_train, x_temp, y_train, y_temp = train_test_split(x, y, shuffle=True, test_size=0.3, random_state=42)
x_valid, x_test, y_valid, y_test = train_test_split(x_temp, y_temp, shuffle=True, test_size=0.5, random_state=42)

## algos


###lib

Lib version

In [None]:
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score
import matplotlib
import matplotlib.pyplot as plt

In [None]:
import optuna


def lib_optimize(trial):
    n_neighbors = trial.suggest_int('n_neighbors', 1, 200)
    weights = trial.suggest_categorical('weights', ['uniform', 'distance'])

    knn = KNeighborsClassifier(
        n_neighbors=n_neighbors,
        weights=weights,
    )
    knn.fit(x_train, y_train)
    y_pred = knn.predict(x_valid)
    return accuracy_score(y_valid, y_pred)


study = optuna.create_study(direction='maximize')
study.optimize(lib_optimize, n_trials=400, show_progress_bar=True)

best_params = study.best_params
print(f"Best params: {best_params}")

In [None]:
print(study.best_params, study.best_value)

`{'n_neighbors': 19, 'weights': 'distance'}`

In [None]:
model = KNeighborsClassifier(n_neighbors=19, weights='distance')
model.fit(x_train, y_train)
y_pred = model.predict(x_valid)
print(accuracy_score(y_valid, y_pred))

`Trial 63 finished with value: 0.6213292117465224 and parameters: {'n_neighbors': 123, 'weights': 'distance'}. Best is trial 63 with value: 0.6213292117465224`

####own optimise1.31.

In [None]:
neighbors_range = [i for i in range(1, 30)]

train_scores = []
valid_scores = []
test_scores = []
train_scores1 = []
valid_scores1 = []
test_scores1 = []

for n_neighbors in neighbors_range:
    model = KNeighborsClassifier(n_neighbors=n_neighbors, weights='uniform')
    model.fit(x_train, y_train)

    y_train_pred = model.predict(x_train)
    train_accuracy = accuracy_score(y_train, y_train_pred)
    train_scores.append(train_accuracy)

    y_valid_pred = model.predict(x_valid)
    valid_accuracy = accuracy_score(y_valid, y_valid_pred)
    valid_scores.append(valid_accuracy)

    y_test_pred = model.predict(x_test)
    test_accuracy = accuracy_score(y_test, y_test_pred)
    test_scores.append(test_accuracy)

    model = KNeighborsClassifier(n_neighbors=n_neighbors, weights='distance')
    model.fit(x_train, y_train)

    y_train_pred = model.predict(x_train)
    train_accuracy = accuracy_score(y_train, y_train_pred)
    train_scores1.append(train_accuracy)

    y_valid_pred = model.predict(x_valid)
    valid_accuracy = accuracy_score(y_valid, y_valid_pred)
    valid_scores1.append(valid_accuracy)

    y_test_pred = model.predict(x_test)
    test_accuracy = accuracy_score(y_test, y_test_pred)
    test_scores1.append(test_accuracy)

plt.figure(figsize=(10, 6))
plt.plot(neighbors_range, train_scores, label='Train Accuracy')
plt.plot(neighbors_range, valid_scores, label='Valid (Test) Accuracy')
plt.plot(neighbors_range, test_scores, label='Real Test Accuracy')

plt.plot(neighbors_range, train_scores1, label='Train Accuracy | distance')
plt.plot(neighbors_range, valid_scores1, label='Valid (Test) Accuracy | distance')
plt.plot(neighbors_range, test_scores1, label='Real Test Accuracy | distance')

plt.xlabel('Number of Neighbors')
plt.ylabel('Accuracy')
plt.title('Impact of Number of Neighbors on Model Accuracy')

plt.legend()
plt.show()

### my version

In [None]:
from scipy import stats
import math

from sklearn.neighbors import NearestNeighbors
import numpy as np
import pandas as pd

PI = np.pi

kernels = ['uniform', 'gaussian', 'triangular', 'epanechnikov']
metrics = ['manhattan', 'euclidean', 'cosine']
windows = ['fixed', 'mutable']


class KNN:
    _x: pd.DataFrame = None
    _y: np.array = None

    def __init__(
            self,
            kernel: str = 'gaussian',
            n_neighbors: int = 3,
            window: str = 'mutable',
            metric: str = 'euclidean',
    ):
        assert kernel in kernels, f"kernel must be in {kernels}"
        ker = {
            'uniform': lambda u: (np.float64(0.5) * u) * (np.abs(u) < 1.0),
            'triangular': lambda u: (1 - np.abs(u)) * (np.abs(u) < 1.0),
            'epanechnikov': lambda u: (3 / 4 * (1 - u ** 2)) * (np.abs(u) < 1.0),
            'gaussian': lambda u: 1 / np.sqrt(2 * PI) * np.exp(u ** 2 / -2),
        }
        self.ker_name = kernel
        self.kernel = ker[kernel]
        self.n_neighbors = n_neighbors
        assert window in windows, f"window must be in {windows}"
        self.window = window
        assert metric in metrics, f"metric must be in {metrics}"
        metr = {
            'manhattan': lambda x, y: np.max(np.abs(x - y)),
            'euclidean': lambda x, y: np.sqrt(np.sum((x - y) ** 2)),
            'cosine': lambda x, y: 1 - np.dot(x, y) / (np.sqrt(np.dot(x, x)) * np.sqrt(np.dot(y, y)))
        }
        self.NN = NearestNeighbors(metric=metr[metric])


    def fit(self, x: pd.DataFrame, y: np.array):
        assert len(x) == len(y), "x and y must be the same len"
        assert len(x) > 0, "x and y must be > 1 in len"
        # self._x = x
        self._y = y
        self.NN.fit(x.values)
        return self


    def one_iter(self, x, n_neighbors: int = None, weights: np.ndarray = None):
        if n_neighbors is not None:
            self.n_neighbors = n_neighbors
        if weights is None:
            weights = np.ones(len(self._y))

        if self.window == "fixed":
            assert self.radius is not None, "radius mast be not None"
            div = self.radius
            if self.ker_name == 'gaussian':
                self.radius = np.inf
            dist, ind = self.NN.radius_neighbors(x, self.radius, return_distance=True)
            ndist, nind = dist[0], ind[0]
            assert len(ndist) != 0, "radius not enought"
        else:
            assert self.n_neighbors > 0, "neighbours must be >= 1"
            dist, ind = self.NN.kneighbors(x, self.n_neighbors + 1, return_distance=True)
            ndist, nind = dist[0], ind[0][:-1]
            div = ndist[-1]
            ndist = ndist[:-1]

        # итоговые веса точек
        m_dist = self.kernel(ndist / div) * weights[nind]
        classes = self._y[nind]
        res_w = {}
        for i in range(len(classes)):
            if classes[i] not in res_w:
                res_w[classes[i]] = 0
            res_w[classes[i]] += m_dist[i]
        self.last_sum = sum(m_dist)
        self.last_res = res_w
        return max(res_w, key=res_w.get)


    def predict(self, x, weights: np.ndarray = None, radius: float = None, n_neighbors: int = None):
        if n_neighbors is not None:
            self.n_neighbors = n_neighbors
        if radius is not None:
            self.radius = radius
        xv = x.values
        if weights is None:
            weights = np.ones(len(self._y))

        return list(map(lambda _x: self.one_iter([_x], weights=weights), xv))


    def diff(self):
        for k in self.last_res:
            self.last_res[k] = self.last_res[k] / self.last_sum
        return self.last_res


тестер:

In [None]:
model = KNN(
    n_neighbors=13,
    kernel='triangular',
    metric='euclidean',
    window='mutable'
)
model.fit(x_train, y_train)
y_pred = model.predict(x_valid, radius=1.2)
accuracy_score(y_valid, y_pred)


In [None]:
import optuna
# from KNNrealisation import KNN

def my_optimize(trial):
    n_neighbors = trial.suggest_int("n_neighbors", 1, 100)
    # print(window_param)
    kernel = trial.suggest_categorical("kernel", ["uniform", "triangular", "epanechnikov", "gaussian"])
    metric = trial.suggest_categorical("metric", ["manhattan", "euclidean", "cosine"])
    window = trial.suggest_categorical("window", ["fixed", "mutable"])
    model = KNN(
        kernel=kernel,
        n_neighbors=n_neighbors,
        metric=metric,
        window=window)
    model.fit(x_train, y_train)
    if window == 'mutable':
        neighbors = trial.suggest_int('neighbors', 2, 20)
        res = model.predict(x_valid, kernel, n_neighbors=neighbors)
    elif window == 'fixed':
        radius = trial.suggest_float('radius', 1.2, 3)
        res = model.predict(x_valid, kernel, radius=radius)

    return accuracy_score(y_valid, res)



study = optuna.create_study(direction='maximize')
study.optimize(my_optimize, n_trials=400, show_progress_bar=True)

best_params = study.best_params
print("parans:", study.best_params, "\nvalue:", study.best_value)


In [None]:
print(study.best_trial, study.best_params, study.best_value, sep="\n")

`Best params: {'n_neighbors': 78, 'kernel': 'epanechnikov', 'metric': 'euclidean'}`

In [None]:
model = KNN(
    n_neighbors=19,
    kernel='gaussian',
    metric='euclidean',
    window='mutable'
)
model.fit(x_train, y_train)
y_pred = model.predict(x_valid, radius=1.2)
accuracy_score(y_valid, y_pred)


#### my optimaiser

In [None]:
neighbors_range = [i for i in range(3, 30)]

train_scores = []
valid_scores = []
test_scores = []
train_scores1 = []
valid_scores1 = []
test_scores1 = []

model = KNN()
model.fit(x_train, y_train)

for n_neighbors in neighbors_range:
    y_train_pred = model.predict(x_train, n_neighbors=n_neighbors)
    train_accuracy = accuracy_score(y_train, y_train_pred)
    train_scores.append(train_accuracy)

    y_valid_pred = model.predict(x_valid, n_neighbors=n_neighbors)
    valid_accuracy = accuracy_score(y_valid, y_valid_pred)
    valid_scores.append(valid_accuracy)

    y_test_pred = model.predict(x_test, n_neighbors=n_neighbors)
    test_accuracy = accuracy_score(y_test, y_test_pred)
    test_scores.append(test_accuracy)


plt.figure(figsize=(10, 6))
plt.plot(neighbors_range, train_scores, label='Train Accuracy')
plt.plot(neighbors_range, valid_scores, label='Valid (Test) Accuracy')
plt.plot(neighbors_range, test_scores, label='Real Test Accuracy')

plt.xlabel('Number of Neighbors')
plt.ylabel('Accuracy')
plt.title('Impact of Number of Neighbors on Model Accuracy')

plt.legend()
plt.show()

###lowess

In [None]:
def lowess(model):
    w = []
    for i in range(len(x_train)):
        x_test1 = x_test
        y_test1 = y_test
        x_train1 = np.delete(x_train, i, axis=0)
        y_train1 = np.delete(y_train, i, axis=0)
        print(x_train.shape)
        print(x_train1.shape)
        # model.fit
        model.fit(x_train1, y_train1)

        res = model.one_iter(x_test1, n_neighbors=10)
        d = model.diff()
        coef = d.get(y_test1, 0.0)
        w.append(coef)
    return w

In [None]:
model = KNN()
model.fit(x_train, y_train)

w = []
x_train_a = np.array(x_train)
for i in range(len(x_train)):
    x_test1 = x_train_a[i]
    y_test1 = y_train[i]
    x_train1 = np.delete(x_train_a, i, axis=0)
    y_train1 = np.delete(y_train, i, axis=0)
    model.fit(pd.DataFrame(x_train1), y_train1)

    res = model.one_iter([x_test1], n_neighbors=10)
    d = model.diff()
    coef = d.get(y_test1, 0.0)
    w.append(coef)



In [None]:
neighbors_range = [i for i in range(3, 30)]

train_scores = []
valid_scores = []
test_scores = []
train_scores1 = []
valid_scores1 = []
test_scores1 = []

model = KNN()
model.fit(x_train, y_train)

for n_neighbors in neighbors_range:
    y_train_pred = model.predict(x_train, n_neighbors=n_neighbors)
    train_accuracy = accuracy_score(y_train, y_train_pred)
    train_scores.append(train_accuracy)

    y_valid_pred = model.predict(x_valid, n_neighbors=n_neighbors)
    valid_accuracy = accuracy_score(y_valid, y_valid_pred)
    valid_scores.append(valid_accuracy)

    y_test_pred = model.predict(x_test, n_neighbors=n_neighbors)
    test_accuracy = accuracy_score(y_test, y_test_pred)
    test_scores.append(test_accuracy)

    # after
    y_train_pred = model.predict(x_train, n_neighbors=n_neighbors, weights=w)
    train_accuracy = accuracy_score(y_train, y_train_pred)
    train_scores1.append(train_accuracy)

    y_valid_pred = model.predict(x_valid, n_neighbors=n_neighbors, weights=w)
    valid_accuracy = accuracy_score(y_valid, y_valid_pred)
    valid_scores1.append(valid_accuracy)

    y_test_pred = model.predict(x_test, n_neighbors=n_neighbors, weights=w)
    test_accuracy = accuracy_score(y_test, y_test_pred)
    test_scores1.append(test_accuracy)


plt.figure(figsize=(10, 6))
plt.plot(neighbors_range, train_scores, label='Train Accuracy')
plt.plot(neighbors_range, valid_scores, label='Valid (Test) Accuracy')
plt.plot(neighbors_range, test_scores, label='Real Test Accuracy')
plt.plot(neighbors_range, train_scores1, label='Train Accuracy | lowess')
plt.plot(neighbors_range, valid_scores1, label='Valid (Test) Accuracy | lowess')
plt.plot(neighbors_range, test_scores1, label='Real Test Accuracy | lowess')

plt.xlabel('Number of Neighbors')
plt.ylabel('Accuracy')
plt.title('Impact of Number of Neighbors on Model Accuracy')

plt.legend()
plt.show()