In [1]:
import optuna
import pandas as pd
import numpy as np

from KNearestNeighbors import KNearestNeighbors
from sklearn.model_selection import train_test_split
from Kernel import Kernel
from Lowess import Lowess
from Metric import Metric

In [2]:
np.random.seed(2931)
df = pd.read_csv('SpotifyFeatures.csv')
df = df.drop(np.random.choice(df.index, 230000, replace=False)).reset_index(drop=True)
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2725 entries, 0 to 2724
Data columns (total 18 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   genre             2725 non-null   object 
 1   artist_name       2725 non-null   object 
 2   track_name        2725 non-null   object 
 3   track_id          2725 non-null   object 
 4   popularity        2725 non-null   int64  
 5   acousticness      2725 non-null   float64
 6   danceability      2725 non-null   float64
 7   duration_ms       2725 non-null   int64  
 8   energy            2725 non-null   float64
 9   instrumentalness  2725 non-null   float64
 10  key               2725 non-null   object 
 11  liveness          2725 non-null   float64
 12  loudness          2725 non-null   float64
 13  mode              2725 non-null   object 
 14  speechiness       2725 non-null   float64
 15  tempo             2725 non-null   float64
 16  time_signature    2725 non-null   object 


In [3]:
df.loc[df.genre == 'Children’s Music', 'genre'] = 'Children\'s Music'
point_columns = ['popularity', 'acousticness', 'danceability', 'duration_ms', 'energy', 'instrumentalness', 'liveness', 'loudness', 'speechiness', 'tempo', 'valence']
features = df.genre.to_numpy()
points = df.loc[:, point_columns].to_numpy()

x_train, x_test, y_train, y_test = train_test_split(points, features, random_state=2931)

min_x = np.min(x_train, axis=0)
max_x = np.max(x_train, axis=0)
x_train = (x_train - min_x) / (max_x - min_x)
x_test = (x_test - min_x) / (max_x - min_x)

***
<h3>KNN</h3>

In [4]:
from sklearn.metrics import accuracy_score

knn = KNearestNeighbors(neighbors_count=30, metric=Metric.EUCLIDEAN, kernel=Kernel.GAUSSIAN)
knn.fit(x_train, y_train)

predictions = knn.predict(x_test)

print(accuracy_score(y_test, predictions))

0.2932551319648094


In [5]:
from sklearn.neighbors import KNeighborsClassifier

lib_knn = KNeighborsClassifier(n_neighbors=30)
lib_knn.fit(x_train, y_train)

lib_predictions = lib_knn.predict(x_test)

print(accuracy_score(y_test, lib_predictions))

0.2829912023460411


In [6]:
def optuna_for_lib(trial):
    n_neighbors = trial.suggest_int("n_neighbors", 1, 40)
    weights = trial.suggest_categorical("weights", ["uniform", "distance"])
    metric = trial.suggest_categorical("metric", ["euclidean", "cosine", "manhattan", "cityblock", "minkowski"])

    opt_lib_knn = KNeighborsClassifier(n_neighbors=n_neighbors, weights=weights, metric=metric, p=3)
    opt_lib_knn.fit(x_train, y_train)

    opt_lib_predictions = opt_lib_knn.predict(x_test)
    accuracy = accuracy_score(y_test, opt_lib_predictions)

    return accuracy

study = optuna.create_study(direction="maximize")
study.optimize(optuna_for_lib, n_trials=400)

best_params = study.best_params
best_accuracy = study.best_value

print("Best params:", best_params)
print("Best accuracy:", best_accuracy)

Best params: {'n_neighbors': 34, 'weights': 'distance', 'metric': 'cityblock'}
<br>
Best accuracy: 0.3313782991202346

In [7]:
def optuna_for_own(trial):
    neighbors_count = trial.suggest_int("n_neighbors", 1, 40)
    metric = trial.suggest_categorical("metric", ["cosine", "euclidean", "minkowski"])
    if metric == 'cosine':
        metric = Metric.COSINE
    elif metric == 'euclidean':
        metric = Metric.EUCLIDEAN
    elif metric == 'minkowski':
        metric = Metric.MINKOWSKI
    kernel = trial.suggest_categorical("kernel", ["uniform", "gaussian", "triangular", "epanechnikov", None])
    if kernel == 'uniform':
        kernel = Kernel.UNIFORM
    elif kernel == 'gaussian':
        kernel = Kernel.GAUSSIAN
    elif kernel == 'triangular':
        kernel = Kernel.TRIANGLE
    elif kernel == 'epanechnikov':
        kernel = Kernel.EPANECHIKOV
    window_width = trial.suggest_float("window_width", 0, 1)
    if window_width < 0.01:
        window_width = None

    opt_knn = KNearestNeighbors(neighbors_count=neighbors_count, metric=metric, kernel=kernel, window_width=window_width, minkowski_p=1)
    opt_knn.fit(x_train, y_train)

    opt_predictions = opt_knn.predict(x_test)
    accuracy = accuracy_score(y_test, opt_predictions)

    return accuracy

study = optuna.create_study(direction="maximize")
study.optimize(optuna_for_own, n_trials=1000)

best_params = study.best_params
best_accuracy = study.best_value

print("Best params:", best_params)
print("Best accuracy:", best_accuracy)

Best params: {'n_neighbors': 32, 'metric': 'minkowski', 'kernel': 'gaussian', 'window_width': 0.5157464799797326}
<br>
Best accuracy: 0.3416422287390029

***
<h3>LOWESS</h3>

In [8]:
lowess = Lowess(knn)
lowess_weights = lowess.correct_weights(x_train, y_train)

lowess.knn.fit(x_train, y_train, object_weights=lowess_weights)
lowess_predictions = lowess.knn.predict(x_test)

print('With LOWESS -> ', accuracy_score(y_test, lowess_predictions))
print('Without LOWESS -> ', accuracy_score(y_test, predictions))

In [9]:
lowess_lib = Lowess(lib_knn)
lowess_lib_weights = lowess_lib.correct_weights(x_train, y_train)

idx = np.random.choice(np.arange(len(x_train)), size=len(x_train), replace=True, p=lowess_lib_weights/np.sum(lowess_lib_weights))
x_sampled = x_train[idx]
y_sampled = y_train[idx]

lowess_lib.knn.fit(x_sampled, y_sampled)
lowess_lib_predictions = lowess_lib.knn.predict(x_test)

print('With LOWESS -> ', accuracy_score(y_test, lowess_lib_predictions))
print('Without LOWESS -> ', accuracy_score(y_test, lib_predictions))

With LOWESS ->  0.29178885630498536
Without LOWESS ->  0.2829912023460411


With LOWESS ->  0.29178885630498536
<br>
Without LOWESS ->  0.2829912023460411