In [13]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

from sklearn.neighbors import KNeighborsClassifier
from sklearn.preprocessing import MinMaxScaler
from sklearn.metrics import pairwise_distances

import utils.datasets_utils as datasets_utils
import utils.classification_utils as classification_utils

sns.set_palette("Set2")
sns.set_style("whitegrid")

We will experiment with spectral_modified (bc irrelevant attributes add noise in the proximity measure and redundant attributes add bias towards certain attributes) with cosine distance (bc of high dimensionality) and umap_modified with euclidean distance (15 dimensions)

In [5]:
datasets = datasets_utils.load_datasets()
sub_datasets = filter(lambda x: x.name in {'spectral_modified', 'umap_modified'}, datasets)
datasets_wrapper = datasets_utils.DatasetsWrapper(sub_datasets)

In [6]:
datasets_wrapper.get_shapes()

Unnamed: 0,train_X,val_X,test_X,train_y,val_y,test_y
spectral_modified,"(23114, 189)","(2397, 189)","(5993, 189)","(23114,)","(2397,)","(5993,)"
umap_modified,"(23114, 15)","(2397, 15)","(5993, 15)","(23114,)","(2397,)","(5993,)"


Since we will be using the Euclidean distance, brining the features of the dataset to the same scale is required

In [8]:
scaler = MinMaxScaler()
datasets_wrapper.scale_datasets(scaler=scaler)

# 1. KNN on Spectral Featres - Dataset

In [11]:
spectral_dataset = datasets_wrapper.get_dataset('spectral_modified')

The Genre 'Easy Listening' is underrepresented (only 12 tracks belong to this genre) and hence the $k$ parameter in our KNN classifier should roughly be around $10$.
We will experiment with different values

In [12]:
spectral_dataset.train.y.value_counts().sort_values()[:3]

(genre, Unnamed: 253_level_1, Unnamed: 253_level_2)
Easy Listening     15
Blues             200
Spoken            200
Name: count, dtype: int64

We will precompute the distances, to be able to experiment with different hyperparameters more efficiently

In [14]:
def calculate_distances(X, metric):
    return pd.DataFrame(pairwise_distances(X, metric=metric, n_jobs=5), index=X.index, columns=X.index)

In [None]:
spectral_train_distances = calculate_distances(spectral_dataset.train.X, metric='cosine')

In [None]:
spectral_val_distances = calculate_distances(spectral_dataset.val.X, metric='cosine')

In [None]:
spectral_test_distances = calculate_distances(spectral_dataset.test.X, metric='cosine')

In [None]:
spectral_distances_dataset = datasets_utils.create_dataset(
    name='spectral_distances_dataset',
    train_X=spectral_train_distances, train_y=spectral_dataset.train.y,
    val_X=spectral_val_distances, val_y=spectral_dataset.val.y,
    test_X=spectral_test_distances, test_y=spectral_dataset.test.y
)

In [None]:
spectral_classifiers_list = list()
for i in range(7, 17):
    spectral_classifiers_list.append(
        classification_utils.ClassifierFactory.create_instance(
            dataset=spectral_distances_dataset,
            classifier=KNeighborsClassifier(n_neighbors=i, weights='distance', algorithm='kd_tree', metric='precomputed'),
        )
    )


spectral_classifiers = classification_utils.ClassifiersCollection(spectral_classifiers_list)

In [None]:
spectral_classifiers.fit_classifiers()

In [None]:
for classifier in spectral_classifiers.get_all_classifiers():
    pass