In [1]:
import os
import pandas as pd
from sklearn.model_selection import train_test_split

os.chdir("C:\\Users\\lucac\\Documents\\GitHub\\song-cluster")

df = pd.read_csv('data/analysis_data/exploration_data.csv')

# Filter the dataset to include only the specified genres
df = df[df['genre_top'].isin([
    'Classical', 'Electronic', 'Experimental', 'Folk', 
    'Hip-Hop', 'Instrumental', 'International', 'Jazz', 
    'Pop', 'Rock'])]

feature_cols = [c for c in df.columns if c.startswith(('mfcc_', 'centroid_', 'rolloff_', 'zcr_'))]
X = df[feature_cols]
y = df['genre_top']

In [2]:
from sklearn.preprocessing   import StandardScaler
from sklearn.neighbors       import KNeighborsClassifier
from sklearn.metrics         import accuracy_score, classification_report, confusion_matrix

# Split into train & test
X_train, X_test, y_train, y_test = train_test_split(
    X, y,
    stratify=y,
    test_size=0.2,
    random_state=42
)

# Scale features
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled  = scaler.transform(X_test)

In [None]:
# Train KNN
knn = KNeighborsClassifier(
    n_neighbors=25,
    weights='uniform',
    algorithm='auto',
    leaf_size=30,
    p=2,
    metric='euclidean',
    metric_params=None,
    n_jobs=-1
)

knn.fit(X_train_scaled, y_train)

Accuracy: 0.5426599749058971
               precision    recall  f1-score   support

    Classical       0.50      0.72      0.59       242
   Electronic       0.48      0.65      0.55      1852
 Experimental       0.50      0.49      0.49      2109
         Folk       0.46      0.37      0.41       555
      Hip-Hop       0.51      0.32      0.39       707
 Instrumental       0.39      0.08      0.14       414
International       0.65      0.18      0.29       276
         Jazz       0.75      0.03      0.05       113
          Pop       0.43      0.02      0.04       465
         Rock       0.63      0.79      0.70      2831

     accuracy                           0.54      9564
    macro avg       0.53      0.37      0.37      9564
 weighted avg       0.53      0.54      0.51      9564

Confusion Matrix:
[[ 174    8   47    4    0    1    0    0    0    8]
 [  14 1204  289   37   74    8    0    1    1  224]
 [  68  413 1038   66   35   25   16    0    4  444]
 [  14   55  122  203

In [None]:
# Predict & evaluate
y_pred = knn.predict(X_test_scaled)
print("Accuracy:", accuracy_score(y_test, y_pred))
print(classification_report(y_test, y_pred))
print("Confusion Matrix:")
print(confusion_matrix(y_test, y_pred))

Accuracy: 0.5426599749058971
               precision    recall  f1-score   support

    Classical       0.50      0.72      0.59       242
   Electronic       0.48      0.65      0.55      1852
 Experimental       0.50      0.49      0.49      2109
         Folk       0.46      0.37      0.41       555
      Hip-Hop       0.51      0.32      0.39       707
 Instrumental       0.39      0.08      0.14       414
International       0.65      0.18      0.29       276
         Jazz       0.75      0.03      0.05       113
          Pop       0.43      0.02      0.04       465
         Rock       0.63      0.79      0.70      2831

     accuracy                           0.54      9564
    macro avg       0.53      0.37      0.37      9564
 weighted avg       0.53      0.54      0.51      9564

Confusion Matrix:
[[ 174    8   47    4    0    1    0    0    0    8]
 [  14 1204  289   37   74    8    0    1    1  224]
 [  68  413 1038   66   35   25   16    0    4  444]
 [  14   55  122  203

AttributeError: 'KNeighborsClassifier' object has no attribute 'feature_importances_'

In [20]:
# save the model
import joblib
joblib.dump(knn, 'models/knn_model.pkl')

['models/knn_model.pkl']