In [1]:
import os
import pandas as pd

os.chdir("C:\\Users\\lucac\\Documents\\GitHub\\song-cluster")

# Replace with the path to your full dataset
df = pd.read_csv('data/analysis_data/classifier_data.csv')

# Filter the dataset to include only the specified genres
df = df[df['genre_top'].isin([
    'Classical', 'Electronic', 'Experimental', 'Folk', 
    'Hip-Hop', 'Instrumental', 'International', 'Jazz', 
    'Pop', 'Rock'])]

# Quick peek
print(df.shape)           # e.g. (5000,  ? )
print(df['genre_top'].value_counts())
print(df.columns.tolist())  # should list your feature columns + 'genre'

(47817, 26)
genre_top
Rock             14155
Experimental     10544
Electronic        9260
Hip-Hop           3536
Folk              2773
Pop               2325
Instrumental      2070
International     1378
Classical         1212
Jazz               564
Name: count, dtype: int64
['track_id', 'title', 'artist_name', 'genre_top', 'mfcc_1', 'mfcc_2', 'mfcc_3', 'mfcc_4', 'mfcc_5', 'mfcc_6', 'mfcc_7', 'mfcc_8', 'mfcc_9', 'mfcc_10', 'mfcc_11', 'mfcc_12', 'mfcc_13', 'mfcc_14', 'mfcc_15', 'mfcc_16', 'centroid_mean', 'centroid_variance', 'rolloff_mean', 'rolloff_variance', 'zcr_mean', 'zcr_variance']


In [33]:
# Train/Test Split
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder

# Features vs target
X = df.drop(columns=['genre_top', 'track_id', 'title', 'artist_name'])
y = df['genre_top']

X_train, X_test, y_train, y_test = train_test_split(
    X, y,
    test_size=0.2,
    stratify=y,
    random_state=42,
)

# Encode the target variable
le = LabelEncoder()

y_train_coded = le.fit_transform(y_train)
y_test_coded = le.fit_transform(y_test)

In [None]:
from xgboost import XGBClassifier
from sklearn.metrics import accuracy_score, classification_report

# 1. Instantiate Classifier Object
xgb = XGBClassifier(
    use_label_encoder=False,    # silence warnings
    eval_metric='map',     # multiclass loss
    random_state=42,
    n_jobs=-1,
    class_weight='balanced',  # handle class imbalance
)

# 2. Fit on training data (top_feats as before)
xgb.fit(X_train, y_train_coded)

Parameters: { "class_weight", "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)


In [37]:
# 3. Predict & evaluate
y_pred_xgb = xgb.predict(X_test)

# 1) Accuracy on encoded labels
print("XGBoost Accuracy (coded):", accuracy_score(y_test_coded, y_pred_xgb))

# 2) Decode back to genre strings and show report using labels
y_pred_labels = le.inverse_transform(y_pred_xgb)
y_true_labels = y_test.values

print("Classification Report (genre labels):")
print(classification_report(y_true_labels, y_pred_labels, digits=3))

# 3) Feature importances
print("XGBoost Feature Importances:")
importances = xgb.feature_importances_
importances_df = pd.DataFrame({
    'feature': X.columns,
    'importance': importances
}).sort_values(by='importance', ascending=False)
print(importances_df)

XGBoost Accuracy (coded): 0.5644081974069427
Classification Report (genre labels):
               precision    recall  f1-score   support

    Classical      0.714     0.640     0.675       242
   Electronic      0.536     0.593     0.563      1852
 Experimental      0.492     0.602     0.542      2109
         Folk      0.474     0.382     0.423       555
      Hip-Hop      0.510     0.426     0.464       707
 Instrumental      0.405     0.159     0.229       414
International      0.681     0.225     0.338       276
         Jazz      0.583     0.062     0.112       113
          Pop      0.277     0.039     0.068       465
         Rock      0.659     0.781     0.715      2831

     accuracy                          0.564      9564
    macro avg      0.533     0.391     0.413      9564
 weighted avg      0.548     0.564     0.540      9564

XGBoost Feature Importances:
              feature  importance
2              mfcc_3    0.104649
0              mfcc_1    0.078157
17  centroid_

In [39]:
# Retrain but top 10 features
top_10_features = importances_df.head(10)['feature'].tolist()
X_train_top_10 = X_train[top_10_features]
X_test_top_10 = X_test[top_10_features]

# 1. Instantiate
xgb_top_10 = XGBClassifier(
    use_label_encoder=True,    # silence warnings
    eval_metric='map',     # mean average precision
    random_state=42,
    n_jobs=-1,
    class_weight='balanced',  # handle class imbalance
)

# 2. Fit on training data (top_feats as before)
xgb_top_10.fit(X_train_top_10, y_train_coded)

Parameters: { "class_weight", "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)


In [43]:
# 3. Predict & evaluate
y_pred_xgb_top_10 = xgb_top_10.predict(X_test_top_10)

# 1) Accuracy
print(f"XGBoost (Top 10 Features) Accuracy: {accuracy_score(y_test_coded, y_pred_xgb_top_10):.3f}")

# 2) Encoding table: class number → genre label
encoding_df = pd.DataFrame({
    'class': range(len(le.classes_)),
    'label': le.classes_
})

# save encoding table to CSV
encoding_df.to_csv('models/encoding_table.csv', index=False)

# 3) Classification report with labels
print("\nClassification Report (Top 10 Features):")
print(classification_report(
    y_test_coded,
    y_pred_xgb_top_10,
    target_names=le.classes_,
    digits=3
))

# 4) Feature importances
print("XGBoost (Top 10 Features) Importances:")
importances_top_10_df = pd.DataFrame({
    'feature': top_10_features,
    'importance': xgb_top_10.feature_importances_
}).sort_values('importance', ascending=False)
print(importances_top_10_df)

XGBoost (Top 10 Features) Accuracy: 0.535

Classification Report (Top 10 Features):
               precision    recall  f1-score   support

    Classical      0.638     0.603     0.620       242
   Electronic      0.513     0.567     0.538      1852
 Experimental      0.477     0.586     0.526      2109
         Folk      0.412     0.328     0.365       555
      Hip-Hop      0.455     0.383     0.416       707
 Instrumental      0.400     0.135     0.202       414
International      0.500     0.127     0.202       276
         Jazz      0.125     0.009     0.017       113
          Pop      0.259     0.015     0.028       465
         Rock      0.625     0.754     0.684      2831

     accuracy                          0.535      9564
    macro avg      0.440     0.351     0.360      9564
 weighted avg      0.509     0.535     0.506      9564

XGBoost (Top 10 Features) Importances:
             feature  importance
0             mfcc_3    0.158710
1             mfcc_1    0.127140
2  ce

In [29]:
# save baseline XGboost (best performance) model
import joblib

joblib.dump(xgb, 'models/xgboost_model.pkl')

['models/xgboost_model.pkl']

In [7]:
import joblib

# Load the saved XGBoost model
xgb_loaded = joblib.load('models/xgboost_model.pkl')

# Retrieve and print key information about the final constructed tree
print("Number of trees:", xgb_loaded.get_booster().num_boosted_rounds())
print("Feature names:", xgb_loaded.feature_names_in_)
print("Model parameters:", xgb_loaded.get_booster().attributes())

Number of trees: 100
Feature names: ['mfcc_1' 'mfcc_2' 'mfcc_3' 'mfcc_4' 'mfcc_5' 'mfcc_6' 'mfcc_7' 'mfcc_8'
 'mfcc_9' 'mfcc_10' 'mfcc_11' 'mfcc_12' 'mfcc_13' 'mfcc_14' 'mfcc_15'
 'mfcc_16' 'centroid_mean' 'centroid_variance' 'rolloff_mean'
 'rolloff_variance' 'zcr_mean' 'zcr_variance']
Model parameters: {}
