In [19]:
import os
import pandas as pd

os.chdir("C:\\Users\\lucac\\Documents\\GitHub\\song-cluster")

# Replace with the path to your full dataset
df = pd.read_csv('data/analysis_data/classifier_data.csv')

# Filter the dataset to include only the specified genres
df = df[df['genre_top'].isin([
    'Classical', 'Electronic', 'Experimental', 'Folk', 
    'Hip-Hop', 'Instrumental', 'International', 'Jazz', 
    'Pop', 'Rock'])]

# Quick peek
print(df.shape)           # e.g. (5000,  ? )
print(df['genre_top'].value_counts())
print(df.columns.tolist())  # should list your feature columns + 'genre'

(47817, 26)
genre_top
Rock             14155
Experimental     10544
Electronic        9260
Hip-Hop           3536
Folk              2773
Pop               2325
Instrumental      2070
International     1378
Classical         1212
Jazz               564
Name: count, dtype: int64
['track_id', 'title', 'artist_name', 'genre_top', 'mfcc_1', 'mfcc_2', 'mfcc_3', 'mfcc_4', 'mfcc_5', 'mfcc_6', 'mfcc_7', 'mfcc_8', 'mfcc_9', 'mfcc_10', 'mfcc_11', 'mfcc_12', 'mfcc_13', 'mfcc_14', 'mfcc_15', 'mfcc_16', 'centroid_mean', 'centroid_variance', 'rolloff_mean', 'rolloff_variance', 'zcr_mean', 'zcr_variance']


In [16]:
# Train/Test Split
from sklearn.model_selection import train_test_split

# Features vs target
X = df.drop(columns=['genre_top', 'track_id', 'title', 'artist_name'])
y = df['genre_top']

X_train, X_test, y_train, y_test = train_test_split(
    X, y,
    test_size=0.2,
    stratify=y,
    random_state=42
)

In [17]:
# Train the Random Forest using the default parameters and all features 

from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, accuracy_score

# Baseline model
baseline_rf = RandomForestClassifier(
    random_state=42,
    n_jobs=-1, 
    class_weight='balanced', # to handle class imbalance
    max_depth=None,
    min_samples_split=2,
    min_samples_leaf=1,
    max_features='sqrt',
)

baseline_rf.fit(X_train, y_train)

KeyboardInterrupt: 

In [None]:
# Evaluation
y_pred = baseline_rf.predict(X_test)


print("Baseline Random Forest Classifier")
print("===================================")
print("Number of classes:", len(baseline_rf.classes_))
print("Classes:", baseline_rf.classes_)
print("Number of trees:", baseline_rf.n_estimators)
print("Max depth:", baseline_rf.max_depth)
print("Min samples split:", baseline_rf.min_samples_split)
print("Min samples leaf:", baseline_rf.min_samples_leaf)
print("Max features:", baseline_rf.max_features)
print("Accuracy:", accuracy_score(y_test, y_pred))
print(classification_report(y_test, y_pred, digits=3))
print("Feature importances:")
importances = pd.DataFrame({
    'feature': X.columns,
    'importance': baseline_rf.feature_importances_
}).sort_values(by='importance', ascending=False)

print(importances)

In [None]:
# Re train the Random Forest using the default parameters and only the top 15 features
top_15_features = importances.head(15)['feature'].tolist()
X_train_top_15 = X_train[top_15_features]
X_test_top_15 = X_test[top_15_features]

# Train the model with only the top 10 features
rf_top_15 = RandomForestClassifier(
    random_state=42,
    n_jobs=-1, 
    class_weight='balanced',
    max_depth=None,
    min_samples_split=2,
    min_samples_leaf=1,
    max_features='sqrt',
)

rf_top_15.fit(X_train_top_15, y_train)

# Evaluation
y_pred_top_15 = rf_top_15.predict(X_test_top_15)
print("Accuracy with top 10 features:", accuracy_score(y_test, y_pred_top_15))
print(classification_report(y_test, y_pred_top_15, digits=3))
print("Feature importances with top 15 features:")
importances_top_15 = pd.DataFrame({
    'feature': top_15_features,
    'importance': rf_top_15.feature_importances_
}).sort_values(by='importance', ascending=False)

print(importances_top_15)

Accuracy with top 10 features: 0.5533249686323714
               precision    recall  f1-score   support

    Classical      0.720     0.595     0.652       242
   Electronic      0.518     0.607     0.559      1852
 Experimental      0.476     0.612     0.535      2109
         Folk      0.494     0.310     0.381       555
      Hip-Hop      0.542     0.321     0.403       707
 Instrumental      0.584     0.126     0.207       414
International      0.758     0.181     0.292       276
         Jazz      1.000     0.027     0.052       113
          Pop      0.556     0.022     0.041       465
         Rock      0.627     0.784     0.697      2831

     accuracy                          0.553      9564
    macro avg      0.627     0.358     0.382      9564
 weighted avg      0.564     0.553     0.521      9564

Feature importances with top 15 features:
              feature  importance
0              mfcc_1    0.086261
1              mfcc_4    0.073371
7             mfcc_16    0.070314

In [None]:
# save baseline model for later use
import joblib

joblib.dump(baseline_rf, 'models/baseline_rf.pkl')

['models/baseline_rf.pkl']

In [20]:
import joblib
# Load the saved model
loaded_rf = joblib.load('models/baseline_rf.pkl')

# Get information about the final tree
final_tree = loaded_rf.estimators_[-1]  # Access the last tree in the forest
print("Final Tree Information:")
print("========================")
print("Number of nodes:", final_tree.tree_.node_count)
print("Max depth:", final_tree.tree_.max_depth)

Final Tree Information:
Number of nodes: 22023
Max depth: 32
