In [1]:
import os
import warnings
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import librosa
import joblib
from sklearn.pipeline import Pipeline
from sklearn.feature_selection import SelectFromModel
from sklearn.base import clone
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.neural_network import MLPClassifier
from sklearn.dummy import DummyClassifier
from sklearn.preprocessing import StandardScaler, PowerTransformer
from sklearn.metrics import accuracy_score, f1_score, confusion_matrix, ConfusionMatrixDisplay

warnings.simplefilter("ignore")

plt.rcParams['figure.figsize'] = (12, 5)

### Build Dataset
- Feature extraction with Librosa for model inputs (X_train/X_test)
- Labelling tracks with genre and echonest tags (Y_train/Y_test)

In [2]:
## UNCOMMENT THE FOLLOWING CODE TO BUILD DATASET FROM SCRATCH ##

feats_mean_mfcc = []
feats_std_mfcc = []
feats_mean_chroma = []
feats_std_chroma = []
feats_tempo = []
feats_mean_contrast = []
feats_std_contrast = []
feats_mean_mel = []
feats_std_mel = []
feats_mean_tonnetz = []
feats_std_tonnetz = []
feats_mean_rms = []
feats_std_rms = []
feats_mean_zcr = []
feats_std_zcr = []

fname = 'happy_pharrel.mp3'

# Load the audio file and extract the features
audio, srate = librosa.load(fname, res_type='kaiser_fast')
mfcc_matrix = librosa.feature.mfcc(y=audio, sr=srate, n_mfcc=40)
chroma_cqt = librosa.feature.chroma_cqt(y=audio, sr=srate, n_chroma=12)
tempo, _ = librosa.beat.beat_track(y=audio, sr=srate)
contrast = librosa.feature.spectral_contrast(y=audio, sr=srate, n_bands=6)
mel = librosa.feature.melspectrogram(y=audio, sr=srate, n_mels=64)
tonnetz = librosa.feature.tonnetz(y=audio, sr=srate)
rms = librosa.feature.rms(y=audio)
zcr = librosa.feature.zero_crossing_rate(y=audio)

mean_mfcc = np.mean(mfcc_matrix, axis=1)
std_mfcc = np.std(mfcc_matrix, axis=1)
mean_chroma = np.mean(chroma_cqt, axis=1)
std_chroma = np.std(chroma_cqt, axis=1)
mean_contrast = np.mean(contrast, axis=1)
std_contrast = np.std(contrast, axis=1)
mean_mel = np.mean(mel, axis=1)
std_mel = np.std(mel, axis=1)
mean_tonnetz = np.mean(tonnetz, axis=1)
std_tonnetz = np.std(tonnetz, axis=1)
mean_rms = np.mean(rms, axis=1)
std_rms = np.std(rms, axis=1)
mean_zcr = np.mean(zcr, axis=1)
std_zcr = np.std(zcr, axis=1)

feats_mean_mfcc.append(mean_mfcc)
feats_std_mfcc.append(std_mfcc)
feats_mean_chroma.append(mean_chroma)
feats_std_chroma.append(std_chroma)
feats_tempo.append(tempo)
feats_mean_contrast.append(mean_contrast)
feats_std_contrast.append(std_contrast)
feats_mean_mel.append(mean_mel)
feats_std_mel.append(std_mel)
feats_mean_tonnetz.append(mean_tonnetz)
feats_std_tonnetz.append(std_tonnetz)
feats_mean_rms.append(mean_rms)
feats_std_rms.append(std_rms)
feats_mean_zcr.append(mean_zcr)
feats_std_zcr.append(std_zcr)

# save to disk to avoid re-computing
np.save('features/feats_mean_mfcc.npy', feats_mean_mfcc)
np.save('features/feats_std_mfcc.npy', feats_std_mfcc)
np.save('features/feats_mean_chroma.npy', feats_mean_chroma)
np.save('features/feats_std_chroma.npy', feats_std_chroma)
np.save('features/feats_tempo.npy', feats_tempo)
np.save('features/feats_mean_contrast.npy', feats_mean_contrast)
np.save('features/feats_std_contrast.npy', feats_std_contrast)
np.save('features/feats_mean_mel.npy', feats_mean_mel)
np.save('features/feats_std_mel.npy', feats_std_mel)
np.save('features/feats_mean_tonnetz.npy', feats_mean_tonnetz)
np.save('features/feats_std_tonnetz.npy', feats_std_tonnetz)
np.save('features/feats_mean_rms.npy', feats_mean_rms)
np.save('features/feats_std_rms.npy', feats_std_rms)
np.save('features/feats_mean_zcr.npy', feats_mean_zcr)
np.save('features/feats_std_zcr.npy', feats_std_zcr)

print(f"Dataset size: {len(feats_mean_mfcc)}")

Dataset size: 1


In [3]:
## LOAD DATASET FROM DISK IF ALREADY COMPUTED ##

feats_mean_mfcc = np.load('features/feats_mean_mfcc.npy')
feats_std_mfcc = np.load('features/feats_std_mfcc.npy')
feats_mean_chroma = np.load('features/feats_mean_chroma.npy')
feats_std_chroma = np.load('features/feats_std_chroma.npy')
feats_tempo = np.load('features/feats_tempo.npy').reshape(-1, 1)
feats_mean_contrast = np.load('features/feats_mean_contrast.npy')
feats_std_contrast = np.load('features/feats_std_contrast.npy')
feats_mean_mel = np.load('features/feats_mean_mel.npy')
feats_std_mel = np.load('features/feats_std_mel.npy')
feats_mean_tonnetz = np.load('features/feats_mean_tonnetz.npy')
feats_std_tonnetz = np.load('features/feats_std_tonnetz.npy')
feats_mean_rms = np.load('features/feats_mean_rms.npy')
feats_std_rms = np.load('features/feats_std_rms.npy')
feats_mean_zcr = np.load('features/feats_mean_zcr.npy')
feats_std_zcr = np.load('features/feats_std_zcr.npy')

X = np.hstack([feats_mean_mfcc, feats_std_mfcc, feats_mean_chroma, feats_std_chroma, feats_tempo,
               feats_mean_contrast, feats_std_contrast, feats_mean_mel, feats_std_mel,
               feats_mean_tonnetz, feats_std_tonnetz, feats_mean_rms, feats_std_rms,
               feats_mean_zcr, feats_std_zcr])
print("X shape:", X.shape)

X shape: (1, 263)


### Classification
Model training and evaluation

In [161]:
def perform_grid_search_rf(X, y):
    """ Perform Grid Search to find the best hyperparameters for a single label. """
    
    pipeline = Pipeline([
        ('feature_selection', SelectFromModel(RandomForestClassifier())),
        ('classification', RandomForestClassifier())
    ])

    param_grid = {
        'feature_selection__threshold': ['0.7*mean'],
        'classification__n_estimators': [110],
        'classification__max_depth': [13],
    }

    grid_search = GridSearchCV(pipeline, param_grid, cv=2, scoring='f1', verbose=3)
    grid_search.fit(X, y)
    return grid_search.best_estimator_, grid_search.best_params_


def perform_grid_search_mlp(X, y):
    """ Perform Grid Search to find the best hyperparameters for a single label. """
    
    pipeline = Pipeline([
        ('scaler', StandardScaler()),
        ('feature_selection', SelectFromModel(RandomForestClassifier())),
        ('classification', MLPClassifier())
    ])

    param_grid = {
        'feature_selection__threshold': ['1.1*mean'],
        'classification__hidden_layer_sizes': [(10,)],
        'classification__alpha': [0.0005],
    }

    grid_search = GridSearchCV(pipeline, param_grid, cv=2, scoring='f1', verbose=3)
    grid_search.fit(X, y)
    return grid_search.best_estimator_, grid_search.best_params_


def eval_classifier(model, X_test, y_test, labels):
    """ Evaluates a trained classifier on the test set and displays results. """

    y_pred = model.predict(X_test)

    # Display settings
    n_labels = y.shape[1]
    n_cols = 3
    n_rows = (n_labels + n_cols - 1) // n_cols
    _, axes = plt.subplots(nrows=n_rows, ncols=n_cols, figsize=(n_cols * 4, n_rows * 4))
    axes = axes.flatten()

    accuracies = {}
    f1_scores = {}

    # Get confusion matrix, accuracy, and f1-score for each tag
    for i in range(n_labels):
        cm = confusion_matrix(y_test[:, i], y_pred[:, i])
        disp = ConfusionMatrixDisplay(confusion_matrix=cm)
        disp.plot(ax=axes[i], cmap='YlGnBu')
        axes[i].set_title(f'Label: {labels[i]}')
        
        acc = accuracy_score(y_test[:, i], y_pred[:, i])
        accuracies[labels[i]] = acc

        f1 = f1_score(y_test[:, i], y_pred[:, i])
        f1_scores[labels[i]] = f1

    sns.barplot(x=list(accuracies.values()), y=list(accuracies.keys()), palette='viridis', ax=axes[n_labels])
    axes[n_labels].set_xlabel('Accuracy')
    axes[n_labels].set_ylabel('Label')
    axes[n_labels].set_title('Accuracy per Label')
    axes[n_labels].set_xlim(0, 1)

    sns.barplot(x=list(f1_scores.values()), y=list(f1_scores.keys()), palette='viridis', ax=axes[n_labels + 1])
    axes[n_labels + 1].set_xlabel('F1 Score')
    axes[n_labels + 1].set_ylabel('Label')
    axes[n_labels + 1].set_title('F1 Score per Label')
    axes[n_labels + 1].set_xlim(0, 1)

    plt.tight_layout()
    plt.show()

    print(f"Accuracies:")
    for label, acc in accuracies.items():
        print(f"{label}: {acc:.2f}")
    print("Average Accuracy:", round(np.mean(list(accuracies.values())), 2))

    print(f"\nF1 Scores:")
    for label, f1 in f1_scores.items():
        print(f"{label}: {f1:.2f}")
    print("Average F1 Score:", round(np.mean(list(f1_scores.values())), 2))


In [4]:
## UNCOMMENT THE FOLLOWING TO TRAIN MODEL FROM SCRATCH ##

# best_rf_model, best_rf_params = perform_grid_search_rf(X_train, y_train)
# joblib.dump(best_rf_model, 'models/best_rf_model.joblib')

# print("\nBest params:", best_rf_params)
# # (max_depth=13, n_estimators=110, threshold=0.7*mean) => 0.75 avg acc, 0.55 avg f1

In [5]:
## UNCOMMENT THE FOLLOWING TO TRAIN MODEL FROM SCRATCH ##

# best_mlp_model, best_mlp_params = perform_grid_search_mlp(X_train, y_train)
# joblib.dump(best_mlp_model, 'models/best_mlp_model.joblib')

# print("\nBest params:", best_mlp_params)
# # (alpha=0.0005, hidden_layer_sizes=(10,), threshold=1.1*mean) => 0.75 avg acc, 0.58 avg f1

In [6]:
## LOAD TRAINED MODEL FROM DISK ##

best_rf_model = joblib.load('models/best_rf_model.joblib')
best_mlp_model = joblib.load('models/best_mlp_model.joblib')

In [9]:
labels = ["acoustic", "danceable", "energetic", "instrumental", "lively", "speechy", "happy"]

# Predict on X
y_pred_rf = best_rf_model.predict(X)

# Match the predictions with the labels
y_pred_rf = pd.DataFrame(y_pred_rf, columns=labels)
y_pred_rf

Unnamed: 0,acoustic,danceable,energetic,instrumental,lively,speechy,happy
0,0,1,0,0,0,1,1
