In [1]:
import os
import warnings
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import librosa
import joblib
from sklearn.pipeline import Pipeline
from sklearn.feature_selection import SelectFromModel
from sklearn.base import clone
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.neural_network import MLPClassifier
from sklearn.dummy import DummyClassifier
from sklearn.preprocessing import StandardScaler, PowerTransformer
from sklearn.metrics import accuracy_score, f1_score, confusion_matrix, ConfusionMatrixDisplay

warnings.simplefilter("ignore")

plt.rcParams['figure.figsize'] = (12, 5)

### Build Dataset
- Feature extraction with Librosa for model inputs (X_train/X_test)
- Labelling tracks with genre and echonest tags (Y_train/Y_test)

In [2]:
## UNCOMMENT THE FOLLOWING CODE TO BUILD DATASET FROM SCRATCH ##

feats_mean_mfcc = []
feats_std_mfcc = []
feats_mean_chroma = []
feats_std_chroma = []
feats_tempo = []
feats_mean_contrast = []
feats_std_contrast = []
feats_mean_mel = []
feats_std_mel = []
feats_mean_tonnetz = []
feats_std_tonnetz = []
feats_mean_rms = []
feats_std_rms = []
feats_mean_zcr = []
feats_std_zcr = []

fname = 'happy_pharrel.mp3'

# Load the audio file and extract the features
audio, srate = librosa.load(fname, res_type='kaiser_fast')
mfcc_matrix = librosa.feature.mfcc(y=audio, sr=srate, n_mfcc=40)
chroma_cqt = librosa.feature.chroma_cqt(y=audio, sr=srate, n_chroma=12)
tempo, _ = librosa.beat.beat_track(y=audio, sr=srate)
contrast = librosa.feature.spectral_contrast(y=audio, sr=srate, n_bands=6)
mel = librosa.feature.melspectrogram(y=audio, sr=srate, n_mels=64)
tonnetz = librosa.feature.tonnetz(y=audio, sr=srate)
rms = librosa.feature.rms(y=audio)
zcr = librosa.feature.zero_crossing_rate(y=audio)

mean_mfcc = np.mean(mfcc_matrix, axis=1)
std_mfcc = np.std(mfcc_matrix, axis=1)
mean_chroma = np.mean(chroma_cqt, axis=1)
std_chroma = np.std(chroma_cqt, axis=1)
mean_contrast = np.mean(contrast, axis=1)
std_contrast = np.std(contrast, axis=1)
mean_mel = np.mean(mel, axis=1)
std_mel = np.std(mel, axis=1)
mean_tonnetz = np.mean(tonnetz, axis=1)
std_tonnetz = np.std(tonnetz, axis=1)
mean_rms = np.mean(rms, axis=1)
std_rms = np.std(rms, axis=1)
mean_zcr = np.mean(zcr, axis=1)
std_zcr = np.std(zcr, axis=1)

feats_mean_mfcc.append(mean_mfcc)
feats_std_mfcc.append(std_mfcc)
feats_mean_chroma.append(mean_chroma)
feats_std_chroma.append(std_chroma)
feats_tempo.append(tempo)
feats_mean_contrast.append(mean_contrast)
feats_std_contrast.append(std_contrast)
feats_mean_mel.append(mean_mel)
feats_std_mel.append(std_mel)
feats_mean_tonnetz.append(mean_tonnetz)
feats_std_tonnetz.append(std_tonnetz)
feats_mean_rms.append(mean_rms)
feats_std_rms.append(std_rms)
feats_mean_zcr.append(mean_zcr)
feats_std_zcr.append(std_zcr)

# save to disk to avoid re-computing
np.save('features/feats_mean_mfcc.npy', feats_mean_mfcc)
np.save('features/feats_std_mfcc.npy', feats_std_mfcc)
np.save('features/feats_mean_chroma.npy', feats_mean_chroma)
np.save('features/feats_std_chroma.npy', feats_std_chroma)
np.save('features/feats_tempo.npy', feats_tempo)
np.save('features/feats_mean_contrast.npy', feats_mean_contrast)
np.save('features/feats_std_contrast.npy', feats_std_contrast)
np.save('features/feats_mean_mel.npy', feats_mean_mel)
np.save('features/feats_std_mel.npy', feats_std_mel)
np.save('features/feats_mean_tonnetz.npy', feats_mean_tonnetz)
np.save('features/feats_std_tonnetz.npy', feats_std_tonnetz)
np.save('features/feats_mean_rms.npy', feats_mean_rms)
np.save('features/feats_std_rms.npy', feats_std_rms)
np.save('features/feats_mean_zcr.npy', feats_mean_zcr)
np.save('features/feats_std_zcr.npy', feats_std_zcr)

print(f"Dataset size: {len(feats_mean_mfcc)}")

Dataset size: 1


In [3]:
## LOAD DATASET FROM DISK IF ALREADY COMPUTED ##

feats_mean_mfcc = np.load('features/feats_mean_mfcc.npy')
feats_std_mfcc = np.load('features/feats_std_mfcc.npy')
feats_mean_chroma = np.load('features/feats_mean_chroma.npy')
feats_std_chroma = np.load('features/feats_std_chroma.npy')
feats_tempo = np.load('features/feats_tempo.npy').reshape(-1, 1)
feats_mean_contrast = np.load('features/feats_mean_contrast.npy')
feats_std_contrast = np.load('features/feats_std_contrast.npy')
feats_mean_mel = np.load('features/feats_mean_mel.npy')
feats_std_mel = np.load('features/feats_std_mel.npy')
feats_mean_tonnetz = np.load('features/feats_mean_tonnetz.npy')
feats_std_tonnetz = np.load('features/feats_std_tonnetz.npy')
feats_mean_rms = np.load('features/feats_mean_rms.npy')
feats_std_rms = np.load('features/feats_std_rms.npy')
feats_mean_zcr = np.load('features/feats_mean_zcr.npy')
feats_std_zcr = np.load('features/feats_std_zcr.npy')

X = np.hstack([feats_mean_mfcc, feats_std_mfcc, feats_mean_chroma, feats_std_chroma, feats_tempo,
               feats_mean_contrast, feats_std_contrast, feats_mean_mel, feats_std_mel,
               feats_mean_tonnetz, feats_std_tonnetz, feats_mean_rms, feats_std_rms,
               feats_mean_zcr, feats_std_zcr])
print("X shape:", X.shape)

X shape: (1, 263)


In [6]:
## LOAD TRAINED MODEL FROM DISK ##

best_rf_model = joblib.load('models/best_rf_model.joblib')
best_mlp_model = joblib.load('models/best_mlp_model.joblib')

In [9]:
labels = ["acoustic", "danceable", "energetic", "instrumental", "lively", "speechy", "happy"]

# Predict on X
y_pred_rf = best_rf_model.predict(X)

# Match the predictions with the labels
y_pred_rf = pd.DataFrame(y_pred_rf, columns=labels)
y_pred_rf

Unnamed: 0,acoustic,danceable,energetic,instrumental,lively,speechy,happy
0,0,1,0,0,0,1,1
