# [FMA: A Dataset For Music Analysis](https://github.com/mdeff/fma)

Michaël Defferrard, Kirell Benzi, Pierre Vandergheynst, Xavier Bresson, EPFL LTS2.

## Setup: Import Packages and FMA Data

In [25]:
%matplotlib inline

import os
from pathlib import Path

import IPython.display as ipd
import librosa
import librosa.display
import matplotlib.pyplot as plt
import numpy as np
import sklearn as skl
from sklearn.decomposition import PCA
from sklearn.ensemble import GradientBoostingClassifier, RandomForestClassifier
from sklearn.linear_model import Lasso
from sklearn.neural_network import MLPClassifier
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.svm import SVC
from sklearn.utils import shuffle

import utils

plt.rcParams["figure.figsize"] = (17, 5)

In [15]:
# Directory where mp3 are stored.
FMA_METADATA_DIRECTORY = os.getenv("FMA_METADATA_DIRECTORY")
FMA_AUDIO_DIRECTORY = os.getenv("FMA_AUDIO_DIRECTORY")

# Load metadata and features.
tracks = utils.load(Path(FMA_METADATA_DIRECTORY) / "tracks.csv")
genres = utils.load(Path(FMA_METADATA_DIRECTORY) / "genres.csv")
features = utils.load(Path(FMA_METADATA_DIRECTORY) / "features.csv")
echonest = utils.load(Path(FMA_METADATA_DIRECTORY) / "echonest.csv")

np.testing.assert_array_equal(features.index, tracks.index)
assert echonest.index.isin(tracks.index).all()

tracks.shape, genres.shape, features.shape, echonest.shape

((106574, 52), (163, 4), (106574, 518), (13129, 249))

## **Jump to 5 for Model Training**

## 1 Metadata

The metadata table, a CSV file in the `fma_metadata.zip` archive, is composed of many colums:
1. The index is the ID of the song, taken from the website, used as the name of the audio file.
2. Per-track, per-album and per-artist metadata from the Free Music Archive website.
3. Two columns to indicate the subset (small, medium, large) and the split (training, validation, test).

In [None]:
ipd.display(tracks["track"].head())
ipd.display(tracks["album"].head())
ipd.display(tracks["artist"].head())
ipd.display(tracks["set"].head())

### 1.1 Subsets

The small and medium subsets can be selected with the below code.

In [None]:
small = tracks[tracks["set", "subset"] <= "small"]
small.shape

In [None]:
medium = tracks[tracks["set", "subset"] <= "medium"]
medium.shape

## 2 Genres

The genre hierarchy is stored in `genres.csv` and distributed in `fma_metadata.zip`.

In [None]:
print("{} top-level genres".format(len(genres["top_level"].unique())))
genres.loc[genres["top_level"].unique()].sort_values("#tracks", ascending=False)

In [None]:
genres.sort_values("#tracks").head(10)

## 3 Features

1. Features extracted from the audio for all tracks.
2. For some tracks, data colected from the [Echonest](http://the.echonest.com/) API.

In [None]:
print("{1} features for {0} tracks".format(*features.shape))
columns = ["mfcc", "chroma_cens", "tonnetz", "spectral_contrast"]
columns.append(["spectral_centroid", "spectral_bandwidth", "spectral_rolloff"])
columns.append(["rmse", "zcr"])
for column in columns:
    ipd.display(features[column].head().style.format("{:.2f}"))

### 3.1 Echonest features

In [None]:
print("{1} features for {0} tracks".format(*echonest.shape))
ipd.display(echonest["echonest", "metadata"].head())
ipd.display(echonest["echonest", "audio_features"].head())
ipd.display(echonest["echonest", "social_features"].head())
ipd.display(echonest["echonest", "ranks"].head())

In [None]:
ipd.display(echonest["echonest", "temporal_features"].head())
x = echonest.loc[2, ("echonest", "temporal_features")]
plt.plot(x)

### 3.2 Features like MFCCs are discriminant

In [None]:
small = tracks["set", "subset"] <= "small"
genre1 = tracks["track", "genre_top"] == "Instrumental"
genre2 = tracks["track", "genre_top"] == "Hip-Hop"

X = features.loc[small & (genre1 | genre2), "mfcc"]
X = skl.decomposition.PCA(n_components=2).fit_transform(X)

y = tracks.loc[small & (genre1 | genre2), ("track", "genre_top")]
y = LabelEncoder().fit_transform(y)

plt.scatter(X[:,0], X[:,1], c=y, cmap="RdBu", alpha=0.5)
X.shape, y.shape

## 4 Audio

You can load the waveform and listen to audio in the notebook itself.

In [None]:
filename = utils.get_audio_path(FMA_AUDIO_DIRECTORY, 2)
print(f"File: {filename}")

x, sr = librosa.load(filename, sr=None, mono=True)
print(f"Duration: {x.shape[-1] / sr:.2f}s, {x.size} samples")

start, end = 7, 17
ipd.Audio(data=x[start*sr:end*sr], rate=sr)

And use [librosa](https://github.com/librosa/librosa) to compute spectrograms and audio features.

In [None]:
librosa.display.waveshow(y=x, sr=sr, alpha=0.5)
plt.vlines([start, end], -1, 1)

start = len(x) // 2
plt.figure()
plt.plot(x[start:start+2000])
plt.ylim((-1, 1))

In [None]:
stft = np.abs(librosa.stft(x, n_fft=2048, hop_length=512))
mel = librosa.feature.melspectrogram(sr=sr, S=stft**2)
log_mel = librosa.amplitude_to_db(mel)

librosa.display.specshow(log_mel, sr=sr, hop_length=512, x_axis="time", y_axis="mel");

In [None]:
mfcc = librosa.feature.mfcc(S=librosa.power_to_db(mel), n_mfcc=20)
mfcc = StandardScaler().fit_transform(mfcc)
librosa.display.specshow(mfcc, sr=sr, x_axis="time");

## 5 Genre classification

### 5.1 From features

In [16]:
medium = (tracks["set", "subset"] <= "medium")  # Filters for rows where the subset is "medium"

# Create a boolean mask for the training, validation, and test data (where "set" is "training/validation/test")
train = tracks["set", "split"] == "training"
val = tracks["set", "split"] == "validation"
test = tracks["set", "split"] == "test"

# Select the genre labels (target) for the training set from the "medium" subset and "training" split
y_train = tracks.loc[medium & train, ("track", "genre_top")]
# Select the genre labels (target) for the testing set from the "medium" subset and "test" split
y_test = tracks.loc[medium & test, ("track", "genre_top")]
# Select the feature values (MFCCs) for the training set from the "medium" subset and "training" split
X_train = features.loc[
    medium & train,
    [
        "mfcc",
        "chroma_cens",
        "tonnetz",
        "spectral_contrast",
        "spectral_centroid",
        "spectral_bandwidth",
        "spectral_rolloff",
        "rmse",
        "zcr",
    ],
].values
# Select the feature values (MFCCs) for the testing set from the "medium" subset and "test" split
X_test = features.loc[
    medium & test,
    [
        "mfcc",
        "chroma_cens",
        "tonnetz",
        "spectral_contrast",
        "spectral_centroid",
        "spectral_bandwidth",
        "spectral_rolloff",
        "rmse",
        "zcr",
    ],
].values  # Filters for MFCC features (X) for testing

# Flatten the feature arrays if needed (if features are multidimensional, e.g., MFCC)
X_train = X_train.reshape(X_train.shape[0], -1)
X_test = X_test.reshape(X_test.shape[0], -1)

print(f"{y_train.size} training examples, {y_test.size} testing examples")
print(f"{X_train.shape[1]} features, {np.unique(y_train).size} classes")

19922 training examples, 2573 testing examples
350 features, 16 classes


In [None]:
# Be sure training samples are shuffled.
X_train, y_train = shuffle(X_train, y_train, random_state=42)

# Standardize features by removing the mean and scaling to unit variance.
scaler = StandardScaler(copy=False)
scaler.fit_transform(X_train)
scaler.transform(X_test)

# Define reduce features to reduce the number of features.
reduce_features = False
model_classifier = "RF"

if reduce_features:
    # Apply PCA to reduce dimensionality.
    pca = PCA(n_components=0.99) # Keep 95% of variance.
    X_train_pca = pca.fit_transform(X_train)
    X_test_pca = pca.transform(X_test)

    # Lasso for feature selection.
    y_train_encoded = LabelEncoder().fit_transform(y_train)
    lasso = Lasso(alpha=0.001)
    lasso.fit(X_train_pca, y_train_encoded)

    # Select the most important features.
    mask = lasso.coef_ != 0
    selected_features = np.where(mask)[0]
    print(f"Selected {mask.sum()} features from {X_train_pca.shape[1]}")

    # Use the selected features for training and testing.
    X_train_selected = X_train_pca[:, selected_features]
    X_test_selected = X_test_pca[:, selected_features]

    X_train = X_train_selected
    X_test = X_test_selected

# Choose a classifier for the model.
if model_classifier == "RF": # Random Forest
    clf = RandomForestClassifier(n_estimators=100, random_state=42)
elif model_classifier == "GPM": # Gradient Boosting Machine
    clf = GradientBoostingClassifier(n_estimators=100, learning_rate=0.1, max_depth=3, random_state=42)
elif model_classifier == "NN": # Neural Network
    clf = MLPClassifier(hidden_layer_sizes=(128, 64), activation="relu", max_iter=200, random_state=42)
elif model_classifier == "SVM": # Support Vector Machine
    clf = SVC()
else:
    message = "Invalid model_classifier. Choose from 'RF', 'GPM', 'NN', 'SVM'."
    raise ValueError(message)


# Fit and evaluate the model
clf.fit(X_train, y_train)
score = clf.score(X_test, y_test)
print(f"Accuracy: {score:.2%}")

Accuracy: 62.42%
