# [FMA: A Dataset For Music Analysis](https://github.com/mdeff/fma)

Michaël Defferrard, Kirell Benzi, Pierre Vandergheynst, Xavier Bresson, EPFL LTS2.

## Setup: Import Packages and FMA Data

In [1]:
%matplotlib inline

import os
from pathlib import Path

import IPython.display as ipd
import matplotlib.pyplot as plt
import numpy as np

import utils.fma_extraction as fma_utils
import utils.ml_training as ml_utils

plt.rcParams["figure.figsize"] = (17, 5)

In [None]:
# Directory where mp3 are stored.
FMA_METADATA_DIRECTORY = os.getenv("FMA_METADATA_DIRECTORY")
FMA_AUDIO_DIRECTORY = os.getenv("FMA_AUDIO_DIRECTORY")

# Load metadata and features.
tracks = fma_utils.load(Path(FMA_METADATA_DIRECTORY) / "tracks.csv")
genres = fma_utils.load(Path(FMA_METADATA_DIRECTORY) / "genres.csv")
features = fma_utils.load(Path(FMA_METADATA_DIRECTORY) / "features.csv")
echonest = fma_utils.load(Path(FMA_METADATA_DIRECTORY) / "echonest.csv")

np.testing.assert_array_equal(features.index, tracks.index)
assert echonest.index.isin(tracks.index).all()  # noqa: S101

# Merge features with echonest.
features = features.merge(echonest, left_index=True, right_index=True, how="left")
print("Features columns:", features.columns.to_list())

tracks.shape, genres.shape, features.shape, echonest.shape

## Echonest features

In [None]:
# Add all features under ('echonest', 'audio_features')
echonest_features = [
    col for col in features.columns
    if col[:2] == ("echonest", "audio_features")
        or col[:2] == ("echonest", "temporal_features")
]
print(echonest_features)

In [None]:
print("{1} features for {0} tracks".format(*echonest.shape))
ipd.display(echonest["echonest", "audio_features"].head())
print("Audio features include", echonest["echonest", "audio_features"].columns)

In [None]:
ipd.display(echonest["echonest", "temporal_features"].head())
x = echonest.loc[2, ("echonest", "temporal_features")]
plt.plot(x)

## Classification by Features

In [None]:
# Prepare dataset splits and destrucure into training, testing, and validation sets
X_train, X_test, X_val, y_train, y_test, y_val = ml_utils.prepare_dataset_splits(tracks=tracks, features=features, subset="medium")

# Check the shapes of the feature arrays
print(X_train.shape)
print(X_test.shape)
print(X_val.shape)

print(f"{y_train.size} training examples, {y_test.size} testing examples")
print(f"{X_train.shape[1]} features, {np.unique(y_train).size} classes")

In [None]:
# Specify feature reduction and classifier
reduce_features = False
model_classifer = "SVM"

# Preprocess data, removing features if specified
X_train, X_test, y_train_encoded, y_test_encoded = ml_utils.preprocess_data(
    X_train=X_train,
    X_test=X_test,
    y_train=y_train,
    y_test=y_test,
    reduce_features=reduce_features)

# Train, evaluate, and retrieve the specified model
clf, score = ml_utils.train_and_evaluate(
    X_train=X_train,
    y_train_encoded=y_train_encoded,
    X_test=X_test,
    y_test_encoded=y_test_encoded,
    model_classifier=model_classifer,
)

