# 

In [2]:
import ast
import os
import warnings

import librosa as lb
import numpy as np
import pandas as pd
import scipy as sp


warnings.filterwarnings("ignore")

In [2]:
def load(filepath):       # FMA module function to load metadata
    filename = os.path.basename(filepath)
    if 'features' in filename:
        return pd.read_csv(filepath, index_col=0, header=[0, 1, 2])

    if 'echonest' in filename:
        return pd.read_csv(filepath, index_col=0, header=[0, 1, 2])

    if 'genres' in filename:
        return pd.read_csv(filepath, index_col=0)

    if 'tracks' in filename:
        tracks = pd.read_csv(filepath, index_col=0, header=[0, 1])

        COLUMNS = [('track', 'tags'), ('album', 'tags'), ('artist', 'tags'),
                   ('track', 'genres'), ('track', 'genres_all')]
        for column in COLUMNS:
            tracks[column] = tracks[column].map(ast.literal_eval)

        COLUMNS = [('track', 'date_created'), ('track', 'date_recorded'),
                   ('album', 'date_created'), ('album', 'date_released'),
                   ('artist', 'date_created'), ('artist', 'active_year_begin'),
                   ('artist', 'active_year_end')]
        for column in COLUMNS:
            tracks[column] = pd.to_datetime(tracks[column])

        SUBSETS = ('small', 'medium', 'large')
        try:
            tracks['set', 'subset'] = tracks['set', 'subset'].astype(
                    'category', categories=SUBSETS, ordered=True)
        except (ValueError, TypeError):
            # the categories and ordered arguments were removed in pandas 0.25
            tracks['set', 'subset'] = tracks['set', 'subset'].astype(
                     pd.CategoricalDtype(categories=SUBSETS, ordered=True))

        COLUMNS = [('track', 'genre_top'), ('track', 'license'),
                   ('album', 'type'), ('album', 'information'),
                   ('artist', 'bio')]
        for column in COLUMNS:
            tracks[column] = tracks[column].astype('category')

        return tracks

# Metadata preparation

Load metadata and select small subset of FMA dataset. For convenience, music files paths were added as column to metadata dataframe. Resulting dataframe (**"tracks_genre_small.csv"**) was used as path container as well as train labels.

In [4]:
METADATA_BASE_DIR = "../data/fma_metadata/"
TRACKS_BASE_DIR = "../data/fma_small/"

In [8]:
tracks = load(os.path.join(METADATA_BASE_DIR, "tracks.csv"))
genres = load(os.path.join(METADATA_BASE_DIR, "genres.csv"))
small = tracks[tracks['set', 'subset'] <= 'small'].track
tracklist = [("0" * (6 - len(str(id_)))) + str(id_) for id_ in small.index]
pathlist = []
for folder in os.listdir(TRACKS_BASE_DIR):
    if os.path.isdir(os.path.join(TRACKS_BASE_DIR, folder)):
        for trackfile in os.listdir(os.path.join(TRACKS_BASE_DIR, folder)):
            pathlist.append(os.path.join(TRACKS_BASE_DIR, folder, trackfile))
pathlist = sorted(pathlist, key=lambda x: int(x[-10:-4]))
small.loc[:, "filepath"] = pathlist
data = small.loc[:, ["genre_top", "filepath"]]
data.to_csv("../data/tracks_genre_small.csv")

# Creating labels for multi-label classification

FMA dataset metadata contains informations not only about main genre, but also about subgenres. This information was saved in a for of matrix with **7794** rows (number of tracks) and **114** columns (all available subgenres for small dataset).

In [10]:
tracks = load(os.path.join(METADATA_BASE_DIR, "tracks.csv"))
genres = load(os.path.join(METADATA_BASE_DIR, "genres.csv"))
small = tracks[tracks['set', 'subset'] <= 'small'].track
tracklist = [("0" * (6 - len(str(id_)))) + str(id_) for id_ in small.index]
pathlist = []
for folder in os.listdir(TRACKS_BASE_DIR):
    if os.path.isdir(os.path.join(TRACKS_BASE_DIR, folder)):
        for trackfile in os.listdir(os.path.join(TRACKS_BASE_DIR, folder)):
            pathlist.append(os.path.join(TRACKS_BASE_DIR, folder, trackfile))
pathlist = sorted(pathlist, key=lambda x: int(x[-10:-4]))
small.loc[:, "filepath"] = pathlist

# Set of files that has incorrect size (much less than 30 seconds)
blacklist = set(["fma_small/098/098565.mp3",
                 "fma_small/098/098567.mp3",
                 "fma_small/098/098569.mp3",
                 "fma_small/099/099134.mp3",
                 "fma_small/108/108925.mp3",
                 "fma_small/133/133297.mp3"])

In [11]:
data_all_genres = small.loc[:, ["genre_top", "filepath", "genres_all"]]
data_all_genres = data_all_genres.query("filepath not in @blacklist")

In [12]:
data_all_genres

Unnamed: 0_level_0,genre_top,filepath,genres_all
track_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
2,Hip-Hop,fma_small/000/000002.mp3,[21]
5,Hip-Hop,fma_small/000/000005.mp3,[21]
10,Pop,fma_small/000/000010.mp3,[10]
140,Folk,fma_small/000/000140.mp3,[17]
141,Folk,fma_small/000/000141.mp3,[17]
...,...,...,...
154308,Hip-Hop,fma_small/154/154308.mp3,"[811, 539, 21]"
154309,Hip-Hop,fma_small/154/154309.mp3,"[811, 539, 21]"
154413,Pop,fma_small/154/154413.mp3,"[10, 76]"
154414,Pop,fma_small/154/154414.mp3,"[10, 76]"


Extract set of genres that are present in small fma dataset

In [23]:
genre_map = genres.loc[:, ["title"]]
genre_map.loc[:, "ID"] = range(genre_map.size)
id_remapper = genre_map.ID

In [24]:
unique_genres = set()
for idx, (_, row) in enumerate(data_all_genres.iterrows()):
    unique_genres |= set(row.genres_all)

In [25]:
small_genres = genre_map.loc[unique_genres, :]
small_genres.ID = range(small_genres.shape[0])
id_remapper = small_genres.ID

In [26]:
print(small_genres.title.to_list())

['Avant-Garde', 'International', 'Sound Art', 'Novelty', 'Turkish', 'Pop', 'New Age', 'Rock', 'Romany (Gypsy)', 'Electronic', 'Sound Effects', 'Folk', 'Soundtrack', 'Hip-Hop', 'Audio Collage', 'Punk', 'Post-Rock', 'Lo-Fi', 'Compilation', 'Rap', 'Field Recordings', 'Metal', 'Noise', 'Psych-Folk', 'Trip-Hop', 'Breakbeat', 'Krautrock', 'Tango', 'Experimental', 'Dance', 'Electroacoustic', 'Chip Music', 'Ambient Electronic', 'Hip-Hop Beats', 'Loud-Rock', 'Latin America', 'Drone', 'Salsa', 'Free-Folk', 'Noise-Rock', 'Psych-Rock', 'Goth', 'Electro-Punk', 'Indie-Rock', 'Abstract Hip-Hop', 'Industrial', 'No Wave', 'Experimental Pop', 'French', 'Reggae - Dub', 'Drum & Bass', 'Afrobeat', 'Nerdcore', 'Garage', 'Indian', 'New Wave', 'Post-Punk', 'Reggae - Dancehall', 'Sludge', 'African', 'Freak-Folk', 'Progressive', 'Alternative Hip-Hop', 'Death-Metal', 'Middle East', 'Singer-Songwriter', 'Shoegaze', 'Kid-Friendly', 'Synth Pop', 'Spanish', 'Ambient', 'Hardcore', 'Thrash', 'Power-Pop', 'Space-Rock',

Create 2d numpy array representing genres of each track in form of dummy variables

In [27]:
genre_dummies = np.zeros((len(data_all_genres), len(unique_genres)))
for idx, (_, row) in enumerate(data_all_genres.iterrows()):
    track_genres = list(map(lambda x: id_remapper[x], row.genres_all))
    genre_dummies[idx, track_genres] = 1

In [28]:
np.save("../data/genres_multilabel.npy", genre_dummies)

In [198]:
genre_map.to_csv("../data/all_genres.csv")

In [29]:
len(unique_genres)

114