# Classification de genres musicaux

Lionel Baptiste, Ghali El Ouarzazi, Joévin Soulenq

Basé sur les travaux de Michaël Defferrard : https://github.com/mdeff/fma

## Prédiction du genre musical depuis un fichier audio

* On entraine un modèle de classification
* On a une musique arbitraire au format MP3
* On cherche à extraire son indicateur MFCC via LibROSA
* On calcule les 7 indicateurs statistiques suivants depuis la MCFF : min, max, median, mean, skewness, kurtosis et std* * On prédit le genre musical de la chanson

In [68]:
import time
import os

import IPython.display as ipd
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import seaborn as sns
from scipy import stats
from sklearn.utils import shuffle
from pandas import Series
from sklearn.preprocessing import MultiLabelBinarizer, LabelEncoder, LabelBinarizer, StandardScaler
from sklearn.svm import SVC
from sklearn.neural_network import MLPClassifier
import librosa
import librosa.display

import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)

import utils

In [69]:
# Répertoire ou sont présent les fichiers MP3
AUDIO_DIR = os.environ.get('AUDIO_DIR')

# Chargement des métadonnées et des indicateurs
tracks       = utils.load('fma_metadata/tracks.csv')
genres       = utils.load('fma_metadata/genres.csv')
features_fma = utils.load('fma_metadata/features.csv')

## Apprentissage

Entraînement du classifieur choisi (SVM ou RN)

In [110]:
small = tracks['set', 'subset'] <= 'medium' # 'small', 'medium', 'large' ou 'full'

train = tracks['set', 'split'] == 'training'
val   = tracks['set', 'split'] == 'validation'
test  = tracks['set', 'split'] == 'test'

y_train = tracks.loc[small & train, ('track', 'genre_top')]
y_test  = tracks.loc[small & test,  ('track', 'genre_top')]
X_train = features_fma.loc[small & train, 'mfcc']
X_test  = features_fma.loc[small & test,  'mfcc']

# Mélange l'ensemble des attributs (indispensable pour avoir un résultat correct)
X_train, y_train = shuffle(X_train, y_train, random_state=42)

# Standardise les données en retirant la moyenne et réduit la variance "zero mean and unit variance".
scaler = StandardScaler(copy=False)
scaler.fit_transform(X_train)
scaler.transform(X_test)

# Classification
t = time.process_time()
#clf = SVC(kernel='rbf')                                     # SVM
clf = MLPClassifier(hidden_layer_sizes=(10), max_iter=1000)  # RN
clf.fit(X_train, y_train)
score = clf.score(X_test, y_test)
print('Précision : {:.2%}'.format(score))
print('Temps d\'exécution : {:.2f}'.format(time.process_time() - t))

Précision : 58.96%
Temps d'exécution : 173.84


## Chargement du fichier audio

Chargement du ficher audio directement dans le notebook

In [111]:
numero = 140565
filename = utils.get_audio_path(AUDIO_DIR, numero)
print('Fichier: {}'.format(filename))

x, sr = librosa.load(filename, sr=None, mono=True)
print('Durée: {:.2f}s, {} échantillons'.format(x.shape[-1] / sr, x.size))

start, end = 7, 17
ipd.Audio(data=x[start*sr:end*sr], rate=sr)

Fichier: /home/joevin/TA/fma_small/140/140565.mp3
Durée: 29.99s, 1439471 échantillons


In [112]:
artiste = tracks.loc[numero:numero:, ['track_id','artist']].iloc[0, 12]
titre   = tracks.loc[numero:numero:, ['track_id','track']].iloc[0, 19]
genre   = tracks.loc[numero:numero:, ['track_id','track']].iloc[0, 7]
print ('{} - {}\nGenre musical observé : {}'.format(titre, artiste, genre))

pretend your happy - transient
Genre musical observé : Electronic


## Extraction

In [113]:
def columns():
    feature_sizes = dict(mfcc=20)
    moments = ('mean', 'std', 'skew', 'kurtosis', 'median', 'min', 'max')

    columns = []
    for name, size in feature_sizes.items():
        for moment in moments:
            it = ((name, moment, '{:02d}'.format(i+1)) for i in range(size))
            columns.extend(it)

    names = ('feature', 'statistics', 'number')
    columns = pd.MultiIndex.from_tuples(columns, names=names)

    return columns.sort_values()

features = pd.Series(index=columns(), dtype=np.float32, name=numero)

def feature_stats(name, values):
    features[name, 'mean'] = np.mean(values, axis=1)
    features[name, 'std'] = np.std(values, axis=1)
    features[name, 'skew'] = stats.skew(values, axis=1)
    features[name, 'kurtosis'] = stats.kurtosis(values, axis=1)
    features[name, 'median'] = np.median(values, axis=1)
    features[name, 'min'] = np.min(values, axis=1)
    features[name, 'max'] = np.max(values, axis=1)

In [114]:
# chargement du fichier
x, sr = librosa.load(filename, sr=None, mono=True)

# préparation LibROSA
stft = np.abs(librosa.stft(x, n_fft=2048, hop_length=512))
mel = librosa.feature.melspectrogram(sr=sr, S=stft**2)

# extraction du MFCC
f = librosa.feature.mfcc(S=librosa.power_to_db(mel), n_mfcc=20)
feature_stats('mfcc', f)

# conversion de type Series vers DataFrame puis transposée
extracted_mfcc = features.to_frame().T 

# tableau obtenu
extracted_mfcc.head()

feature,mfcc,mfcc,mfcc,mfcc,mfcc,mfcc,mfcc,mfcc,mfcc,mfcc,mfcc,mfcc,mfcc,mfcc,mfcc,mfcc,mfcc,mfcc,mfcc,mfcc,mfcc
statistics,kurtosis,kurtosis,kurtosis,kurtosis,kurtosis,kurtosis,kurtosis,kurtosis,kurtosis,kurtosis,...,std,std,std,std,std,std,std,std,std,std
number,01,02,03,04,05,06,07,08,09,10,...,11,12,13,14,15,16,17,18,19,20
140565,0.571171,2.753291,-0.433967,-0.259532,0.452921,-0.132181,-0.191425,-0.030144,-0.284373,0.174169,...,9.282961,12.125919,9.162485,7.975543,7.863668,7.778516,7.520745,7.19881,7.608623,6.799826


## Comparaison

On compare nos indicateurs extraits avec ceux de la base FMA (provenant de *features.csv*).

In [115]:
fma_mfcc = features_fma.loc[numero,['track_id','mfcc']].to_frame().T
fma_mfcc.head()

feature,mfcc,mfcc,mfcc,mfcc,mfcc,mfcc,mfcc,mfcc,mfcc,mfcc,mfcc,mfcc,mfcc,mfcc,mfcc,mfcc,mfcc,mfcc,mfcc,mfcc,mfcc
statistics,kurtosis,kurtosis,kurtosis,kurtosis,kurtosis,kurtosis,kurtosis,kurtosis,kurtosis,kurtosis,...,std,std,std,std,std,std,std,std,std,std
number,01,02,03,04,05,06,07,08,09,10,...,11,12,13,14,15,16,17,18,19,20
140565,-0.292312,-0.089246,-0.397742,-1.079161,-0.818577,0.291896,-0.043776,-0.028966,0.068056,0.094826,...,9.765505,9.634953,8.320844,7.577446,7.970215,7.536709,8.12822,7.529292,6.88137,6.999118


## Prédiction

In [116]:
prediction_mfcc = extracted_mfcc.astype(float)

# Pré-raitement, utile ?
# Mélange l'ensemble des attributs (indispensable pour avoir un résultat correct)
prediction_mfcc = shuffle(prediction_mfcc, random_state=42)

# Standardise les données en retirant la moyenne et réduit la variance "zero mean and unit variance".
scaler = StandardScaler(copy=False)
scaler.fit_transform(prediction_mfcc)
scaler.transform(prediction_mfcc)

# prédiction
prediction = clf.predict(prediction_mfcc)
print('Genre musical trouvé: {}'.format(prediction[0]))

# prédiction avec seuils de confiance
#dec = clf.decision_function(prediction_mfcc) # SVM
dec = clf.predict_proba(prediction_mfcc)      # RN

colonnes = ['Blues', 'Classical', 'Country', 'Easy Listening', 'Electronic', 'Experimental', 'Folk', 'Hip-Hop', 'Instrumental', 'International', 'Jazz', 'Old-Time / Historic', 'Pop', 'Rock', 'Soul-RnB', 'Spoken']
data = pd.DataFrame(data=dec, columns=colonnes)
data.head()

Genre musical trouvé: Electronic


Unnamed: 0,Blues,Classical,Country,Easy Listening,Electronic,Experimental,Folk,Hip-Hop,Instrumental,International,Jazz,Old-Time / Historic,Pop,Rock,Soul-RnB,Spoken
0,5.607574e-269,5.412370999999999e-203,0.0,0.0,0.999262,0.000738,1.835923e-223,1.8111939999999999e-56,1.526161e-88,1.3280839999999998e-283,1.004966e-257,0.0,5.8814319999999994e-130,2.208104e-89,2.788638e-295,5.816607e-249
