This notebook is for figuring out how to work with the FMA dataset

In [None]:
import numpy as np
import pandas as pd
import librosa
import librosa.display
import matplotlib
import sklearn
import os.path
import sklearn
import pickle

In [None]:
print('The scikit-learn version is {}.'.format(sklearn.__version__))

In [None]:
features = pd.read_csv("fma_metadata/features.csv", index_col=0, header=[0,1,2])
genres = pd.read_csv("fma_metadata/genres.csv")
tracks = pd.read_csv("fma_metadata/tracks.csv", index_col=0, header=[0,1])

In [None]:
small = tracks['set','subset'] == 'small'
small_tracks = tracks[small]

In [None]:
all_genres = small_tracks['track','genre_top']
for genre in all_genres:
    if genre not in small_tracks['track','genre_top'].unique():
        print('not in list')

In [None]:
small_features = features[small]

In [None]:
len(small_features['chroma_cens'].columns)

In [None]:
small_features.head()

In [None]:
small_features['mfcc'].head()

In [None]:
len(tracks)

In [None]:
len(small_tracks)

In [None]:
small_tracks['track','genre_top'].unique() # All genres

In [None]:
for genre in small_tracks['track','genre_top'].unique():
    print(genre, len(small_tracks[small_tracks['track','genre_top'] == genre]))

Tracks are therefore evenly distributed in this dataset (8 genres, 1000 tracks each)

In [None]:
small_train = small_tracks[small_tracks['set','split'] == 'training']
small_val = small_tracks[small_tracks['set','split'] == 'validation']
small_test = small_tracks[small_tracks['set','split'] == 'test']

In [None]:
print(len(small_train))
print(len(small_val))
print(len(small_test))

In [None]:
X_train = small_features.loc[small_tracks['set','split'] == 'training']
X_val = small_features.loc[small_tracks['set','split'] == 'validation']
X_test = small_features.loc[small_tracks['set','split'] == 'test']

In [None]:
y_train = small_train['track','genre_top']
y_val = small_val['track','genre_top']
y_test = small_test['track','genre_top']

In [None]:
print(len(y_train))
print(len(y_val))
print(len(y_test))

In [None]:
le = sklearn.preprocessing.LabelEncoder()
y_train = le.fit_transform(y_train)
y_val = le.fit_transform(y_val)
y_test = le.fit_transform(y_test)

In [None]:
le.classes_

In [None]:
y_train[:10]

In [None]:
small_train['track','genre_top'][:10]

In [None]:
len(list(X_train['mfcc'].columns))

In [None]:
len(list(X_train['chroma_cens'].columns))

In [None]:
X_train_mfcc = X_train['mfcc']
X_val_mfcc = X_val['mfcc']
X_test_mfcc = X_test['mfcc']

In [None]:
X_train_chroma = X_train['chroma_cens']
X_val_chroma = X_val['chroma_cens']
X_test_chroma = X_test['chroma_cens']

In [None]:
genres = le.inverse_transform([0,1,2,3,4,5,6,7])
genres

In [None]:
lr = sklearn.linear_model.LogisticRegression(C=1, penalty='l2', solver='lbfgs', multi_class='multinomial')
lr.fit(X_train_mfcc, y_train)
y_pred = lr.predict(X_test_mfcc)
print(sklearn.metrics.classification_report(y_test, y_pred, target_names=genres))

In [None]:
lr = sklearn.linear_model.LogisticRegression(C=1, penalty='l2')
lr.fit(X_train_chroma, y_train)
y_pred = lr.predict(X_test_chroma)
print(sklearn.metrics.classification_report(y_test, y_pred, target_names=genres))

In [None]:
neigh = sklearn.neighbors.KNeighborsClassifier(n_neighbors=7)
neigh.fit(X_train_mfcc, y_train)
y_pred = neigh.predict(X_test_mfcc)
print(sklearn.metrics.classification_report(y_test, y_pred, target_names=genres))

In [None]:
neigh = sklearn.neighbors.KNeighborsClassifier(n_neighbors=7)
neigh.fit(X_train_chroma, y_train)
y_pred = neigh.predict(X_test_chroma)
print(sklearn.metrics.classification_report(y_test, y_pred, target_names=genres))

In [None]:
C_params = [0.001, 0.1, 1, 5]
for c in C_params:
    print("C = ", c)
    lr = sklearn.linear_model.LogisticRegression(C=c, penalty='l2', solver='lbfgs', multi_class='multinomial')
    lr.fit(X_train_mfcc, y_train)
    y_pred = lr.predict(X_test_mfcc)
    print(sklearn.metrics.classification_report(y_test, y_pred, target_names=genres))


In [None]:
C_params = [0.001, 0.1, 1, 5]
for c in C_params:
    print("C = ", c)
    lr = sklearn.linear_model.LogisticRegression(C=c, penalty='l2')
    lr.fit(X_train_chroma, y_train)
    y_pred = lr.predict(X_test_chroma)
    print(sklearn.metrics.classification_report(y_test, y_pred, target_names=genres))

In [None]:
for n in range(1,11):
    print("n_neighbors = ", n)
    neigh = sklearn.neighbors.KNeighborsClassifier(n_neighbors=n)
    neigh.fit(X_train_mfcc, y_train)
    y_pred = neigh.predict(X_test_mfcc)
    print(sklearn.metrics.classification_report(y_test, y_pred, target_names=genres))

In [None]:
for n in range(1,11):
    print("n_neighbors = ", n)
    neigh = sklearn.neighbors.KNeighborsClassifier(n_neighbors=n)
    neigh.fit(X_train_chroma, y_train)
    y_pred = neigh.predict(X_test_chroma)
    print(sklearn.metrics.classification_report(y_test, y_pred, target_names=genres))