In [1]:
%matplotlib inline

In [2]:
import os
import sys

import IPython.display as ipd
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import sklearn as skl
import sklearn.utils, sklearn.preprocessing, sklearn.decomposition, sklearn.svm
import librosa
import librosa.display

import utils

plt.rcParams['figure.figsize'] = (17, 5)

In [3]:
# Directory where mp3 are stored
AUDIO_DIR = os.environ.get('AUDIO_DIR')

# Load metadata and features
tracks = utils.load('fma_metadata/tracks.csv')
genres = utils.load('fma_metadata/genres.csv')
features = utils.load('fma_metadata/features.csv')
echonest = utils.load('fma_metadata/echonest.csv')

np.testing.assert_array_equal(features.index, tracks.index)
assert echonest.index.isin(tracks.index).all()

tracks.shape, genres.shape, features.shape, echonest.shape

  'category', categories=SUBSETS, ordered=True)


((106574, 52), (163, 4), (106574, 518), (13129, 249))

## 5 Genre classification

### 5.1 From features

In [15]:
small = tracks['set', 'subset'] <= 'small'

train = tracks['set', 'split'] == 'training'
val = tracks['set', 'split'] == 'validation'
test = tracks['set', 'split'] == 'test'

y_train = tracks.loc[small & train, ('track', 'genre_top')]
y_test = tracks.loc[small & test, ('track', 'genre_top')]
enc = skl.preprocessing.LabelEncoder()
y_train = enc.fit_transform(y_train)
y_test = enc.transform(y_test)

X_train = features.loc[small & train,]
X_test = features.loc[small & test,]

print('{} training examples, {} testing examples'.format(y_train.size, y_test.size))
print('{} features, {} classes'.format(X_train.shape[1], np.unique(y_train).size))

6400 training examples, 800 testing examples
518 features, 8 classes


In [16]:
# Be sure training samples are shuffled.
X_train, y_train = skl.utils.shuffle(X_train, y_train, random_state=42)

# Standardize features by removing the mean and scaling to unit variance.
scaler = skl.preprocessing.StandardScaler(copy=False)
scaler.fit_transform(X_train)
scaler.transform(X_test)

# Support vector classification.
clf = skl.svm.SVC()
clf.fit(X_train, y_train)
score = clf.score(X_test, y_test)
print('Accuracy: {:.2%}'.format(score))

Accuracy: 47.88%


In [17]:
from sklearn.feature_selection import SelectFromModel
from sklearn.linear_model import LassoCV

### 5.2 Minimalist Genetic Algorithm for Feature Selection

In [19]:
from genetic_selection import GeneticSelectionCV

In [21]:
model = skl.svm.SVC()
selector = GeneticSelectionCV(model,
                             cv=5,
                             verbose=1,
                             scoring="accuracy",
                             n_population=50,
                             crossover_proba=0.5,
                             mutation_proba=0.2,
                             n_generations=40,
                             crossover_independent_proba=0.5,
                             mutation_independent_proba=0.5,
                             tournament_size=3,
                             caching=True,
                             n_jobs=-1)

In [22]:
selector = selector.fit(X_train, y_train)

Selecting features with genetic algorithm.
gen	nevals	avg                      	std                                	min                          	max                          
0  	50    	[   0.588475  257.4     ]	[  4.86593307e-03   8.93756119e+00]	[   0.57578125  240.        ]	[   0.59828125  282.        ]
1  	37    	[   0.5914375  260.36     ]	[  4.28717655e-03   9.74630186e+00]	[   0.5815625  243.       ]  	[   0.60078125  288.        ]
2  	25    	[   0.593625  263.12    ]  	[  4.48336770e-03   9.83186656e+00]	[   0.5734375  242.       ]  	[   0.60078125  285.        ]
3  	31    	[   0.5955  263.6   ]      	[  4.56880985e-03   8.78635305e+00]	[   0.5796875  239.       ]  	[   0.60421875  283.        ]
4  	25    	[   0.59615  262.5    ]    	[  5.62098468e-03   8.31685037e+00]	[   0.57734375  236.        ]	[   0.60328125  279.        ]
5  	35    	[   0.59798125  260.84      ]	[  4.59893840e-03   7.06359682e+00]	[   0.58453125  248.        ]	[   0.60546875  277.        ]
6  	32    	[  

In [38]:
selector.n_features_

268

In [39]:
selector.estimator_

SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape='ovr', degree=3, gamma='auto', kernel='rbf',
  max_iter=-1, probability=False, random_state=None, shrinking=True,
  tol=0.001, verbose=False)

In [37]:
X_train_new = X_train.iloc[:, selector.support_]
X_test_new = X_test.iloc[:, selector.support_]

# Support vector classification.
clf_gs = skl.svm.SVC()
clf_gs.fit(X_train_new, y_train)
score_gs = clf_gs.score(X_test_new, y_test)
print('Accuracy: {:.2%}'.format(score_gs))

Accuracy: 48.75%


In [40]:
model2 = skl.linear_model.LogisticRegression()
selector2 = GeneticSelectionCV(model2,
                             cv=5,
                             verbose=1,
                             scoring="accuracy",
                             n_population=50,
                             crossover_proba=0.5,
                             mutation_proba=0.2,
                             n_generations=10,
                             crossover_independent_proba=0.5,
                             mutation_independent_proba=0.5,
                             tournament_size=3,
                             caching=True,
                             n_jobs=-1)
selector2 = selector2.fit(X_train, y_train)

Selecting features with genetic algorithm.
gen	nevals	avg                          	std                                	min                          	max                          
0  	50    	[   0.55184375  258.58      ]	[  5.78741226e-03   1.18930063e+01]	[   0.54234375  236.        ]	[   0.56484375  288.        ]
1  	34    	[   0.5559375  265.28     ]  	[  5.82148164e-03   1.15966202e+01]	[   0.5378125  230.       ]  	[   0.568125  288.      ]    
2  	34    	[   0.55745625  266.2       ]	[  5.35324814e-03   9.14330356e+00]	[   0.5446875  246.       ]  	[   0.56984375  288.        ]
3  	31    	[   0.55965625  270.06      ]	[  5.61013661e-03   9.09815366e+00]	[   0.543125  249.      ]    	[   0.56984375  290.        ]
4  	31    	[   0.56083438  269.86      ]	[  6.22413476e-03   1.12035887e+01]	[   0.5371875  231.       ]  	[   0.56984375  286.        ]
5  	24    	[   0.56144375  271.24      ]	[  6.25449526e-03   9.65931675e+00]	[   0.5459375  241.       ]  	[   0.56984375  289.        