In [1]:
import numpy as np
import pandas as pd
import sklearn as skl
import sklearn.utils, sklearn.preprocessing, sklearn.decomposition, sklearn.svm
from sklearn.decomposition import PCA
import random
from sklearn.svm import SVC
from sklearn.preprocessing import MultiLabelBinarizer, LabelEncoder, LabelBinarizer, StandardScaler
from sklearn.utils import shuffle

# All Features

In [2]:
genres = pd.read_csv('fma_metadata/tracks.csv', low_memory=False).dropna(subset=['track.7'])
genres = genres.rename(index=str, columns={'Unnamed: 0': 'id', 'track.7': 'genre'})
genres = genres[['id', 'genre']]
genres = genres[1:]

features = pd.read_csv('fma_metadata/features.csv', low_memory=False)
features = features.rename(index=str, columns={'feature': 'id'})
features = features[1:]

features = features.merge(genres, on='id')
features.head()

Unnamed: 0,id,chroma_cens,chroma_cens.1,chroma_cens.2,chroma_cens.3,chroma_cens.4,chroma_cens.5,chroma_cens.6,chroma_cens.7,chroma_cens.8,...,tonnetz.40,tonnetz.41,zcr,zcr.1,zcr.2,zcr.3,zcr.4,zcr.5,zcr.6,genre
0,2,7.1806526184,5.2303090096,0.24932080507,1.3476201296,1.4824777842,0.53137123585,1.4815930128,2.691454649,0.86686819792,...,0.012225749902,0.012110591866,5.758890152,0.45947265625,0.085629448295,0.0712890625,0.0,2.0898721218,0.061448108405,Hip-Hop
1,3,1.8889633417,0.76053929329,0.34529656172,2.2952005863,1.6540306807,0.067592434585,1.3668476343,1.0540937185,0.10810308903,...,0.014211839065,0.017740072682,2.8246941566,0.46630859375,0.084578499198,0.06396484375,0.0,1.7167237997,0.0693301633,Hip-Hop
2,5,0.52756297588,-0.077654317021,-0.27961030602,0.6858831048,1.9375696182,0.880838871,-0.92319184542,-0.92723226547,0.66661673784,...,0.012690781616,0.014759079553,6.8084154129,0.375,0.05311408639,0.04150390625,0.0,2.1933031082,0.044860601425,Hip-Hop
3,10,3.7022454739,-0.29119303823,2.1967420578,-0.234449476,1.3673638105,0.9984113574,1.7706941366,1.6045658588,0.52121698856,...,0.01795193553,0.013921394013,21.434211731,0.4521484375,0.077514506876,0.07177734375,0.0,3.542324543,0.040800448507,Pop
4,134,0.91844475269,0.67414724827,0.5778182745,1.2811170816,0.93374562263,0.078176945448,1.1992042065,-0.17522314191,0.92548191547,...,0.016322381794,0.015819497406,4.731086731,0.41943359375,0.064369551837,0.05078125,0.0,1.8061059713,0.054622855037,Hip-Hop


In [3]:
data = features.sample(25000)

training_set = data[:19922]
validation_set = data[19922:22427]
test_set = data[22427:]

enc = LabelEncoder()
y_train = enc.fit_transform(training_set['genre'])
y_val = enc.transform(validation_set['genre'].values)
y_test = enc.transform(test_set['genre'].values)

scaler = StandardScaler(copy=False)

data = data.drop('genre', axis=1)

training_set = data[:19922]
validation_set = data[19922:22427]
test_set = data[22427:]

X_train = training_set.as_matrix()
X_val = validation_set.as_matrix()
X_test = test_set.as_matrix()

scaler.fit_transform(X_train)
scaler.transform(X_val)
scaler.transform(X_test)



array([[-1.17484855e+00,  3.92205278e-02,  3.29844360e-01, ...,
        -4.91687495e-01, -8.82104808e-01, -6.78403548e-01],
       [ 1.41900109e+00, -1.20246905e-01, -6.43709126e-02, ...,
        -4.91687495e-01, -8.56998456e-01, -1.04822339e+00],
       [-4.60494985e-01, -8.13729202e-02, -5.27365307e-02, ...,
        -1.28962138e-01,  9.33860194e-01, -2.30803373e-01],
       ...,
       [-1.44993421e-01, -9.23905387e-02, -7.93337293e-02, ...,
         2.22875268e+00,  1.08314730e+00, -1.01384874e+00],
       [-3.92752091e-01, -4.33050514e-02,  5.74505532e-04, ...,
        -4.91687495e-01,  3.34326864e-01,  2.49769153e-01],
       [-1.48450831e+00, -1.16816053e-01, -1.27333837e-01, ...,
         5.24005410e-02,  1.11014976e+00, -5.21418404e-01]])

In [4]:
clf = SVC(C=1.5, gamma='auto')
clf.fit(X_train, y_train) 

clf.score(X_test, y_test), clf.score(X_train, y_train)

(0.27788573649436454, 1.0)

# PCA Reduced Features

In [2]:
genres = pd.read_csv('fma_metadata/tracks.csv', low_memory=False).dropna(subset=['track.7'])
genres = genres.rename(index=str, columns={'Unnamed: 0': 'id', 'track.7': 'genre'})
genres = genres[['id', 'genre']]
genres = genres[1:]

features = pd.read_csv('fma_metadata/features.csv', low_memory=False)
features = features.rename(index=str, columns={'feature': 'id'})
features = features[1:]

features = features.merge(genres, on='id')
features.head()

Unnamed: 0,id,chroma_cens,chroma_cens.1,chroma_cens.2,chroma_cens.3,chroma_cens.4,chroma_cens.5,chroma_cens.6,chroma_cens.7,chroma_cens.8,...,tonnetz.40,tonnetz.41,zcr,zcr.1,zcr.2,zcr.3,zcr.4,zcr.5,zcr.6,genre
0,2,7.1806526184,5.2303090096,0.24932080507,1.3476201296,1.4824777842,0.53137123585,1.4815930128,2.691454649,0.86686819792,...,0.012225749902,0.012110591866,5.758890152,0.45947265625,0.085629448295,0.0712890625,0.0,2.0898721218,0.061448108405,Hip-Hop
1,3,1.8889633417,0.76053929329,0.34529656172,2.2952005863,1.6540306807,0.067592434585,1.3668476343,1.0540937185,0.10810308903,...,0.014211839065,0.017740072682,2.8246941566,0.46630859375,0.084578499198,0.06396484375,0.0,1.7167237997,0.0693301633,Hip-Hop
2,5,0.52756297588,-0.077654317021,-0.27961030602,0.6858831048,1.9375696182,0.880838871,-0.92319184542,-0.92723226547,0.66661673784,...,0.012690781616,0.014759079553,6.8084154129,0.375,0.05311408639,0.04150390625,0.0,2.1933031082,0.044860601425,Hip-Hop
3,10,3.7022454739,-0.29119303823,2.1967420578,-0.234449476,1.3673638105,0.9984113574,1.7706941366,1.6045658588,0.52121698856,...,0.01795193553,0.013921394013,21.434211731,0.4521484375,0.077514506876,0.07177734375,0.0,3.542324543,0.040800448507,Pop
4,134,0.91844475269,0.67414724827,0.5778182745,1.2811170816,0.93374562263,0.078176945448,1.1992042065,-0.17522314191,0.92548191547,...,0.016322381794,0.015819497406,4.731086731,0.41943359375,0.064369551837,0.05078125,0.0,1.8061059713,0.054622855037,Hip-Hop


In [3]:
data = features.sample(25000)

training_set = data[:19922]
validation_set = data[19922:22427]
test_set = data[22427:]

enc = LabelEncoder()
y_train = enc.fit_transform(training_set['genre'])
y_val = enc.transform(validation_set['genre'].values)
y_test = enc.transform(test_set['genre'].values)

scaler = StandardScaler(copy=False)

pca = PCA(n_components=275)
pca.fit(data.drop('genre', axis=1).as_matrix().T)
data = pd.DataFrame(pca.components_.T)

training_set = data[:19922]
validation_set = data[19922:22427]
test_set = data[22427:]

X_train = training_set.as_matrix()
X_val = validation_set.as_matrix()
X_test = test_set.as_matrix()

scaler.fit_transform(X_train)
scaler.transform(X_val)
scaler.transform(X_test)

array([[-1.55977573,  1.51909576, -0.29091038, ..., -1.03029923,
        -0.83162177,  0.53587042],
       [ 0.02166803, -0.13373329, -0.34006884, ...,  0.21611512,
        -1.85053953, -0.416559  ],
       [-0.2856438 ,  0.12723044, -1.17623878, ..., -0.32664934,
        -0.60598507, -0.18088152],
       ...,
       [-1.22440888,  1.1130143 , -1.03924643, ...,  1.14719908,
        -0.12780583,  1.26973126],
       [ 0.55109119, -0.55012009,  0.34411873, ...,  0.89581704,
         0.12089336,  1.15717321],
       [-0.95215279,  0.80248139, -1.21813032, ...,  1.0174678 ,
        -0.25658993,  0.10892382]])

In [4]:
clf = SVC(C=1.5, gamma='auto')
clf.fit(X_train, y_train) 

clf.score(X_test, y_test), clf.score(X_train, y_train)

(0.6583754372328022, 0.8857544423250677)