In [147]:
import pandas as pd
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
import numpy as np

In [71]:
#Drop any songs that have NaN as a genre
genres = pd.read_csv('fma_metadata/tracks.csv', low_memory=False).dropna(subset=['track.7'])
genres = genres.rename(index=str, columns={'Unnamed: 0': 'id', 'track.7': 'genre'})

In [72]:
#Reduce size of df to just id and genre
genres = genres[['id', 'genre']]

In [75]:
#Get 25,000 of the songs that don't have NaN as a genre
genres = genres[1:].sample(25000)

In [88]:
genres = genres.set_index('id')
genres.head()

Unnamed: 0_level_0,genre
id,Unnamed: 1_level_1
93953,Rock
20086,Old-Time / Historic
108626,Experimental
32776,Electronic
82250,Experimental


In [84]:
features = pd.read_csv('fma_metadata/features.csv', low_memory=False)
features = features.rename(index=str, columns={'feature': 'id'})

In [91]:
#filter out the same track IDs
features = features.loc[features.id.isin(genres.index.values)].set_index('id')

In [92]:
features.head()

Unnamed: 0_level_0,chroma_cens,chroma_cens.1,chroma_cens.2,chroma_cens.3,chroma_cens.4,chroma_cens.5,chroma_cens.6,chroma_cens.7,chroma_cens.8,chroma_cens.9,...,tonnetz.39,tonnetz.40,tonnetz.41,zcr,zcr.1,zcr.2,zcr.3,zcr.4,zcr.5,zcr.6
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
3,1.8889633417,0.76053929329,0.34529656172,2.2952005863,1.6540306807,0.067592434585,1.3668476343,1.0540937185,0.10810308903,0.61918509007,...,0.063831120729,0.014211839065,0.017740072682,2.8246941566,0.46630859375,0.084578499198,0.06396484375,0.0,1.7167237997,0.0693301633
5,0.52756297588,-0.077654317021,-0.27961030602,0.6858831048,1.9375696182,0.880838871,-0.92319184542,-0.92723226547,0.66661673784,1.0385463238,...,0.040730185807,0.012690781616,0.014759079553,6.8084154129,0.375,0.05311408639,0.04150390625,0.0,2.1933031082,0.044860601425
134,0.91844475269,0.67414724827,0.5778182745,1.2811170816,0.93374562263,0.078176945448,1.1992042065,-0.17522314191,0.92548191547,1.4385091066,...,0.05876616016,0.016322381794,0.015819497406,4.731086731,0.41943359375,0.064369551837,0.05078125,0.0,1.8061059713,0.054622855037
135,0.97840219736,0.62357187271,1.12926054,-1.0439702272,-1.1272884607,-0.03264175728,-0.87843364477,-0.91623413563,-0.74325716496,0.60923475027,...,0.10427882522,0.01673123613,0.020464096218,-0.038451351225,0.23486328125,0.050837226212,0.05078125,0.00341796875,0.29787299037,0.024899475276
136,0.9150006175,-0.64347624779,-0.46050721407,-0.53070127964,-0.36446049809,-0.22685964406,-0.060376849025,-0.26767292619,-0.063476271927,0.24851760268,...,0.076808303595,0.017914786935,0.016705930233,0.55877000093,0.1474609375,0.036686290056,0.0341796875,0.00341796875,0.80502045155,0.016904523596


In [93]:
features.reindex(genres.index.values)

Unnamed: 0_level_0,chroma_cens,chroma_cens.1,chroma_cens.2,chroma_cens.3,chroma_cens.4,chroma_cens.5,chroma_cens.6,chroma_cens.7,chroma_cens.8,chroma_cens.9,...,tonnetz.39,tonnetz.40,tonnetz.41,zcr,zcr.1,zcr.2,zcr.3,zcr.4,zcr.5,zcr.6
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
93953,-1.5874287486e-01,-5.9277850389e-01,-3.9923959970e-01,-2.5748467445e-01,-2.2578692436e-01,-2.6395587251e-02,2.4995682240e+00,1.8951414824e+00,4.1370522976e-01,-3.3995303512e-01,...,5.8169201016e-02,1.9423143938e-02,1.7380004749e-02,1.9453805542e+02,7.7929687500e-01,3.7423402071e-02,2.9785156250e-02,3.4179687500e-03,9.8379526138e+00,3.3667922020e-02
20086,-3.7079554796e-01,-1.7796246707e-01,-5.0857496262e-01,-3.5529714823e-01,-2.8024035692e-01,-5.2816200256e-01,-5.8888262510e-01,3.8071337342e-01,1.0215874016e-01,2.4042625725e-01,...,1.2132487446e-01,2.4534692988e-02,2.3457542062e-02,1.0027783966e+02,3.9257812500e-01,5.3145077080e-02,5.2246093750e-02,0.0000000000e+00,5.9697732925e+00,1.7180748284e-02
108626,-5.7287716866e-01,-1.0494841337e+00,-1.0454161167e+00,-1.2939493656e+00,-8.8696432114e-01,9.9912440777e-01,2.9571924210e+00,4.1035461426e+00,4.1013078690e+00,7.1374140680e-02,...,1.0699443519e-01,1.8340403214e-02,2.6030596346e-02,3.7762420177e+00,4.1357421875e-01,4.5061796904e-02,3.3691406250e-02,0.0000000000e+00,1.6431454420e+00,3.8413606584e-02
32776,-1.0992000103e+00,-1.3128474951e+00,-9.4788420200e-01,3.7686908245e-01,2.1139997244e-01,-3.9794772863e-01,-4.6096768975e-01,-3.8589403033e-01,3.6123901606e-01,-2.1094664931e-01,...,7.2626195848e-02,1.5987791121e-02,1.7749791965e-02,6.4292490482e-01,5.7714843750e-01,1.7851457000e-01,1.7919921875e-01,0.0000000000e+00,2.6403039694e-01,8.0036029220e-02
82250,1.3813786507e+00,1.3853936195e+00,1.9287285805e+00,1.6893213987e+00,2.0497715473e+00,1.5189225674e+00,2.5142071247e+00,2.0595555305e+00,2.1993477345e+00,1.3730596304e+00,...,7.2355799377e-02,1.6297915950e-02,1.8117737025e-02,-5.3211545944e-01,7.0751953125e-01,2.1427534521e-01,1.9873046875e-01,0.0000000000e+00,4.7845137119e-01,1.3050070405e-01
130639,2.6179051399e-01,-7.5792305171e-02,-7.0668959618e-01,1.3321990967e+00,-8.9946824312e-01,-1.0267541409e+00,-9.5349943638e-01,9.0396785736e-01,1.8090956211e+00,-3.3172917366e-01,...,9.1820351779e-02,1.6073934734e-02,2.4608863518e-02,6.6841335297e+00,4.0332031250e-01,5.3696155548e-02,2.8808593750e-02,3.9062500000e-03,2.2833302021e+00,5.5931042880e-02
57983,-5.8587020636e-01,2.4682320654e-01,-4.6796211600e-01,-5.6815713644e-01,9.3867599964e-01,-4.1668754816e-01,5.9253686666e-01,-7.7173632383e-01,-8.3341342211e-01,-8.5945940018e-01,...,7.1509018540e-02,1.9288280979e-02,2.3950980976e-02,4.6157867432e+01,5.4345703125e-01,4.0402315557e-02,3.0761718750e-02,0.0000000000e+00,5.2917671204e+00,3.9432965219e-02
3911,-7.8936213255e-01,-1.2030998468e+00,-1.2867045403e+00,-6.6311138868e-01,-7.3343783617e-01,-7.8307104111e-01,-4.6657964587e-01,-7.9180926085e-01,-9.8169612885e-01,-4.7835335135e-01,...,1.4213909209e-01,2.6115613058e-02,2.6961628348e-02,1.5862684631e+02,2.9052734375e-01,1.9128674641e-02,1.7578125000e-02,4.8828125000e-04,8.1485996246e+00,1.1354391463e-02
17492,-4.4171807170e-01,-8.2176303864e-01,-8.5352426767e-01,-2.9371833801e-01,-1.0421884060e+00,-4.6237495542e-01,-6.6695809364e-01,9.0923406184e-02,-3.2917076349e-01,-1.2510716915e-01,...,7.8870743513e-02,1.5943966806e-02,1.5549796633e-02,7.9809899330e+00,4.1894531250e-01,6.3676856458e-02,5.4199218750e-02,0.0000000000e+00,2.0853135586e+00,4.1035722941e-02
130399,9.2106932402e-01,-1.3525570929e-01,1.0887333006e-01,1.0954540223e-01,1.4803910255e-01,4.2632082105e-01,-4.4390964508e-01,3.7885561585e-01,-1.8943348527e-01,3.1912153959e-01,...,8.5279494524e-02,1.7762662843e-02,1.6224807128e-02,4.1198448181e+01,6.9677734375e-01,6.9253578782e-02,5.4199218750e-02,4.3945312500e-03,5.7529473305e+00,7.4808232486e-02


In [106]:
features = features.merge(genres, on='id')

## First trying without any feature reduction

In [148]:
training_set = features[:19922]
validation_set = features[19922:22427]
test_set = features[22427:]
train_X = training_set.drop('genre', axis=1)
train_Y = training_set['genre']
vali_X = scaler.fit_transform(validation_set.drop('genre', axis=1))
vali_Y = validation_set['genre'].values
test_X = scaler.fit_transform(test_set.drop('genre', axis=1))
test_Y = test_set['genre'].values
scaler = StandardScaler()
train_X = scaler.fit_transform(train_X)
logreg = LogisticRegression(C=0.09, multi_class='ovr')
logreg.fit(train_X, train_Y)
print('Training Accuracy: ' + str(logreg.score(train_X, train_Y)))
print('Validation Accuracy: ' + str(logreg.score(vali_X, vali_Y)))
print('Test Accuracy: ' + str(logreg.score(test_X, test_Y)))

  return self.partial_fit(X, y)
  return self.fit(X, **fit_params).transform(X)
  return self.partial_fit(X, y)
  return self.fit(X, **fit_params).transform(X)
  return self.partial_fit(X, y)
  return self.fit(X, **fit_params).transform(X)


Training Accuracy: 0.6884348960947696
Validation Accuracy: 0.5604790419161677
Test Accuracy: 0.6404974737660318


## Now with PCA

In [160]:
train_X = training_set.drop('genre', axis=1)
train_Y = training_set['genre']
vali_X = scaler.fit_transform(validation_set.drop('genre', axis=1))
vali_Y = validation_set['genre'].values
test_X = scaler.fit_transform(test_set.drop('genre', axis=1))
test_Y = test_set['genre'].values
pca = PCA(n_components=500)
principal = pca.fit(train_X)

  return self.partial_fit(X, y)
  return self.fit(X, **fit_params).transform(X)
  return self.partial_fit(X, y)
  return self.fit(X, **fit_params).transform(X)


In [161]:
scaler = StandardScaler()
train_X = pca.transform(scaler.fit_transform(train_X))
vali_X = pca.transform(vali_X)
test_X = pca.transform(test_X)
logreg = LogisticRegression(C=0.09, multi_class='ovr')
logreg.fit(train_X, train_Y)
print('Training Accuracy: ' + str(logreg.score(train_X, train_Y)))
print('Validation Accuracy: ' + str(logreg.score(vali_X, vali_Y)))
print('Test Accuracy: ' + str(logreg.score(test_X, test_Y)))

  return self.partial_fit(X, y)
  return self.fit(X, **fit_params).transform(X)


Training Accuracy: 0.6612287922899307
Validation Accuracy: 0.5608782435129741
Test Accuracy: 0.6335017489312087
