In [58]:
import pandas as pd
import numpy as np
from sklearn import preprocessing, cross_validation, neighbors, metrics, svm
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier

songs_dataset = pd.read_json('MasterSongList.json')

In [4]:
songs_dataset.loc[:,'genres'] = songs_dataset['genres'].apply(''.join)
def consolidateGenre(genre):
    if len(genre)>0:
        return genre.split(':')[0]
    else: return genre

songs_dataset.loc[:, 'genres'] = songs_dataset['genres'].apply(consolidateGenre)

In [6]:
audio_feature_list = [audio_feature for audio_feature in songs_dataset['audio_features']]
audio_features_headers = ['key','energy','liveliness','tempo','speechiness','acousticness','instrumentalness','time_signature'
                         ,'duration','loudness','valence','danceability','mode','time_signature_confidence','tempo_confidence'
                         ,'key_confidence','mode_confidence']
audio_features = pd.DataFrame(audio_feature_list, columns=audio_features_headers)
audio_features.loc[:,].dropna(axis=0,how='all',inplace=True)
audio_features['genres'] = songs_dataset['genres']
audio_features.head()

Unnamed: 0,key,energy,liveliness,tempo,speechiness,acousticness,instrumentalness,time_signature,duration,loudness,valence,danceability,mode,time_signature_confidence,tempo_confidence,key_confidence,mode_confidence,genres
0,11.0,0.912744,0.083704,132.069,0.293137,0.005423,1e-06,0.0,4.0,218.30667,-3.89,0.752186,0.72692,0.552,0.541,1.0,1.0,pop
1,6.0,0.745704,0.119955,100.008,0.046255,0.02623,0.012727,1.0,4.0,235.06086,-7.687,0.351282,0.691817,0.737,0.634,0.796,1.0,pop
2,5.0,0.709932,0.231455,130.03,0.121741,0.036662,0.0,0.0,4.0,232.46104,-5.15,0.37439,0.704729,0.565,0.565,0.743,1.0,
3,3.0,0.705822,0.053292,126.009,0.126016,0.001966,0.0,0.0,4.0,194.09333,-3.898,0.592798,0.875137,0.004,0.114,1.0,0.742,dance
4,3.0,0.741757,0.072774,129.985,0.051255,0.096732,0.000474,0.0,4.0,285.42667,-5.86,0.58563,0.730711,0.271,0.324,0.822,1.0,reggaeton


In [35]:
rock_1500 = audio_features.loc[audio_features['genres'] == 'rock'].sample(n=1500)
rap_1500 = audio_features.loc[audio_features['genres'] == 'rap'].sample(n=1500)
jazz_1500 = audio_features.loc[audio_features['genres'] == 'jazz'].sample(n=1500)
dance_1500 = audio_features.loc[audio_features['genres'] == 'dance'].sample(n=1500)
dataset = pd.concat([rock_1500,rap_1500,jazz_1500,dance_1500], ignore_index=True)
dataset = dataset.fillna(dataset.median(),inplace=True)
dataset.shape

(6000, 18)

In [29]:
df_features = dataset.drop('genres',axis =1)
df_features.head()

Unnamed: 0,key,energy,liveliness,tempo,speechiness,acousticness,instrumentalness,time_signature,duration,loudness,valence,danceability,mode,time_signature_confidence,tempo_confidence,key_confidence,mode_confidence
0,0.0,0.829562,0.326078,144.955,0.031807,0.008758,0.0,0.0,4.0,143.69333,-5.383,0.836835,0.518001,0.293,0.492,0.795,0.84
1,11.0,0.885721,0.34749,163.245,0.069079,0.160855,0.017547,0.0,4.0,295.8,-10.937,0.279321,0.286817,0.569,0.564,0.45,1.0
2,9.0,0.890157,0.061336,120.956,0.030632,0.004076,0.003903,1.0,4.0,224.36676,-6.996,0.955193,0.530167,0.561,0.406,0.744,1.0
3,0.0,0.753911,0.067912,137.545,0.031465,0.001832,0.0,1.0,4.0,199.62667,-9.386,0.962784,0.577222,1.0,0.661,0.921,0.89
4,0.0,0.420254,0.154516,134.559,0.052239,0.020613,0.000162,0.0,4.0,169.77333,-12.195,0.422416,0.645729,0.678,0.39,0.715,0.947


In [30]:
labels_final = dataset['genres']

In [32]:
standard_scaler = preprocessing.StandardScaler()
df_features_final = standard_scaler.fit_transform(df_features)

In [33]:
X = df_features_final
y = labels_final

X_train, X_test, y_train, y_test = cross_validation.train_test_split(X,y,test_size=0.3, random_state = 101)

In [67]:
names = ['Logistic Regression','SVC','Random Forest']
original_classifiers =  [LogisticRegression(solver='saga',multi_class='multinomial'),
                        SVC(gamma=2, C=1),
                        RandomForestClassifier(n_estimators=5,min_samples_split=2, max_features='log2')]

In [68]:
for name, clf in zip(names,original_classifiers):
    clf.fit(X_train,y_train)
    predictions = clf.predict(X_test)
    print(name)
    print(metrics.confusion_matrix(y_test, predictions))
    print(metrics.classification_report(list(y_test),list(predictions)))

Logistic Regression
[[338  11  58  48]
 [ 12 388   6  47]
 [ 66  19 330  22]
 [ 32  52  16 355]]
             precision    recall  f1-score   support

      dance       0.75      0.74      0.75       455
       jazz       0.83      0.86      0.84       453
        rap       0.80      0.76      0.78       437
       rock       0.75      0.78      0.77       455

avg / total       0.78      0.78      0.78      1800

SVC
[[ 91 293  44  27]
 [  1 446   1   5]
 [ 11 233 182  11]
 [ 19 341  10  85]]
             precision    recall  f1-score   support

      dance       0.75      0.20      0.32       455
       jazz       0.34      0.98      0.51       453
        rap       0.77      0.42      0.54       437
       rock       0.66      0.19      0.29       455

avg / total       0.63      0.45      0.41      1800

Random Forest
[[331  19  57  48]
 [ 27 375  16  35]
 [ 64  25 329  19]
 [ 65  61  28 301]]
             precision    recall  f1-score   support

      dance       0.68      0.73   

In [82]:
from sklearn.model_selection import GridSearchCV
from sklearn.pipeline import Pipeline
param_grids = [{'solver':['newton-cg', 'sag', 'saga', 'lbfgs'],'multi_class':['ovr', 'multinomial']},
              {'C':[0.1,1, 10],'gamma':[1,0.1,0.01,0.001]},
              {'n_estimators':[5, 10, 100],'min_samples_split':[2, 3, 4, 5, 10],'max_features':['sqrt', 'log2', 'auto']}]
for clf,param_grid in zip(original_classifiers,param_grids):
    grid = GridSearchCV(clf,param_grid)
    grid.fit(X_train,y_train)
    print(grid.best_params_)

{'multi_class': 'ovr', 'solver': 'newton-cg'}
{'C': 10, 'gamma': 0.01}
{'max_features': 'log2', 'min_samples_split': 4, 'n_estimators': 100}


## SVM

In [129]:
from sklearn.feature_selection import SelectKBest,SelectFromModel,RFE
clf_svm = SVC(C=10,gamma=0.01)
selector_svm = SelectKBest(k=5)
X_new = selector_svm.fit_transform(X,y)
df_features.columns[selector_svm.get_support(indices=True)].tolist()

['key', 'energy', 'liveliness', 'tempo', 'speechiness']

In [133]:
X_train, X_test, y_train, y_test = cross_validation.train_test_split(X_new,y,test_size=0.3, random_state = 101)
clf_svm.fit(X_train,y_train)
prediction = clf_svm.predict(X_test)
accuracy = metrics.accuracy_score(y_test,prediction)
print(accuracy)

0.723333333333


## Random Forest - SelectFromModel

In [158]:
clf_rf = RandomForestClassifier(n_estimators=5,min_samples_split=2, max_features='log2')
clf_rf.fit(X_train,y_train)
predictions = clf_rf.predict(X_test)

thresholds =  clf_rf.feature_importances_
for thresh in thresholds:
    selector_rf = SelectFromModel(clf_rf,threshold = thresh,prefit=True)
    select_X_train = selector_rf.transform(X_train)
    rfc_new = RandomForestClassifier()
    rfc_new.fit(select_X_train,y_train)
    select_X_test = selector_rf.transform(X_test)
    prediction = rfc_new.predict(select_X_test)
    accuracy = metrics.accuracy_score(y_test,prediction)
    print('Threshold:',thresh)
    print('Number of features:',select_X_train.shape[1])
    print('The features are:',df_features.columns[selector_rf.get_support(indices=True)].tolist())
    print('Accuracy score:',accuracy)
    print('\n')

Threshold: 0.166356543024
Number of features: 4
The features are: ['key', 'energy', 'liveliness', 'tempo']
Accuracy score: 0.706111111111


Threshold: 0.267599398555
Number of features: 2
The features are: ['energy', 'liveliness']
Accuracy score: 0.573333333333


Threshold: 0.311821931273
Number of features: 1
The features are: ['liveliness']
Accuracy score: 0.430555555556


Threshold: 0.254222127149
Number of features: 3
The features are: ['energy', 'liveliness', 'tempo']
Accuracy score: 0.694444444444




## Random Forest - RFE

In [167]:
from sklearn.feature_selection import RFE
rfc = RandomForestClassifier()
rfe_model = RFE(rfc,n_features_to_select=6)
X = df_features_final
y = labels_final

X_train, X_test, y_train, y_test = cross_validation.train_test_split(X,y,test_size=0.3, random_state = 101)

rfe_model = rfe_model.fit(X_train,y_train)
print(rfe_model.support_)
print(rfe_model.ranking_)

[False  True False  True  True  True False False False False False False
  True False False  True False]
[10  1  8  1  1  1  2 11 12  4  5  3  1  7  6  1  9]


In [166]:
prediction = rfe_model.predict(X_test)
print(metrics.accuracy_score(y_test,prediction))

0.731666666667
