In [1]:
import librosa
import librosa.display
import librosa.feature
import json
import glob
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import requests

# Random Forest Classifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import precision_score, recall_score, accuracy_score, f1_score
from sklearn.metrics import classification_report

from keras.models import Sequential
from keras.layers import Dense, Activation
from keras.utils.np_utils import to_categorical

Using TensorFlow backend.


In [2]:
#read in the data from a csv file.
data = pd.read_csv('SpotifyFeatures.csv')

In [3]:
#remove unnecessary columns
data = data.drop(['artist_name'],axis=1)
data = data.drop(['track_id'],axis=1)
data = data.drop(['track_name'],axis=1)
data = data.drop(['key'],axis=1)
data = data.drop(['mode'],axis=1)
data = data.drop(['time_signature'],axis=1)
data

Unnamed: 0,genre,popularity,acousticness,danceability,duration_ms,energy,instrumentalness,liveness,loudness,speechiness,tempo,valence
0,Movie,0,0.61100,0.389,99373,0.9100,0.000000,0.3460,-1.828,0.0525,166.969,0.8140
1,Movie,1,0.24600,0.590,137373,0.7370,0.000000,0.1510,-5.559,0.0868,174.003,0.8160
2,Movie,3,0.95200,0.663,170267,0.1310,0.000000,0.1030,-13.879,0.0362,99.488,0.3680
3,Movie,0,0.70300,0.240,152427,0.3260,0.000000,0.0985,-12.178,0.0395,171.758,0.2270
4,Movie,4,0.95000,0.331,82625,0.2250,0.123000,0.2020,-21.150,0.0456,140.576,0.3900
5,Movie,0,0.74900,0.578,160627,0.0948,0.000000,0.1070,-14.970,0.1430,87.479,0.3580
6,Movie,2,0.34400,0.703,212293,0.2700,0.000000,0.1050,-12.675,0.9530,82.873,0.5330
7,Movie,15,0.93900,0.416,240067,0.2690,0.000000,0.1130,-8.949,0.0286,96.827,0.2740
8,Movie,0,0.00104,0.734,226200,0.4810,0.000860,0.0765,-7.725,0.0460,125.080,0.7650
9,Movie,10,0.31900,0.598,152694,0.7050,0.001250,0.3490,-7.790,0.0281,137.496,0.7180


In [4]:
def pluckRows(dataframe, numrows):
    genreNames = ['Hip-Hop', 'Soul', 'Classical', 'Blues', 'Rock']
    columns = list(dataframe.columns.values)
    res = pd.DataFrame(columns=columns)
    for genre in genreNames:
        temp = dataframe.loc[dataframe['genre'] == genre].head(numrows)
        res = res.append(temp, ignore_index=True)
    return res
data = pluckRows(data, 100)
data

Unnamed: 0,genre,popularity,acousticness,danceability,duration_ms,energy,instrumentalness,liveness,loudness,speechiness,tempo,valence
0,Hip-Hop,96,0.149000,0.837,213594,0.364,0.000000,0.2710,-11.713,0.2760,123.984,0.4630
1,Hip-Hop,95,0.259000,0.889,180522,0.496,0.000000,0.2520,-6.365,0.0905,86.003,0.5440
2,Hip-Hop,93,0.039500,0.837,288624,0.636,0.001250,0.3420,-7.643,0.0860,145.972,0.2740
3,Hip-Hop,88,0.001950,0.942,177806,0.383,0.000000,0.0922,-8.099,0.5650,100.021,0.3800
4,Hip-Hop,92,0.194000,0.729,183907,0.625,0.009860,0.2480,-5.266,0.0315,146.034,0.2610
5,Hip-Hop,92,0.103000,0.896,145543,0.671,0.000000,0.5520,-6.977,0.2890,112.502,0.3580
6,Hip-Hop,92,0.258000,0.740,166606,0.613,0.003720,0.1230,-4.880,0.1450,75.023,0.4730
7,Hip-Hop,91,0.172000,0.680,194520,0.559,0.000000,0.3180,-5.545,0.1290,202.006,0.1370
8,Hip-Hop,93,0.052100,0.861,228760,0.603,0.000000,0.0924,-5.788,0.1760,98.043,0.5040
9,Hip-Hop,90,0.107000,0.792,195637,0.743,0.000000,0.1830,-2.806,0.0851,150.024,0.7420


In [5]:
#extract labels from data
labels = data['genre']
labels

0      Hip-Hop
1      Hip-Hop
2      Hip-Hop
3      Hip-Hop
4      Hip-Hop
5      Hip-Hop
6      Hip-Hop
7      Hip-Hop
8      Hip-Hop
9      Hip-Hop
10     Hip-Hop
11     Hip-Hop
12     Hip-Hop
13     Hip-Hop
14     Hip-Hop
15     Hip-Hop
16     Hip-Hop
17     Hip-Hop
18     Hip-Hop
19     Hip-Hop
20     Hip-Hop
21     Hip-Hop
22     Hip-Hop
23     Hip-Hop
24     Hip-Hop
25     Hip-Hop
26     Hip-Hop
27     Hip-Hop
28     Hip-Hop
29     Hip-Hop
        ...   
470       Rock
471       Rock
472       Rock
473       Rock
474       Rock
475       Rock
476       Rock
477       Rock
478       Rock
479       Rock
480       Rock
481       Rock
482       Rock
483       Rock
484       Rock
485       Rock
486       Rock
487       Rock
488       Rock
489       Rock
490       Rock
491       Rock
492       Rock
493       Rock
494       Rock
495       Rock
496       Rock
497       Rock
498       Rock
499       Rock
Name: genre, Length: 500, dtype: object

In [6]:
# #encode the labels
# encoder = LabelEncoder()
# y = encoder.fit_transform(labels)
# y
y = labels

In [7]:
scaler = StandardScaler()
X = scaler.fit_transform(np.array(data.iloc[:, 1:], dtype = float))
X

array([[ 1.49525621, -0.67235683,  1.36208241, ...,  2.0686253 ,
         0.23458701,  0.02844841],
       [ 1.45276044, -0.37389051,  1.62121705, ...,  0.02658723,
        -1.02268254,  0.35937685],
       [ 1.3677689 , -0.96946648,  1.36208241, ..., -0.02295009,
         0.96244681, -0.74371796],
       ...,
       [-0.07708733, -1.07390256, -0.46681012, ..., -0.61299451,
        -1.05436172, -0.44955934],
       [-0.07708733, -0.79445669,  0.15610968, ..., -0.6427169 ,
         0.08718117,  0.63310778],
       [-0.28956619,  0.49166182,  0.02155901, ..., -0.64161607,
        -1.35824352,  0.86189782]])

In [8]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

In [9]:
forest = RandomForestClassifier(n_estimators=100, max_depth=5)
forest.fit(X_train, y_train)
forest.score(X_test, y_test)
pred = forest.predict(X_test)
print(classification_report(y_test, pred))

              precision    recall  f1-score   support

       Blues       0.59      0.68      0.63        19
   Classical       1.00      1.00      1.00        18
     Hip-Hop       1.00      0.95      0.97        19
        Rock       0.89      0.80      0.84        20
        Soul       0.71      0.71      0.71        24

   micro avg       0.82      0.82      0.82       100
   macro avg       0.84      0.83      0.83       100
weighted avg       0.83      0.82      0.82       100



In [11]:
accuracy_score(y_test, pred, normalize=True, sample_weight=None)

0.82

In [12]:
train_error = forest.score(X_train, y_train)
test_error = forest.score(X_test, y_test)

print('\tTrain accuracy: %s' % str(train_error))
print('\tTest accuracy: %s' % str(test_error))

	Train accuracy: 0.9275
	Test accuracy: 0.82
