In [1]:
from IPython.display import display
from IPython.core.display import HTML 
import IPython.core.display as di # Example: di.display_html('<h3>%s:</h3>' % str, raw=True)

# This line will hide code by default when the notebook is exported as HTML
di.display_html('<script>jQuery(function() {if (jQuery("body.notebook_app").length == 0) { jQuery(".input_area").toggle(); jQuery(".prompt").toggle();}});</script>', raw=True)

# This line will add a button to toggle visibility of code blocks, for use with the HTML export version
di.display_html('''<button onclick="jQuery('.input_area').toggle(); jQuery('.prompt').toggle();">Toggle code</button>''', raw=True)

import pandas as pd
import numpy as np

pd.set_option('mode.chained_assignment',None)

In [2]:
from sklearn.metrics import confusion_matrix 
from sklearn.metrics import accuracy_score
from sklearn.tree import DecisionTreeClassifier

In [3]:
from sklearn.cross_validation import train_test_split

In [4]:
from sklearn.cross_validation import cross_val_score

In [5]:
data_df = pd.read_csv('spotify_top_200_clustered.csv')

In [6]:
data_df.head(2)

Unnamed: 0,track_id,Artist,Track.Name,top.rank,total.plays,average.daily.plays,days.in.chart,artist_id,followers,genres,...,speechiness,acousticness,instrumentalness,danceability,key,duration_ms,loudness,mode,valence,cluster
0,003eoIwxETJujVWmNFMoZy,Alessia Cara,Growing Pains,91,9809956,700711.142857,14,2wUjUUtkb5lvLKcGKsKqsR,4431425,"[u'canadian contemporary r&b', u'dance pop', u...",...,0.733,0.0822,0.0,0.353,1,193680,-6.276,0,0.437,1
1,00B7TZ0Xawar6NZ00JFomN,Cardi B,Best Life (feat. Chance The Rapper),61,11882167,848726.214286,14,4kYSro6naA4h99UJvo89HB,5927731,"[u'pop', u'rap']",...,0.553,0.287,0.0,0.62,9,284856,-7.438,1,0.665,1


In [7]:
y = data_df.cluster
X = data_df[['energy', 'liveness', 'tempo', 'speechiness'
            , 'acousticness', 'instrumentalness', 'danceability', 'key'
            , 'duration_ms' ,'loudness', 'mode', 'valence']]

In [8]:
X.head()

Unnamed: 0,energy,liveness,tempo,speechiness,acousticness,instrumentalness,danceability,key,duration_ms,loudness,mode,valence
0,0.755,0.39,191.153,0.733,0.0822,0.0,0.353,1,193680,-6.276,0,0.437
1,0.625,0.314,167.911,0.553,0.287,0.0,0.62,9,284856,-7.438,1,0.665
2,0.799,0.092,94.033,0.0873,0.187,0.0,0.836,7,217653,-4.247,0,0.772
3,0.56,0.153,94.949,0.173,0.0426,0.0,0.857,8,91011,-8.278,1,0.482
4,0.919,0.107,95.991,0.05,0.0063,3e-06,0.581,6,210347,-4.324,0,0.847


In [9]:
y.head()

0    1
1    1
2    1
3    2
4    1
Name: cluster, dtype: int64

In [10]:
# dividing X, y into train and test data 
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state = 0) 

In [11]:
dtree_model = DecisionTreeClassifier(max_depth = 3).fit(X_train, y_train) 
dtree_predictions = dtree_model.predict(X_test) 

In [12]:
accuracy_score(y_test, dtree_predictions)

0.88349514563106801

In [13]:
# creating a confusion matrix 
cm = confusion_matrix(y_test, dtree_predictions) 

In [14]:
cm

array([[144,   8,   1],
       [ 22, 175,   2],
       [  6,   9,  45]])

In [17]:
#optimise max depth parameter
for i in range(2,11):
    
    dtree_model = DecisionTreeClassifier(max_depth = i).fit(X_train, y_train) 
    dtree_predictions = dtree_model.predict(X_test)
    cm = confusion_matrix(y_test, dtree_predictions) 
    scores = accuracy_score(y_test, dtree_predictions)
    print 'Max depth: %s' % i
    print cm
    print("Accuracy: %0.2f " % scores)
    print

Max depth: 2
[[132  15   6]
 [ 15 165  19]
 [  3  11  46]]
Accuracy: 0.83 

Max depth: 3
[[144   8   1]
 [ 22 175   2]
 [  6   9  45]]
Accuracy: 0.88 

Max depth: 4
[[133  19   1]
 [  5 193   1]
 [  5  16  39]]
Accuracy: 0.89 

Max depth: 5
[[132  16   5]
 [  6 188   5]
 [  6   8  46]]
Accuracy: 0.89 

Max depth: 6
[[137  10   6]
 [  9 186   4]
 [  8   6  46]]
Accuracy: 0.90 

Max depth: 7
[[143   7   3]
 [  7 190   2]
 [  8   6  46]]
Accuracy: 0.92 

Max depth: 8
[[142   8   3]
 [  7 191   1]
 [  8   7  45]]
Accuracy: 0.92 

Max depth: 9
[[141   5   7]
 [  8 189   2]
 [  8   7  45]]
Accuracy: 0.91 

Max depth: 10
[[144   7   2]
 [  9 188   2]
 [  7   7  46]]
Accuracy: 0.92 



In [18]:
#since there is little improvement in accuracy at max_depth>5, use 5 for final model for simplicity
dtree_model = DecisionTreeClassifier(max_depth = 5).fit(X_train, y_train) 
dtree_predictions = dtree_model.predict(X_test)

In [19]:
scores = cross_val_score(dtree_model,X,y,cv=10)
cm = confusion_matrix(y_test, dtree_predictions)

In [20]:
scores

array([ 0.90361446,  0.87951807,  0.8969697 ,  0.9030303 ,  0.87878788,
        0.89090909,  0.8902439 ,  0.84756098,  0.89634146,  0.85276074])

In [21]:
print("Accuracy: %0.2f (+/- %0.2f)" % (scores.mean(), scores.std() * 2))
print cm

Accuracy: 0.88 (+/- 0.04)
[[128  16   9]
 [  6 188   5]
 [  6   7  47]]


In [47]:
from sklearn import tree
import graphviz

In [48]:
dot_data = tree.export_graphviz(dtree_model,
                            feature_names=X.columns,
                            class_names=['1','2','3'],    
                            filled=True, rounded=True,  
                            special_characters=True) 
 

In [49]:
graph = graphviz.Source(dot_data)

In [50]:
#render the graph as a png file
!dot -Tpng tree.dot -o tree.png 

In [51]:
new_data = pd.read_csv('song_features2.csv')

In [52]:
new_data.head()

Unnamed: 0,track_id,energy,liveness,tempo,speechiness,acousticness,instrumentalness,danceability,key,duration_ms,loudness,mode,valence
0,14msK75pk3pA33pzPVNtBF,0.321,0.0884,70.142,0.323,0.578,0.0,0.725,1,178640.0,-10.744,0,0.319
1,6MWtB6iiXyIwun0YzU6DFP,0.539,0.101,99.947,0.178,0.163,2e-06,0.833,11,149520.0,-7.399,0,0.385
2,3KkXRkHbMCARz0aVfEt68P,0.479,0.0703,89.911,0.0466,0.556,0.0,0.76,2,158040.0,-5.574,1,0.913
3,2JvzF1RMd7lE3KmFlsyZD8,0.364,0.271,123.984,0.276,0.149,0.0,0.837,8,213594.0,-11.713,1,0.463
4,4NzMOnvSJVNKF7nw5NkXIP,0.389,0.106,120.046,0.332,0.74,0.162,0.905,8,193143.0,-14.505,1,0.196


In [53]:
new_X = new_data[['energy', 'liveness', 'tempo', 'speechiness'
            , 'acousticness', 'instrumentalness', 'danceability', 'key'
            , 'duration_ms' ,'loudness', 'mode', 'valence']] 

In [54]:
#fit model to the new data to predict clusters
dtree_predictions = dtree_model.predict(new_X)

In [56]:
new_data['cluster'] = dtree_predictions

In [62]:
counts = new_data.groupby(['cluster']).count()

In [64]:
counts['track_id']

cluster
1    75
2    94
3    31
Name: track_id, dtype: int64