In [1]:
import numpy as np
import pandas as pd
from pandas import read_csv
from sklearn.metrics import confusion_matrix, accuracy_score
from sklearn.metrics import precision_score, recall_score, f1_score
from sklearn.metrics import classification_report
from sklearn.preprocessing import StandardScaler, MaxAbsScaler, Binarizer
from sklearn.preprocessing import FunctionTransformer
from sklearn.decomposition import PCA
from sklearn.tree import DecisionTreeClassifier
from sklearn.pipeline import Pipeline, FeatureUnion
from sklearn.feature_selection import SelectKBest
from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV

%matplotlib inline

# Spotify Song Attributes

In [18]:
data = pd.read_csv("data.csv", encoding='latin-1', index_col = 0)
data.head()

Unnamed: 0,acousticness,danceability,duration_ms,energy,instrumentalness,key,liveness,loudness,mode,speechiness,tempo,time_signature,valence,target,song_title,artist
0,0.0102,0.833,204600,0.434,0.0219,2,0.165,-8.795,1,0.431,150.062,4,0.286,1,Mask Off,Future
1,0.199,0.743,326933,0.359,0.00611,1,0.137,-10.401,1,0.0794,160.083,4,0.588,1,Redbone,Childish Gambino
2,0.0344,0.838,185707,0.412,0.000234,2,0.159,-7.148,1,0.289,75.044,4,0.173,1,Xanny Family,Future
3,0.604,0.494,199413,0.338,0.51,5,0.0922,-15.236,1,0.0261,86.468,4,0.23,1,Master Of None,Beach House
4,0.18,0.678,392893,0.561,0.512,5,0.439,-11.648,0,0.0694,174.004,4,0.904,1,Parallel Lines,Junior Boys


In [None]:
data.iloc[::100]

In [6]:
data['target'].value_counts()

1    1020
0     997
Name: target, dtype: int64

In [None]:
ax = data.hist(column='danceability', by='target', bins=20, layout=(2,1), sharex=True, sharey=True)

In [None]:
scatter = pd.plotting.scatter_matrix(data, figsize=(12, 8), diagonal='hist')

In [19]:
data = pd.get_dummies(data, columns=['key', 'mode', 'time_signature'], drop_first=False)

In [None]:
data.head()

In [None]:
data.shape

In [20]:
data = data.drop(['song_title', 'artist'], axis=1)

In [23]:
train, test = train_test_split(data, train_size=0.75, random_state=12345)

train, dev = train_test_split(train, train_size=0.7, random_state=12345)



In [24]:
#Fit to  the decision tree model 
dt = DecisionTreeClassifier(max_depth=5,
                                   min_samples_split=10,
                                   min_samples_leaf=10)
X = train.drop('target', axis=1)
y = train.target
dt.fit(X, y)
# Assest the model 
train['pred'] = dt.predict(X)
print ("DT features importance ")
print (pd.Series(dt.feature_importances_,
          index=X.columns))
cm = confusion_matrix(y_true=train.target,
                      y_pred=train.pred)
print ( '********** DC confusion matrix (train)*************')
print (pd.DataFrame(cm,
             index=dt.classes_,
             columns=dt.classes_))

print ('accuracy(train):' , accuracy_score(y_true=train.target, y_pred=train.pred))
print ( '********** DC classification report (train) *************')
print (classification_report(y_true=train.target,
                            y_pred=train.pred))

# Validate the model on test 
X = dev.drop('target', axis=1)
y = dev.target
dev['pred'] = dt.predict(X)
print ('accuracy(dev):' , accuracy_score(y_true=dev.target, y_pred=dev.pred))
print ( '********** DC classification report (test) *************')
print (classification_report(y_true=dev.target,
                            y_pred=dev.pred))

DT features importance 
acousticness        0.016801
danceability        0.151355
duration_ms         0.045687
energy              0.118221
instrumentalness    0.236026
liveness            0.023708
loudness            0.167922
speechiness         0.072665
tempo               0.054649
valence             0.107338
key_0               0.000000
key_1               0.000000
key_2               0.000000
key_3               0.000000
key_4               0.000000
key_5               0.000000
key_6               0.000000
key_7               0.000000
key_8               0.000000
key_9               0.000000
key_10              0.000000
key_11              0.000000
mode_0              0.000000
mode_1              0.000000
time_signature_1    0.000000
time_signature_3    0.000000
time_signature_4    0.005628
time_signature_5    0.000000
dtype: float64
********** DC confusion matrix (train)*************
     0    1
0  385  123
1  115  435
accuracy(train): 0.775047258979206
********** DC classificati

In [None]:
train, test = train_test_split(data, train_size=0.75, random_state=12345)

X = train.drop('target', axis=1)
y = train['target']

X_train, X_dev, y_train, y_dev = \
    train_test_split(X, y, train_size=0.7, random_state=12345)

y_train.head()

In [None]:
X_dev.head()

In [None]:
train_no_artists = train.drop(['song_title', 'artist'], axis=1)

X = train_no_artists.drop('target', axis=1)
y = train_no_artists['target']

X_train_no_artists, X_dev_no_artists, y_train_no_artists, y_dev_no_artists = \
    train_test_split(X, y, train_size=0.7, random_state=54321)

y_train_no_artists.head()

In [None]:
clf_dt_no_artists = DecisionTreeClassifier()

In [None]:
clf_dt_no_artists.fit(X_train_no_artists, y_train_no_artists)

In [None]:
print(clf_dt_no_artists.classes_)

In [None]:
pd.Series(clf_dt_no_artists.feature_importances_,
          index=X_train_no_artists.columns)

In [None]:
X_train_no_artists.shape

In [None]:
train_no_artists['target_pred_no_artists'] = clf_dt_no_artists.predict(X)
train_no_artists.head(10)

In [None]:
print('Confusion Matrix for model without Artist and song names\n')
cm = confusion_matrix(y_true=train_no_artists.target,
                      y_pred=train_no_artists.target_pred_no_artists)

print(pd.DataFrame(cm,
             index=clf_dt_no_artists.classes_,
             columns=clf_dt_no_artists.classes_))

In [None]:
print('Classification Report for model without Artist and song names\n')
print(classification_report(y_true=train_no_artists.target,
                      y_pred=train_no_artists.target_pred_no_artists))

In [None]:
print('Accuracy for model without Artist and song names\n')
print(accuracy_score(y_true=train_no_artists.target,
                     y_pred=train_no_artists.target_pred_no_artists))

In [None]:
train_w_artists = train.drop(['song_title'], axis=1)

train_w_artists = pd.get_dummies(train_w_artists, columns=['key', 'mode', 'time_signature', 'artist'], drop_first=False)

X = train_w_artists.drop('target', axis=1)
y = train_w_artists['target']

X_train_w_artists, X_dev_w_artists, y_train_w_artists, y_dev_w_artists = \
    train_test_split(X, y, train_size=0.7, random_state=54321)

y_train_w_artists.head()


In [None]:
clf_dt_w_artists = DecisionTreeClassifier()

In [None]:
clf_dt_w_artists.fit(X_train_w_artists, y_train_w_artists)

In [None]:
print(clf_dt_no_artists.classes_)