In [1]:
import requests
import json
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import scikitplot as skplt
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import precision_recall_fscore_support, accuracy_score, classification_report, confusion_matrix, roc_curve, auc
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.neural_network import MLPClassifier
from collections import OrderedDict

In [2]:
#getting the client id and secret
clientid=''#Yout client ID
clientsecret=''Your client secret
authurl='https://accounts.spotify.com/api/token'
response=requests.post(authurl,{
    'grant_type':'client_credentials',
    'client_id':clientid,
    'client_secret':clientsecret,
})
authjson=response.json()
accesstoken=authjson['access_token']
headers={'Authorization':'Bearer {}'.format(accesstoken)}

In [3]:
#using the playlists' songs, we create dataframes with details of the songs, and an extra column, genre
def playlistdataframe(playlistid,genre):    
    base='https://api.spotify.com/v1/playlists/'
    r=requests.get(base+playlistid+'/tracks',headers=headers)
    playlist=r.json()
    ids=[]
    for song in playlist['items']:
        if type(playlist['items']) != 'NoneType':
            ids.append(song['track']['id'])
    playlistids=",".join(ids)
    r=requests.get("https://api.spotify.com/v1/audio-features/?ids={}".format(playlistids), headers=headers)
    details=r.json()
    df=pd.DataFrame(details['audio_features'])
    df['genre']=genre
    return df

In [6]:
#rock1=playlistdataframe('37i9dQZF1DXcF6B6QPhFDv','rock')
rock2=playlistdataframe('37i9dQZF1DWWJOmJ7nRx0C','rock')
rock3=playlistdataframe('37i9dQZF1DX82GYcclJ3Ug','rock')
rock4=playlistdataframe('37i9dQZF1DWXRqgorJj26U','rock')
hiphop1=playlistdataframe('37i9dQZF1DX0XUsuxWHRQd','hip hop')
hiphop2=playlistdataframe('37i9dQZF1DWVA1Gq4XHa6U','hip hop')
hiphop3=playlistdataframe('37i9dQZF1DX186v583rmzp','hip hop')
hiphop4=playlistdataframe('37i9dQZF1DWT5MrZnPU1zD','hip hop')
#edm1=playlistdataframe('37i9dQZF1DX4dyzvuaRJ0n','electronic')
edm2=playlistdataframe('37i9dQZF1DXaXB8fQg7xif','electronic')
edm3=playlistdataframe('37i9dQZF1DWXLeA8Omikj7','electronic')
edm4=playlistdataframe('37i9dQZF1DX6VdMW310YC7','electronic')
class1=playlistdataframe('37i9dQZF1DWWEJlAGA9gs0','classical')
class2=playlistdataframe('37i9dQZF1DWV0gynK7G6pD','classical')
class3=playlistdataframe('37i9dQZF1DWVFeEut75IAL','classical')
pop1=playlistdataframe('37i9dQZF1DWUa8ZRTfalHk','pop')
pop2=playlistdataframe('37i9dQZF1DXbYM3nMM0oPk','pop')
pop3=playlistdataframe('37i9dQZF1DX0s5kDXi1oC5','pop')
metal1=playlistdataframe('37i9dQZF1DWWOaP4H0w5b0','metal')
metal2=playlistdataframe('37i9dQZF1DXakaomPRkkDa','metal')
metal3=playlistdataframe('37i9dQZF1DX9qNs32fujYe','metal')

In [7]:
#concatenation to make a mega dataframe
musicdf=pd.concat([rock3,rock2,rock4,hiphop1,hiphop2,hiphop3,hiphop4,edm2,edm3,edm4,class1,class2,class3,pop1,pop2,pop3,metal1,metal2,metal3])
musicdf=musicdf.drop_duplicates(subset=['id'],keep='first')
musicdf.head()

Unnamed: 0,danceability,energy,key,loudness,mode,speechiness,acousticness,instrumentalness,liveness,valence,tempo,type,id,uri,track_href,analysis_url,duration_ms,time_signature,genre
0,0.54,0.631,8,-4.569,1,0.0336,0.00316,0.0,0.104,0.194,84.903,audio_features,6z7mOTf4O20AApGsTNn7fC,spotify:track:6z7mOTf4O20AApGsTNn7fC,https://api.spotify.com/v1/tracks/6z7mOTf4O20A...,https://api.spotify.com/v1/audio-analysis/6z7m...,151989,4,rock
1,0.544,0.886,0,-2.667,1,0.0381,0.00188,0.0,0.324,0.567,104.09,audio_features,0AdfjZjynCi5aVfKcaxGvW,spotify:track:0AdfjZjynCi5aVfKcaxGvW,https://api.spotify.com/v1/tracks/0AdfjZjynCi5...,https://api.spotify.com/v1/audio-analysis/0Adf...,215645,4,rock
2,0.472,0.811,8,-5.121,1,0.0612,0.000839,0.000143,0.373,0.637,164.991,audio_features,4v2Bq0xDB7uNN73I5b44Du,spotify:track:4v2Bq0xDB7uNN73I5b44Du,https://api.spotify.com/v1/tracks/4v2Bq0xDB7uN...,https://api.spotify.com/v1/audio-analysis/4v2B...,118733,4,rock
3,0.536,0.845,2,-5.765,1,0.0615,0.0033,0.0,0.0465,0.6,159.953,audio_features,3MTfE3Mo3lfwAymYLU2luX,spotify:track:3MTfE3Mo3lfwAymYLU2luX,https://api.spotify.com/v1/tracks/3MTfE3Mo3lfw...,https://api.spotify.com/v1/audio-analysis/3MTf...,208296,4,rock
4,0.697,0.584,8,-6.429,0,0.0324,0.0616,2e-06,0.0935,0.96,107.083,audio_features,7s42bmqI0kQpoO80H3Vnqu,spotify:track:7s42bmqI0kQpoO80H3Vnqu,https://api.spotify.com/v1/tracks/7s42bmqI0kQp...,https://api.spotify.com/v1/audio-analysis/7s42...,170718,4,rock


In [8]:
#separating the independent variables from the dependent
X=musicdf[['acousticness', 'danceability', 'energy', 'instrumentalness', 'key', 'liveness', 'loudness', 'speechiness', 'tempo', 'time_signature', 'valence']]
y=musicdf.genre
%store X
%store y

Stored 'X' (DataFrame)
Stored 'y' (Series)


In [9]:
#standardizing the values to make better predictions
standardized_df=StandardScaler().fit_transform(X)
scaledmusicdf=pd.DataFrame(X,columns=X.columns)
scaledmusicdf.head()

Unnamed: 0,acousticness,danceability,energy,instrumentalness,key,liveness,loudness,speechiness,tempo,time_signature,valence
0,0.00316,0.54,0.631,0.0,8,0.104,-4.569,0.0336,84.903,4,0.194
1,0.00188,0.544,0.886,0.0,0,0.324,-2.667,0.0381,104.09,4,0.567
2,0.000839,0.472,0.811,0.000143,8,0.373,-5.121,0.0612,164.991,4,0.637
3,0.0033,0.536,0.845,0.0,2,0.0465,-5.765,0.0615,159.953,4,0.6
4,0.0616,0.697,0.584,2e-06,8,0.0935,-6.429,0.0324,107.083,4,0.96


In [10]:
#split up of the training and testing data
X_train, X_test, y_train, y_test=train_test_split(scaledmusicdf, y, test_size=0.2, random_state=4)

In [11]:
#KNN: finding the best k value
acc=0
best_k=0
ypred=0
for i in range(1, 50):
    knn=KNeighborsClassifier(n_neighbors=i).fit(X_train, y_train)
    yhat=knn.predict(X_test)
    knnacc=accuracy_score(y_test, yhat)
    if knnacc>acc:
        acc=knnacc
        best_k=i
        ypred=yhat
print("Best accuracy: {}\nBest K value: {}".format(acc, best_k))
knnpred=KNeighborsClassifier(n_neighbors=best_k)
knnpred.fit(X_train, y_train)
print(knnpred.score(X_test, y_test))
print(precision_recall_fscore_support(y_test, ypred, average='micro'))
print(confusion_matrix(y_test, ypred))
print(classification_report(y_test,ypred))

Best accuracy: 0.4492307692307692
Best K value: 4
0.4492307692307692
(0.4492307692307692, 0.4492307692307692, 0.4492307692307692, None)
[[38  3  0  0  0  0]
 [ 7 34  5  3  3  6]
 [ 1  6 32  7 10  5]
 [ 0  7 10 24  4  9]
 [ 0 13 16  8  4  6]
 [ 0 13 16 14  7 14]]
              precision    recall  f1-score   support

   classical       0.83      0.93      0.87        41
  electronic       0.45      0.59      0.51        58
     hip hop       0.41      0.52      0.46        61
       metal       0.43      0.44      0.44        54
         pop       0.14      0.09      0.11        47
        rock       0.35      0.22      0.27        64

    accuracy                           0.45       325
   macro avg       0.43      0.46      0.44       325
weighted avg       0.42      0.45      0.43       325



In [12]:
#random forest classification
forestpred=RandomForestClassifier(n_estimators=100, max_depth=6)
forestpred.fit(X_train, y_train)
print(forestpred.score(X_test, y_test))
randomforest=forestpred.predict(X_test)
print(confusion_matrix(y_test,randomforest))
print(classification_report(y_test, randomforest))

0.7230769230769231
[[38  3  0  0  0  0]
 [ 5 43  3  2  2  3]
 [ 0  0 53  0  8  0]
 [ 0  0  0 44  0 10]
 [ 1  1 11  0 29  5]
 [ 0  8  2 19  7 28]]
              precision    recall  f1-score   support

   classical       0.86      0.93      0.89        41
  electronic       0.78      0.74      0.76        58
     hip hop       0.77      0.87      0.82        61
       metal       0.68      0.81      0.74        54
         pop       0.63      0.62      0.62        47
        rock       0.61      0.44      0.51        64

    accuracy                           0.72       325
   macro avg       0.72      0.73      0.72       325
weighted avg       0.72      0.72      0.71       325



In [13]:
#logistic regression classification
logpred=LogisticRegression(solver='lbfgs', max_iter=10000)
logpred.fit(X_train, y_train)
logpredict=logpred.predict(X_test)
print(logpred.score(X_test, y_test))
print(confusion_matrix(y_test,logpredict))
print(classification_report(y_test, logpredict))

0.6738461538461539
[[39  2  0  0  0  0]
 [ 4 35  7  3  1  8]
 [ 0  1 52  0  6  2]
 [ 0  1  0 45  0  8]
 [ 1  1 10  1 21 13]
 [ 0  3  2 21 11 27]]
              precision    recall  f1-score   support

   classical       0.89      0.95      0.92        41
  electronic       0.81      0.60      0.69        58
     hip hop       0.73      0.85      0.79        61
       metal       0.64      0.83      0.73        54
         pop       0.54      0.45      0.49        47
        rock       0.47      0.42      0.44        64

    accuracy                           0.67       325
   macro avg       0.68      0.68      0.68       325
weighted avg       0.67      0.67      0.67       325



In [14]:
#SVM
svmpred=SVC(kernel='linear')
svmpred.fit(X_train, y_train)
svmpred.score(X_test, y_test)

0.6707692307692308

In [15]:
#neural networks
nnpred=MLPClassifier(hidden_layer_sizes=600)
nnpred.fit(X_train, y_train)
nnpred.score(X_test, y_test)

0.5938461538461538

In [16]:
#summary of the used classifiers with 10 instances
classifiers=[knnpred, forestpred, logpred, svmpred, nnpred]
modelresult=[]
for i in classifiers:
    modelresult.append(cross_val_score(i, X_train, y_train, scoring='accuracy', cv=10))
modeldf=pd.DataFrame(modelresult, columns=[x for x in range(1,11)], index=["KNN", "Random Forest", "Logistic", "SVM", "NN"])
modeldf["Mean"] = modeldf.mean(axis=1)
modeldf



Unnamed: 0,1,2,3,4,5,6,7,8,9,10,Mean
KNN,0.407692,0.469231,0.484615,0.338462,0.507692,0.530769,0.515385,0.438462,0.503876,0.395349,0.459153
Random Forest,0.676923,0.715385,0.715385,0.723077,0.807692,0.784615,0.730769,0.723077,0.751938,0.658915,0.728778
Logistic,0.684615,0.7,0.7,0.7,0.815385,0.753846,0.776923,0.746154,0.736434,0.674419,0.728778
SVM,0.676923,0.692308,0.676923,0.7,0.792308,0.746154,0.769231,0.761538,0.736434,0.666667,0.721849
NN,0.569231,0.653846,0.661538,0.646154,0.792308,0.738462,0.723077,0.7,0.728682,0.658915,0.687221


In [17]:
#hyperparameter tuning for knn
knn=KNeighborsClassifier(n_neighbors=best_k, n_jobs=-1)
params={'leaf_size':[1,2,3,4,5], 'weights':['uniform','distance'], 'algorithm':['auto','ball_tree','brute'],'n_jobs':[-1]}
knnmodel=GridSearchCV(knn, param_grid=params,n_jobs=1)
knnmodel.fit(X_train, y_train)
knnbest=knnmodel.best_params_
print(knnbest)
print(f"Training best: {knnmodel.score(X_train, y_train)}\nTesting best: {knnmodel.score(X_test, y_test)}")

{'algorithm': 'auto', 'leaf_size': 1, 'n_jobs': -1, 'weights': 'uniform'}
Training best: 0.6302003081664098
Testing best: 0.4492307692307692


In [18]:
#hyperparameter tuning for random forest
forest=RandomForestClassifier()
params={'criterion':['gini', 'entropy'],'n_estimators':[5,10,15,20,25], 'min_samples_leaf':[1,2,3],
        'min_samples_split':[3,4,5,6,7],'random_state':[123],'n_jobs':[-1]}
forestmodel=GridSearchCV(forest,param_grid=params,n_jobs=-1)
forestmodel.fit(X_train,y_train)
forestbest=forestmodel.best_params_
print(forestbest)
print(f"Training best: {forestmodel.score(X_train, y_train)}\nTesting best: {forestmodel.score(X_test, y_test)}")

{'criterion': 'entropy', 'min_samples_leaf': 1, 'min_samples_split': 6, 'n_estimators': 25, 'n_jobs': -1, 'random_state': 123}
Training best: 0.9815100154083205
Testing best: 0.7046153846153846


In [19]:
#hyperparameter tuning for logistic regression
logregression=LogisticRegression(max_iter=100000)
params={'C':np.logspace(-3,3,7), 'solver' : ['newton-cg', 'lbfgs', 'liblinear', 'sag', 'saga'], 'penalty':['l2']}
logmodel=GridSearchCV(logregression, param_grid=params, cv=3)
logmodel.fit(X_train, y_train)
logbest=logmodel.best_params_
print(logbest)
print(f"Training best: {logmodel.score(X_train, y_train)}\nTesting best: {logmodel.score(X_test, y_test)}")

{'C': 100.0, 'penalty': 'l2', 'solver': 'newton-cg'}
Training best: 0.7542372881355932
Testing best: 0.6707692307692308


In [20]:
#hyperparameter tuning for SVM
svm=SVC()
params={'C':[0.001, 0.01, 0.1, 1, 10], 'gamma':[0.001, 0.01, 0.1, 1], 'kernel': ['linear', 'rbf']} #poly and sigmoid take too long
svmmodel=GridSearchCV(svm, param_grid=params, refit=True, scoring='accuracy', verbose=10, cv=3, n_jobs=-1)
svmmodel.fit(X_train, y_train)
svmbest=svmmodel.best_params_
print(svmbest)
print(f"Training best: {svmmodel.score(X_train, y_train)}\nTesting best: {svmmodel.score(X_test, y_test)}")

Fitting 3 folds for each of 40 candidates, totalling 120 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Batch computation too fast (0.0698s.) Setting batch_size=2.
[Parallel(n_jobs=-1)]: Done   5 tasks      | elapsed:    0.1s
[Parallel(n_jobs=-1)]: Done  12 tasks      | elapsed:    0.3s
[Parallel(n_jobs=-1)]: Done  26 tasks      | elapsed:    0.7s
[Parallel(n_jobs=-1)]: Done  40 tasks      | elapsed:    1.1s
[Parallel(n_jobs=-1)]: Done  58 tasks      | elapsed:    2.4s
[Parallel(n_jobs=-1)]: Done  76 tasks      | elapsed:    5.8s
[Parallel(n_jobs=-1)]: Batch computation too slow (2.0622s.) Setting batch_size=1.
[Parallel(n_jobs=-1)]: Done  97 tasks      | elapsed:   12.8s
[Parallel(n_jobs=-1)]: Done 108 tasks      | elapsed:   41.2s
[Parallel(n_jobs=-1)]: Done 120 out of 120 | elapsed:  1.0min finished


{'C': 10, 'gamma': 0.001, 'kernel': 'linear'}
Training best: 0.7573189522342064
Testing best: 0.6953846153846154


In [21]:
#hyperparameter tuning for neural networks
nn=MLPClassifier()
params = {'alpha': 10.0**(-np.arange(1, 10)), 'hidden_layer_sizes':[200, 400, 600, 800, 1000], 
          'activation': ['relu', 'tanh', 'logistic']}
nnmodel=GridSearchCV(nn, param_grid=params, n_jobs=-1, scoring='accuracy', cv=3)
nnmodel.fit(X_train, y_train)
nnbest=nnmodel.best_params_
print(nnbest)
print(f"Training best: {nnmodel.score(X_train, y_train)}\nTesting best: {nnmodel.score(X_test, y_test)}")

{'activation': 'tanh', 'alpha': 1e-06, 'hidden_layer_sizes': 400}
Training best: 0.7110939907550077
Testing best: 0.6369230769230769


In [22]:
#summary of the best versions of each classifier for 10 instances
knnfinal=KNeighborsClassifier(leaf_size=knnbest['leaf_size'], n_neighbors=21, algorithm=knnbest['algorithm'], 
                              weights=knnbest['weights'])
forestfinal=RandomForestClassifier(criterion=forestbest['criterion'], min_samples_split=forestbest['min_samples_split'], 
                                   min_samples_leaf=forestbest['min_samples_leaf'], n_estimators=forestbest['n_estimators'],
                                   random_state=0)
logfinal=LogisticRegression(max_iter=100000, C=logbest['C'], solver=logbest['solver'])
svmfinal=SVC(kernel=svmbest['kernel'], gamma=svmbest['gamma'], C=svmbest['C'])
nnfinal=MLPClassifier(activation=nnbest['activation'], alpha=nnbest['alpha'], hidden_layer_sizes=nnbest['hidden_layer_sizes'])


classifiers=[knnfinal, forestfinal, logfinal, svmfinal, nnfinal]
results=[]
for i in classifiers:
    results.append(cross_val_score(i, X_train, y_train, scoring='accuracy', cv=3))
finaldf=pd.DataFrame(results, columns=[x for x in range(1,4)], index=["KNN", "Random Forest", "Logistic", "SVM", "NN"])
finaldf["Mean"] = finaldf.mean(axis=1)
finaldf
#Random Forest is the best, so let's use it for predictions
#future scope would involve boosting



Unnamed: 0,1,2,3,Mean
KNN,0.468822,0.438799,0.479167,0.462263
Random Forest,0.706697,0.766744,0.738426,0.737289
Logistic,0.683603,0.775982,0.731481,0.730355
SVM,0.681293,0.769053,0.740741,0.730362
NN,0.623557,0.736721,0.696759,0.685679


In [23]:
#insert playlist id
trial=playlistdataframe('3aFp6OcR1rukcTX0A8ppxe',0)

In [24]:
#extracting categories from the mega dataframe
categories=pd.factorize(musicdf['genre'])[1]
converter=OrderedDict()
for i in range(len(categories)):
    converter[i]=categories[i]

In [25]:
#factorizing the genre column; making it numerical for categorization
genres=pd.factorize(y)[0]
forestfinal.fit(X, genres)

RandomForestClassifier(criterion='entropy', min_samples_split=6,
                       n_estimators=25, random_state=0)

In [26]:
#predicting genres for the given features of the playlist's songs
finalX=trial[['acousticness', 'danceability', 'energy', 'instrumentalness', 'key', 'liveness', 'loudness', 'speechiness', 'tempo', 'time_signature', 'valence']]
predictions=forestfinal.predict(finalX)

In [27]:
#probabilities of the predictions
predprob=forestfinal.predict_proba(finalX)
print(predprob)

[[0.44622222 0.00666667 0.026      0.         0.         0.52111111]
 [0.19188889 0.47825397 0.12634921 0.         0.19850794 0.005     ]
 [0.29704762 0.         0.         0.         0.01       0.69295238]
 [0.59073016 0.         0.21866667 0.         0.096      0.09460317]
 [0.21466667 0.27548718 0.01942857 0.         0.43041758 0.06      ]
 [0.50371429 0.         0.         0.         0.         0.49628571]
 [0.         0.35488889 0.024      0.         0.62111111 0.        ]
 [0.42811111 0.         0.06666667 0.         0.10698413 0.3982381 ]
 [0.20409524 0.03555556 0.         0.         0.016      0.74434921]
 [0.30666667 0.         0.         0.         0.         0.69333333]
 [0.57666667 0.         0.02666667 0.         0.02       0.37666667]
 [0.22466667 0.15333333 0.27704762 0.         0.33828571 0.00666667]
 [0.34185714 0.         0.05555556 0.         0.         0.6025873 ]
 [0.08904762 0.55709524 0.024      0.         0.32985714 0.        ]
 [0.21047619 0.03       0.65819048

In [28]:
#converting the numerical predictions into categorical
predicted_genres = [converter[prediction] for prediction in predictions]
trial['genre']=predicted_genres

In [29]:
r=requests.get('https://api.spotify.com/v1/playlists/3aFp6OcR1rukcTX0A8ppxe/tracks', headers=headers)
details=r.json()
songnames=[]
for i in range(len(details['items'])):
    songdetails=details['items'][i]['track']['name']
    songnames.append(songdetails)
print(songnames)

['Stranded', 'Tequila Shots', '¡Viva La Gloria!', 'All I Wanted', 'Famous', 'Days Go By', 'It Ain’t Me (with Selena Gomez)', 'Cautious', 'Gods & Machines', 'Whiskey In The Jar', 'When You Know', 'Loner', 'Plush - 2017 Remaster', 'No Sleep Till Brooklyn', 'Sweater Weather', 'Heart-Shaped Box', 'Errbody', 'The Other Half Of Me', 'Futsal Shuffle 2020 - Bonus Track', 'Symphony No.25 in G minor, K.183: 1. Allegro con brio']


In [30]:
finalpred=pd.DataFrame()
finalpred['Name']=songnames
finalpred['genre']=trial['genre']
finalpred

Unnamed: 0,Name,genre
0,Stranded,metal
1,Tequila Shots,hip hop
2,¡Viva La Gloria!,metal
3,All I Wanted,rock
4,Famous,pop
5,Days Go By,rock
6,It Ain’t Me (with Selena Gomez),pop
7,Cautious,rock
8,Gods & Machines,metal
9,Whiskey In The Jar,metal
