In [1]:
import requests
import json
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import scikitplot as skplt
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import precision_recall_fscore_support, accuracy_score, classification_report, confusion_matrix, roc_curve, auc
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.neural_network import MLPClassifier
from collections import OrderedDict

In [69]:
#getting the client id and secret
clientid=''#Enter your client ID here
clientsecret=''#Enter your client secret
authurl='https://accounts.spotify.com/api/token'
response=requests.post(authurl,{
    'grant_type':'client_credentials',
    'client_id':clientid,
    'client_secret':clientsecret,
})
authjson=response.json()
accesstoken=authjson['access_token']
headers={'Authorization':'Bearer {}'.format(accesstoken)}

In [3]:
#using the playlists' songs, we create dataframes with details of the songs, and an extra column, genre
def playlistdataframe(playlistid,genre):    
    base='https://api.spotify.com/v1/playlists/'
    r=requests.get(base+playlistid+'/tracks',headers=headers)
    playlist=r.json()
    ids=[]
    for song in playlist['items']:
        ids.append(song['track']['id'])
    playlistids=",".join(ids)
    r=requests.get("https://api.spotify.com/v1/audio-features/?ids={}".format(playlistids), headers=headers)
    details=r.json()
    df=pd.DataFrame(details['audio_features'])
    df['genre']=genre
    return df

In [4]:
rock1=playlistdataframe('37i9dQZF1DXcF6B6QPhFDv','rock')
rock2=playlistdataframe('37i9dQZF1DWWJOmJ7nRx0C','rock')
#rock3=playlistdataframe('37i9dQZF1DX82GYcclJ3Ug','rock')
rock4=playlistdataframe('37i9dQZF1DWXRqgorJj26U','rock')
hiphop1=playlistdataframe('37i9dQZF1DX0XUsuxWHRQd','hip hop')
hiphop2=playlistdataframe('37i9dQZF1DWVA1Gq4XHa6U','hip hop')
hiphop3=playlistdataframe('37i9dQZF1DX186v583rmzp','hip hop')
# hiphop4=playlistdataframe('37i9dQZF1DWT5MrZnPU1zD','hip hop')
#edm1=playlistdataframe('37i9dQZF1DX4dyzvuaRJ0n','electronic')
edm2=playlistdataframe('37i9dQZF1DXaXB8fQg7xif','electronic')
edm3=playlistdataframe('37i9dQZF1DWXLeA8Omikj7','electronic')
edm4=playlistdataframe('37i9dQZF1DX6VdMW310YC7','electronic')
class1=playlistdataframe('37i9dQZF1DWWEJlAGA9gs0','classical')
class2=playlistdataframe('37i9dQZF1DWV0gynK7G6pD','classical')
class3=playlistdataframe('37i9dQZF1DWVFeEut75IAL','classical')
pop1=playlistdataframe('37i9dQZF1DWUa8ZRTfalHk','pop')
pop2=playlistdataframe('37i9dQZF1DX5gQonLbZD9s','pop')
pop3=playlistdataframe('37i9dQZF1DX0s5kDXi1oC5','pop')
metal1=playlistdataframe('37i9dQZF1DWWOaP4H0w5b0','metal')
metal2=playlistdataframe('37i9dQZF1DXakaomPRkkDa','metal')
metal3=playlistdataframe('37i9dQZF1DX9qNs32fujYe','metal')

In [5]:
#concatenation to make a mega dataframe
musicdf=pd.concat([rock1,rock2,rock4,hiphop1,hiphop2,hiphop3,edm2,edm3,edm4,class1,class2,class3,pop1,pop2,pop3,metal1,metal2,metal3])
musicdf=musicdf.drop_duplicates(subset=['id'],keep='first')
musicdf.head()

Unnamed: 0,danceability,energy,key,loudness,mode,speechiness,acousticness,instrumentalness,liveness,valence,tempo,type,id,uri,track_href,analysis_url,duration_ms,time_signature,genre
0,0.601,0.723,8,-4.364,0,0.0384,0.000347,0.0011,0.346,0.581,114.991,audio_features,6BsZx9FarJrLddTNu2k6pU,spotify:track:6BsZx9FarJrLddTNu2k6pU,https://api.spotify.com/v1/tracks/6BsZx9FarJrL...,https://api.spotify.com/v1/audio-analysis/6BsZ...,236933,4,rock
1,0.53,0.759,7,-7.067,1,0.0351,0.00984,0.0,0.319,0.502,131.999,audio_features,3NUmUIyzNLBp8bCFMH8Mif,spotify:track:3NUmUIyzNLBp8bCFMH8Mif,https://api.spotify.com/v1/tracks/3NUmUIyzNLBp...,https://api.spotify.com/v1/audio-analysis/3NUm...,253840,4,rock
2,0.384,0.628,1,-8.018,0,0.031,0.126,1e-05,0.314,0.394,161.438,audio_features,0HOvlNv2ClRN4Gb6Y40dxc,spotify:track:0HOvlNv2ClRN4Gb6Y40dxc,https://api.spotify.com/v1/tracks/0HOvlNv2ClRN...,https://api.spotify.com/v1/audio-analysis/0HOv...,202703,4,rock
3,0.394,0.833,0,-5.18,1,0.042,0.00863,0.811,0.109,0.362,145.082,audio_features,55meRTYBw8S5q7KF3DkjL7,spotify:track:55meRTYBw8S5q7KF3DkjL7,https://api.spotify.com/v1/tracks/55meRTYBw8S5...,https://api.spotify.com/v1/audio-analysis/55me...,250961,4,rock
4,0.471,0.9,8,-2.283,0,0.0832,0.0148,3e-06,0.082,0.362,91.318,audio_features,03Szk0skbXqllHkNCVZI9p,spotify:track:03Szk0skbXqllHkNCVZI9p,https://api.spotify.com/v1/tracks/03Szk0skbXql...,https://api.spotify.com/v1/audio-analysis/03Sz...,270215,4,rock


In [6]:
#separating the independent variables from the dependent
X=musicdf[['acousticness', 'danceability', 'energy', 'instrumentalness', 'key', 'liveness', 'loudness', 'speechiness', 'tempo', 'time_signature', 'valence']]
y=musicdf.genre
%store X
%store y

Stored 'X' (DataFrame)
Stored 'y' (Series)


In [7]:
#standardizing the values to make better predictions
standardized_df=StandardScaler().fit_transform(X)
scaledmusicdf=pd.DataFrame(X,columns=X.columns)
scaledmusicdf.head()

Unnamed: 0,acousticness,danceability,energy,instrumentalness,key,liveness,loudness,speechiness,tempo,time_signature,valence
0,0.000347,0.601,0.723,0.0011,8,0.346,-4.364,0.0384,114.991,4,0.581
1,0.00984,0.53,0.759,0.0,7,0.319,-7.067,0.0351,131.999,4,0.502
2,0.126,0.384,0.628,1e-05,1,0.314,-8.018,0.031,161.438,4,0.394
3,0.00863,0.394,0.833,0.811,0,0.109,-5.18,0.042,145.082,4,0.362
4,0.0148,0.471,0.9,3e-06,8,0.082,-2.283,0.0832,91.318,4,0.362


In [8]:
#split up of the training and testing data
X_train, X_test, y_train, y_test=train_test_split(scaledmusicdf, y, test_size=0.2, random_state=4)

In [9]:
#KNN: finding the best k value
acc=0
best_k=0
ypred=0
for i in range(1, 50):
    knn=KNeighborsClassifier(n_neighbors=i).fit(X_train, y_train)
    yhat=knn.predict(X_test)
    knnacc=accuracy_score(y_test, yhat)
    if knnacc>acc:
        acc=knnacc
        best_k=i
        ypred=yhat
print("Best accuracy: {}\nBest K value: {}".format(acc, best_k))
knnpred=KNeighborsClassifier(n_neighbors=best_k)
knnpred.fit(X_train, y_train)
print(knnpred.score(X_test, y_test))
print(precision_recall_fscore_support(y_test, ypred, average='micro'))
print(confusion_matrix(y_test, ypred))
print(classification_report(y_test,ypred))

Best accuracy: 0.4794520547945205
Best K value: 19
0.4794520547945205
(0.4794520547945205, 0.4794520547945205, 0.4794520547945205, None)
[[37  5  0  1  0  0]
 [ 4 33  4  3  9  4]
 [ 1  3 22  6  4  2]
 [ 0  8  8 27  8  5]
 [ 1  9 13  7 15  1]
 [ 2 12  9 19  4  6]]
              precision    recall  f1-score   support

   classical       0.82      0.86      0.84        43
  electronic       0.47      0.58      0.52        57
     hip hop       0.39      0.58      0.47        38
       metal       0.43      0.48      0.45        56
         pop       0.38      0.33      0.35        46
        rock       0.33      0.12      0.17        52

    accuracy                           0.48       292
   macro avg       0.47      0.49      0.47       292
weighted avg       0.46      0.48      0.46       292



In [10]:
#random forest classification
forestpred=RandomForestClassifier(n_estimators=100, max_depth=6)
forestpred.fit(X_train, y_train)
print(forestpred.score(X_test, y_test))
randomforest=forestpred.predict(X_test)
print(confusion_matrix(y_test,randomforest))
print(classification_report(y_test, randomforest))

0.726027397260274
[[41  2  0  0  0  0]
 [ 3 41  4  2  5  2]
 [ 0  0 31  1  6  0]
 [ 0  0  0 48  0  8]
 [ 0  3  6  0 37  0]
 [ 0  7  0 20 11 14]]
              precision    recall  f1-score   support

   classical       0.93      0.95      0.94        43
  electronic       0.77      0.72      0.75        57
     hip hop       0.76      0.82      0.78        38
       metal       0.68      0.86      0.76        56
         pop       0.63      0.80      0.70        46
        rock       0.58      0.27      0.37        52

    accuracy                           0.73       292
   macro avg       0.72      0.74      0.72       292
weighted avg       0.72      0.73      0.71       292



In [11]:
#logistic regression classification
logpred=LogisticRegression(solver='lbfgs', max_iter=10000)
logpred.fit(X_train, y_train)
logpredict=logpred.predict(X_test)
print(logpred.score(X_test, y_test))
print(confusion_matrix(y_test,logpredict))
print(classification_report(y_test, logpredict))

0.7226027397260274
[[41  2  0  0  0  0]
 [ 4 34  6  6  6  1]
 [ 0  0 34  0  4  0]
 [ 0  0  0 47  0  9]
 [ 1  2  4  1 34  4]
 [ 0  5  2 21  3 21]]
              precision    recall  f1-score   support

   classical       0.89      0.95      0.92        43
  electronic       0.79      0.60      0.68        57
     hip hop       0.74      0.89      0.81        38
       metal       0.63      0.84      0.72        56
         pop       0.72      0.74      0.73        46
        rock       0.60      0.40      0.48        52

    accuracy                           0.72       292
   macro avg       0.73      0.74      0.72       292
weighted avg       0.72      0.72      0.71       292



In [12]:
#SVM
svmpred=SVC(kernel='linear')
svmpred.fit(X_train, y_train)
svmpred.score(X_test, y_test)

0.7465753424657534

In [13]:
#neural networks
nnpred=MLPClassifier(hidden_layer_sizes=600)
nnpred.fit(X_train, y_train)
nnpred.score(X_test, y_test)

0.660958904109589

In [14]:
#summary of the used classifiers with 10 instances
classifiers=[knnpred, forestpred, logpred, svmpred, nnpred]
modelresult=[]
for i in classifiers:
    modelresult.append(cross_val_score(i, X_train, y_train, scoring='accuracy', cv=10))
modeldf=pd.DataFrame(modelresult, columns=[x for x in range(1,11)], index=["KNN", "Random Forest", "Logistic", "SVM", "NN"])
modeldf["Mean"] = modeldf.mean(axis=1)
modeldf

Unnamed: 0,1,2,3,4,5,6,7,8,9,10,Mean
KNN,0.487179,0.495726,0.504274,0.478632,0.529915,0.504274,0.525862,0.543103,0.534483,0.456897,0.506034
Random Forest,0.74359,0.709402,0.726496,0.726496,0.717949,0.786325,0.775862,0.732759,0.775862,0.758621,0.745336
Logistic,0.709402,0.717949,0.675214,0.675214,0.683761,0.700855,0.767241,0.732759,0.689655,0.75,0.710205
SVM,0.717949,0.683761,0.692308,0.700855,0.700855,0.717949,0.741379,0.724138,0.715517,0.767241,0.716195
NN,0.717949,0.57265,0.57265,0.666667,0.598291,0.615385,0.75,0.646552,0.706897,0.724138,0.657118


In [15]:
#hyperparameter tuning for knn
knn=KNeighborsClassifier(n_neighbors=best_k, n_jobs=-1)
params={'leaf_size':[1,2,3,4,5], 'weights':['uniform','distance'], 'algorithm':['auto','ball_tree','brute'],'n_jobs':[-1]}
knnmodel=GridSearchCV(knn, param_grid=params,n_jobs=1)
knnmodel.fit(X_train, y_train)
knnbest=knnmodel.best_params_
print(knnbest)
print(f"Training best: {knnmodel.score(X_train, y_train)}\nTesting best: {knnmodel.score(X_test, y_test)}")

{'algorithm': 'auto', 'leaf_size': 1, 'n_jobs': -1, 'weights': 'uniform'}
Training best: 0.5574614065180102
Testing best: 0.4794520547945205


In [16]:
#hyperparameter tuning for random forest
forest=RandomForestClassifier()
params={'criterion':['gini', 'entropy'],'n_estimators':[5,10,15,20,25], 'min_samples_leaf':[1,2,3],
        'min_samples_split':[3,4,5,6,7],'random_state':[123],'n_jobs':[-1]}
forestmodel=GridSearchCV(forest,param_grid=params,n_jobs=-1)
forestmodel.fit(X_train,y_train)
forestbest=forestmodel.best_params_
print(forestbest)
print(f"Training best: {forestmodel.score(X_train, y_train)}\nTesting best: {forestmodel.score(X_test, y_test)}")

{'criterion': 'entropy', 'min_samples_leaf': 1, 'min_samples_split': 4, 'n_estimators': 25, 'n_jobs': -1, 'random_state': 123}
Training best: 0.9922813036020584
Testing best: 0.7397260273972602


In [17]:
#hyperparameter tuning for logistic regression
logregression=LogisticRegression(max_iter=100000)
params={'C':np.logspace(-3,3,7), 'solver' : ['newton-cg', 'lbfgs', 'liblinear', 'sag', 'saga'], 'penalty':['l2']}
logmodel=GridSearchCV(logregression, param_grid=params, cv=3)
logmodel.fit(X_train, y_train)
logbest=logmodel.best_params_
print(logbest)
print(f"Training best: {logmodel.score(X_train, y_train)}\nTesting best: {logmodel.score(X_test, y_test)}")

{'C': 100.0, 'penalty': 'l2', 'solver': 'newton-cg'}
Training best: 0.7469982847341338
Testing best: 0.7534246575342466


In [18]:
#hyperparameter tuning for SVM
svm=SVC()
params={'C':[0.001, 0.01, 0.1, 1, 10], 'gamma':[0.001, 0.01, 0.1, 1], 'kernel': ['linear', 'rbf']} #poly and sigmoid take too long
svmmodel=GridSearchCV(svm, param_grid=params, refit=True, scoring='accuracy', verbose=10, cv=3, n_jobs=-1)
svmmodel.fit(X_train, y_train)
svmbest=svmmodel.best_params_
print(svmbest)
print(f"Training best: {svmmodel.score(X_train, y_train)}\nTesting best: {svmmodel.score(X_test, y_test)}")

Fitting 3 folds for each of 40 candidates, totalling 120 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Batch computation too fast (0.0578s.) Setting batch_size=2.
[Parallel(n_jobs=-1)]: Done   5 tasks      | elapsed:    0.1s
[Parallel(n_jobs=-1)]: Done  12 tasks      | elapsed:    0.2s
[Parallel(n_jobs=-1)]: Done  26 tasks      | elapsed:    0.5s
[Parallel(n_jobs=-1)]: Done  40 tasks      | elapsed:    0.8s
[Parallel(n_jobs=-1)]: Done  58 tasks      | elapsed:    1.5s
[Parallel(n_jobs=-1)]: Done  76 tasks      | elapsed:    4.2s
[Parallel(n_jobs=-1)]: Batch computation too slow (2.1207s.) Setting batch_size=1.
[Parallel(n_jobs=-1)]: Done  98 tasks      | elapsed:    8.3s
[Parallel(n_jobs=-1)]: Done 112 tasks      | elapsed:   29.5s
[Parallel(n_jobs=-1)]: Done 120 out of 120 | elapsed:   39.8s finished


{'C': 10, 'gamma': 0.001, 'kernel': 'linear'}
Training best: 0.7521440823327615
Testing best: 0.7534246575342466


In [19]:
#hyperparameter tuning for neural networks
nn=MLPClassifier()
params = {'alpha': 10.0**(-np.arange(1, 10)), 'hidden_layer_sizes':[200, 400, 600, 800, 1000], 
          'activation': ['relu', 'tanh', 'logistic']}
nnmodel=GridSearchCV(nn, param_grid=params, n_jobs=-1, scoring='accuracy', cv=3)
nnmodel.fit(X_train, y_train)
nnbest=nnmodel.best_params_
print(nnbest)
print(f"Training best: {nnmodel.score(X_train, y_train)}\nTesting best: {nnmodel.score(X_test, y_test)}")

{'activation': 'tanh', 'alpha': 1e-06, 'hidden_layer_sizes': 600}
Training best: 0.7521440823327615
Testing best: 0.7226027397260274


In [20]:
#summary of the best versions of each classifier for 10 instances
knnfinal=KNeighborsClassifier(leaf_size=knnbest['leaf_size'], n_neighbors=21, algorithm=knnbest['algorithm'], 
                              weights=knnbest['weights'])
forestfinal=RandomForestClassifier(criterion=forestbest['criterion'], min_samples_split=forestbest['min_samples_split'], 
                                   min_samples_leaf=forestbest['min_samples_leaf'], n_estimators=forestbest['n_estimators'],
                                   random_state=0)
logfinal=LogisticRegression(max_iter=100000, C=logbest['C'], solver=logbest['solver'])
svmfinal=SVC(kernel=svmbest['kernel'], gamma=svmbest['gamma'], C=svmbest['C'])
nnfinal=MLPClassifier(activation=nnbest['activation'], alpha=nnbest['alpha'], hidden_layer_sizes=nnbest['hidden_layer_sizes'])


classifiers=[knnfinal, forestfinal, logfinal, svmfinal, nnfinal]
results=[]
for i in classifiers:
    results.append(cross_val_score(i, X_train, y_train, scoring='accuracy', cv=3))
finaldf=pd.DataFrame(results, columns=[x for x in range(1,4)], index=["KNN", "Random Forest", "Logistic", "SVM", "NN"])
finaldf["Mean"] = finaldf.mean(axis=1)
finaldf
#Random Forest is the best, so let's use it for predictions
#future scope would involve boosting



Unnamed: 0,1,2,3,Mean
KNN,0.503856,0.496144,0.489691,0.496564
Random Forest,0.714653,0.724936,0.744845,0.728145
Logistic,0.724936,0.70437,0.752577,0.727294
SVM,0.719794,0.717224,0.744845,0.727288
NN,0.709512,0.670951,0.726804,0.702422


In [21]:
#insert playlist id
trial=playlistdataframe('3aFp6OcR1rukcTX0A8ppxe',0)

In [22]:
#extracting categories from the mega dataframe
categories=pd.factorize(musicdf['genre'])[1]
converter=OrderedDict()
for i in range(len(categories)):
    converter[i]=categories[i]

In [38]:
#factorizing the genre column; making it numerical for categorization
genres=pd.factorize(y)[0]
forestfinal.fit(X, genres)

RandomForestClassifier(criterion='entropy', min_samples_split=4,
                       n_estimators=25, random_state=0)

In [39]:
#predicting genres for the given features of the playlist's songs
finalX=trial[['acousticness', 'danceability', 'energy', 'instrumentalness', 'key', 'liveness', 'loudness', 'speechiness', 'tempo', 'time_signature', 'valence']]
predictions=forestfinal.predict(finalX)

In [40]:
#probabilities of the predictions
predprob=forestfinal.predict_proba(finalX)
print(predprob)

[[0.47       0.         0.         0.         0.08       0.45      ]
 [0.13666667 0.47       0.04       0.         0.35333333 0.        ]
 [0.30133333 0.         0.         0.         0.         0.69866667]
 [0.34866667 0.02       0.21666667 0.         0.37466667 0.04      ]
 [0.18       0.04       0.02       0.         0.76       0.        ]
 [0.51333333 0.         0.         0.         0.         0.48666667]
 [0.06       0.06666667 0.01333333 0.         0.84       0.02      ]
 [0.48333333 0.02       0.06666667 0.         0.09333333 0.33666667]
 [0.208      0.         0.04333333 0.         0.08       0.66866667]
 [0.27866667 0.         0.06       0.         0.         0.66133333]
 [0.52       0.         0.         0.         0.09333333 0.38666667]
 [0.062      0.03       0.41       0.         0.498      0.        ]
 [0.21333333 0.         0.         0.         0.02666667 0.76      ]
 [0.30666667 0.         0.03333333 0.         0.08       0.58      ]
 [0.08       0.238      0.072     

In [41]:
#converting the numerical predictions into categorical
predicted_genres = [converter[prediction] for prediction in predictions]
trial['genre']=predicted_genres

In [85]:
r=requests.get('https://api.spotify.com/v1/playlists/3aFp6OcR1rukcTX0A8ppxe/tracks', headers=headers)
details=r.json()
songnames=[]
for i in range(len(details['items'])):
    songdetails=details['items'][i]['track']['name']
    songnames.append(songdetails)
print(songnames)

['Stranded', 'Tequila Shots', '¡Viva La Gloria!', 'All I Wanted', 'Famous', 'Days Go By', 'It Ain’t Me (with Selena Gomez)', 'Cautious', 'Gods & Machines', 'Whiskey In The Jar', 'When You Know', 'Loner', 'Witchcraft', 'Plush - 2017 Remaster', 'No Sleep Till Brooklyn', 'Sweater Weather', 'Heart-Shaped Box', 'Errbody', 'The Other Half Of Me', 'Futsal Shuffle 2020 - Bonus Track']


In [114]:
finalpred=pd.DataFrame()
finalpred['Name']=songnames
finalpred['genre']=trial['genre']
finalpred

Unnamed: 0,Name,genre
0,Stranded,rock
1,Tequila Shots,hip hop
2,¡Viva La Gloria!,metal
3,All I Wanted,pop
4,Famous,pop
5,Days Go By,rock
6,It Ain’t Me (with Selena Gomez),pop
7,Cautious,rock
8,Gods & Machines,metal
9,Whiskey In The Jar,metal
