## Preface: Import Statements

In [1]:
# Spotify Authentication
from spotipy.oauth2 import SpotifyClientCredentials

# Data Extraction/Cleaning
import spotipy
import numpy as np
import pandas as pd

# Data Visualization
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns; sns.set()

# Feature Selection
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import StandardScaler

# Structured Machine Learning
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.neural_network import MLPClassifier
from sklearn.model_selection import cross_val_score

import joblib

# Hyperparameter Tuning
from sklearn.model_selection import GridSearchCV

# Miscellaneous
import warnings
warnings.filterwarnings('ignore')

## Chapter I: Spotify Authentication

In [2]:
username = 'w8huf24wdkqfq3ijybse23ad8'
client_id = '6a9762a4ec7340c0959c65aaafd89623'
client_secret = '8126f6721b304547a681a51c6d348dbe'
client_credentials_manager = SpotifyClientCredentials(client_id=client_id, client_secret=client_secret)
sp = spotipy.Spotify(client_credentials_manager=client_credentials_manager)

## Chapter II: Data Extraction

In [3]:
# Create Pop/EDM Dataframe
sourcePlaylistID = 'spotify:playlist:3G5Efx1wXQ9sTgQ34vwB14'
sourcePlaylist = sp.user_playlist(username, sourcePlaylistID);
tracks = sourcePlaylist["tracks"];
songs = tracks["items"];

track_ids = []
track_names = []

for i in range(0, len(songs)):
    if songs[i]['track']['id'] != None: # Removes the local tracks in your playlist if there is any
        track_ids.append(songs[i]['track']['id'])
        track_names.append(songs[i]['track']['name'])

features = []
for i in range(0,len(track_ids)):
    audio_features = sp.audio_features(track_ids[i])
    for track in audio_features:
        features.append(track)
        
popedm = pd.DataFrame(features, index = track_names)

popedm['genre'] = 1

popedm.head()

Unnamed: 0,danceability,energy,key,loudness,mode,speechiness,acousticness,instrumentalness,liveness,valence,tempo,type,id,uri,track_href,analysis_url,duration_ms,time_signature,genre
Caught Up (feat. Khalid),0.678,0.438,9,-8.847,0,0.0577,0.29,0.0,0.0838,0.501,81.498,audio_features,1opARDDYaOeE1QUdwXmBGu,spotify:track:1opARDDYaOeE1QUdwXmBGu,https://api.spotify.com/v1/tracks/1opARDDYaOeE...,https://api.spotify.com/v1/audio-analysis/1opA...,247546,4,1
Easier,0.562,0.46,5,-4.173,1,0.259,0.476,0.0,0.107,0.623,176.055,audio_features,2bjUEg4jBtKBlPdNrTAppI,spotify:track:2bjUEg4jBtKBlPdNrTAppI,https://api.spotify.com/v1/tracks/2bjUEg4jBtKB...,https://api.spotify.com/v1/audio-analysis/2bjU...,158099,4,1
Moonlight,0.633,0.412,9,-7.339,1,0.0284,0.42,4e-06,0.048,0.214,102.215,audio_features,4TZdeM7zelZGz6JPrXSEZ1,spotify:track:4TZdeM7zelZGz6JPrXSEZ1,https://api.spotify.com/v1/tracks/4TZdeM7zelZG...,https://api.spotify.com/v1/audio-analysis/4TZd...,202360,3,1
Dangerous Woman,0.664,0.602,4,-5.369,0,0.0412,0.0529,0.0,0.356,0.289,134.049,audio_features,7l94dyN2hX9c6wWcZQuOGJ,spotify:track:7l94dyN2hX9c6wWcZQuOGJ,https://api.spotify.com/v1/tracks/7l94dyN2hX9c...,https://api.spotify.com/v1/audio-analysis/7l94...,235947,3,1
Be Alright,0.813,0.456,1,-7.667,0,0.0684,0.169,3e-06,0.105,0.587,108.801,audio_features,6f5TuB9WtbA1g49A4DcMQ4,spotify:track:6f5TuB9WtbA1g49A4DcMQ4,https://api.spotify.com/v1/tracks/6f5TuB9WtbA1...,https://api.spotify.com/v1/audio-analysis/6f5T...,179293,4,1


In [4]:
# Create Rock/Metal Dataframe
sourcePlaylistID = 'spotify:playlist:0JEVlIL7tHkHO1wIEqYImj'
sourcePlaylist = sp.user_playlist(username, sourcePlaylistID);
tracks = sourcePlaylist["tracks"];
songs = tracks["items"];

track_ids = []
track_names = []

for i in range(0, len(songs)):
    if songs[i]['track']['id'] != None: # Removes the local tracks in your playlist if there is any
        track_ids.append(songs[i]['track']['id'])
        track_names.append(songs[i]['track']['name'])

features = []
for i in range(0,len(track_ids)):
    audio_features = sp.audio_features(track_ids[i])
    for track in audio_features:
        features.append(track)
        
rockmetal = pd.DataFrame(features, index = track_names)

rockmetal['genre'] = 2

rockmetal.head()

Unnamed: 0,danceability,energy,key,loudness,mode,speechiness,acousticness,instrumentalness,liveness,valence,tempo,type,id,uri,track_href,analysis_url,duration_ms,time_signature,genre
On My Own,0.361,0.872,8,-4.312,1,0.0392,0.0117,0.0,0.318,0.578,176.025,audio_features,6HCkDai7cXHo85eW5mBXc7,spotify:track:6HCkDai7cXHo85eW5mBXc7,https://api.spotify.com/v1/tracks/6HCkDai7cXHo...,https://api.spotify.com/v1/audio-analysis/6HCk...,172960,4,2
End of Me,0.59,0.823,5,-4.606,0,0.0394,0.00187,0.0,0.375,0.571,129.969,audio_features,3tfTwukbzAVanIvsYeIEmj,spotify:track:3tfTwukbzAVanIvsYeIEmj,https://api.spotify.com/v1/tracks/3tfTwukbzAVa...,https://api.spotify.com/v1/audio-analysis/3tfT...,170427,4,2
White Rabbit,0.49,0.887,1,-5.363,0,0.0787,0.00176,0.0,0.0564,0.598,175.975,audio_features,4QhSscYz3TPLEwD6lMezvG,spotify:track:4QhSscYz3TPLEwD6lMezvG,https://api.spotify.com/v1/tracks/4QhSscYz3TPL...,https://api.spotify.com/v1/audio-analysis/4QhS...,217787,4,2
Unbreakable,0.458,0.849,3,-4.244,0,0.0404,0.00341,1e-06,0.161,0.389,89.916,audio_features,6M9vEm3Cy3PHr3QkXRX6x3,spotify:track:6M9vEm3Cy3PHr3QkXRX6x3,https://api.spotify.com/v1/tracks/6M9vEm3Cy3PH...,https://api.spotify.com/v1/audio-analysis/6M9v...,201160,4,2
What I've Done,0.623,0.93,5,-5.285,1,0.0324,0.0141,2e-06,0.138,0.287,120.119,audio_features,18lR4BzEs7e3qzc0KVkTpU,spotify:track:18lR4BzEs7e3qzc0KVkTpU,https://api.spotify.com/v1/tracks/18lR4BzEs7e3...,https://api.spotify.com/v1/audio-analysis/18lR...,205613,4,2


In [5]:
# Create Rap/R&B Dataframe
sourcePlaylistID = 'spotify:playlist:0IqWU97Hv3TjHnUb5RvecH'
sourcePlaylist = sp.user_playlist(username, sourcePlaylistID);
tracks = sourcePlaylist["tracks"];
songs = tracks["items"];

track_ids = []
track_names = []

for i in range(0, len(songs)):
    if songs[i]['track']['id'] != None: # Removes the local tracks in your playlist if there is any
        track_ids.append(songs[i]['track']['id'])
        track_names.append(songs[i]['track']['name'])

features = []
for i in range(0,len(track_ids)):
    audio_features = sp.audio_features(track_ids[i])
    for track in audio_features:
        features.append(track)
        
raprb = pd.DataFrame(features, index = track_names)

raprb['genre'] = 3

raprb.head()

Unnamed: 0,danceability,energy,key,loudness,mode,speechiness,acousticness,instrumentalness,liveness,valence,tempo,type,id,uri,track_href,analysis_url,duration_ms,time_signature,genre
Lemonade,0.799,0.66,1,-6.153,0,0.079,0.256,0.0,0.111,0.471,140.04,audio_features,02kDW379Yfd5PzW5A6vuGt,spotify:track:02kDW379Yfd5PzW5A6vuGt,https://api.spotify.com/v1/tracks/02kDW379Yfd5...,https://api.spotify.com/v1/audio-analysis/02kD...,195429,4,3
Laugh Now Cry Later (feat. Lil Durk),0.761,0.518,0,-8.871,1,0.134,0.244,3.5e-05,0.107,0.522,133.976,audio_features,2SAqBLGA283SUiwJ3xOUVI,spotify:track:2SAqBLGA283SUiwJ3xOUVI,https://api.spotify.com/v1/tracks/2SAqBLGA283S...,https://api.spotify.com/v1/audio-analysis/2SAq...,261493,4,3
For The Night (feat. Lil Baby & DaBaby),0.823,0.586,6,-6.606,0,0.2,0.114,0.0,0.193,0.347,125.971,audio_features,0PvFJmanyNQMseIFrU708S,spotify:track:0PvFJmanyNQMseIFrU708S,https://api.spotify.com/v1/tracks/0PvFJmanyNQM...,https://api.spotify.com/v1/audio-analysis/0PvF...,190476,4,3
DOLLAZ ON MY HEAD (feat. Young Thug),0.825,0.458,0,-7.47,1,0.0577,0.114,0.0,0.102,0.161,145.115,audio_features,3nS9a01VvXHQriLqJYwRqG,spotify:track:3nS9a01VvXHQriLqJYwRqG,https://api.spotify.com/v1/tracks/3nS9a01VvXHQ...,https://api.spotify.com/v1/audio-analysis/3nS9...,197760,4,3
FRANCHISE (feat. Young Thug & M.I.A.),0.835,0.699,8,-5.405,0,0.277,0.00671,0.0,0.195,0.547,154.981,audio_features,4jVBIpuOiMj1crqd8LoCrJ,spotify:track:4jVBIpuOiMj1crqd8LoCrJ,https://api.spotify.com/v1/tracks/4jVBIpuOiMj1...,https://api.spotify.com/v1/audio-analysis/4jVB...,202795,4,3


In [6]:
# Combine Dataframes
df = pd.concat([popedm,rockmetal,raprb],ignore_index=True)
df.head()

Unnamed: 0,danceability,energy,key,loudness,mode,speechiness,acousticness,instrumentalness,liveness,valence,tempo,type,id,uri,track_href,analysis_url,duration_ms,time_signature,genre
0,0.678,0.438,9,-8.847,0,0.0577,0.29,0.0,0.0838,0.501,81.498,audio_features,1opARDDYaOeE1QUdwXmBGu,spotify:track:1opARDDYaOeE1QUdwXmBGu,https://api.spotify.com/v1/tracks/1opARDDYaOeE...,https://api.spotify.com/v1/audio-analysis/1opA...,247546,4,1
1,0.562,0.46,5,-4.173,1,0.259,0.476,0.0,0.107,0.623,176.055,audio_features,2bjUEg4jBtKBlPdNrTAppI,spotify:track:2bjUEg4jBtKBlPdNrTAppI,https://api.spotify.com/v1/tracks/2bjUEg4jBtKB...,https://api.spotify.com/v1/audio-analysis/2bjU...,158099,4,1
2,0.633,0.412,9,-7.339,1,0.0284,0.42,4e-06,0.048,0.214,102.215,audio_features,4TZdeM7zelZGz6JPrXSEZ1,spotify:track:4TZdeM7zelZGz6JPrXSEZ1,https://api.spotify.com/v1/tracks/4TZdeM7zelZG...,https://api.spotify.com/v1/audio-analysis/4TZd...,202360,3,1
3,0.664,0.602,4,-5.369,0,0.0412,0.0529,0.0,0.356,0.289,134.049,audio_features,7l94dyN2hX9c6wWcZQuOGJ,spotify:track:7l94dyN2hX9c6wWcZQuOGJ,https://api.spotify.com/v1/tracks/7l94dyN2hX9c...,https://api.spotify.com/v1/audio-analysis/7l94...,235947,3,1
4,0.813,0.456,1,-7.667,0,0.0684,0.169,3e-06,0.105,0.587,108.801,audio_features,6f5TuB9WtbA1g49A4DcMQ4,spotify:track:6f5TuB9WtbA1g49A4DcMQ4,https://api.spotify.com/v1/tracks/6f5TuB9WtbA1...,https://api.spotify.com/v1/audio-analysis/6f5T...,179293,4,1


## Chapter III: Data Cleaning

In [7]:
# Drop Unnecessary Columns and Check Correlation
df = df.drop(columns=['type', 'id', 'uri', 'track_href', 'analysis_url', 'time_signature', 'key', 'mode'])
df.head()

Unnamed: 0,danceability,energy,loudness,speechiness,acousticness,instrumentalness,liveness,valence,tempo,duration_ms,genre
0,0.678,0.438,-8.847,0.0577,0.29,0.0,0.0838,0.501,81.498,247546,1
1,0.562,0.46,-4.173,0.259,0.476,0.0,0.107,0.623,176.055,158099,1
2,0.633,0.412,-7.339,0.0284,0.42,4e-06,0.048,0.214,102.215,202360,1
3,0.664,0.602,-5.369,0.0412,0.0529,0.0,0.356,0.289,134.049,235947,1
4,0.813,0.456,-7.667,0.0684,0.169,3e-06,0.105,0.587,108.801,179293,1


In [None]:
# Visually View Feature Correlation
pair_plot = sns.pairplot(df, hue='genre')

## Chapter IV: Feature Selection

In [None]:
# Separate Features (X) and Target (y)
X = df[['danceability', 'energy', 'loudness', 'speechiness',
       'acousticness', 'instrumentalness', 'liveness', 'valence', 'tempo',
       'duration_ms']].values
y = df['genre'].values

# Split into Training and Test Sets
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=1, test_size=0.3, stratify=y)

# Data Normalization
mms = MinMaxScaler().fit(X_train)
X_train_nrm, X_test_nrm = mms.transform(X_train), mms.transform(X_test)

# Data Standardization
stdsc = StandardScaler()
X_train_std, X_test_std = X_train.copy(), X_test.copy() # Copy Data
X_train_std, X_test_std = stdsc.fit_transform(X_train_std), stdsc.transform(X_test_std)

In [None]:
# Feature Importance
rf = RandomForestClassifier(n_estimators=1000, random_state=0)
rf.fit(X_train_std, y_train)
feature_importance = rf.feature_importances_
best_features = pd.DataFrame(feature_importance.reshape(1,-1), columns=df.columns[:-1], index = ["importance"])
best_features * 100

## Chapter V: Model Selection

In [None]:
# Instantiate ML Models
log_reg = LogisticRegression(C = .1) # Logistic Regression
knn = KNeighborsClassifier(n_neighbors=5) # K-Nearest Neighbors
svm = SVC(kernel='linear', C=.1) # Support Vector Machine
rf = RandomForestClassifier(n_estimators=1000, random_state=0) # Random Forest
nn = MLPClassifier() # Neural Network

In [None]:
# Standardized K-Fold Cross Validation
classifiers = [log_reg, knn, svm, rf, nn]
std_model_scores = []

for clf in classifiers:
    std_model_scores.append(cross_val_score(clf, X_train_std, y_train, scoring='accuracy', cv=10))

std_models_df = pd.DataFrame(std_model_scores, columns=[1,2,3,4,5,6,7,8,9,10], index=["LR", "KNN", "SVM", "Forest", "Neural Network"])
std_models_df["Mean"] = std_models_df.mean(axis=1)

std_models_df*100

In [None]:
# Visually Represent Standardized Model Scores
fig, axes = plt.subplots(nrows=1, ncols=1, figsize=(18, 8))
bplot_models = axes.boxplot(std_model_scores, vert=True, patch_artist=True)
colors_d = ["#123863", "#215e7c", "#083242", "#091d33", "#3e3e3e"]
for patch, color in zip(bplot_models['boxes'], colors_d):
    patch.set_facecolor(color)
    
axes.set_title('Standardized Accuracy', fontsize = 18)
plt.setp(axes, xticks=[i+1 for i in range(len(std_model_scores))],xticklabels=['LR', 'KNN', 'SVM', 'RF', 'NN'])

y_ticks = axes.get_yticklabels()
x_ticks = axes.get_xticklabels()
for i in x_ticks: 
    i.set_fontsize(18)       
for i in y_ticks:
    i.set_fontsize(18)

In [None]:
# Normalized K-Fold Cross Validation
classifiers = [log_reg, knn, svm, rf, nn]
nrm_model_scores = []

for clf in classifiers:
    nrm_model_scores.append(cross_val_score(clf, X_train_nrm, y_train, scoring='accuracy', cv=10))

nrm_models_df = pd.DataFrame(nrm_model_scores, columns=[1,2,3,4,5,6,7,8,9,10], index=["LR", "KNN", "SVM", "Forest", "Neural Network"])
nrm_models_df["Mean"] = nrm_models_df.mean(axis=1)

nrm_models_df * 100

In [None]:
# Visually Represent Normalized Model Scores
fig, axes = plt.subplots(nrows=1, ncols=1, figsize=(18, 8))
bplot_models = axes.boxplot(nrm_model_scores, vert=True, patch_artist=True)
colors_d = ["#3c96d3", "#2A9D8F", "#E9C46A", "#F4A261", "#E76F51"]
for patch, color in zip(bplot_models['boxes'], colors_d):
    patch.set_facecolor(color)
    
axes.set_title('Classification Accuracy', fontsize = 18)
plt.setp(axes, xticks=[i+1 for i in range(len(nrm_model_scores))],xticklabels=['LR', 'KNN', 'SVM', 'RF', 'NN'])

y_ticks = axes.get_yticklabels()
x_ticks = axes.get_xticklabels()
for i in x_ticks: 
    i.set_fontsize(18)       
for i in y_ticks:
    i.set_fontsize(18)

In [None]:
# Compare Standardized Train vs Test Accuracy for Logistic Regression
log_reg.fit(X_train_std, y_train)
train_score = log_reg.score(X_train_std, y_train)
test_score = log_reg.score(X_test_std, y_test)
print("Logistic Regression")
print("Train Score: {} \nTest Score: {}".format(train_score, test_score))
print()

# Compare Standardized Train vs Test Accuracy for K-Nearest Neighbors
knn.fit(X_train_std, y_train)
train_score = knn.score(X_train_std, y_train)
test_score = knn.score(X_test_std, y_test)
print("K-Nearest Neighbors")
print("Train score: {} \nTest score: {}".format(train_score, test_score))
print()

# Compare Standardized Train vs Test Accuracy for Support Vector Machine
svm.fit(X_train_std, y_train)
train_score = svm.score(X_train_std, y_train)
test_score = svm.score(X_test_std, y_test)
print("Support Vector Machine")
print("Train score: {} \nTest score: {}".format(train_score, test_score))
print()

# Compare Standardized Train vs Test Accuracy for Random Forest
rf.fit(X_train_std, y_train)
train_score = rf.score(X_train_std, y_train)
test_score = rf.score(X_test_std, y_test)
print("Random Forest")
print("Train score: {} \nTest score: {}".format(train_score, test_score))
print()

# Compare Standardized Train vs Test Accuracy for Neural Network
nn.fit(X_train_std, y_train)
train_score = nn.score(X_train_std, y_train)
test_score = nn.score(X_test_std, y_test)
print("Neural Network")
print("Train score: {} \nTest score: {}".format(train_score, test_score))
print()

In [None]:
# Compare Normalized Train vs Test Accuracy for Logistic Regression
log_reg.fit(X_train_std, y_train)
train_score = log_reg.score(X_train_std, y_train)
test_score = log_reg.score(X_test_std, y_test)
print("Logistic Regression")
print("Train Score: {} \nTest Score: {}".format(train_score, test_score))
print()

# Compare Normalized Train vs Test Accuracy for K-Nearest Neighbors
knn.fit(X_train_std, y_train)
train_score = knn.score(X_train_std, y_train)
test_score = knn.score(X_test_std, y_test)
print("K-Nearest Neighbors")
print("Train score: {} \nTest score: {}".format(train_score, test_score))
print()

# Compare Normalized Train vs Test Accuracy for Support Vector Machine
svm.fit(X_train_std, y_train)
train_score = svm.score(X_train_std, y_train)
test_score = svm.score(X_test_std, y_test)
print("Support Vector Machine")
print("Train score: {} \nTest score: {}".format(train_score, test_score))
print()

# Compare Normalized Train vs Test Accuracy for Random Forest
rf.fit(X_train_std, y_train)
train_score = rf.score(X_train_std, y_train)
test_score = rf.score(X_test_std, y_test)
print("Random Forest")
print("Train score: {} \nTest score: {}".format(train_score, test_score))
print()

# Compare Normalized Train vs Test Accuracy for Neural Network
nn.fit(X_train_std, y_train)
train_score = nn.score(X_train_std, y_train)
test_score = nn.score(X_test_std, y_test)
print("Neural Network")
print("Train score: {} \nTest score: {}".format(train_score, test_score))
print()

## Chapter VI: Hyperparameter Tuning

## Chapter VII: Conclusion

In [None]:
# Standardize Entire X
X_std = stdsc.fit_transform(X)

# Train Model On Entire Data Set
knn.fit(X, y)

# Save Model to Disk
joblib.dump(knn, 'final_model.sav')
 
# Load Model from Disk
finished_model = joblib.load('final_model.sav')

# Use Model as Before
accuracy = finished_model.score(X_test, y_test)
print("Final Test Accuracy: " + str(round(accuracy, 4) * 100) + "%")