# PCA USING Spotipy Web API

Files to be installed:

pip install spotipy

pip install plotly

pip install chart_studio

The implementation through the Spotipy Web API needs a unique client_id and client_secret IDs that can be found for free by creating a developer spotify account on the link below:
https://developer.spotify.com/documentation/web-api/quick-start/


In [1]:
import matplotlib.pyplot as plt
import spotipy
from spotipy.oauth2 import SpotifyClientCredentials
import pandas as pd
import numpy as np
import seaborn as sns
import plotly.express as px
import math
%matplotlib inline

In [2]:
from collections import defaultdict
from sklearn.metrics import euclidean_distances
from scipy.spatial.distance import cdist
import difflib

In [3]:
# Need to follow the link on the reference and create a free developer account on spotify for the unique keys below
client_id = "xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx"         #private
client_secret = "xxxxxxxxxxxxxxxxxxxxxxxxxxxxx"     #private

# Set up query with credentials
sp = spotipy.Spotify(
            client_credentials_manager=SpotifyClientCredentials(
            client_id=client_id,
            client_secret=client_secret))

url_pop = 'spotify:playlist:37i9dQZF1DXcBWIGoYBM5M'
url_rnb = 'spotify:playlist:37i9dQZF1DX0XUsuxWHRQd'

In [4]:
track = 'Levels' #input("Enter a song name:")
artist = 'Avicii' #input("Enter the artist:")

In [5]:
def find_song(track, artist):
    
    song_data = defaultdict()
    results = sp.search(q='artist:' + artist + ' track:' + track, type='track')
    
    if results['tracks']['items'] == []:
        return None
 
    track_result = results['tracks']['items'][0]
    
    track_id = track_result['id']
    name = track_result['name']
    artist = track_result['artists'][0]['name']
    album = track_result["album"]["name"]
    
    audio_features = sp.audio_features(track_id)[0]
    for key, value in audio_features.items():
        song_data[key] = value

    df_user = pd.DataFrame({"Song Name": [name], "Artist": artist, "Album": album, "ID": track_id})
    df_user_song = pd.DataFrame({"Danceability": [(song_data["danceability"])],
                                 "Mode": (song_data["mode"]),
                                 "Energy": (song_data["energy"]),
                                 "Key": (song_data["key"]),
                                 "Loudness": (song_data["loudness"]),
                                 "Speechiness": (song_data["speechiness"]),
                                 "Acousticness": (song_data["acousticness"]),
                                 "Instrumentalness": (song_data["instrumentalness"]),
                                 "Liveness": (song_data["liveness"]),
                                 "Valence": (song_data["valence"]),
                                 "Tempo": (song_data["tempo"]),
                                 "Duration (ms)": (song_data["duration_ms"]),
                                 "ID_CHECK": (song_data["id"]),
                                })   
    
    df_combined = df_user_song.join(df_user)
    
    return df_combined

In [6]:
song_stuff = find_song(track,artist)
song_stuff

Couldn't read cache at: .cache
Couldn't write token to cache at: .cache
Couldn't read cache at: .cache
Couldn't write token to cache at: .cache


Unnamed: 0,Danceability,Mode,Energy,Key,Loudness,Speechiness,Acousticness,Instrumentalness,Liveness,Valence,Tempo,Duration (ms),ID_CHECK,Song Name,Artist,Album,ID
0,0.602,0,0.834,1,-6.573,0.0358,0.0297,0.887,0.34,0.353,125.99,338867,6Kz3vnWQJUy57NhlP1TF51,Levels - Original Version,Avicii,Top 100 Wedding Reception Songs,6Kz3vnWQJUy57NhlP1TF51


In [7]:
def analyse_playlist(url):
    """Retrieves all songs from an identified playlist, and takes the ID, song, album, artist 
    and puts this all into a DataFrame"""

   # SONG NAMES

    offset = 0
    name = []

    while True:
        response = sp.playlist_tracks(url,
                                      offset=offset,
                                      fields=['items.track.name,total'])

        name.append(response["items"])
        offset = offset + len(response['items'])
        
        if len(response['items']) == 0:
            break

    name_list = [b["track"]["name"] for a in name for b in a]
    len(name_list)
    
    

    # ALBUM

    offset=0
    album = []

    while True:
        response = sp.playlist_tracks(url,
                                      offset=offset,
                                      fields=['items.track.album.name,total'])

        album.append(response["items"])
        offset = offset + len(response['items'])
        
        if len(response['items']) == 0:
            break

    album_list = [b["track"]["album"]["name"] for a in album for b in a]

    
    
   # ARTIST

    offset=0
    artist = []

    while True:
        response = sp.playlist_tracks(url,
                                      offset=offset,
                                      fields=['items.track.album.artists.name,total'])

        artist.append(response["items"])
        offset = offset + len(response['items'])
        
        if len(response['items']) == 0:
            break

    artist_list = [b["track"]["album"]["artists"][0]["name"] for a in artist for b in a]

    
    # ID
    
    offset = 0
    identifier = []

    while True:
        response = sp.playlist_tracks(url,
                                      offset=offset,
                                      fields=['items.track.id,total'])

        identifier.append(response["items"])
        offset = offset + len(response['items'])
        
        if len(response['items']) == 0:
            break

    identifier_list= [b["track"]["id"] for a in identifier for b in a]
    len(identifier_list)

    #Get audio features
    features = [sp.audio_features(identifier) for identifier in identifier_list]
    
    # Get each invidividual feature
    danceability = [(b["danceability"]) for a in features for b in a]    
    mode = [(b["mode"]) for a in features for b in a]
    energy = [(b["energy"]) for a in features for b in a]
    key = [(b["key"]) for a in features for b in a]        
    loudness = [(b["loudness"]) for a in features for b in a]       
    speechiness = [(b["speechiness"]) for a in features for b in a]
    acousticness = [(b["acousticness"]) for a in features for b in a]        
    instrumentalness = [(b["instrumentalness"]) for a in features for b in a] 
    liveness = [(b["liveness"]) for a in features for b in a]
    valence = [(b["valence"]) for a in features for b in a]        
    tempo = [(b["tempo"]) for a in features for b in a] 
    duration_ms = [(b["duration_ms"]) for a in features for b in a] 
    identifier_ = [(b["id"]) for a in features for b in a] 
    
    ## DataFrame (saved with current time)

    df = pd.DataFrame({"Song Name": name_list, "Artist": artist_list, "Album": album_list, "ID": identifier_list})
    df_2 = pd.DataFrame({"Danceability":danceability,
                         "Mode":mode,
                         "Energy":energy,
                         "Key":key,
                         "Loudness":loudness,
                         "Speechiness":speechiness,
                         "Acousticness":acousticness,
                         "Instrumentalness":instrumentalness,
                         "Liveness":liveness,
                         "Valence":valence,
                         "Tempo":tempo,
                         "Duration (ms)": duration_ms,
                         "ID_CHECK":identifier_
                               })

    df_combined = df_2.join(df)
    df_combined.to_excel("file" + str(count) + ".xlsx")
    
    # this adds user chosen song to the end
    df_combined = df_combined.append(song_stuff)
    df_combined.to_excel("newfile" + str(count) + ".xlsx")

    return df_combined.tail()

In [8]:
analyse_playlist(url_pop, "pop")

TypeError: analyse_playlist() takes 1 positional argument but 2 were given

In [None]:
analyse_playlist(url_rnb, "rnb")

## Data Visualization

### Histograms

In [None]:
def plot_figures(excel_file, colour):
    """Takes data from an excel file and plots histogram in subplots"""

    # Read excel file and put into DataFrame

    df = pd.read_excel(excel_file, index_col=0)

    # Initiate plot
    
    plt.figure(1,figsize=(30,25))
    
    # Create subplots for each column in the excel file

    plt.subplot(4,3,1)
    sns.distplot(df["Danceability"], color=colour)

    plt.subplot(4,3,2)
    sns.distplot(df["Energy"], color=colour)

    plt.subplot(4,3,3)
    sns.distplot(df["Tempo"], color=colour)

    plt.subplot(4,3,4)
    sns.distplot(df["Loudness"], color=colour)

    plt.subplot(4,3,5)
    sns.distplot(df["Speechiness"], color=colour)

    plt.subplot(4,3,6)
    sns.distplot(df["Acousticness"], color=colour)

    plt.subplot(4,3,7)
    sns.distplot(df["Instrumentalness"], color=colour)

    plt.subplot(4,3,8)
    sns.distplot(df["Liveness"], color=colour)

    plt.subplot(4,3,9)
    sns.distplot(df["Valence"], color=colour)

    plt.subplot(4,3,10)
    sns.distplot(df["Duration (ms)"], color=colour)

    plt.subplot(4,3,11)
    sns.distplot(df["Key"], color=colour)

    plt.subplot(4,3,12)
    sns.distplot(df["Mode"], color=colour)



In [None]:
plot_figures("newfilepop.xlsx", colour="darkcyan")

In [None]:
plot_figures("newfilernb.xlsx",colour="crimson")

### Radar Charts

In [None]:
def plot_radar_chart(excel_file):
    """Takes data from excel file and scales the values so that they all lie between zero and 1.
    Then plots a radar chart."""

    df = pd.read_excel(excel_file, index_col = 0)
    #or df = pd.read_excel(excel_file)
    
    # Get the specific columns
    radar = df.loc[:,"Danceability":"Duration (ms)"]

    # Create another df for our altered values (scale all values between 0 and 1)
    df_z = radar
    df_z["Key"] = (df["Key"] / df["Key"].max())
    df_z["Duration (ms)"] = (df["Duration (ms)"] / df["Duration (ms)"].max())
    df_z["Tempo"] = (df["Tempo"] / df["Tempo"].max())
    df_z["Loudness"] = (df["Loudness"] / df["Loudness"].min())

    # Prepare data for plotting with plotly.express
    labels = list(df_z.columns)

    values = df_z.mean().values

    df_radar = pd.DataFrame(dict(r=values, theta=labels))

    fig = px.line_polar(df_radar, r="r", theta="theta", line_close=True)

    fig.update_traces(fill="toself")
    fig.show()
    return df_radar

In [None]:
pop_radar = plot_radar_chart("newfilepop.xlsx")

In [None]:
import chart_studio
import chart_studio.plotly as py
import chart_studio.tools as tls

In [None]:
rnb_radar = plot_radar_chart("newfilernb.xlsx")

In [None]:
# PCA Decomposition
def PCA(excel_file):

    from sklearn.decomposition import PCA

    df = pd.read_excel(excel_file, index_col=0)

    df_scaled = pd.DataFrame()

    for col in df.loc[:,"Danceability":"Duration (ms)"]:
        df_scaled[col] = (df[col] - df[col].mean() / df[col].std())

    df_scaled


    # Initialize PCA
    pca = PCA(n_components = len(df_scaled.columns))

    # Fit PCA
    pca_series = pca.fit_transform(df_scaled).T

    df_pca = pd.DataFrame({"PC1":pca_series[0], "PC2":pca_series[1]})
    print(df_pca)
    df_pca.to_excel("DF_PCA_" + f'{excel_file}')

    explained_variance = pca.explained_variance_ratio_

    print("\n explained variance = {}".format(explained_variance))

    eigenvector_loadings = pca.singular_values_
    print("\n eigenvector_loadings = {}".format(eigenvector_loadings))



In [None]:
PCA("newfilepop.xlsx")

In [None]:
PCA("newfilernb.xlsx")

# Scatterplot

In [None]:
fig = plt.figure(figsize=(15,15))
ax = fig.add_subplot(111)

ax.set_xlabel("First Principal Component", fontsize=15)
ax.set_ylabel("Second Principal Component", fontsize=15)
ax.set_title("Principal Components of both genres", fontsize=18)

pop_components = pd.read_excel("DF_PCA_newfilepop.xlsx", index_col=0)
rnb_components = pd.read_excel("DF_PCA_newfilernb.xlsx", index_col=0)

ax = plt.scatter(x = pop_components["PC1"], y= pop_components["PC2"], label="Pop", color="darkcyan")
ax = plt.scatter(x = rnb_components["PC1"][:1861], y= rnb_components["PC2"][:1861], label="R&B", color="crimson")
plt.xlim(-250000,400000)

plt.grid(True)
plt.legend(prop = {"size":18}, loc="lower right")

# RECOMMENDATION

## Perform KNN algorithm

In [None]:
# Label the class of each genre, pop = 0, rnb = 1

#AK - insert 'new' at beginning of file name to include user choice in dataset - ex. "newfilepop.xlsx"
labelled_pop = pd.read_excel("filepop.xlsx", index_col=0, usecols=[0,1,2,3,4,5,6,7,8,9,10,11])
labelled_rnb = pd.read_excel("filernb.xlsx", index_col=0, usecols=[0,1,2,3,4,5,6,7,8,9,10,11])
labelled_pop["Class"] = 0
labelled_rnb["Class"] = 1

# Combine to create a full dataframe
full_data = labelled_pop.append(labelled_rnb[:200], ignore_index=True) #1861

# Scale between 0 and 1
full_data["Key"] = (full_data["Key"] / full_data["Key"].max())
full_data["Tempo"] = (full_data["Tempo"] / full_data["Tempo"].max())
full_data["Loudness"] = (full_data["Loudness"] / full_data["Loudness"].min())

full_data_random = full_data.sample(frac=1)
full_data_random

### Training data

In [None]:
# Get x training data
x_train = full_data_random[:3000]

# Target values
y_train = x_train["Class"].values

# Remove target values from x_data
x_train = x_train.drop("Class", axis=1)
x_train

### Test Data

In [None]:
x_test = full_data_random[70:]

# Randomise the data
x_test = x_test.sample(frac=1)

y_test = x_test["Class"].values

x_test = x_test.drop("Class", axis=1)
x_test

# TEST KNN VALUES

In [None]:
from sklearn import metrics
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import cross_val_score

In [None]:
# test knn values to show optimal accuracy
def test_accuracy(x_train, x_test, y_train, y_test):
    
    accuracy_scores = []
    
    for i in range(2,10):
        knn_i = KNeighborsClassifier(n_neighbors = i).fit(x_train, y_train)
        y = knn_i.predict(x_test)
        accuracy_scores.append((i,(metrics.accuracy_score(y_test, y))))
        
    df_accuracy_scores = pd.DataFrame(accuracy_scores).rename(columns = {0:'KNN Value',1:'Accuracy Score'})
    df_accuracy_scores = df_accuracy_scores.style.hide_index().set_properties(**{'text-align': 'center'})
    df_accuracy_scores = df_accuracy_scores.set_table_styles([dict(selector = 'th', props=[('text-align', 'center')])])
    
    return df_accuracy_scores

In [None]:
accuracy_results = test_accuracy(x_train, x_test, y_train, y_test)
accuracy_results

In [None]:
accuracy_results = accuracy_results.data
accuracy_results.plot(x='KNN Value',y='Accuracy Score')

In [None]:
accuracy_results = accuracy_results.sort_values('Accuracy Score', ascending=False)
optimal_knn = accuracy_results['KNN Value'].iloc[0]

print(" The optimal KNN value is ", optimal_knn)

### Fit the KNN model

In [None]:
knn = KNeighborsClassifier(n_neighbors=optimal_knn)

#cross_validate
cv_scores = cross_val_score(knn, x_train, y_train, cv=10)
knn.fit(x_train, y_train)

In [None]:
cv_scores_mean = np.mean(cv_scores)
print(cv_scores , "\n\n""mean =" ,"{:.2f}".format(cv_scores_mean))

In [None]:
predictions = knn.predict(x_test)
predictions

In [None]:
accuracy_score = knn.score(x_test, y_test)
print("Accuracy score = ""{:.2f}".format(accuracy_score))

In [None]:
from sklearn.metrics import confusion_matrix

confusion_matrix = confusion_matrix(y_test, predictions)
confusion_matrix

In [None]:
# song_prediction_data
song_pred_data = song_stuff.iloc[:, 0:11]
song_pred_data

In [None]:
song_pred_data["Key"] = (song_pred_data["Key"] / full_data["Key"].max())
song_pred_data["Tempo"] = (song_pred_data["Tempo"] / full_data["Tempo"].max())
song_pred_data["Loudness"] = (song_pred_data["Loudness"] / full_data["Loudness"].min())
song_pred_data

In [None]:
user_prediction = knn.predict(song_pred_data)

if user_prediction == 0:
    print(" The chosen song is closest to pop.")
    user_genre = 0
else:
    print(" The chosen song is closest to RnB.")
    user_genre = 1

In [None]:
# 0 = pop, 1 = rnb --- used for automatic recommendation route if we decide to do that
if user_genre == 0:
    og_excel_file = "filepop.xlsx"
    pl_components = pop_components
else:
    og_excel_file = "filernb.xlsx"
    pl_components = rnb_components

In [None]:
# ("filepop.xlsx" or "filernb.xlsx", pop_components or rnb_components, rec_amount)
def recommendation(original_excel_file, playlist_components, rec_amount):
    
    distances = []
    for i in range(len(playlist_components)-1):
        distances.append(math.dist(playlist_components.iloc[50],playlist_components.iloc[i]))
        
    rec_data = ['Song Name', 'Artist', 'Album']  
    
    distance_df = pd.DataFrame(distances)    
    
    playlist_info = pd.read_excel(original_excel_file, index_col=None, usecols=rec_data)
    playlist_info_df = pd.DataFrame(playlist_info)
    
    comb_df = playlist_info_df.join(distance_df)
    comb_df = comb_df.rename(columns = {0:'Distance'})
    
    
    recommendations = comb_df.sort_values('Distance').head(rec_amount)[rec_data].style.set_properties(**{'text-align': 'left'})
    recommendations = recommendations.hide_index()
    recommendations = recommendations.set_table_styles([dict(selector = 'th', props=[('text-align', 'left')])])
    
    return recommendations

In [None]:
# manual pop - Levels by Avicii
recommendation("filepop.xlsx", pop_components, 5)

In [None]:
# manual rnb - Levels by Avicii
recommendation("filernb.xlsx", rnb_components, 5)

In [None]:
# automatic based on user_genre prediction
recommendation(og_excel_file, pl_components, 5)

# Reference:

    https://towardsdatascience.com/using-k-nearest-neighbours-to-predict-the-genre-of-spotify-tracks-796bbbad619f
   
    https://developer.spotify.com/documentation/web-api/quick-start/