LOADING LIBRARIES


In [40]:
import os
import numpy as np
import pandas as pd
import json
import seaborn as sns
import plotly.express as px 
import matplotlib.pyplot as plt
%matplotlib inline
import requests
from sklearn.cluster import KMeans
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.manifold import TSNE
from sklearn.decomposition import PCA
from sklearn.metrics import euclidean_distances
from scipy.spatial.distance import cdist

import warnings
warnings.filterwarnings("ignore")


LOADING DATASET


In [42]:
data=pd.read_csv('tracks.csv')
number_cols = ['valence', 'acousticness', 'danceability', 'energy',
 'instrumentalness', 'key', 'liveness', 'loudness', 'mode', 'popularity', 'speechiness', 'tempo']
X=data[number_cols]
X.head()

Unnamed: 0,valence,acousticness,danceability,energy,instrumentalness,key,liveness,loudness,mode,popularity,speechiness,tempo
0,0.127,0.674,0.645,0.445,0.744,0,0.151,-13.338,1,6,0.451,104.851
1,0.655,0.797,0.695,0.263,0.0,0,0.148,-22.136,1,0,0.957,102.009
2,0.457,0.994,0.434,0.177,0.0218,1,0.212,-21.18,1,0,0.0512,130.418
3,0.397,0.995,0.321,0.0946,0.918,7,0.104,-27.961,1,0,0.0504,169.98
4,0.196,0.989,0.402,0.158,0.13,3,0.311,-16.9,0,0,0.039,103.22


CORRELATION VISUALIZATION

In [None]:
pd.plotting.scatter_matrix(data[['energy','valence','acousticness','danceability','tempo','instrumentalness','liveness','loudness','speechiness']],figsize=(20,12))

CORRELATION MATRIX

In [None]:
corre=data.corr()
corre.style.background_gradient()

PREPROCESSING OF DATA



In [43]:
# Standard Scaling of data to normalize it.
scaler=StandardScaler()
scaler_data=scaler.fit_transform(X) 

In [None]:
from sklearn.decomposition import PCA
pca=PCA()
pca.fit(scaler_data)
exp_variance=pca.explained_variance_ratio_
fig, ax = plt.subplots()
ax.bar(range(pca.n_components_), exp_variance)
ax.set_xlabel('Principal Component #')

In [None]:
import numpy as np
cum_exp_variance = np.cumsum(exp_variance)
fig, ax = plt.subplots()
ax.plot(cum_exp_variance)
ax.axhline(y=0.85, linestyle='--')

In [46]:
pca = PCA(n_components=6, random_state=10)
train_pca = pca.fit_transform(scaler_data)

In [14]:
from sklearn.cluster import KMeans
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline


In [None]:
wcss=[]

for i in range(1,11): 
     kmeans = KMeans(n_clusters=i, init ='k-means++', max_iter=1000,  n_init=10,random_state=0 )
     kmeans.fit(train_pca)
     wcss.append(kmeans.inertia_)

plt.plot(range(1,11),wcss)
plt.title('The Elbow Method Graph')
plt.xlabel('Number of clusters')
plt.ylabel('WCSS')
plt.show()

KMeans Model

In [47]:
model=KMeans(n_clusters=5)
model.fit(train_pca)

KMeans(n_clusters=5)

In [48]:
data['label']=model.labels_

KMeans Clustering Visualization

In [None]:

pca_pipeline = Pipeline([('scaler', StandardScaler()), ('PCA', PCA(n_components=2))])
song_embedding = pca_pipeline.fit_transform(X)
projection = pd.DataFrame(columns=['x', 'y'], data=song_embedding)
projection['title'] = data['name']
projection['cluster'] = data['label']

fig = px.scatter(
    projection, x='x', y='y', color='cluster', hover_data=['x', 'y', 'title'])
fig.show()

IMPORTING DATA FROM SPOTIFY API


In [5]:
import spotipy
from spotipy.oauth2 import SpotifyClientCredentials

In [22]:
def auth_token(client_id,client_secret):
    url='https://accounts.spotify.com/api/token'
    auth_response = requests.post(url, {
        'grant_type': 'client_credentials',
        'client_id': client_id,
        'client_secret': client_secret,
    })

    auth_response=auth_response.json()
    access_token=auth_response['access_token']
    print(access_token)
    return access_token

In [23]:
client_id='12a744de578f414897c0e94a7f349ae9'
client_secret='afda3c39c0d44576a7af3bd5c3af0f35'
access_token=auth_token(client_id,client_secret)

BQA1SOxmuz8ZzJaH3FTqfNPxtnBOX3Cxq5lPDuhzbIguFijPK9kkqxJtgjTeulsCQm0o94UKfHcxBnkCvdcfEJ7aFAEjf4wtqgSqw0PyOJVodJhNBgg


In [24]:
def get_playlist(access_token,playlist_id):
    header={'Authorization':"Bearer {}".format(access_token)}
    url_base='https://api.spotify.com/v1/'
    playlist=requests.get(url_base+ 'playlists/' + playlist_id+ '/tracks',headers=header)
    playlist=playlist.json()
    return playlist

In [25]:
def get_track_features(track_id):
    header={'Authorization':"Bearer {}".format(access_token)}
    url_base='https://api.spotify.com/v1/'
    track_feature=requests.get(url_base+'audio-features/' + track_id, headers=header)
    track_feature=track_feature.json()
    return track_feature

In [26]:
def get_unique_track(track_id):
    header={'Authorization':"Bearer {}".format(access_token)}
    url_base='https://api.spotify.com/v1/'
    unique_track=requests.get(url_base+ 'tracks/' + track_id, headers=header)
    unique_track=unique_track.json()
    return unique_track

In [27]:
def get_unique_artist(artist_id):
    header= {'Authorization': "Bearer {}".format(access_token)}
    url_base = 'https://api.spotify.com/v1/'
 
    
    artist_feature = requests.get(url_base + 'artists/' + artist_id, headers=header)
    artist_feature = artist_feature.json()
    return artist_feature

In [28]:
def create_tracks_dataframe(playlist,df):
    for item in playlist['tracks']['items']:
        track_id= item['track']['id']
        track_name=item['track']['name']
        track_popularity=item['track']['popularity']
        track_explicit=item['track']['explicit']
        track=get_unique_track(track_id)
        track_features=get_track_features(track_id)

        artists=[]
        a_id=[]
        genres=[]
        popularity=[]
        for artist in track['artists']:
            artists.append(artist['name'])
            a_id.append(artist['id'])
        
        for artist in range(len(a_id)):
            artist=get_unique_artist(a_id[artist])
            popularity.append(artist['popularity'])
            genres.append(artist['genres'])

        album=track['album']['name']
        album_id=track['album']['id']

        df = df.append({'Song_id': track_id, 'Song': track_name, 'Artist': artists, 'Album': album, 'Album_id': album_id,'acousticness': track_features['acousticness'], 'danceability': track_features['danceability'], 
                        'energy': track_features['energy'], 'popularity': track_popularity, 'explicit': track_explicit, 
                        'key': track_features['key'],
                        'mode': track_features['mode'], 
                        'instrumentalness': track_features['instrumentalness'], 
                        'valence': track_features['valence'],'liveness': track_features['liveness'],'loudness': track_features['loudness'],'speechiness': track_features['speechiness'],'duration': track_features['duration_ms'],'tempo': track_features['tempo'],'genres':genres,'artist_pop': popularity}, ignore_index=True)
            
        
    return df
        


    

In [None]:
playlist_id='37i9dQZEVXbMDoHDwVN2tF?si=f0993d2406bc4ea4'
playlist=get_playlist(access_token,playlist_id)
df = pd.DataFrame(columns=['Song_id', 'Song', 'Artist', 'Album', 'Album_id','acousticness', 'danceability', 'energy', 'key', 'mode', 'instrumentalness', 'valence','tempo','duration','loudness','liveness','speechiness'])
tracks=create_tracks_dataframe(playlist,df)
tracks.head()


A NAIVE APPROACH FOR RECOMMENDATION

In [50]:
X=tracks[number_cols]
scaler_data=scaler.transform(X)
train_pca=pca.transform(scaler_data)
model.fit(train_pca)
tracks['label']=model.labels_

In [None]:
tracks.head()

In [None]:
tracks['label'].value_counts().plot(kind='bar')

In [53]:
tracks_label=tracks[tracks['label']==1]
tracks_label.sample(12)

Unnamed: 0,Song_id,Song,Artist,Album,Album_id,acousticness,danceability,energy,key,mode,...,tempo,duration,loudness,liveness,speechiness,artist_pop,explicit,genres,popularity,label
31,5Eax0qFko2dh7Rl2lYs3bx,Efecto,[Bad Bunny],Un Verano Sin Ti,3RQQmkQEvNCY4prGKE6oc5,0.141,0.801,0.475,7,0,...,98.047,213061,-8.797,0.0639,0.0516,[98],0.0,"[[reggaeton, trap latino]]",94.0,1
15,70UV1HmppYUxBI6yCev4d5,Superhero (Heroes & Villains) [with Future & C...,"[Metro Boomin, Future, Chris Brown]",HEROES & VILLAINS,4gR3h0hcpE1iJH0v5bVv78,0.136,0.715,0.587,5,0,...,116.68,182667,-5.387,0.201,0.21,"[84, 88, 87]",1.0,"[[hip hop, rap, trap], [atl hip hop, rap, sout...",77.0,1
18,1IHWl5LamUGEuP4ozKQSXZ,Tití Me Preguntó,[Bad Bunny],Un Verano Sin Ti,3RQQmkQEvNCY4prGKE6oc5,0.0993,0.65,0.715,5,0,...,106.672,243717,-5.198,0.126,0.253,[98],0.0,"[[reggaeton, trap latino]]",95.0,1
22,5IgjP7X4th6nMNDh4akUHb,Under The Influence,[Chris Brown],Indigo (Extended),3okhA6w5uau6ZNhnVpwVww,0.0635,0.733,0.69,9,0,...,116.992,184613,-5.529,0.105,0.0427,[87],1.0,"[[dance pop, pop, r&b]]",95.0,1
38,3k3NWokhRRkEPhCzPmV8TW,Ojitos Lindos,"[Bad Bunny, Bomba Estéreo]",Un Verano Sin Ti,3RQQmkQEvNCY4prGKE6oc5,0.08,0.647,0.686,3,0,...,79.928,258299,-5.745,0.528,0.0413,"[98, 76]",0.0,"[[reggaeton, trap latino], [cumbia, latin alte...",93.0,1
14,1bDbXMyjaUIooNwFE9wn0N,Rich Flex,"[Drake, 21 Savage]",Her Loss,5MS3MvWHJ3lOZPLiMxzOU6,0.0503,0.561,0.52,11,0,...,153.15,239360,-9.342,0.355,0.244,"[97, 92]",1.0,"[[canadian hip hop, canadian pop, hip hop, rap...",95.0,1
33,73vIOb4Q7YN6HeJTbscRx5,Miss You,"[Oliver Tree, Robin Schulz]",Miss You,32G4vFNwLJQjpzkOoGEUUo,0.0128,0.587,0.742,6,0,...,145.007,206000,-6.64,0.146,0.0529,"[82, 81]",1.0,"[[alternative hip hop], [dance pop, deep euro ...",94.0,1
11,4uUG5RXrOk84mYEfFvj3cK,I'm Good (Blue),"[David Guetta, Bebe Rexha]",I'm Good (Blue),7M842DMhYVALrXsw3ty7B3,0.00383,0.561,0.965,7,0,...,128.04,175238,-3.673,0.371,0.0343,"[88, 83]",1.0,"[[big room, dance pop, edm, pop, pop dance], [...",97.0,1
32,4DGrMHTVjxecZbYStawUK1,Too Many Nights (feat. Don Toliver & with Future),"[Metro Boomin, Future, Don Toliver]",HEROES & VILLAINS,4gR3h0hcpE1iJH0v5bVv78,0.148,0.683,0.713,7,0,...,88.008,199920,-4.375,0.113,0.0464,"[84, 88, 79]",1.0,"[[hip hop, rap, trap], [atl hip hop, rap, sout...",75.0,1
40,31De8hk4QgDsbS26w06h21,Umbrella (with 21 Savage & Young Nudy),"[Metro Boomin, 21 Savage, Young Nudy]",HEROES & VILLAINS,4gR3h0hcpE1iJH0v5bVv78,0.0923,0.825,0.78,11,0,...,111.004,222000,-3.705,0.361,0.142,"[84, 92, 71]",1.0,"[[hip hop, rap, trap], [atl hip hop, rap], [hi...",75.0,1


RECCOMENDATION THROUGH EUCLIDEAN DISTANCE


In [34]:
from sklearn.metrics import euclidean_distances
from scipy.spatial.distance import cdist
import difflib

In [54]:
def get_mean_vector(song_list, spotify_data):
    
    song_vectors = []
    
    for index,row in song_list.iterrows():
        song_vector = row[number_cols].values
        song_vectors.append(song_vector)  

    song_matrix = np.array(list(song_vectors))
    return np.mean(song_matrix, axis=0)



In [55]:

def recommend_songs( song_list, spotify_data, n_songs=10):
    song_center = get_mean_vector(song_list, spotify_data)
    scaled_data = scaler.transform(spotify_data[number_cols])
    scaled_song_center = scaler.transform(song_center.reshape(1, -1))
    distances = cdist(scaled_song_center, scaled_data, 'cosine')
    index = list(np.argsort(distances)[:, :n_songs][0])
    rec_songs = spotify_data.iloc[index]
    return rec_songs


In [56]:
rec_songs_distance=recommend_songs(tracks,data,10)
rec_songs_distance.head(10)

Unnamed: 0,id,name,popularity,duration_ms,explicit,artists,id_artists,release_date,danceability,energy,...,loudness,mode,speechiness,acousticness,instrumentalness,liveness,valence,tempo,time_signature,label
93816,1diS6nkxMQc3wwC4G1j0bh,We're Good,91,165507,0,['Dua Lipa'],['6M2wZ9GZgrQXHCFfjv46we'],2021-02-11,0.722,0.588,...,-5.932,1,0.0544,0.0319,0.0,0.183,0.59,134.01,4,4
93814,5YaskwnGDZFDRipaqzbwQx,Your Love (9PM),91,150053,0,"['ATB', 'Topic', 'A7S']","['7jZM5w05mGhw6wTB1okhD9', '0u6GtibW46tFX7koQ6...",2021-01-15,0.669,0.784,...,-5.603,1,0.112,0.194,6e-06,0.115,0.517,125.993,4,4
93804,3Ofmpyhv5UAQ70mENzB277,Astronaut In The Ocean,98,132780,0,['Masked Wolf'],['1uU7g3DNSbsu0QjSEqZtEd'],2021-01-06,0.778,0.695,...,-6.865,0,0.0913,0.175,0.0,0.15,0.472,149.996,4,4
93173,1daDRI9ahBonbWD8YcxOIB,Angels Like You,82,196453,0,['Miley Cyrus'],['5YGY8feqx7naU7z4HrwZM6'],2020-11-27,0.672,0.642,...,-4.035,1,0.0313,0.0981,0.0,0.1,0.494,121.981,4,4
93213,7C1trmcQQ5n5RNy4l6ziCv,Some Say - Felix Jaehn Remix,81,186878,0,"['Nea', 'Felix Jaehn']","['7nqlScm2smydSRl13eaP8E', '4bL2B6hmLlMWnUEZno...",2020-01-10,0.682,0.7,...,-5.591,1,0.0397,0.406,0.0,0.174,0.637,120.03,4,4
88955,3a1lNhkSLSkpJE4MSHpDu9,Congratulations,83,220293,1,"['Post Malone', 'Quavo']","['246dkjvS1zLTtiykXe5h60', '0VRj0yCOv2FXJNP47X...",2016-12-09,0.63,0.804,...,-4.183,1,0.0363,0.215,0.0,0.253,0.492,123.146,4,4
92829,6ft4hAq6yde8jPZY2i5zLr,Paradise (feat. Dermot Kennedy),92,167903,0,"['MEDUZA', 'Dermot Kennedy']","['0xRXCcSX89eobfrshSVdyu', '5KNNVgR6LBIABRIomy...",2020-10-30,0.632,0.595,...,-7.644,0,0.0401,0.0689,0.0,0.209,0.435,124.114,4,4
93822,31qCy5ZaophVA81wtlwLc4,Anyone,90,190779,0,['Justin Bieber'],['1uNFoZAHBGtllmzznpCI3s'],2021-01-01,0.686,0.538,...,-8.026,1,0.0345,0.181,3e-06,0.113,0.584,115.884,4,4
93031,5ri4zqtWhG07hIuNNDWP76,Fly Away,84,178157,0,['Tones And I'],['2NjfBq1NflQcKSeiDooVjY'],2020-11-13,0.806,0.513,...,-6.208,1,0.0685,0.217,0.0,0.217,0.503,124.988,4,4
93019,5cpJFiNwYyWwFLH0V6B3N8,Del Mar,84,214507,1,"['Ozuna', 'Doja Cat', 'Sia']","['1i8SpTcr7yvPOmcqrbnVXY', '5cj0lLjcoR7YOSnhnX...",2020-09-04,0.759,0.636,...,-5.585,1,0.0369,0.0224,0.000143,0.166,0.536,109.976,4,4


RECOMMENDATIONS THROUGH COSINE SIMILARITY

In [77]:
from sklearn.metrics.pairwise import cosine_similarity

In [71]:

def generate_sumvector_nonplaylist(complete_data, playlist_df):
    complete_nonplaylist = complete_data[~complete_data['id'].isin(playlist_df['Song_id'].values)]
    summarize_vector = playlist_df[number_cols]
    return summarize_vector.sum(axis = 0), complete_nonplaylist


In [72]:
summarize_vector, complete_nonplaylist = generate_sumvector_nonplaylist(data,tracks)

In [80]:

def generate_recommendations(df,summarize_vector, nonplaylist_data):
    non_playlist_df = df[df['id'].isin(nonplaylist_data['id'].values)]
    non_playlist_df['sim'] = cosine_similarity(nonplaylist_data[number_cols].values, summarize_vector.values.reshape(1, -1))[:,0]
    non_playlist_df_top_40 = non_playlist_df.sort_values('sim',ascending = False).head(40)
    
    return non_playlist_df_top_40

In [None]:
recommend = generate_recommendations(data,summarize_vector,complete_nonplaylist)
recommend.head(10)


ACCURACY:

One of the biggest problems with this model is that there are almost no metrics to evaluate whether the recommendation is good or bad. To compare the results of these different models we used a spotify playlist 'Top Retro' and asked for recommendations through thses models on the basis of features of this playlists songs. After analysis of results of these results we concluded that the results and recommendations are having the same label.  Here, the two methods are the content-based filtering approach (Cosine_Similarity) and the clustering method using KMeans. The discrepancy in the result could indicate two possibilities. The most probable reason is that all two models are well, both aren't performing well or either one of them is performing well. This could be because of the dataset size, lack of hyperparameter tuning, or model constraints. However, there is also a possibility that one of the models is performing the best, while the other one is not keeping up. Regardless of which, this shows the problem of not having a proper metric to train the model, resulting in a lack of technique to measure success that can help improve the model.It is hard to measure the success of your system without deploying and receiving feedback from the users. In terms of song recommendation, this can be the number of users adding recommended songs to their playlist. By looking at the metrics, we can perform A/B testing to see which model or parameters perform the best and update the model accordingly. 