In [None]:
#  For project clustering steps:
# 1- Load the Pandas data frame songs of the audio features (at least 1000 songs as different as possible).🎹 ----DONE
# 2- Standardize the data using Standardscaler ----DONE
# 3- Save the scaler for future use on the new user input song, using Pickle. (tomorrow)
# 4- Select the number of clusters k.
# 5- Adjust k means cluster.
# 6- Build the elbow graph to find the best k. (tomorrow)
# 7- Use the model with the best k to assign each observation in your data frame to its cluster number using model.predict (add cluster columns to the pandas data frame).
# 8- Save the model with the best k as your final model using pickle. (tomorrow)

# 📍 For the final product tasks and the full scenario:
# 1- Get the song name from the user as input ---------- DONE
# 2- Play the input song in the music embed player. ---------- DONE
# 3- Get the audio features for this song from the Spotify API using sp.audio_features(trackid). Be careful to keep only the audio features columns.---------- DONE
# 4- Load the StandardScaler using Pickle and use it to scale the REQUESTED_SONG.(tomorrow)
# 5- Use kmeans_model.predict(new scaled audio dataset for the REQUESTED_SONG) to predict the cluster (label) for the REQUESTED_SONG.
# 6- Return a random song from the same cluster that the REQUESTED_SONG belongs to from your data frame and suggest it to the user.
# 7- Play it using the built-in music player.
# 8*- Try to Streamlit? ☀️ (optional) 

In [None]:
from stackapi import StackAPI
from bs4 import BeautifulSoup
import requests as re

import pandas as pd
import numpy as np

from random import randint    
from datetime import datetime, timedelta
from time import sleep

import json
import os
import spotipy
from spotipy.oauth2 import SpotifyClientCredentials
import pprint

client_id = os.environ.get("SPOTIPY_CLIENT_ID")
client_secret = os.environ.get("SPOTIPY_CLIENT_SECRET")

sp = spotipy.Spotify(client_credentials_manager=SpotifyClientCredentials())

from plotly import express as px
from sklearn.cluster import KMeans
from sklearn.preprocessing import OneHotEncoder
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler
from matplotlib import pyplot as plt

# ***************************** PART I *****************************

## Loading our Songs Database

In [None]:
df = pd.read_csv('Songs_Database.csv', sep=';')
# df.dropna(subset=["id"], inplace=True)
df

In [None]:
data = df.drop(["id"], axis=1)
data.head()

## Performing Scaling on our Songs_Database

In [None]:
# Create an instance of StandardScaler
scaler = StandardScaler()

# Standardize the data using StandardScaler
X_normalized = scaler.fit_transform(data[["danceability", "energy", "loudness", "speechiness", "acousticness", "instrumentalness","liveness","valence","tempo","duration_ms"]])

# Print the original data
print("Original data:")
# print(data.head().to_frame())
display(data.head())

# Create a DataFrame with the normalized data
data_normalized = pd.DataFrame(X_normalized, columns=["danceability", "energy", "loudness", "speechiness", "acousticness", "instrumentalness","liveness","valence","tempo","duration_ms"])

# Print the scaled data
print("\nScaled data:")
data_normalized.head()

In [None]:
# Perform PCA on the normalized data
pca = PCA(n_components=10)
pca.fit(X_normalized)

# Get the contribution of each principal component to the data variance
# En utilisant pca.explained_variance_ratio_, on peut obtenir une idée de la contribution de chaque composante principale à la variance des données.
# Cela peut aider à décider combien de composantes principales utiliser pour réduire la dimensionnalité ou pour d'autres analyses.
pca.explained_variance_ratio_

# Get the contribution of each principal component to the data variance
explained_variance_ratio = pca.explained_variance_ratio_

# Print the contribution of each principal component
print("Explained variance ratio:")
print(explained_variance_ratio)

# Calculate the cumulative sum of explained variance ratio
# Déterminer combien de composantes principales tu souhaites conserver, en fonction du pourcentage de variance que tu souhaites préserver.
# Par exemple, si la somme cumulative atteint 0,95, cela signifie que les composantes principales retenues capturent 95 % de la variance totale des données.
np.cumsum(pca.explained_variance_ratio_)

# Calculate the cumulative sum of explained variance ratio
cumulative_variance_ratio = np.cumsum(explained_variance_ratio)

# Print the cumulative sum of explained variance ratio
print("\nCumulative variance ratio:")
print(cumulative_variance_ratio)

## Save SCALER in file with PICKLE

In [None]:
import pickle

#Ici, nous ouvrons un fichier nommé "scaler.pkl" en mode d'écriture binaire ('wb'), puis nous utilisons pickle.dump() pour écrire l'objet scaler dans le fichier.
with open('scaler.pkl', 'wb') as fichier:
    pickle.dump(scaler, fichier)

In [None]:
# with open("Songs_Database.csv", "rb") as f: #don't forget the correct path
#      scaler_new = pickle.load(fichier) 

## Finding out the best number of Clusters to use

In [None]:
K = range(5, 30) #range of the clusters
inertia = []

for k in K: #for every data in set build a inertia
    print("Training a K-Means model with {} clusters! ".format(k))
    print()
    kmeans = KMeans(n_clusters=k,
                    random_state=1234)
    kmeans.fit(data)
    inertia.append(kmeans.inertia_)

import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline

plt.figure(figsize=(16,8))
plt.plot(K, inertia, 'bx-')
plt.xlabel('k')
plt.ylabel('inertia')
plt.xticks(np.arange(min(K), max(K)+1, 1.0)) #changing the scale
plt.title('Elbow Method showing the optimal k') 

## PCA and KMeans

In [None]:
# Transform the normalized data using the principal components
X_pca = pca.transform(X_normalized)

# Apply K-means clustering to the transformed data
kmeans = KMeans(n_clusters=14, n_init=12)
kmeans.fit(X_pca)
cluster_ids = kmeans.predict(X_pca)
cluster_distances = kmeans.transform(X_pca)

# Print the cluster IDs and distances
print("\nCluster IDs:")
print(cluster_ids)
print("\nCluster distances:")
print(cluster_distances)

## Assigning a cluster to each song

In [None]:
data_normalized['cluster'] = cluster_ids
data_normalized

In [None]:
data.head()

In [None]:
data_normalized.reset_index(drop=True, inplace=True)  # Restaurer l'index d'origine
column = df.loc[:, "id"]
data_normalized["id"] = column

data_normalized

## 8- Saving model with best K as final model using Pickle

In [None]:
meilleur_k = 14  # Remplacez par le meilleur k que vous avez sélectionné
kmeans_final = KMeans(n_clusters=meilleur_k)

kmeans_final.fit(X_pca)  # Remplacez X_pca par vos données normalisées

# Définir le nom du fichier pour sauvegarder le modèle
nom_fichier_modele = "k_means_final_model.pkl"

# Sauvegarder le modèle avec Pickle
with open(nom_fichier_modele, 'wb') as fichier:
    pickle.dump(kmeans_final, fichier)

In [None]:
#Obtenir les identifiants uniques de cluster dans cluster_ids,
#puis il itère sur ces identifiants pour tracer un nuage de points pour chaque cluster.

for c in np.unique(cluster_ids):
    plt.scatter(X_pca[cluster_ids == c, 0], X_pca[cluster_ids == c, 1], marker='.', s=50)

In [None]:
#Utilisation de fit_predict de KMeans pour effectuer à la fois l'ajustement du modèle K-means 
# et la prédiction des clusters pour les données normalisées X_normalized.
cluster_ids = KMeans(n_clusters=9, n_init=100).fit_predict(X_normalized)

for c in np.unique(cluster_ids):
    plt.scatter(X_normalized[cluster_ids == c, 0], X_normalized[cluster_ids == c, 1], marker='.', s=50)

plt.show()

In [None]:
#Utilisation de fit_predict de KMeans pour effectuer à la fois l'ajustement du modèle K-means 
# et la prédiction des clusters pour les données normalisées X_normalized.

cluster_ids = KMeans(n_clusters=14, n_init=100).fit_predict(X_normalized)

for c in np.unique(cluster_ids):
    plt.scatter(X_normalized[cluster_ids == c, 0], X_normalized[cluster_ids == c, 1], marker='.', s=50)

plt.show()

# ***************************** PART II *****************************

## Getting the song from user

In [279]:
x = str(input("Please enter the name of a song you'd like to listen to: "))

## Creating the JSON

In [280]:
results = sp.search(q= x, limit=1, market="FR")

json_results = json.dumps(results, ensure_ascii=True)
print(json_results)

{"tracks": {"href": "https://api.spotify.com/v1/search?query=Blackpink&type=track&market=FR&offset=0&limit=1", "items": [{"album": {"album_type": "album", "artists": [{"external_urls": {"spotify": "https://open.spotify.com/artist/41MozSoPIsD1dJM0CLPjZF"}, "href": "https://api.spotify.com/v1/artists/41MozSoPIsD1dJM0CLPjZF", "id": "41MozSoPIsD1dJM0CLPjZF", "name": "BLACKPINK", "type": "artist", "uri": "spotify:artist:41MozSoPIsD1dJM0CLPjZF"}], "external_urls": {"spotify": "https://open.spotify.com/album/7jaSNQUBJbvfbZHLNFrV7P"}, "href": "https://api.spotify.com/v1/albums/7jaSNQUBJbvfbZHLNFrV7P", "id": "7jaSNQUBJbvfbZHLNFrV7P", "images": [{"height": 640, "url": "https://i.scdn.co/image/ab67616d0000b2734aeaaeeb0755f1d8a8b51738", "width": 640}, {"height": 300, "url": "https://i.scdn.co/image/ab67616d00001e024aeaaeeb0755f1d8a8b51738", "width": 300}, {"height": 64, "url": "https://i.scdn.co/image/ab67616d000048514aeaaeeb0755f1d8a8b51738", "width": 64}], "is_playable": true, "name": "BORN PINK

## Displaying JSON

In [281]:
results['tracks']["items"][0].keys() # All the dictionary
pprint.pprint(results['tracks']['items'][0])

results['tracks']['items'][0]['id'] # Getting song ID

for item in results['tracks']['items']:
    print("The name of song is: '{}' and the id is: {}".format(item['name'],item["id"]))

{'album': {'album_type': 'album',
           'artists': [{'external_urls': {'spotify': 'https://open.spotify.com/artist/41MozSoPIsD1dJM0CLPjZF'},
                        'href': 'https://api.spotify.com/v1/artists/41MozSoPIsD1dJM0CLPjZF',
                        'id': '41MozSoPIsD1dJM0CLPjZF',
                        'name': 'BLACKPINK',
                        'type': 'artist',
                        'uri': 'spotify:artist:41MozSoPIsD1dJM0CLPjZF'}],
           'external_urls': {'spotify': 'https://open.spotify.com/album/7jaSNQUBJbvfbZHLNFrV7P'},
           'href': 'https://api.spotify.com/v1/albums/7jaSNQUBJbvfbZHLNFrV7P',
           'id': '7jaSNQUBJbvfbZHLNFrV7P',
           'images': [{'height': 640,
                       'url': 'https://i.scdn.co/image/ab67616d0000b2734aeaaeeb0755f1d8a8b51738',
                       'width': 640},
                      {'height': 300,
                       'url': 'https://i.scdn.co/image/ab67616d00001e024aeaaeeb0755f1d8a8b51738',
              

## Understanding the JSON

In [282]:
print("The json file has the following keys: ", list(results.keys())) # We can see that we only have tracks
print("The 'tracks' key has the following child keys: ", list(results["tracks"].keys())) # Let's check the values
print("The query we made is: ", results["tracks"]["href"]) # Query we have searched 
print("The song's info is contained in: ", results["tracks"]["items"]) # items (actual tracks)
print("The limit of the query we've made is: ", results["tracks"]["limit"]) # Limit we have chosen
print("The next page if any: ", results["tracks"]["next"]) # Link to the next page (next 50 tracks)
print("The starting webpage: ", results["tracks"]["offset"]) # Actual offset (starting point)
print("Starting webpage: ", results["tracks"]["previous"]) # Previous search
print("Total number of results: ", results["tracks"]["total"]) # Number of matches

The json file has the following keys:  ['tracks']
The 'tracks' key has the following child keys:  ['href', 'items', 'limit', 'next', 'offset', 'previous', 'total']
The query we made is:  https://api.spotify.com/v1/search?query=Blackpink&type=track&market=FR&offset=0&limit=1
The song's info is contained in:  [{'album': {'album_type': 'album', 'artists': [{'external_urls': {'spotify': 'https://open.spotify.com/artist/41MozSoPIsD1dJM0CLPjZF'}, 'href': 'https://api.spotify.com/v1/artists/41MozSoPIsD1dJM0CLPjZF', 'id': '41MozSoPIsD1dJM0CLPjZF', 'name': 'BLACKPINK', 'type': 'artist', 'uri': 'spotify:artist:41MozSoPIsD1dJM0CLPjZF'}], 'external_urls': {'spotify': 'https://open.spotify.com/album/7jaSNQUBJbvfbZHLNFrV7P'}, 'href': 'https://api.spotify.com/v1/albums/7jaSNQUBJbvfbZHLNFrV7P', 'id': '7jaSNQUBJbvfbZHLNFrV7P', 'images': [{'height': 640, 'url': 'https://i.scdn.co/image/ab67616d0000b2734aeaaeeb0755f1d8a8b51738', 'width': 640}, {'height': 300, 'url': 'https://i.scdn.co/image/ab67616d00001

## Other Infos

In [283]:
print("Track infos")
print(results["tracks"]["items"][0]["artists"]) # Track artists

# print("Track artists")
# print(results["tracks"]["items"][0]["artists"]["0"]["name"]) # Track artists



print("\nTrack ID")
print(results["tracks"]["items"][0]["id"]) # Track ID
print("\nTrack name")
print(results["tracks"]["items"][0]["name"]) # Track name
print("\nPopularity index")
print(results["tracks"]["items"][0]["popularity"]) # Popularity index
print("\nBasically ID")
print(results["tracks"]["items"][0]["uri"]) # Basically ID


Track infos
[{'external_urls': {'spotify': 'https://open.spotify.com/artist/41MozSoPIsD1dJM0CLPjZF'}, 'href': 'https://api.spotify.com/v1/artists/41MozSoPIsD1dJM0CLPjZF', 'id': '41MozSoPIsD1dJM0CLPjZF', 'name': 'BLACKPINK', 'type': 'artist', 'uri': 'spotify:artist:41MozSoPIsD1dJM0CLPjZF'}]

Track ID
0L8LOav65XwLjCLS11gNPD

Track name
Typa Girl

Popularity index
80

Basically ID
spotify:track:0L8LOav65XwLjCLS11gNPD


## Playing the requested Song

In [284]:
from IPython.display import IFrame

track_id = results["tracks"]["items"][0]["id"]

def play_song(track_id):
    return IFrame(src="https://open.spotify.com/embed/track/"+track_id+"?autoplay=1",
       width="320",
       height="80",
       frameborder="0",
       allowtransparency="true",
       allow="encrypted-media",
      )

In [285]:
play_song(track_id)

## Track ID of the Song

In [286]:
song_id = results["tracks"]["items"][0]["id"]
song_id

'0L8LOav65XwLjCLS11gNPD'

## Getting the audio features of the Requested Song

In [287]:
print(sp.audio_features(song_id))
sp.audio_features(song_id)

[{'danceability': 0.915, 'energy': 0.621, 'key': 7, 'loudness': -6.519, 'mode': 1, 'speechiness': 0.1, 'acousticness': 0.0745, 'instrumentalness': 0, 'liveness': 0.628, 'valence': 0.527, 'tempo': 131.984, 'type': 'audio_features', 'id': '0L8LOav65XwLjCLS11gNPD', 'uri': 'spotify:track:0L8LOav65XwLjCLS11gNPD', 'track_href': 'https://api.spotify.com/v1/tracks/0L8LOav65XwLjCLS11gNPD', 'analysis_url': 'https://api.spotify.com/v1/audio-analysis/0L8LOav65XwLjCLS11gNPD', 'duration_ms': 179173, 'time_signature': 4}]


[{'danceability': 0.915,
  'energy': 0.621,
  'key': 7,
  'loudness': -6.519,
  'mode': 1,
  'speechiness': 0.1,
  'acousticness': 0.0745,
  'instrumentalness': 0,
  'liveness': 0.628,
  'valence': 0.527,
  'tempo': 131.984,
  'type': 'audio_features',
  'id': '0L8LOav65XwLjCLS11gNPD',
  'uri': 'spotify:track:0L8LOav65XwLjCLS11gNPD',
  'track_href': 'https://api.spotify.com/v1/tracks/0L8LOav65XwLjCLS11gNPD',
  'analysis_url': 'https://api.spotify.com/v1/audio-analysis/0L8LOav65XwLjCLS11gNPD',
  'duration_ms': 179173,
  'time_signature': 4}]

## Extract the Audio Features of the Requested Song in a Dataframe

In [288]:
print(sp.audio_features(results["tracks"]["items"][0]["id"]))

[{'danceability': 0.915, 'energy': 0.621, 'key': 7, 'loudness': -6.519, 'mode': 1, 'speechiness': 0.1, 'acousticness': 0.0745, 'instrumentalness': 0, 'liveness': 0.628, 'valence': 0.527, 'tempo': 131.984, 'type': 'audio_features', 'id': '0L8LOav65XwLjCLS11gNPD', 'uri': 'spotify:track:0L8LOav65XwLjCLS11gNPD', 'track_href': 'https://api.spotify.com/v1/tracks/0L8LOav65XwLjCLS11gNPD', 'analysis_url': 'https://api.spotify.com/v1/audio-analysis/0L8LOav65XwLjCLS11gNPD', 'duration_ms': 179173, 'time_signature': 4}]


In [289]:
requested_song_audio_features = pd.DataFrame(sp.audio_features(song_id))
requested_song_audio_features = requested_song_audio_features[["danceability","energy","loudness","speechiness","acousticness","instrumentalness","liveness","valence","tempo","id","duration_ms"]]

requested_song_audio_features

Unnamed: 0,danceability,energy,loudness,speechiness,acousticness,instrumentalness,liveness,valence,tempo,id,duration_ms
0,0.915,0.621,-6.519,0.1,0.0745,0,0.628,0.527,131.984,0L8LOav65XwLjCLS11gNPD,179173


In [290]:
simple_requested_song_audio_features = requested_song_audio_features[["danceability","energy","loudness","speechiness","acousticness","instrumentalness","liveness","valence","tempo","duration_ms"]]

simple_requested_song_audio_features

Unnamed: 0,danceability,energy,loudness,speechiness,acousticness,instrumentalness,liveness,valence,tempo,duration_ms
0,0.915,0.621,-6.519,0.1,0.0745,0,0.628,0.527,131.984,179173


### 4- Load the StandardScaler using Pickle and use it to scale the REQUESTED_SONG.

#### Load

In [291]:
# Définir le nom du fichier contenant le scaler
nom_fichier_scaler = "scaler.pkl"

# Charger le scaler à partir du fichier
with open(nom_fichier_scaler, 'rb') as fichier:
    scaler = pickle.load(fichier)


#### Scaling the REQUESTED_SONG

In [292]:
print(scaler.get_feature_names_out())

['danceability' 'energy' 'loudness' 'speechiness' 'acousticness'
 'instrumentalness' 'liveness' 'valence' 'tempo' 'duration_ms']


In [293]:
requested_song_scaled = scaler.transform(simple_requested_song_audio_features)
requested_song_scaled

array([[ 1.61094903, -0.08451154,  0.2289375 , -0.26222197, -0.72678385,
        -0.36789564,  3.60782351,  0.18786315,  0.44514399, -0.68496209]])

### 5- Use kmeans_model.predict (on the new scaled audio dataset for the REQUESTED_SONG) to predict the cluster (label) for the REQUESTED_SONG.

In [296]:
requested_song_cluster_label = kmeans_final.predict(requested_song_scaled)
requested_song_cluster_label

array([8])

### 6- Return a random song from the same cluster that the REQUESTED_SONG belongs to from your data frame and suggest it to the user.


#### Return all the songs of the corresponding cluster

In [297]:
songs_same_cluster = data_normalized[data_normalized['cluster'] == requested_song_cluster_label[0]]
display(songs_same_cluster)

Unnamed: 0,danceability,energy,loudness,speechiness,acousticness,instrumentalness,liveness,valence,tempo,duration_ms,cluster,id
44,0.517939,-1.108496,-0.814436,1.091370,0.274586,-0.364628,-0.440196,0.841665,-1.374761,-1.133638,8,6ScHuFHv4QemKuuy6T6oUz
52,0.170163,-1.114249,-0.660258,4.575616,0.022834,-0.367896,-0.579053,-1.352300,0.871343,-2.700993,8,0Zf6XBkEVhs8TRaMaOpo8v
53,0.340502,-1.073980,-1.475987,2.453317,0.233253,-0.367160,-0.614355,-0.694111,-1.482286,-0.004022,8,152lZdxL1OR0ZMW6KquMif
58,-0.007274,-0.654031,-0.942055,1.968698,2.480229,-0.252150,-0.393126,-0.351852,1.856690,-0.995614,8,4DaXff4O24VWggRbnX2D7h
69,1.248978,0.266405,0.156160,2.127453,-0.225160,-0.367872,-0.534336,-0.948611,-0.185662,0.832193,8,38spM0LahLUfQhOMAqA7AI
...,...,...,...,...,...,...,...,...,...,...,...,...
2096,0.879910,-0.090264,-0.042512,2.152519,1.108747,-0.367896,-0.173466,0.907484,0.360774,-2.027441,8,5qkZJKTLBjzGyk2GnNSmCB
2099,0.972177,-0.584999,0.082348,1.041237,1.394316,-0.367896,-0.621416,-0.215826,-0.372875,-2.206024,8,0ZWzd4bD0dzdOzXnZIWQth
2106,0.496646,-0.556235,0.309648,1.818299,1.800124,-0.367896,-0.502956,0.319501,0.667108,-1.016670,8,3ekihSUQzDswLXm5w9UtPw
2108,-1.135772,-0.878387,-1.249376,1.567634,-0.089890,-0.354415,-0.495111,-1.339136,-0.873186,-1.107071,8,0V3up5IYUXpkOsbJxUFQ8x


#### Pick a random song in the list

In [302]:
random_song = songs_same_cluster.sample(n=1)
display(random_song)

Unnamed: 0,danceability,energy,loudness,speechiness,acousticness,instrumentalness,liveness,valence,tempo,duration_ms,cluster,id
659,0.461159,-0.751828,-0.151161,1.032881,0.99978,-0.367884,-0.502956,-0.286033,-1.042,-0.37332,8,0EcQcdcbQeVJn9fknj44Be


In [299]:
random_song_id = random_song['id'].values[0]
print(random_song_id)

5Evht83kIur1NX0crj1b3N


In [300]:
play_song(random_song_id)

In [301]:
#  For project clustering steps:
# 1- Load the Pandas data frame songs of the audio features (at least 1000 songs as different as possible).🎹 
# 2- Standardize the data using Standardscaler
# 3- Save the scaler for future use on the new user input song, using Pickle. (tomorrow)
# 4- Select the number of clusters k.
# 5- Adjust k means cluster.
# 6- Build the elbow graph to find the best k. (tomorrow)
# 7- Use the model with the best k to assign each observation in your data frame to its cluster number using model.predict (add cluster columns to the pandas data frame).
# 8- Save the model with the best k as your final model using pickle. (tomorrow)

# 📍 For the final product tasks and the full scenario:
# 1- Get the song name from the user as input ---------- DONE
# 2- Play the input song in the music embed player. ---------- DONE
# 3- Get the audio features for this song from the Spotify API using sp.audio_features(trackid). Be careful to keep only the audio features columns.---------- DONE
# 4- Load the StandardScaler using Pickle and use it to scale the REQUESTED_SONG.
# 5- Use kmeans_model.predict(new scaled audio dataset for the REQUESTED_SONG) to predict the cluster (label) for the REQUESTED_SONG.
# 6- Return a random song from the same cluster that the REQUESTED_SONG belongs to from your data frame and suggest it to the user.
# 7- Play it using the built-in music player.
# 8*- Try to Streamlit? ☀️ (optional) 