## WebScraping III: revenge of the prototype

Following the prototype discussed in class, build an MVP, where the client will input a song, and the app will check if that song is in top 100 list. If it is, you will recommend another song from billboard 100, if it isn't, you will recommend a random song (for now) from another website, source of music boards.

In [48]:
import requests
from bs4 import BeautifulSoup

import spotipy
from spotipy.oauth2 import SpotifyClientCredentials
import pandas as pd
from sklearn import datasets
from sklearn.preprocessing import StandardScaler
sp = spotipy.Spotify(auth_manager=SpotifyClientCredentials(client_id="d151bde53dbd4ba0a64a51af0beb8ba1",
                                                           client_secret="2c1899472adb4da58dfad53428c612fb"))
from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_score
import numpy as np
import matplotlib.pyplot as plt
from sklearn.metrics import pairwise_distances_argmin_min
pd.set_option('display.max_rows', None)


### Generate The Hot 100 Billboard Data Frame 

In [49]:
def billboard_100(url):
    r = requests.get(url)
    r.status_code
    html = r.content
    soup = BeautifulSoup(html, 'html.parser')
    song = soup.find_all("span", class_="chart-element__information__song")
    import pandas as pd
    rows = []
    for i in song:
        rows.append(i.get_text())
    song_df = pd.DataFrame(rows, columns=["song"])
    artist = soup.find_all("span", class_="chart-element__information__artist")
    rows = []
    for i in artist:
        rows.append(i.get_text())
    artist_df = pd.DataFrame(rows, columns=["artist"])
    billboard_chart = pd.concat([artist_df, song_df], axis=1)
    return billboard_chart

### Merge all Playlists into one DataFrame 

In [30]:
 def get_playlist_tracks_lupo(playlist_url):
        merged = pd.DataFrame()
        for url in playlist_url:
                results = sp.playlist_tracks(url)
                tracks = results['items']
                while results['next']:
                    results = sp.next(results)
                    tracks.extend(results['items'])

                song_ids = [track['track']['id'] for track in tracks]
                song_name = [track['track']['name'] for track in tracks]
                artist = [track['track']['artists'][0]['name'] for track in tracks]

                playlist_audio = []
                [playlist_audio.extend(sp.audio_features(song_ids[i:i+100])) for i in range(0, len(song_ids), 100)]
                data = list(zip(song_ids, song_name, artist))
                playlist_audio_df = pd.DataFrame.from_dict(playlist_audio)
                dataframe = pd.DataFrame(data, columns=['song_ids', 'song_name', 'artist'])
                info_cluster_df = dataframe.merge(playlist_audio_df, left_index = True, right_index = True)
                merged = merged.append(info_cluster_df)
        merged = merged.reset_index(drop = True)
        return merged

# https://www.hypebot.com/hypebot/2020/11/20-most-popular-playlists-on-spotify.html PLAYLISTS SOURCE


### Song recommender from closest song in KNN cluster

In [54]:
def song_recommender_closest():
    song_name = input("Enter your favourite song: ")
    billboard = billboard_100('https://www.billboard.com/charts/hot-100')

    if song_name in billboard['song'].values.tolist():
        return 'The following recommendation is from the billboard hot-100:', billboard.sample()
    else: 
#Obtain user song features based on song name. Get song_id and use spotify sp.audio_features and convert into data frame.
        query = 'track:' + song_name #+ artist_name
        results = sp.search(q=query, limit=1)
        user_song_id = results["tracks"]["items"][0]['id']
        user_song_name = results['tracks']['items'][0]['name']
        user_song_artist = results['tracks']['items'][0]['artists']
        artist_names = ''
        for artist in user_song_artist:
            artist_names += artist['name'] + ', '
        
        user_song_features = sp.audio_features(user_song_id)
        user_song_df = pd.DataFrame.from_dict(user_song_features)
        X_user = user_song_df.drop(columns=['id', 'type', 'uri', 'track_href','analysis_url', 'time_signature'], axis=1)
#List of playlists    
        collection = ['https://open.spotify.com/playlist/37i9dQZF1DXcBWIGoYBM5M', 
          "https://open.spotify.com/playlist/5ABHKGoOzxkaa28ttQV9sE", 
          "https://open.spotify.com/playlist/37i9dQZEVXbMDoHDwVN2tF", 
          "https://open.spotify.com/playlist/37i9dQZF1DX0XUsuxWHRQd",
          "https://open.spotify.com/playlist/37i9dQZF1DX10zKzsJ2jva",
          "https://open.spotify.com/playlist/37i9dQZF1DWY7IeIP1cdjF",
          "https://open.spotify.com/playlist/37i9dQZF1DWWMOmoXKqHTD",
          "https://open.spotify.com/playlist/37i9dQZF1DX4o1oenSJRJd",
          "https://open.spotify.com/playlist/37i9dQZF1DWXRqgorJj26U",
          "https://open.spotify.com/playlist/37i9dQZF1DX4UtSsGT1Sbe",
          "https://open.spotify.com/playlist/37i9dQZF1DX76Wlfdnj7AP",
          "https://open.spotify.com/playlist/37i9dQZF1DXbTxeAdrVG2l",
          "https://open.spotify.com/playlist/37i9dQZF1DX4WYpdgoIcn6",
          "https://open.spotify.com/playlist/37i9dQZF1DX4sWSpwq3LiO",
          "https://open.spotify.com/playlist/37i9dQZF1DX1lVhptIYRda",
          "https://open.spotify.com/playlist/37i9dQZF1DWY4xHQp97fN6",
          "https://open.spotify.com/playlist/37i9dQZF1DX3rxVfibe1L0", 
          "https://open.spotify.com/playlist/37i9dQZF1DWSqmBTGDYngZ", 
          "https://open.spotify.com/playlist/37i9dQZF1DX4dyzvuaRJ0n",
          "https://open.spotify.com/playlist/37i9dQZF1DXdSjVZQzv2tl",
          "https://open.spotify.com/playlist/37i9dQZF1DXdPec7aLTmlC",
          #"https://open.spotify.com/playlist/3vxotOnOGDlZXyzJPLFnm2"]
          "https://open.spotify.com/playlist/37i9dQZF1DWWEJlAGA9gs0",
          "https://open.spotify.com/playlist/37i9dQZF1DXbITWG1ZJKYt",
          "https://open.spotify.com/playlist/5eYZGmjBvg3kpIUVpRCUhE",
          "https://open.spotify.com/playlist/37i9dQZF1DXaKIA8E7WcJj",
          "https://open.spotify.com/playlist/37i9dQZF1DWTJ7xPn4vNaz",
          "https://open.spotify.com/playlist/37i9dQZF1DX5Ejj0EkURtP",
          "https://open.spotify.com/playlist/37i9dQZF1DWSV3Tk4GO2fq",
          "https://open.spotify.com/playlist/37i9dQZF1DWWOaP4H0w5b0",
          "https://open.spotify.com/playlist/2PjVPkj4a9kBvQIXaZ6UUt",
          "https://open.spotify.com/playlist/6mtYuOxzl58vSGnEDtZ9uB",
          "https://open.spotify.com/playlist/37i9dQZF1DX8FwnYE6PRvL"]
          #"https://open.spotify.com/playlist/3IZZM4l5jdMwrLeKegeQRH"]

        
        playlist_df = get_playlist_tracks_lupo(collection)

#prepare data set for transformation by droping categoricals      
        X = playlist_df.drop(columns=['song_ids', 'song_name', 'artist', 'id', 'type', 'uri', 'track_href','analysis_url', 'time_signature'], axis=1)

#Transform dataset and user song        
        scaler = StandardScaler()
        scaler.fit(X)
        X_prep = scaler.transform(X)
        X_prep_user = scaler.transform(X_user)
        
# K values: Pick highest value of silhouete score to obtain number of clusters     
        K = range(2, 20)

        silhouette = []

        for k in K:
            kmeans = KMeans(n_clusters=k, random_state=42)
            kmeans.fit(X_prep)
            silhouette.append(silhouette_score(X_prep, kmeans.predict(X_prep)))
        k_value = silhouette.index(max(silhouette)) + 2

#Generate data base clusters with kmeans given the number of clusters
        kmeans = KMeans(n_clusters=k_value, random_state=1234)
        kmeans.fit(X_prep)
#Predict user_song cluster
        clusters = kmeans.predict(X_prep)
        clusters_user = kmeans.predict(X_prep_user)
#Transform scaled database to dataframe again and filter database using the user_song cluster
        X_df = pd.DataFrame(X_prep)
        X_df["cluster"] = clusters
        X_df['song_name'] = playlist_df['song_name']
        X_df['artist'] = playlist_df['artist']
        X_df_filtered = X_df[X_df["cluster"] == clusters_user[0]]     
        cluster_filtered = X_df[X_df['cluster'] == clusters_user[0]].drop(columns=['cluster', 'song_name', 'artist'], axis=1).to_numpy()
#obtain most similar song to recommend from the filtered data base
        closest, _ = pairwise_distances_argmin_min(X_prep_user, cluster_filtered)
# closest_features = pd.DataFrame(cluster_filtered[closest[0]])
#extract song name and artist of the recommended song
        recommended_song = X_df_filtered.reset_index(drop = True).loc[closest[0]]['song_name']
        recommended_artist = X_df_filtered.reset_index(drop = True).loc[closest[0]]['artist']
#Conditional to drop the recommended song if the song is the same as the user input. 
        if artist_names[:-2] == recommended_artist:
            cluster_filtered = cluster_filtered.tolist()
            cluster_filtered.pop(closest[0])
            closest, _ = pairwise_distances_argmin_min(X_prep_user, cluster_filtered)
            recommended_song = X_df_filtered.reset_index(drop = True).loc[closest[0]]['song_name']
            recommended_artist = X_df_filtered.reset_index(drop = True).loc[closest[0]]['artist']
#Return the user song to verify which was picked by the Spotify search engine and return the model recommendation
            return f'User input: Artist: {artist_names[:-2]} Track name: {user_song_name}. Model recommendation: {recommended_song} by {recommended_artist}'
        else:
            return f'User input: Artist: {artist_names[:-2]} Track name: {user_song_name}. Model recommendation: {recommended_song} by {recommended_artist}'
        
        


song_recommender_closest()

Enter your favourite song: Enter Sandman


'User input: Artist: Metallica Track name: Enter Sandman. Model recommendation: Sweet Home Alabama by Lynyrd Skynyrd'

In [None]:
from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_score
import numpy as np
import matplotlib.pyplot as plt


K = range(2, 20)

silhouette = []

def get_playlist_tracks(playlist_url):
    
    results = sp.playlist_tracks(playlist_url)
    tracks = results['items']
    
    while results['next']:
        results = sp.next(results)
        tracks.extend(results['items'])
    
    return tracks

tracks = get_playlist_tracks("https://open.spotify.com/playlist/5ABHKGoOzxkaa28ttQV9sE")
playlist_audio = sp.audio_features(track_ids[:])
data = pd.DataFrame.from_dict(playlist_audio)
X = data.drop(columns=['id', 'type', 'uri', 'track_href','analysis_url', 'time_signature'], axis=1)
scaler = StandardScaler()
scaler.fit(X)
X_prep = scaler.transform(X)

for k in K:
    kmeans = KMeans(n_clusters=k, random_state=42)
    kmeans.fit(X_prep)
    silhouette.append(silhouette_score(X_prep, kmeans.predict(X_prep)))



plt.figure(figsize=(16,8))
plt.plot(K, silhouette, 'bx-')
plt.xlabel('k')
plt.ylabel('silhouette score')
plt.xticks(np.arange(min(K), max(K)+1, 1.0))
plt.show()

In [None]:
silhouette

In [None]:
k = silhouette.index(max(silhouette)) + 2
k