## Daily updated top 200 global spotify playlist 

### 0. Import libraries

In [1]:
from dotenv import load_dotenv
import os

from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from bs4 import BeautifulSoup

import logging
import pandas as pd
from fuzzywuzzy import fuzz
import re

import spotipy
from spotipy.oauth2 import SpotifyOAuth

from chromedriver_config.chromedriver_config import chrome_options, user_agent_string_override_command

### 1. Custom functions

In [2]:
def scrape_billboard_global_200(driver: webdriver, 
                                class_name: str,
                                load_time: int = 20,
                                amount: int = 200) -> list[str]:
    try:
        WebDriverWait(driver, load_time).until(
            EC.presence_of_all_elements_located((By.CLASS_NAME, class_name))
        )
        html = driver.page_source
        soup = BeautifulSoup(html, 'html.parser')
        unfiltered_songs = soup.find_all('div', class_ = class_name)
        return unfiltered_songs
    except Exception as e:
        print(f'Scraping Error: {e}')
    finally:
        driver.quit()
        

In [3]:
def filter_names_artists_pos(unfiltered_songs: list[str], 
                             song_html_id: str, 
                             artist_class: str, 
                             pos_class: str,
                             df_columns: list[str],
                             pattern: str) -> pd.DataFrame:
    songs_list = []
    for element in unfiltered_songs:
        pos = int(element.find('span', class_ = pos_class).text.strip())
        title = element.find('h3', id = song_html_id).text.strip()
        artist = element.find('span', class_ = artist_class).text.strip()
        artist = re.split(pattern, artist, maxsplit = 1)[0]
        
        songs_list.append([pos, title, artist])
        
    top_200_df = pd.DataFrame(data = songs_list, columns = df_columns)
    
    return top_200_df

In [4]:
def vaildate_top_200_df(top_200_df: pd.DataFrame, required_columns: list[str]) -> str:
    assert top_200_df['pos'].count() == 200, 'Number of records is not 200.'
    assert all(col in top_200_df.columns for col in required_columns), "Missing required columns."
    assert pd.to_numeric(top_200_df['pos'], errors='raise').notnull().all(), "'pos' has non-numeric values."
    assert top_200_df.notnull().all().all(), "There are missing values!"
    assert top_200_df['pos'].is_monotonic_increasing, "'pos' is not sorted!"
    assert top_200_df['pos'].is_unique, "'pos' is not unique!"
    assert top_200_df['title'].is_unique, "'title' is not unique!"
    assert not top_200_df[['pos', 'title', 'artist']].duplicated().any(), "A record is duplicated."
    assert not top_200_df[['title', 'artist']].duplicated().any(), "A song is duplicated."
    
    return 'OK'

In [5]:
def search_for_track(row, sp: spotipy.Spotify):
    query = f"{row['title']} {row['artist']}"
    result = sp.search(q = query, type='track', limit = 5)
    
    results_scores_dict = {}
    
    if result['tracks']['items']:
        for track in result['tracks']['items']:
            spotify_title = track['name']
            spotify_artists = ", ".join([artist['name'] for artist in track['artists']])
            uri = track['uri']
            
            title_score = fuzz.token_set_ratio(row['title'], spotify_title)
            artist_score = fuzz.token_set_ratio(row['artist'], spotify_artists)
            total_score = (title_score + artist_score) / 2
            
            results_scores_dict[total_score] = uri
            
        max_score = 0
        for score, uri in results_scores_dict.items():
            if score > max_score:
                max_score = score
                
        return results_scores_dict[max_score]
    return None


In [6]:
def get_songs_ids_from_spotify(top_200_df: pd.DataFrame, sp: spotipy.Spotify) -> pd.DataFrame:
    
    top_200_df['uri'] = top_200_df.apply(search_for_track, axis=1, args = (sp,))
    
    return top_200_df

In [7]:
def update_top_200_global(top_200_df: pd.DataFrame, sp: spotipy.Spotify, playlist_name: str) -> pd.DataFrame:
    
    tracks = top_200_df['uri'].to_list()
    tracks1, tracks2 = tracks[0 : 100], tracks[100 : 200]
    playlists = sp.current_user_playlists()
    top_200_id = ""
    
    for playlist in playlists['items']:
        if playlist['name'] == playlist_name:
            top_200_id = playlist['id']
            

    sp.playlist_replace_items(top_200_id, [])
    
    sp.playlist_add_items(top_200_id, tracks1)
    sp.playlist_add_items(top_200_id, tracks2)
    
    return top_200_df

### 2. Environment variables

In [8]:
load_dotenv()
id = os.getenv('ID')
secret = os.getenv('SECRET')
uri = os.getenv('URI')

In [9]:
global_200_url = "https://www.billboard.com/charts/billboard-global-200/"

chromedriver_path = './chromedriver-win64/chromedriver.exe'

html_class = 'o-chart-results-list-row-container'
pos_class = 'c-label a-font-primary-bold-l u-font-size-32@tablet u-letter-spacing-0080@tablet'
song_html_id = 'title-of-a-story'
artist_class = 'a-no-trucate'

top_200_df_columns = ['pos', 'title', 'artist']
top_200_playlist_name = 'TOP 200 GLOBAL'

separators = [" Featuring ", " & ", " X ", " feat. ", " featuring ", " with ", ", "]

pattern = "|".join(map(re.escape, separators))

### 3. Run the code

In [10]:
sp = spotipy.Spotify(auth_manager=SpotifyOAuth(client_id=id, 
                                               client_secret=secret, 
                                               redirect_uri=uri,
                                               scope="user-library-read playlist-modify-public playlist-modify-private playlist-read-private playlist-read-collaborative"))

service = Service(chromedriver_path)
driver = webdriver.Chrome(service = service, options = chrome_options)

In [11]:
# SCRAPING THE TOP 200 GLOABAL FROM BILLBOARD
driver.execute_cdp_cmd('Network.setUserAgentOverride', {'userAgent' : user_agent_string_override_command})

driver.get(global_200_url)

songs = scrape_billboard_global_200(driver, html_class)

assert len(songs) == 200, 'Number of html elements is not 200.'

# TRASFORMING THE DATA TO A DF, DO QC
top_200_df = filter_names_artists_pos(songs, 
                                      song_html_id, 
                                      artist_class, 
                                      pos_class, 
                                      top_200_df_columns, 
                                      pattern)

assert vaildate_top_200_df(top_200_df, top_200_df_columns) == 'OK'
    
# GET SONGS FROM SPOTIFY
top_200_df = get_songs_ids_from_spotify(top_200_df, sp)

# REFRESH TOP 200 GLOBAL PLAYLIST
top_200_df = update_top_200_global(top_200_df, sp, top_200_playlist_name)