Documentation:

https://developer.spotify.com/documentation/web-api/reference/#category-tracks

https://spotipy.readthedocs.io/en/2.19.0/

In [9]:
from bs4 import BeautifulSoup
import requests
import re

In [10]:
headers = {'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_11_5) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/50.0.2661.102 Safari/537.36'}
url = 'https://spotifycharts.com/regional/global/daily/latest'
req = requests.get(url, headers=headers)
soup = BeautifulSoup(req.content, 'html.parser')

In [11]:
# scrape the available countries
li_list = soup.find_all('li')
codes = []
names = []

for l in li_list:
    if len(re.findall('[a-zA-Z]+', l.text)) > 0 and l.has_attr('data-value'):
        codes.append(l['data-value'])
        names.append(l.text)
    
countries_dict = dict(zip(codes, names))

In [12]:
# First, scrape the top 200 from spotifycharts.com with BeautifulSoup
# func get_daily_chart

# Then enrich the songs with features using audio_features
# func get_audio_features

In [13]:
from bs4 import BeautifulSoup
import requests
import pandas as pd
import datetime
import os
import spotipy
from spotipy.oauth2 import SpotifyClientCredentials

In [14]:
def get_daily_chart(date, country):
    headers = {'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_11_5) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/50.0.2661.102 Safari/537.36'}
    url = f'https://spotifycharts.com/regional/{country}/daily/{date}'
    req = requests.get(url, headers=headers)
    soup = BeautifulSoup(req.content, 'html.parser')

    df = pd.DataFrame(columns=['id','position','track','artist','streams'])

    for tr in soup.find('tbody').findAll('tr'):
        id = tr.find('td', {'class': 'chart-table-image'}).find("a").get("href").split("track/")[1]
        position = tr.find('td', {'class': 'chart-table-position'}).text
        track = tr.find('td', {'class': 'chart-table-track'}).find('strong').text
        artist = tr.find('td', {'class': 'chart-table-track'}).find('span').text.replace('by ','').strip()
        streams = tr.find('td', {'class': 'chart-table-streams'}).text.replace(',','')
        d = dict(zip(df.columns,[id,position,track,artist,streams]))    
        df = df.append(d, ignore_index=True)

    return df

In [15]:
# need to sign up for credentials
# https://developer.spotify.com/dashboard/login

client_id = '###'
client_secret = '###'

client_credentials_manager = SpotifyClientCredentials(client_id=client_id, 
                                                      client_secret=client_secret)
sp = spotipy.Spotify(client_credentials_manager=client_credentials_manager)

In [16]:
def get_audio_features(ids):
    # max is 100 ids at a time
    if len(ids) > 100:
        ids_1 = ids[:100]
        ids_2 = ids[100:]
        df_1 = pd.DataFrame(sp.audio_features(ids_1))
        df_2 = pd.DataFrame(sp.audio_features(ids_2))
        df = pd.concat([df_1, df_2], ignore_index=True)
    else:
        df = pd.DataFrame(sp.audio_features(ids))
    df = df.drop(columns=['analysis_url', 'track_href', 'uri', 'type'])
    return df

In [17]:
# countries = ['global', 'us', 'gb', 'ae', 'ar', 'at', 'au', 'be', 'bg', 'bo', 'br', 'ca', 'ch', 'cl', 'co', 'cr',
#              'cy', 'cz', 'de', 'dk', 'do', 'ec', 'ee', 'eg', 'es', 'fi', 'fr', 'gr', 'gt', 'hk', 'hn', 'hu', 
#              'id', 'ie', 'il', 'in', 'is', 'it', 'jp', 'kr', 'lt', 'lu', 'lv', 'ma', 'mx', 'my', 'ni', 'nl', 
#              'no', 'nz', 'pa', 'pe', 'ph', 'pl', 'pt', 'py', 'ro', 'ru', 'sa', 'se', 'sg', 'sk', 'sv', 'th', 
#              'tr', 'tw', 'ua', 'uy', 'vn', 'za']
countries = ['tr', 'tw', 'ua', 'uy', 'vn', 'za']

In [18]:
dates = pd.date_range(start='2017-01-01', end='2021-09-30')
dates = [d.strftime('%Y-%m-%d') for d in dates]
len(dates)
# 1734

1734

In [19]:
for country in countries:
    for date in dates:
        try:
            df_basic = get_daily_chart(date, country)
            ids = list(df_basic['id'])
            df_features = get_audio_features(ids)
            df_out = df_basic.merge(df_features, on='id')

            filename = f'regional-{country}-daily-{date}.csv'
            outdir = f'./data/{country}'
            if not os.path.exists(outdir):
                os.mkdir(outdir)
            path = os.path.join(outdir, filename)

            df_out.to_csv(path, index=False)
        except:
            pass