In [1]:
from bs4 import BeautifulSoup
import requests
import pandas as pd
import numpy as np
import time

In [3]:
country_codes_html = requests.get('https://en.wikipedia.org/wiki/ISO_3166-1_alpha-2')
country_codes_df = pd.read_html(country_codes_html.text, flavor ='html5lib', header = 0)[2]
country_code_list = list(country_codes_df['Country name (using title case)'].values)

In [None]:
country_code_list[42]

In [None]:
spotify_charts = requests.get('https://spotifycharts.com/regional')
spotify_soup = BeautifulSoup(spotify_charts.text, "html5lib")

In [None]:
spotify_countries = []

for i in spotify_soup.find_all('li'):
    
    if i.text in country_code_list:
        
        spotify_countries.append(i.text)

In [None]:
sub_df = country_codes_df[country_codes_df['Country name (using title case)'].isin(spotify_countries)]

In [None]:
spotify_country_codes = list(sub_df['Code'].values)

In [None]:
spotify_country_codes[8]

In [None]:
date_list = []

for i in spotify_soup.find_all('li'):
    
    try:
        
        date = pd.to_datetime(i.text)
        date = date.strftime('%Y-%m-%d')
        
    except:
        
        date = np.NaN
        
    date_list.append(date)
    
date_list_clean = [x for x in date_list if x != 'nan' and type(x) != float]

In [None]:
date_list_clean
len(date_list_clean)

In [33]:
spotify_country_codes[8:9]

['CH']

In [45]:
seed_url = 'https://spotifycharts.com/regional/'
frequency = 'daily'

country_df_list = []

for country in spotify_country_codes[8:9]:
    
    date_df_list = []
    
    for date in date_list_clean[:261]:
        
        url = seed_url + country.lower() + '/' + frequency + '/' + date # I concatenate the base url with the variable components
                                
        response = requests.get(url) # and send a request for every new URL
        
        if response.status_code == 200: # this line checks whether the response from the request was positive (i.e. code 200)
            
            print("Data available for " + country + ", " + date)
            
            soup_response = BeautifulSoup(response.text, "html5lib")
            response_links = soup_response.find_all('a')
            
            track_links = []
            track_ids = []

            for link in response_links:
    
                if type(link.get('href')) == str and 'track' in link.get('href'):
            
                # this condition checks whether the elements contained in the list 'response_links' which were identified
                # by the selector 'a' are 1) of type string and 2) the link property contains the substring 'track'
                # since I only want entries from the charts website which have a valid track URL
                # Note: there were cases where this wasn't the case, i.e. empty rows or rows without a track URL
                # which led to inconsistencies when merging all dataframes at the end
            
                    track_link = link.get('href')
                    track_id = link.get('href').split('/')[-1]
        
                    track_links.append(track_link)
                    track_ids.append(track_id)
                             
            df = pd.read_html(response.text, flavor ='html5lib', header= 0)[0] # This retrieves the whole table as dataframe
            
            column_names = ['0', 'Rank', '1', 'Title_Artist', 'Streams'] # I assign column names
            df.columns = column_names # and apply the new names to the columns of the dataframe 'df'
            
            na_index = pd.notna(df['Title_Artist']) # I keep only rows where the 'Title_Artist' column is different from
                                                    # nan
            
            na_index_num = [i for i, x in enumerate(na_index) if x]
            
            df = df[pd.notna(df['Title_Artist'])]
            
            # The condition below is necessary whenever there was an invalid (i.e. missing) Track-URL in the table 
            # but the table contained other columns for the associated row.
            
            if len(track_links) != len(df):
                
                df['Track_URL'] = list(np.array(track_links)[na_index_num])
                df['Track_ID'] = list(np.array(track_ids)[na_index_num])
                
                # Now the length of the 'df' and valid 'track_links' indices do match and can be assigned as columns.            
            
            else: 
                
                df['Track_URL'] = track_links
                df['Track_ID'] = track_ids
                
                # For this case no exceptions were needed as the initial table contained as many valid Track-Urls as number 
                # of rows.
            
            df = df[['Rank', 'Title_Artist', 'Streams', 'Track_URL', 'Track_ID']]
            
            # I keep the columns 'Rank', 'Title_Artist', 'Streams', 'Track_URL' and 'Track_ID'
                 
            title_list = []
            artist_list = []
            
            # The loop below splits the concatenated column 'Title_Artist' into 'Title' and 'Artist' at the separator ' by '

            for element in list(df['Title_Artist'].values):
                
                if type(element) == str:
    
                    title = element.split(" by ")[0]
                    artist = element.split(" by ")[1]
    
                    title_list.append(title)
                    artist_list.append(artist)
            
                else:
                    
                    title = np.NaN
                    artist = np.NaN
                    
                    title_list.append(title)
                    artist_list.append(artist)
            
            df['Track title'] = title_list
            df['Artist'] = artist_list
            df['country_code'] = country
            df['date'] = date
            
            date_df_list.append(df)
            
            # Finally the cleaned, resulting daily charts by country dataframe is appended to a list 'date_df_list'
            
            # It is good practice not to send request after request. Depending on the time of execution your program could
            # send too many queries within a period of time and the server could block your IP. 
            # Therefore, include a break of one second by 'time.sleep(1)'
            
#             time.sleep(0.5)
            
        elif response.status_code == 404:
            
            print("No data available for " + country + ", " + date)
            
            date_df_list = []
            
        # The condition below checks whether the 'date_df_list' by country has at least one entry (i.e. day) and if it's true
        # it merges the contained dataframes into one big dataframe and saves it as .csv to your directory.
        # Note that in every iteration the previous table is appended and the previously saved .csv is overwritten to save
        # storage.
        
        if len(date_df_list) > 0:
            
            date_df_merged = pd.concat(date_df_list)
            
#             date_df_merged.to_csv(str(date) + "_" + country + ".csv")

            date_df_merged.to_csv("Latest_" + country + ".csv")
    
    # The line below merges all saved country dataframes into one big dataframe containing daily charts across available
    # countries.
    
    country_df_list.append(date_df_merged)
    
print('Data retrieval finished!')

Data available for CH, 2020-09-17
Data available for CH, 2020-09-16
Data available for CH, 2020-09-15
Data available for CH, 2020-09-14
Data available for CH, 2020-09-13
Data available for CH, 2020-09-12
Data available for CH, 2020-09-11
Data available for CH, 2020-09-10
Data available for CH, 2020-09-09
Data available for CH, 2020-09-08
Data available for CH, 2020-09-07
Data available for CH, 2020-09-06
Data available for CH, 2020-09-05
Data available for CH, 2020-09-04
Data available for CH, 2020-09-03
Data available for CH, 2020-09-02
Data available for CH, 2020-09-01
Data available for CH, 2020-08-31
Data available for CH, 2020-08-30
Data available for CH, 2020-08-29
Data available for CH, 2020-08-28
Data available for CH, 2020-08-27
Data available for CH, 2020-08-26
Data available for CH, 2020-08-25
Data available for CH, 2020-08-24
Data available for CH, 2020-08-23
Data available for CH, 2020-08-22
Data available for CH, 2020-08-21
Data available for CH, 2020-08-20
Data available

Data available for CH, 2020-01-20
Data available for CH, 2020-01-19
Data available for CH, 2020-01-18
Data available for CH, 2020-01-17
Data available for CH, 2020-01-16
Data available for CH, 2020-01-15
Data available for CH, 2020-01-14
Data available for CH, 2020-01-13
Data available for CH, 2020-01-12
Data available for CH, 2020-01-11
Data available for CH, 2020-01-10
Data available for CH, 2020-01-09
Data available for CH, 2020-01-08
Data available for CH, 2020-01-07
Data available for CH, 2020-01-06
Data available for CH, 2020-01-05
Data available for CH, 2020-01-04
Data available for CH, 2020-01-03
Data available for CH, 2020-01-02
Data available for CH, 2020-01-01
Data retrieval finished!


'C'