In [2]:
import re
from urllib.request import urlopen as uReq
from bs4 import BeautifulSoup as soup
import pandas as pd

In [3]:
page_url = "https://kworb.net/spotify/"

uClient = uReq(page_url) # downloads the html page from page_url

page_soup = soup(uClient.read(), "html.parser") # parses html into a readable alphabet soup

country_soup = page_soup.find("table") # narrows html down to the country

country_soup_list = country_soup.find_all("tr") # get each row of the country soup

country_dict_links = {}

for entry in country_soup_list:
    
    country_name = entry.contents[0].string # country name
    country_links = []
    for link in entry.contents[2].find_all("a"):
        
        country_links.append(link.get('href'))
    country_dict_links[country_name] = country_links

In [9]:
country_dict_links

{'Global': ['country/global_daily.html',
  'country/global_daily_totals.html',
  'country/global_weekly.html',
  'country/global_weekly_totals.html'],
 'United States': ['country/us_daily.html',
  'country/us_daily_totals.html',
  'country/us_weekly.html',
  'country/us_weekly_totals.html'],
 'United Kingdom': ['country/gb_daily.html',
  'country/gb_daily_totals.html',
  'country/gb_weekly.html',
  'country/gb_weekly_totals.html'],
 'Andorra': ['country/ad_daily.html',
  'country/ad_daily_totals.html',
  'country/ad_weekly.html',
  'country/ad_weekly_totals.html'],
 'Argentina': ['country/ar_daily.html',
  'country/ar_daily_totals.html',
  'country/ar_weekly.html',
  'country/ar_weekly_totals.html'],
 'Australia': ['country/au_daily.html',
  'country/au_daily_totals.html',
  'country/au_weekly.html',
  'country/au_weekly_totals.html'],
 'Austria': ['country/at_daily.html',
  'country/at_daily_totals.html',
  'country/at_weekly.html',
  'country/at_weekly_totals.html'],
 'Belarus': ['co

In [11]:
country_top_info = {country:{"links":country_dict_links[country]} for country in country_dict_links.keys()}

{'Global': {'links': ['country/global_daily.html',
   'country/global_daily_totals.html',
   'country/global_weekly.html',
   'country/global_weekly_totals.html']},
 'United States': {'links': ['country/us_daily.html',
   'country/us_daily_totals.html',
   'country/us_weekly.html',
   'country/us_weekly_totals.html']},
 'United Kingdom': {'links': ['country/gb_daily.html',
   'country/gb_daily_totals.html',
   'country/gb_weekly.html',
   'country/gb_weekly_totals.html']},
 'Andorra': {'links': ['country/ad_daily.html',
   'country/ad_daily_totals.html',
   'country/ad_weekly.html',
   'country/ad_weekly_totals.html']},
 'Argentina': {'links': ['country/ar_daily.html',
   'country/ar_daily_totals.html',
   'country/ar_weekly.html',
   'country/ar_weekly_totals.html']},
 'Australia': {'links': ['country/au_daily.html',
   'country/au_daily_totals.html',
   'country/au_weekly.html',
   'country/au_weekly_totals.html']},
 'Austria': {'links': ['country/at_daily.html',
   'country/at_daily

In [28]:
country_top_info_df = pd.DataFrame(country_dict_links.items(), columns=['Country', 'Links'])

In [29]:
country_abbv = {}
for country in country_dict_links.keys():
    daily_link = country_dict_links[country][0]
    match = re.search(r'country/(\w+)_daily.html', daily_link)
    abbv = match.group(1)
    country_abbv[country] = abbv

In [38]:
abbv_df = pd.DataFrame(list(country_abbv.items()), columns=['Country', 'Abbreviation'])

In [39]:
country_top_info_df.columns

Index(['Country', 'Links'], dtype='object')

In [40]:
country_top_info_df.merge(abbv_df)

Unnamed: 0,Country,Links,Abbreviation
0,Global,"[country/global_daily.html, country/global_dai...",global
1,United States,"[country/us_daily.html, country/us_daily_total...",us
2,United Kingdom,"[country/gb_daily.html, country/gb_daily_total...",gb
3,Andorra,"[country/ad_daily.html, country/ad_daily_total...",ad
4,Argentina,"[country/ar_daily.html, country/ar_daily_total...",ar
...,...,...,...
72,Ukraine,"[country/ua_daily.html, country/ua_daily_total...",ua
73,United Arab Emirates,"[country/ae_daily.html, country/ae_daily_total...",ae
74,Uruguay,"[country/uy_daily.html, country/uy_daily_total...",uy
75,Venezuela,"[country/ve_daily.html, country/ve_daily_total...",ve


In [92]:
# Iterate through each row and each item in the list column
daily_paths = {}
for index, row in country_top_info_df.iterrows():
    country = row['Country']
    paths = row['Links']
    daily_paths[country] = paths

help

In [93]:
def generate_dataset(path, country):
    page_url = f"https://kworb.net/spotify/{path}"
    
    uClient = uReq(page_url) # downloads the html page from page_url

    page_soup = soup(uClient.read(), "html.parser") # parses html into a readable alphabet soup

    spotify_entry_soup = page_soup.find("table") # narrows html down to the country
    
    headings = [th.get_text() for th in spotify_entry_soup.find("tr").find_all("th")]

    datasets = []
    for row in spotify_entry_soup.find_all("tr")[1:]:
        dataset = zip(headings, (td.get_text() for td in row.find_all("td")))
        datasets.append(dataset)
        
    dataset_tuples = [list(zip(dataset)) for dataset in datasets]
    
    # Convert the data into a dictionary to handle repeated attributes
    dict_data = {}
    for sublist in dataset_tuples:
        for tup in sublist:
            attribute, value = tup[0]
            if attribute in dict_data:
                dict_data[attribute].append(value)
            else:
                dict_data[attribute] = [value]

    # Create a DataFrame from the dictionary
    df = pd.DataFrame(dict_data)
    df = df.set_index("Pos")
    
    df[['Artist', 'Title']] = df["Artist and Title"].str.split("-", 1, expand=True)
    
    df_obj = df.select_dtypes('object')
    df[df_obj.columns] = df_obj.apply(lambda x: x.str.strip())
    
    df = df.reset_index()
    
    df["Country"] = country
    
    return df

In [94]:
dfs = []
for country in daily_paths:
    try:
        dfs.append(generate_dataset(daily_paths[country][0], country))
    except:
        continue

In [87]:
len(dfs)

76

In [96]:
dfs[0]

Unnamed: 0,Pos,P+,Artist and Title,Days,Pk,(x?),Streams,Streams+,7Day,7Day+,Total,Artist,Title,Country
0,1,+2,Benson Boone - Beautiful Things,22,1,(x4),5613842,+798921,34942734,+872488,81671921,Benson Boone,Beautiful Things,Global
1,2,=,Xavi - La Diabla,64,1,(x20),5194103,+225839,35787503,-396929,286191130,Xavi,La Diabla,Global
2,3,-2,Tate McRae - greedy,148,1,(x34),5101565,-639,35178152,-259951,743490391,Tate McRae,greedy,Global
3,4,=,"The Weeknd - One Of The Girls (w/ JENNIE, Lily...",125,2,(x2),4709776,-76845,33488250,-338453,375564881,The Weeknd,"One Of The Girls (w/ JENNIE, Lily-Rose Depp)",Global
4,5,=,Taylor Swift - Cruel Summer,369,2,(x17),4658897,+141806,30698216,+272342,1271909550,Taylor Swift,Cruel Summer,Global
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
195,196,-5,Bad Bunny - Me Porto Bonito (w/ Chencho Corleone),573,2,(x5),1271931,+27364,2516498,+1271931,1603148852,Bad Bunny,Me Porto Bonito (w/ Chencho Corleone),Global
196,197,-1,Milo j - M.A.I,12,141,,1268202,+27843,8881285,-156271,15408948,Milo j,M.A.I,Global
197,198,RE,The Weeknd - The Hills,1233,3,(x8),1267948,,7417058,-69406,1130011581,The Weeknd,The Hills,Global
198,199,RE,Maroon 5 - Maps,516,5,,1267049,,5021073,-24774,283949831,Maroon 5,Maps,Global


In [None]:
def export(df):
    df.to_csv

In [98]:
top_200_songs_by_country = pd.concat(dfs, ignore_index=True)

maybe add today streams and total streams

In [99]:
top_200_songs_by_country.to_csv("top_200_songs_by_country.csv", index=False)