In [1]:
import pandas as pd
from bs4 import BeautifulSoup
import requests

In [2]:
soup = BeautifulSoup(requests.get('https://kworb.net/spotify/').text)

In [3]:
link_list = [l['href'] for l in soup.find('table').find_all('a', href=True) 
 if ('weekly' in l['href']) and ('totals' not in l['href'])]

In [4]:
link_list[1:]

['country/us_weekly.html',
 'country/gb_weekly.html',
 'country/ad_weekly.html',
 'country/ar_weekly.html',
 'country/au_weekly.html',
 'country/at_weekly.html',
 'country/by_weekly.html',
 'country/be_weekly.html',
 'country/bo_weekly.html',
 'country/br_weekly.html',
 'country/bg_weekly.html',
 'country/ca_weekly.html',
 'country/cl_weekly.html',
 'country/co_weekly.html',
 'country/cr_weekly.html',
 'country/cy_weekly.html',
 'country/cz_weekly.html',
 'country/dk_weekly.html',
 'country/do_weekly.html',
 'country/ec_weekly.html',
 'country/eg_weekly.html',
 'country/sv_weekly.html',
 'country/ee_weekly.html',
 'country/fi_weekly.html',
 'country/fr_weekly.html',
 'country/de_weekly.html',
 'country/gr_weekly.html',
 'country/gt_weekly.html',
 'country/hn_weekly.html',
 'country/hk_weekly.html',
 'country/hu_weekly.html',
 'country/is_weekly.html',
 'country/in_weekly.html',
 'country/id_weekly.html',
 'country/ie_weekly.html',
 'country/il_weekly.html',
 'country/it_weekly.html',
 

## 🧼 Cleaning the data 

Cleaning the data frame to get the information that we need: Position, Artist_name, Song_title

In [5]:
for link in link_list[1:3]:
    df = pd.read_html(f'https://kworb.net/spotify/{link}')[0]

    if 'Artist and Title' in df.columns: 
        df[['artist_name', 'song_title']] = df['Artist and Title'].str.split('-', n=1, expand = True) 
        df.drop(columns=['Artist and Title'], inplace=True)
        df.drop(columns=['P+', 'Wks', 'Pk', '(x?)', 'Streams', 'Streams+', 'Total'], inplace = True) 

    display(df) 

Unnamed: 0,Pos,artist_name,song_title
0,1,¥$,"CARNIVAL (w/ Kanye West, Ty Dolla $ign)"
1,2,Noah Kahan,Stick Season
2,3,Benson Boone,Beautiful Things
3,4,21 Savage,redrum
4,5,Zach Bryan,I Remember Everything (w/ Kacey Musgraves)
...,...,...,...
195,196,Luke Combs,Beautiful Crazy
196,197,Travis Scott,"Nightcrawler (w/ Swae Lee, Chief Keef)"
197,198,J. Cole,Wet Dreamz
198,199,Nicki Minaj,Everybody (w/ Lil Uzi Vert)


Unnamed: 0,Pos,artist_name,song_title
0,1,Noah Kahan,Stick Season
1,2,Benson Boone,Beautiful Things
2,3,Sophie Ellis,Bextor - Murder On The Dancefloor
3,4,Teddy Swims,Lose Control
4,5,YG Marley,Praise Jah In The Moonlight
...,...,...,...
195,196,Dylan Gossett,Coal
196,197,The Cranberries,Zombie
197,198,Declan McKenna,Brazil
198,199,Andrew Underberg,"Stayed Gone (w/ Sam Haft, Christian Borle, Am..."


Only getting the data that we need: Position, Artist Name, Song Title. 

In [6]:
for link in link_list[1:2]:
    df = pd.read_html(f'https://kworb.net/spotify/{link}')[0]

    if 'Artist and Title' in df.columns: 
        df[['artist_name', 'song_title']] = df['Artist and Title'].str.split('-', n=1, expand = True) 
        df.drop(columns=['Artist and Title'], inplace=True)
        df.drop(columns=['P+', 'Wks', 'Pk', '(x?)', 'Streams', 'Streams+', 'Total'], inplace = True) 
    
     

In [36]:
td_tags = soup.find_all('td', class_="mp text")
filters = ['Global'] 
country_names_list = []

for i in range(0, len(td_tags), 2):
    country_name = td_tags[i].get_text()
    
    if country_name not in filters:
        country_names_list.append(country_name) 

print(country_names_list)

['United States', 'United Kingdom', 'Andorra', 'Argentina', 'Australia', 'Austria', 'Belarus', 'Belgium', 'Bolivia', 'Brazil', 'Bulgaria', 'Canada', 'Chile', 'Colombia', 'Costa Rica', 'Cyprus', 'Czech Republic', 'Denmark', 'Dominican Republic', 'Ecuador', 'Egypt', 'El Salvador', 'Estonia', 'Finland', 'France', 'Germany', 'Greece', 'Guatemala', 'Honduras', 'Hong Kong', 'Hungary', 'Iceland', 'India', 'Indonesia', 'Ireland', 'Israel', 'Italy', 'Japan', 'Kazakhstan', 'Latvia', 'Lithuania', 'Luxembourg', 'Malaysia', 'Malta', 'Mexico', 'Morocco', 'Netherlands', 'New Zealand', 'Nicaragua', 'Nigeria', 'Norway', 'Pakistan', 'Panama', 'Paraguay', 'Peru', 'Philippines', 'Poland', 'Portugal', 'Romania', 'Russia', 'Saudi Arabia', 'Singapore', 'Slovakia', 'South Africa', 'South Korea', 'Spain', 'Sweden', 'Switzerland', 'Taiwan', 'Thailand', 'Turkey', 'Ukraine', 'United Arab Emirates', 'Uruguay', 'Venezuela', 'Vietnam']


In [59]:
combined_df = pd.DataFrame()

for link, country_name in zip(link_list[1:], country_names_list):
    df = pd.read_html(f'https://kworb.net/spotify/{link}')[0]

    if 'Artist and Title' in df.columns: 
        df[['artist_name', 'song_title']] = df['Artist and Title'].str.split('-', n=1, expand = True) 
        df.drop(columns=['Artist and Title', 'P+', 'Wks', 'Pk', '(x?)', 'Streams', 'Streams+', 'Total'], inplace = True) 
        
    df['country'] = country_name
    
    combined_df = combined_df.append(df, ignore_index = True) 

combined_df.to_csv('country_charts.csv', index=False)

  combined_df = combined_df.append(df, ignore_index = True)
  combined_df = combined_df.append(df, ignore_index = True)
  combined_df = combined_df.append(df, ignore_index = True)
  combined_df = combined_df.append(df, ignore_index = True)
  combined_df = combined_df.append(df, ignore_index = True)
  combined_df = combined_df.append(df, ignore_index = True)
  combined_df = combined_df.append(df, ignore_index = True)
  combined_df = combined_df.append(df, ignore_index = True)
  combined_df = combined_df.append(df, ignore_index = True)
  combined_df = combined_df.append(df, ignore_index = True)
  combined_df = combined_df.append(df, ignore_index = True)
  combined_df = combined_df.append(df, ignore_index = True)
  combined_df = combined_df.append(df, ignore_index = True)
  combined_df = combined_df.append(df, ignore_index = True)
  combined_df = combined_df.append(df, ignore_index = True)
  combined_df = combined_df.append(df, ignore_index = True)
  combined_df = combined_df.append(df, i