### Spotify Top 200 Artist_US Daily Chart

In [1]:
import requests
import pandas as pd
from bs4 import BeautifulSoup
from urllib.parse import urljoin

KWORB_DAILY_URL = "https://kworb.net/spotify/country/us_daily.html"
BASE = "https://kworb.net"

In [2]:
def normalize_url(href):
    """
    Normalize track or artist href from Kworb.
    Ensures:
    - no "../"
    - includes /spotify/
    - full absolute URL
    """
    if not href:
        return None

    # convert relative ‚Üí absolute (removes ../)
    url = urljoin(BASE + "/", href)

    # auto fix missing /spotify/ for tracks
    if "/track/" in url and "/spotify/" not in url:
        url = url.replace("/track/", "/spotify/track/")

    # auto fix missing /spotify/ for artists
    if "/artist/" in url and "/spotify/" not in url:
        url = url.replace("/artist/", "/spotify/artist/")

    return url

In [3]:
def extract_links(td):
    links = td.find_all("a")
    artist_url = None
    track_url = None

    for a in links:
        raw = a.get("href", "")
        if not raw:
            continue

        norm = normalize_url(raw)

        if "/track/" in norm:
            track_url = norm
        elif "/artist/" in norm:
            artist_url = norm

    return artist_url, track_url

In [4]:
def get_artist_songs_url_from_track(track_url):
    try:
        r = requests.get(track_url)
        soup = BeautifulSoup(r.text, "html.parser")

        a = soup.find("a", href=lambda x: x and "/artist/" in x)
        if not a:
            return None

        href = a.get("href", "")
        norm = normalize_url(href)

        artist_id = norm.split("/")[-1].replace(".html", "")

        final_url = f"{BASE}/spotify/artist/{artist_id}_albums.html"
        return final_url

    except:
        return None

In [5]:
def get_us_daily_tracks():
    r = requests.get(KWORB_DAILY_URL)
    soup = BeautifulSoup(r.text, "html.parser")

    table = soup.find("table")
    rows = table.find_all("tr")[1:]  # skip header

    data = []

    for row in rows:
        cols = row.find_all("td")

        if len(cols) < 7:
            continue

        rank = cols[0].text.strip()

        # The "Artist and Title" column e.g. "Taylor Swift - The Fate of Ophelia"
        artist_title = cols[2].text.strip()

        # split on " - "
        if " - " in artist_title:
            artist_name, track_name = artist_title.split(" - ", 1)
        else:
            artist_name = artist_title
            track_name = ""

        # raw links
        _, track_url = extract_links(cols[2])

        # true artist_songs_url
        artist_albums_url = get_artist_songs_url_from_track(track_url)
        
        # Strer
        daily_streams = cols[6].text.strip()
        streams_plus = cols[7].text.strip() if len(cols) > 7 else None
        seven_day = cols[8].text.strip() if len(cols) > 8 else None
        seven_day_plus = cols[9].text.strip() if len(cols) > 9 else None
        total_streams = cols[10].text.strip()

        data.append({
            "rank": rank,
            "artist_name": artist_name,
            "track_name": track_name,
            "daily_streams": daily_streams,
            "streams_plus": streams_plus,
            "seven_day": seven_day,
            "seven_day_plus": seven_day_plus,
            "total_streams": total_streams,
            "artist_url": artist_albums_url,
            "track_url": track_url
        })

    return pd.DataFrame(data)

df_chart = get_us_daily_tracks()
df_chart.head(10)

Unnamed: 0,rank,artist_name,track_name,daily_streams,streams_plus,seven_day,seven_day_plus,total_streams,artist_url,track_url
0,1,Taylor Swift,The Fate of Ophelia,1341674,-56235,10048135,-144921,137000080,https://kworb.net/spotify/artist/06HL4z0CvFAxy...,https://kworb.net/spotify/track/53iuhJlwXhSER5...
1,2,HUNTR/X,"Golden (w/ Ejae, AUDREY NUNA, REI AMI, KPop De...",1289935,68272,9225980,-31950,249189200,https://kworb.net/spotify/artist/2yNNYQBChuox9...,https://kworb.net/spotify/track/1CPZ5BxNNd0n0n...
2,3,Olivia Dean,Man I Need,1261178,18782,8635735,120751,88428560,https://kworb.net/spotify/artist/00x1fYSGhdqSc...,https://kworb.net/spotify/track/1qbmS6ep2hbBRa...
3,4,Olivia Dean,So Easy (To Fall In Love),954986,1780,6450213,127544,36909384,https://kworb.net/spotify/artist/00x1fYSGhdqSc...,https://kworb.net/spotify/track/6sGIMrtIzQjdzN...
4,5,Taylor Swift,Opalite,902607,15398,6797306,-122781,106247004,https://kworb.net/spotify/artist/06HL4z0CvFAxy...,https://kworb.net/spotify/track/3yWuTOYDztXjZx...
5,6,Alex Warren,Ordinary,857833,84007,5755462,60335,285591972,https://kworb.net/spotify/artist/0fTSzq9jAh4c3...,https://kworb.net/spotify/track/2RkZ5LkEzeHGRs...
6,7,sombr,back to friends,829226,-42387,5942307,-22904,244046523,https://kworb.net/spotify/artist/4G9NDjRyZFDlJ...,https://kworb.net/spotify/track/0FTmksd2dxiE5e...
7,8,The Neighbourhood,Sweater Weather,723460,-8070,5017498,31029,976280232,https://kworb.net/spotify/artist/77SW9BnxLY8rJ...,https://kworb.net/spotify/track/2QjOHCTQ1Jl3za...
8,9,The Goo Goo Dolls,Iris,702524,14587,4524340,58388,297272926,https://kworb.net/spotify/artist/2sil8z5kiy4r7...,https://kworb.net/spotify/track/6Qyc6fS4DsZjB2...
9,10,Brenda Lee,Rockin' Around The Christmas Tree,691084,8144,4697247,-74798,469459111,https://kworb.net/spotify/artist/4cPHsZM98sKzm...,https://kworb.net/spotify/track/2EjXfH91m7f8Hi...


In [6]:
from datetime import datetime

today = datetime.now().strftime("%Y%m%d")

df_chart.to_csv(f"outputs_kworb/spotify_us_daily_artist_{today}.csv", index=False)

# ==== SAVE JSON ====
json_path = f"outputs_kworb/spotify_us_daily_artists_{today}.json"
df_chart.to_json(json_path, orient="records", force_ascii=False, indent=2)

print("üéâ JSON saved:", json_path)

üéâ JSON saved: outputs_kworb/spotify_us_daily_artists_20251120.json


In [1]:
import os
import requests
import pandas as pd
from bs4 import BeautifulSoup
from urllib.parse import urljoin
from datetime import datetime

In [2]:
# === Step 1: Load your existing Kworb daily artist file ===
today = datetime.now().strftime("%Y%m%d")

input_path = f"outputs_kworb/kworb_us_daily_artist_{today}.csv"
df_chart = pd.read_csv(input_path)

print("Loaded file:", input_path)
print("Total artists:", len(df_chart))

Loaded file: outputs_kworb/kworb_us_daily_artist_20251119.csv
Total artists: 200


In [3]:
BASE = "https://kworb.net"

def normalize_url(href):
    if not href:
        return None

    url = urljoin(BASE + "/", href)

    # Fix missing /spotify/
    if "/track/" in url and "/spotify/" not in url:
        url = url.replace("/track/", "/spotify/track/")
    if "/artist/" in url and "/spotify/" not in url:
        url = url.replace("/artist/", "/spotify/artist/")

    return url

In [4]:
def extract_artist_id(artist_url):
    """
    Extract Spotify artist_id from URL:
    https://kworb.net/spotify/artist/<artist_id>_songs.html
    """
    if not artist_url:
        return None
    filename = artist_url.split("/")[-1]
    artist_id = filename.replace("_songs.html", "").strip()
    return artist_id

In [5]:
def scrape_artist_top_songs(artist_name, artist_id, artist_url):
    print(f"   ‚Üí Scraping Top Songs for {artist_name}")

    url = artist_url
    r = requests.get(url, headers={"User-Agent": "Mozilla/5.0"})
    if r.status_code != 200:
        print(f"     ‚ö†Ô∏è Failed: HTTP {r.status_code}")
        return pd.DataFrame()

    soup = BeautifulSoup(r.text, "html.parser")

    # Kworb ÁöÑ Top Songs ÊòØÈ°µÈù¢‰∏äÁöÑÁ¨¨2‰∏™ table
    tables = soup.find_all("table")
    if len(tables) < 2:
        print("     ‚ö†Ô∏è ERROR: No Top Songs table found.")
        return pd.DataFrame()

    top_song_table = tables[2]   # Á¨¨ 2 ‰∏™Ë°®Ê†º

    rows = top_song_table.find_all("tr")[1:]  # Ë∑≥Ëøá header

    data = []
    for row in rows[:10]:  # Top 10 Songs
        cols = row.find_all("td")
        if len(cols) < 4:
            continue

        rank = cols[0].text.strip()
        song_title = cols[1].text.strip()
        streams_total = cols[2].text.strip()
        daily_streams = cols[3].text.strip()

        data.append({
            "artist_name": artist_name,
            "artist_id": artist_id,
            "rank": rank,
            "song_title": song_title,
            "streams_total": streams_total,
            "daily_streams": daily_streams,
            "artist_url": artist_url
        })

    return pd.DataFrame(data)

In [6]:
def scrape_all_artists_top10(df_chart):
    all_data = []

    artists = df_chart.dropna(subset=["artist_url"]).drop_duplicates("artist_url")

    print("Total unique artists:", len(artists))

    for _, row in artists.iterrows():
        artist_name = row["artist_name"]
        artist_url = normalize_url(row["artist_url"])
        artist_id = extract_artist_id(artist_url)

        try:
            df_artist = scrape_artist_top_songs(artist_name, artist_id, artist_url)
            if len(df_artist) > 0:
                all_data.append(df_artist)
        except Exception as e:
            print(f"‚ö†Ô∏è Error scraping {artist_name}: {e}")

    if len(all_data) == 0:
        return pd.DataFrame()

    return pd.concat(all_data, ignore_index=True)


In [7]:
df_top10 = scrape_all_artists_top10(df_chart)

output_path = f"outputs_kworb/artist_top10_songs_{today}.csv"
df_top10.to_csv(output_path, index=False)

print("üéâ DONE! File saved:", output_path)
print("Total rows:", len(df_top10))

Total unique artists: 115
   ‚Üí Scraping Top Songs for Taylor Swift
‚ö†Ô∏è Error scraping Taylor Swift: list index out of range
   ‚Üí Scraping Top Songs for HUNTR/X
‚ö†Ô∏è Error scraping HUNTR/X: list index out of range
   ‚Üí Scraping Top Songs for Olivia Dean
‚ö†Ô∏è Error scraping Olivia Dean: list index out of range
   ‚Üí Scraping Top Songs for Alex Warren
‚ö†Ô∏è Error scraping Alex Warren: list index out of range
   ‚Üí Scraping Top Songs for sombr
‚ö†Ô∏è Error scraping sombr: list index out of range
   ‚Üí Scraping Top Songs for The Neighbourhood
‚ö†Ô∏è Error scraping The Neighbourhood: list index out of range
   ‚Üí Scraping Top Songs for The Goo Goo Dolls
‚ö†Ô∏è Error scraping The Goo Goo Dolls: list index out of range
   ‚Üí Scraping Top Songs for Brenda Lee
‚ö†Ô∏è Error scraping Brenda Lee: list index out of range
   ‚Üí Scraping Top Songs for Radiohead
‚ö†Ô∏è Error scraping Radiohead: list index out of range
   ‚Üí Scraping Top Songs for The Mar√É¬≠as
‚ö†Ô∏è Error scraping

KeyboardInterrupt: 

In [8]:
import requests
from bs4 import BeautifulSoup

url = "https://kworb.net/spotify/artist/06HL4z0CvFAxyc27GXpf02_songs.html"
headers = {"User-Agent": "Mozilla/5.0"}

r = requests.get(url, headers=headers)
soup = BeautifulSoup(r.text, "html.parser")

tables = soup.find_all("table")
print("Tables found:", len(tables))

print("\n===== first 500 chars =====")
print(r.text[:500])


Tables found: 2

===== first 500 chars =====
<!DOCTYPE html>
<html lang="en"><head><title>Taylor Swift - Spotify Top Songs</title>
<link rel="stylesheet" type="text/css" href="/css/standard0002.css" />
<link rel="stylesheet" type="text/css" href="/css/tables0010.css" />
<link rel="stylesheet" type="text/css" href="/css/menus0001.css" />
<meta http-equiv="Content-Type" content="text/html; charset=UTF-8">
<meta name="description" content="A website that collects and analyzes music data from around the world. All of the charts, sales and stre


In [1]:
pip install selenium webdriver-manager

Note: you may need to restart the kernel to use updated packages.


In [2]:
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from webdriver_manager.chrome import ChromeDriverManager
from bs4 import BeautifulSoup
import time

def fetch_html_selenium(url):
    options = webdriver.ChromeOptions()
    options.add_argument("--headless=new")
    options.add_argument("--disable-gpu")
    options.add_argument("--window-size=1920,1080")
    options.add_argument("--no-sandbox")
    options.add_argument("--disable-dev-shm-usage")

    driver = webdriver.Chrome(service=Service(ChromeDriverManager().install()), options=options)

    driver.get(url)
    time.sleep(2)  # allow JS rendering

    html = driver.page_source
    driver.quit()
    return html

In [3]:
url = "https://kworb.net/spotify/artist/06HL4z0CvFAxyc27GXpf02_songs.html"
html = fetch_html_selenium(url)

soup = BeautifulSoup(html, "html.parser")
tables = soup.find_all("table")

print("Tables found:", len(tables))

Tables found: 2


In [4]:
def scrape_artist_top_songs_selenium(artist_name, artist_id, artist_url):
    print(f"Scraping Top Songs for {artist_name} ...")

    html = fetch_html_selenium(artist_url)
    soup = BeautifulSoup(html, "html.parser")
    tables = soup.find_all("table")

    if len(tables) < 3:
        print("‚ö†Ô∏è Top Songs table missing!")
        return pd.DataFrame()

    top_table = tables[2]
    rows = top_table.find_all("tr")[1:]

    songs = []
    for row in rows[:10]:
        cols = row.find_all("td")
        if len(cols) < 4:
            continue

        songs.append({
            "artist_name": artist_name,
            "artist_id": artist_id,
            "rank": cols[0].text.strip(),
            "song_title": cols[1].text.strip(),
            "streams_total": cols[2].text.strip(),
            "daily_streams": cols[3].text.strip(),
            "artist_url": artist_url
        })

    return pd.DataFrame(songs)

In [10]:
import requests
from bs4 import BeautifulSoup

url = "https://kworb.net/spotify/artist/06HL4z0CvFAxyc27GXpf02_songs.html"

headers = {
    "User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/142.0.0.0 Safari/537.36",
    "Accept": "image/avif,image/webp,image/apng,image/svg+xml,image/*,*/*;q=0.8",
    "Accept-Language": "zh-CN,zh;q=0.9",
    "Cookie": "DSID=AEhM4Mc962PI6BW1oNIw0j0cJXb0OYMDAUmU3XDRL4V7Pwmn1S40p2QByuGRiP2QSpPVEug_3RqsqXnNYZzW0DcbD-OPiKm6Uco8cmPpISSBY2KaqMk9r1ozwl16wDSVUEmR7ofD_4uF5vKOph-Y72R9R68HaBPQYRE-WpQUYwlROEjdZvpgLqpMa3f2X0iiP5pvPKuBmm9ieF0FcFVriQBWmruIPb-lskp0mEMF6Rq4gi9Rbcz3I1XmGCO1TNzFw74aQKNB4dvBmK-CLFk0fQEoyFWXOV4x-ZjivrdTapuiIIpGlNjA858; ar_debug=1; IDE=AHWqTUmvIq06TXInFKPSC59elllUkCbFx00g4qGZeN8nao0SP0UF7ec58PQSD_mT4_I; APC=AfxxVi5DECpIQVYAsPqu7dPM-yGt3fS3XQKATpM6eGG8w1s-PFib6A",   # ÂøÖÈ°ª
}


In [13]:
r = requests.get(url, headers=headers)
soup = BeautifulSoup(r.text, "html.parser")

print("Tables found:", len(soup.find_all("table")))

Tables found: 2


In [None]:
### Top 10 Albums

### Spotify Top 10 Albums per each artist in US daily chart (200 artists)

In [7]:
import os
import requests
import pandas as pd
from bs4 import BeautifulSoup
from urllib.parse import urljoin
from datetime import datetime

In [8]:
# === Step 1: Load your existing Kworb daily artist file ===
today = datetime.now().strftime("%Y%m%d")

input_path = f"outputs_kworb/spotify_us_daily_artist_{today}.csv"
df_chart = pd.read_csv(input_path)

print("Loaded file:", input_path)
print("Total artists:", len(df_chart))

Loaded file: outputs_kworb/spotify_us_daily_artist_20251120.csv
Total artists: 200


In [9]:
BASE = "https://kworb.net"

def normalize_url(href):
    if not href:
        return None

    url = urljoin(BASE + "/", href)

    # Fix missing /spotify/
    if "/track/" in url and "/spotify/" not in url:
        url = url.replace("/track/", "/spotify/track/")
    if "/artist/" in url and "/spotify/" not in url:
        url = url.replace("/artist/", "/spotify/artist/")

    return url

In [10]:
def extract_artist_id(artist_url):
    """
    Extract Spotify artist_id from URL:
    https://kworb.net/spotify/artist/<artist_id>_albums.html
    """
    if not artist_url:
        return None
    filename = artist_url.split("/")[-1]
    artist_id = filename.replace("_albums.html", "").strip()
    return artist_id


In [11]:
def scrape_artist_top10(artist_name, artist_id, artist_url):
    try:
        r = requests.get(artist_url, headers={"User-Agent": "Mozilla/5.0"})
        soup = BeautifulSoup(r.text, "html.parser")

        # Âè™‰ºöÊâæÂà∞‰∏ÄÂº† tableÔºåËøôÊòØÊ≠£Á°ÆÁöÑ
        table = soup.find("table")
        if table is None:
            print(f"‚ö†Ô∏è No table for {artist_name}")
            return pd.DataFrame([])

        rows = table.find_all("tr")[1:]  # skip header

        results = []
        for tr in rows[:10]:
            tds = tr.find_all("td")

            album_title = tds[0].text.strip()
            streams = tds[1].text.strip()
            daily = tds[2].text.strip()

            results.append({
                "artist_name": artist_name,
                "artist_id": artist_id,
                "album_title": album_title,
                "streams": streams,
                "daily": daily,
                "artist_url": artist_url
            })

        return pd.DataFrame(results)

    except Exception as e:
        print(f"‚ùó Error scraping {artist_name}: {e}")
        return pd.DataFrame([])


In [12]:
def scrape_all_artists_top10(df_chart):
    all_rows = []

    # Ensure deduplication by artist_url
    artists = df_chart.dropna(subset=["artist_url"]).drop_duplicates("artist_url")

    print("Total unique artist URLs:", len(artists))

    for _, row in artists.iterrows():
        artist_name = row["artist_name"]
        artist_url = normalize_url(row["artist_url"])
        artist_id = extract_artist_id(artist_url)

        print(f"üéµ Scraping Top 10 for {artist_name} ({artist_id})...")

        df_artist = scrape_artist_top10(artist_name, artist_id, artist_url)
        all_rows.append(df_artist)

    return pd.concat(all_rows, ignore_index=True)


In [13]:
df_top10 = scrape_all_artists_top10(df_chart)

output_path = f"outputs_kworb/spotify_artist_top10_ablums_{today}.csv"
df_top10.to_csv(output_path, index=False)

print("üéâ DONE! File saved:", output_path)
print("Total rows:", len(df_top10))

Total unique artist URLs: 115
üéµ Scraping Top 10 for Taylor Swift (06HL4z0CvFAxyc27GXpf02)...
üéµ Scraping Top 10 for HUNTR/X (2yNNYQBChuox9A5Ka93BIn)...
üéµ Scraping Top 10 for Olivia Dean (00x1fYSGhdqScXBRpSj3DW)...
üéµ Scraping Top 10 for Alex Warren (0fTSzq9jAh4c36UVb4V7CB)...
üéµ Scraping Top 10 for sombr (4G9NDjRyZFDlJKMRL8hx3S)...
üéµ Scraping Top 10 for The Neighbourhood (77SW9BnxLY8rJ0RciFqkHh)...
üéµ Scraping Top 10 for The Goo Goo Dolls (2sil8z5kiy4r76CRTXxBCA)...
üéµ Scraping Top 10 for Brenda Lee (4cPHsZM98sKzmV26wlwD2W)...
üéµ Scraping Top 10 for Radiohead (4Z8W4fKeB5YxbusRsdQVPb)...
üéµ Scraping Top 10 for The Mar√É¬≠as (2sSGPbdZJkaSE2AbcGOACx)...
üéµ Scraping Top 10 for Mariah Carey (4iHNK0tOyZPYnBU7nGAgpQ)...
üéµ Scraping Top 10 for Tame Impala (5INjqkS1o8h1imAzPqGZBb)...
üéµ Scraping Top 10 for Wham! (5lpH0xAS4fVfLkACg9DAuM)...
üéµ Scraping Top 10 for Billie Eilish (6qqNVTkY8uBg9cP3Jd7DAH)...
üéµ Scraping Top 10 for Fleetwood Mac (08GQAI4eElDnROBrJRGE0

In [15]:
# ==== SAVE JSON ====
json_path = f"outputs_kworb/spotify_artist_top10_albums_{today}.json"
df_top10.to_json(json_path, orient="records", force_ascii=False, indent=2)

print("üéâ JSON saved:", json_path)

üéâ JSON saved: outputs_kworb/spotify_artist_top10_albums_20251120.json


### Spotify top artists by monthly listeners

In [8]:
import requests
import pandas as pd
from bs4 import BeautifulSoup
from datetime import datetime

URL = "https://kworb.net/spotify/listeners.html"
HEADERS = {"User-Agent": "Mozilla/5.0"}

def scrape_monthly_listeners():
    r = requests.get(URL, headers=HEADERS)
    soup = BeautifulSoup(r.text, "html.parser")

    table = soup.find("table")
    if table is None:
        raise ValueError("‚ùó No table found on listeners page!")

    rows = table.find_all("tr")[1:]  # skip header

    data = []
    for tr in rows:
        tds = tr.find_all("td")
        if len(tds) < 6:
            continue

        rank = tds[0].text.strip()

        # Artist name is inside <a> tag
        artist_name = tds[1].text.strip()
        artist_url = tds[1].find("a")["href"] if tds[1].find("a") else None

        listeners = tds[2].text.strip()
        daily_change = tds[3].text.strip()
        peak_rank = tds[4].text.strip()
        peak_listeners = tds[5].text.strip()

        data.append({
            "rank": rank,
            "artist_name": artist_name,
            "artist_url": "https://kworb.net/spotify/" + artist_url if artist_url else None,
            "listeners": listeners,
            "daily_change": daily_change,
            "peak_rank": peak_rank,
            "peak_listeners": peak_listeners
        })

    return pd.DataFrame(data)


# ==== RUN & SAVE ====
df = scrape_monthly_listeners()

today = datetime.now().strftime("%Y%m%d")
output_path = f"outputs_kworb/spotify_monthly_listeners_{today}.csv"

df.to_csv(output_path, index=False)

print("üéâ DONE! File saved:", output_path)
print("Total rows:", len(df))


üéâ DONE! File saved: outputs_kworb/spotify_monthly_listeners_20251120.csv
Total rows: 2500


In [9]:
# ==== SAVE JSON ====
json_path = f"outputs_kworb/spotify_monthly_listeners_{today}.json"
df.to_json(json_path, orient="records", force_ascii=False, indent=2)

print("üéâ JSON saved:", json_path)

üéâ JSON saved: outputs_kworb/spotify_monthly_listeners_20251120.json
