In [1]:
!pip install tqdm
import requests
import pandas as pd
import time
from tqdm import tqdm
from datetime import datetime



In [2]:
# 1. Spotify API Auth

In [3]:
CLIENT_ID = "2347f12232864def931146701e2b2d8a"
CLIENT_SECRET = "a98e131897cb4c7b97d0d51564988f51"

def get_token(client_id, client_secret):
    url = "https://accounts.spotify.com/api/token"
    headers = {"Content-Type": "application/x-www-form-urlencoded"}
    data = {"grant_type": "client_credentials"}

    response = requests.post(url, headers=headers, data=data, auth=(client_id, client_secret))
    response.raise_for_status()
    return response.json()["access_token"]

token = get_token(CLIENT_ID, CLIENT_SECRET)
headers = {"Authorization": f"Bearer {token}"}

In [4]:
# 2. API functions

In [5]:
def get_artist_info(artist_id):
    url = f"https://api.spotify.com/v1/artists/{artist_id}"
    r = requests.get(url, headers=headers)
    if r.status_code != 200:
        return None
    return r.json()

def get_artist_top_tracks(artist_id, market="US"):
    url = f"https://api.spotify.com/v1/artists/{artist_id}/top-tracks?market={market}"
    r = requests.get(url, headers=headers)
    if r.status_code != 200:
        return None
    return r.json().get("tracks", [])

def get_artist_albums(artist_id, limit=10):
    url = f"https://api.spotify.com/v1/artists/{artist_id}/albums?include_groups=album&limit={limit}"
    r = requests.get(url, headers=headers)
    if r.status_code != 200:
        return None
    return r.json().get("items", [])

In [6]:
# 3. Load Kworb CSV

In [7]:
today = datetime.now().strftime("%Y%m%d")
df = pd.read_csv(f"outputs_kworb/spotify_monthly_listeners_{today}.csv")

In [8]:
# 4. Extract Spotify Artist ID from Kworb URL

In [9]:
def extract_artist_id(kworb_url):
    """
    Kworb format:
    https://kworb.net/artist/<artist_id>_albums.html
    https://kworb.net/artist/<artist_id>_songs.html
    """
    if pd.isna(kworb_url):
        return None
    
    filename = kworb_url.split("/")[-1]  # e.g. 1Xyo4u8uXC1ZmMpatF05PJ_albums.html
    artist_id = (filename.replace("_albums.html", "")
                          .replace("_songs.html", ""))
    return artist_id.strip()

df["artist_id"] = df["artist_url"].apply(extract_artist_id)

# Drop rows without valid IDs
df = df.dropna(subset=["artist_id"])

artist_ids = df["artist_id"].unique().tolist()
print("Total valid artist IDs:", len(artist_ids))


Total valid artist IDs: 2500


In [10]:
# 5. Containers

In [11]:
artist_info_list = []
top_tracks_list = []
albums_list = []

In [12]:
# 6. Loop Artists

In [13]:
for artist_id in tqdm(artist_ids):

    # --- Artist Info ---
    info = get_artist_info(artist_id)
    if info:
        artist_info_list.append({
            "artist_id": artist_id,
            "name": info.get("name"),
            "genres": info.get("genres"),
            "followers": info.get("followers", {}).get("total"),
            "popularity": info.get("popularity")
        })

    # --- Top Tracks ---
    tracks = get_artist_top_tracks(artist_id)
    if tracks:
        for t in tracks[:10]:
            top_tracks_list.append({
                "artist_id": artist_id,
                "track_id": t.get("id"),
                "track_name": t.get("name"),
                "popularity": t.get("popularity"),
                "album_name": t.get("album", {}).get("name"),
                "release_date": t.get("album", {}).get("release_date")
            })

    # --- Top Albums ---
    albums = get_artist_albums(artist_id)
    if albums:
        for a in albums[:10]:
            albums_list.append({
                "artist_id": artist_id,
                "album_id": a.get("id"),
                "album_name": a.get("name"),
                "release_date": a.get("release_date"),
                "total_tracks": a.get("total_tracks")
            })

    time.sleep(0.1)  # avoid API rate limit

100%|â–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆ| 2500/2500 [12:18<00:00,  3.38it/s]


In [None]:
# 7. Save Results

In [14]:
today = datetime.now().strftime("%Y%m%d")

pd.DataFrame(artist_info_list).to_csv(f"outputs_spotify/spotify_artist_info_{today}.csv", index=False)
pd.DataFrame(top_tracks_list).to_csv(f"outputs_spotify/spotify_artist_tracks_{today}.csv", index=False)
pd.DataFrame(albums_list).to_csv(f"outputs_spotify/spotify_artist_albums_{today}.csv", index=False)

# optional json
pd.DataFrame(artist_info_list).to_json(f"outputs_spotify/spotify_artist_info_{today}.json", orient="records", indent=2)
pd.DataFrame(top_tracks_list).to_json(f"outputs_spotify/spotify_artist_tracks_{today}.json", orient="records", indent=2)
pd.DataFrame(albums_list).to_json(f"outputs_spotify/spotify_artist_albums_{today}.json", orient="records", indent=2)

print("ðŸŽ‰ ALL DONE â€” All Spotify data exported!")

ðŸŽ‰ ALL DONE â€” All Spotify data exported!


In [19]:
len(artist_info_list)

1668

In [20]:
len(top_tracks_list)

16595

In [21]:
len(albums_list)

10740