In [None]:
import spotipy
from spotipy.oauth2 import SpotifyClientCredentials


sp = spotipy.Spotify(
    auth_manager=SpotifyClientCredentials(
        client_id="client_id",
        client_secret="client_secret"
    ),
    requests_timeout=5
)

In [None]:
import requests
import time
from typing import Callable


class ApiExecutor:
    """
    A reusable API executor with rate limiting and retry mechanism.
    """
    
    def __init__(self, requests_per_second: int = 1, max_retries: int = 3):
        """
        Initialize the executor with rate limiting and retry settings.
        
        Args:
            requests_per_second (int): Maximum allowed requests per second.
            max_retries (int): Maximum number of retries for a failed request.
        """
        self.requests_per_second = requests_per_second
        self.max_retries = max_retries
        self.__request_delay = 1 / requests_per_second
    
    def execute_request(self, api_function: Callable, *args, **kwargs):
        """
        Execute an API function with rate limiting and retries.
        
        Args:
            api_function (Callable): A function that performs the API request.
            *args: Positional arguments for the API function.
            **kwargs: Keyword arguments for the API function.
        
        Returns:
            Any: The response from the API function, or None if all retries fail.
        """
        retries = 0
        while retries <= self.max_retries:
            try:
                # Apply rate limiting
                time.sleep(self.__request_delay)
                # Execute the API function
                return api_function(*args, **kwargs)
            except requests.exceptions.HTTPError as e:
                if e.response.status_code == 429:  # Too Many Requests
                    retry_after = int(e.response.headers.get('Retry-After', 30))  # Default to 30 seconds
                    print(f"Rate limit exceeded. Retrying after {retry_after} seconds...")
                    time.sleep(retry_after)
                    retries += 1
                else:
                    print(f"HTTP error: {e}")
                    break
            except Exception as e:
                print(f"Unexpected error: {e}")
                break
        return None

In [27]:
executor = ApiExecutor(requests_per_second=1, max_retries=3)

In [28]:
import csv
import os


# Append data to CSV
def append_to_csv(file_path: str, data: dict):
    """
    Append a single row of data to the CSV file.
    
    Args:
        file_path (str): Path to the CSV file.
        data (dict): Data to append.
    """
    file_exists = os.path.isfile(file_path)
    with open(file_path, mode='a', newline='', encoding='utf-8') as file:
        writer = csv.DictWriter(file, fieldnames=data.keys())
        if not file_exists:
            writer.writeheader()  # Write header if file doesn't exist
        writer.writerow(data)

In [None]:
def replace_in_csv(file_path: str, data: dict, match_condition: Callable[[dict, dict], bool]):
    """
    Replace a row in the CSV file if it matches the specified condition.
    
    Args:
        file_path (str): Path to the CSV file.
        data (dict): The new data to write.
        match_condition (Callable[[dict, dict], bool]): A function that takes two dictionaries (existing row and new data) 
                                                        and returns True if the row should be replaced.
    """
    updated = False
    temp_file_path = f"{file_path}.tmp"
    
    # Ensure the file exists
    if not os.path.isfile(file_path):
        with open(file_path, mode='w', newline='', encoding='utf-8') as file:
            writer = csv.DictWriter(file, fieldnames=data.keys())
            writer.writeheader()
            writer.writerow(data)
        return
    
    # Read and replace rows
    with open(file_path, mode='r', newline='', encoding='utf-8') as file, open(temp_file_path, mode='w', newline='', encoding='utf-8') as temp_file:
        reader = csv.DictReader(file)
        writer = csv.DictWriter(temp_file, fieldnames=reader.fieldnames)
        
        writer.writeheader()
        for row in reader:
            if match_condition(row, data):
                writer.writerow(data)  # Replace the row with new data
                updated = True
            else:
                writer.writerow(row)  # Keep the original row
    
    # If no match was found, append the new data
    if not updated:
        with open(temp_file_path, mode='a', newline='', encoding='utf-8') as temp_file:
            writer = csv.DictWriter(temp_file, fieldnames=data.keys())
            writer.writerow(data)
    
    # Replace the original file with the updated file
    os.replace(temp_file_path, file_path)


def replace_or_append_rows_in_csv(
    file_path: str,
    rows: list[dict],
    match_condition: Callable[[dict, dict], bool],
    max_retries: int = 3
):
    """
    Replace or append multiple rows in the CSV file based on a matching condition.
    
    Args:
        file_path (str): Path to the CSV file.
        rows (List[dict]): A list of rows (dictionaries) to write.
        match_condition (Callable[[dict, dict], bool]): A function that takes two dictionaries 
                                                        (existing row and new data) and returns 
                                                        True if the row should be replaced.
        max_retries (int): Maximum number of retries if file access is denied.
    """
    temp_file_path = f"{file_path}.tmp"
    retries = 0

    while retries <= max_retries:
        try:
            # Ensure the file exists
            file_exists = os.path.isfile(file_path)
            existing_rows = []
            
            # Read the existing file if it exists
            if file_exists:
                with open(file_path, mode='r', newline='', encoding='utf-8') as file:
                    reader = csv.DictReader(file)
                    existing_rows = list(reader)
            
            # Create a mapping of which rows are updated
            updated_indices = set()
            for new_row in rows:
                for i, existing_row in enumerate(existing_rows):
                    if match_condition(existing_row, new_row):
                        existing_rows[i] = new_row  # Replace existing row
                        updated_indices.add(i)
                        break
                else:
                    # If no match, append as a new row
                    existing_rows.append(new_row)
            
            # Write all rows to the temporary file
            with open(temp_file_path, mode='w', newline='', encoding='utf-8') as temp_file:
                writer = csv.DictWriter(temp_file, fieldnames=rows[0].keys())
                writer.writeheader()
                writer.writerows(existing_rows)
            
            # Replace the original file with the updated file
            os.replace(temp_file_path, file_path)
            break  # Exit the loop if successful
        
        except PermissionError as e:
            retries += 1
            print(f"Permission error: {e}. Retrying {retries}/{max_retries}...")
            if retries > max_retries:
                raise e  # Reraise the exception after exhausting retries


def match_by_id_group_list_name(existing_row, new_data):
    return (existing_row['id'] == new_data['id']) & (existing_row['group_wiki_list_name'] == new_data['group_wiki_list_name'])


def match_by_id(existing_row, new_data):
    return existing_row['id'] == new_data['id']

In [51]:
from datetime import datetime, timezone


# Fetch artist data from Spotify
def fetch_artist(search_name: str, sp: spotipy.Spotify, market: str | None = 'KR', genre: str | None = 'k-pop', limit: int = 1) -> dict:
    """
    Search for an artist on Spotify and fetch their data.
    
    Args:
        search_name (str): Name of the artist to search.
        sp (spotipy.Spotify): Spotify client instance.
        market (str | None): Spotify market code (e.g., 'KR' for South Korea). Default is 'KR'.
        genre (str | None): Genre filter for the search. Default is 'k-pop'.
        limit (int): Number of results to fetch. Default is 1.
        
    Returns:
        dict: Artist data or None if not found.
    """
    # Combine search filters for precision
    query = f'artist:{search_name}'
    if genre:
        query += f' genre:{genre}'
    
    if market:
        results = sp.search(q=query, type='artist', limit=limit, market=market)
    else:
        results = sp.search(q=query, type='artist', limit=limit)
    if results['artists']['items']:
        artist = results['artists']['items'][0]
        return {
            'url': artist['external_urls']['spotify'],
            'followers': artist['followers'].get('total', 0),
            'genres': artist.get('genres', []),
            'id': artist['id'],
            'name': artist['name'],
            'popularity': artist['popularity'],
            'group_wiki_list_name': search_name,
            'updated_at': datetime.now(timezone.utc)
        }
    return None

In [None]:
import pandas as pd

group_wiki_df = pd.read_csv('./data/group_info_from_2000.csv')
group_wiki_df.head()

Unnamed: 0,start_year,group,wiki_url,decade,group_wiki_name,bg_start_year,bg_end_year,bg_company,bg_hanja,bg_hangul,bg_members
0,2000,Chakra,https://en.wikipedia.org/wiki/Chakra_(group),2000s,Chakra_(group),2000.0,2006.0,,,,"['Hwangbo', 'Bona', 'Eun', 'Ryeowon', 'Eani']"
1,2000,Papaya,https://en.wikipedia.org/wiki/Papaya_(group),2000s,Papaya_(group),2000.0,,,,,"['Kang Kyoung-ah', 'Joo Yeun-jung', 'Cho Hye-k..."
2,2000,UN,https://en.wikipedia.org/wiki/UN_(band),2000s,UN_(band),2000.0,2005.0,NH Planning,,,[]
3,2001,5tion,https://en.wikipedia.org/wiki/5tion,2000s,5tion,2001.0,,,,,"['Il Kwon', 'Chang Woo', 'Ju Ho', 'Jun Young',..."
4,2001,Jewelry,https://en.wikipedia.org/wiki/Jewelry_(group),2000s,Jewelry_(group),2001.0,,Star Empire,,,"['Kim Ye-won', 'Baby J', 'Kim Eunjung', 'Jung ..."


In [None]:
# Fetch artist info from spotify
artist_data_path = './data/spotify_artist_data.csv'
spotify_artist_data = pd.read_csv(artist_data_path) if os.path.isfile(artist_data_path) else None
artist_names = group_wiki_df['group'].drop_duplicates().copy()

if spotify_artist_data is not None:
    spotify_artist_data['updated_at'] = pd.to_datetime(spotify_artist_data['updated_at'])

index = 0
for artist_name in artist_names:
    index += 1
    print(f'Processing data for {artist_name} ({index} / {len(artist_names)})...')
    if spotify_artist_data is not None and 'updated_at' in spotify_artist_data and artist_name in spotify_artist_data['group_wiki_list_name'].tolist():
        artist_id = spotify_artist_data[spotify_artist_data['group_wiki_list_name'] == artist_name]['id'].iloc[0]
        same_artist_id_data = spotify_artist_data[spotify_artist_data['id'] == artist_id]
        if same_artist_id_data['updated_at'].iloc[0] >= pd.to_datetime(datetime.now(timezone.utc)) - pd.Timedelta(hours=1):
            if same_artist_id_data['group_wiki_list_name'].iloc[0] != artist_name:
                print(f"Copy same data for {artist_name} just change group wili list name...")
                artist_data = same_artist_id_data.iloc[0,:].copy().to_dict()
                artist_data['group_wiki_list_name'] = artist_name
                replace_in_csv(artist_data_path, artist_data, match_by_id_group_list_name)
            else:
                print(f"Will not be updated, because no more ago have been updated")
            continue
    
    print(f"Fetching data for {artist_name}...")
    artist_data = executor.execute_request(fetch_artist, artist_name, sp)
    if artist_data:
        replace_in_csv(artist_data_path, artist_data, match_by_id_group_list_name)
        print(f"Data for '{artist_name}' saved.")
    else:
        print(f"No data found for '{artist_name}'.")

Processing data for Chakra (1 / 511)...
Will not be updated, because no more ago have been updated
Processing data for Papaya (2 / 511)...
Fetching data for Papaya...
No data found for 'Papaya'.
Processing data for UN (3 / 511)...
Will not be updated, because no more ago have been updated
Processing data for 5tion (4 / 511)...
Will not be updated, because no more ago have been updated
Processing data for Jewelry (5 / 511)...
Will not be updated, because no more ago have been updated
Processing data for jtL (6 / 511)...
Will not be updated, because no more ago have been updated
Processing data for K'Pop (7 / 511)...
Fetching data for K'Pop...
No data found for 'K'Pop'.
Processing data for KISS (8 / 511)...
Will not be updated, because no more ago have been updated
Processing data for M.I.L.K (9 / 511)...
Fetching data for M.I.L.K...
No data found for 'M.I.L.K'.
Processing data for Black Beat (10 / 511)...
Fetching data for Black Beat...
No data found for 'Black Beat'.
Processing data fo

In [None]:
import pandas as pd

spotify_artist_data = pd.read_csv('./data/spotify_artist_data.csv')
spotify_artist_data.head()

Unnamed: 0,url,followers,genres,id,name,popularity,group_wiki_list_name,updated_at
0,https://open.spotify.com/artist/0yn2cFHoVqRiZO...,6,['classic konkani pop'],0yn2cFHoVqRiZOgkVs58Uc,Deb Chakravarty,0,Chakra,2024-12-04 11:49:38.034351+00:00
1,https://open.spotify.com/artist/0jhgI3xU8n2o1W...,12572,"['k-indie', 'korean city pop']",0jhgI3xU8n2o1W6EOw9dIf,Light & Salt,30,UN,2024-12-04 11:49:40.147774+00:00
2,https://open.spotify.com/artist/6MJCgDOw7EfTJJ...,2068,['k-pop boy group'],6MJCgDOw7EfTJJtKPHXAcD,5tion,10,5tion,2024-12-04 11:49:41.219361+00:00
3,https://open.spotify.com/artist/01iyQzyns6Ab0L...,7688,['classic k-pop'],01iyQzyns6Ab0LxjYvHcg9,Jewelry,10,Jewelry,2024-12-04 11:49:42.297991+00:00
4,https://open.spotify.com/artist/1QxDhdpQDAxevR...,9118,"['classic k-pop', 'k-rap']",1QxDhdpQDAxevRaOWUDjPC,JTL,30,jtL,2024-12-04 11:49:43.379053+00:00


In [None]:
def fetch_artist_albums(
    artist_id: str,
    sp: spotipy.Spotify,
    executor: ApiExecutor,
    include_groups: list[str] = ['single', 'album'],
    limit: int = 20,
    album_data_path: str = 'artist_albums.csv'
) -> pd.DataFrame:
    """
    Fetch all albums of an artist using Spotify API.
    
    Args:
        artist_id (str): Spotify artist ID.
        sp (spotipy.Spotify): Spotify client instance.
        executor (ApiExecutor): API executor for rate limiting and retries.
        include_groups (list): Album types to include, default ['single', 'album'].
        limit (int): Number of results to fetch per API call, default is 20.
        album_data_path (str): Filepath for saving results, default 'artist_albums.csv'.
    
    Returns:
        pd.DataFrame: Combined DataFrame of all album data.
    """
    offset = 0
    all_albums = []
    processed_album_ids = set()
    total_albums = 0
    new_albums_count = 0  # Track number of new albums added
    
    # Load existing CSV data to avoid duplicates
    try:
        existing_albums = pd.read_csv(album_data_path)
        processed_album_ids = set(existing_albums['id'])
        total_albums = len(existing_albums)
        print(f"Loaded existing data with {len(processed_album_ids)} albums.")
    except FileNotFoundError:
        print("No existing album data found. Starting fresh.")
    
    while True:
        try:
            # Prepare parameters for the API request
            params = {
                'include_groups': ','.join(include_groups),
                'limit': limit,
                'offset': offset
            }
            
            # Execute API request with rate limiting and retries
            response = executor.execute_request(
                lambda: sp.artist_albums(artist_id, **params)
            )
            
            albums = response.get('items', [])
            api_total = response.get('total', 0)
            next_page = response.get('next', None)
            
            if api_total == total_albums:
                print("No new albums detected. Skipping fetch.")
                break
            
            if not albums:
                print("No more albums to fetch.")
                break  # Exit the loop if no more results
            
            for album in albums:
                if album['id'] in processed_album_ids:
                    continue  # Skip already processed albums
                
                # Extract album data
                album_data = {
                    'url': album['external_urls'].get('spotify'),
                    'id': album['id'],
                    'name': album['name'],
                    'album_type': album['album_type'],
                    'total_tracks': album['total_tracks'],
                    'available_markets': album['available_markets'],
                    'release_date': album['release_date'],
                    'release_date_precision': album['release_date_precision'],
                    'artist_id': artist_id,
                    'updated_at': datetime.now(timezone.utc)
                }
                all_albums.append(album_data)
                processed_album_ids.add(album['id'])
                new_albums_count += 1  # Increment new albums count
                
                # Save album data to CSV immediately
                append_to_csv(album_data_path, album_data)
                
            print(f"Fetched {len(albums)} albums (offset: {offset}).")
            if not next_page:
                print("No next page available. Stopping fetch.")
                break
            
            offset += limit
        
        except Exception as e:
            print(f"Error fetching albums for artist {artist_id} at offset {offset}: {e}")
            break
    
    # Print the number of new albums added
    print(f"Finished fetching albums. Added {new_albums_count} new albums.")
    
    # Convert all albums to DataFrame and return
    albums_df = pd.DataFrame(all_albums)
    return albums_df

In [None]:
all_albums_df = pd.DataFrame()
executor.requests_per_second = 0.7 # Because the amount of data is relatively large, the required speed is reduced
filtered_spotify_artist_data = spotify_artist_data.drop_duplicates('id', keep='first')
for index, row in spotify_artist_data.iterrows():
    artist_id = row['id']
    artist_name = row['name']
    print(f'Feching {artist_name} ({artist_id}) albums ({index + 1}/{len(spotify_artist_data)})...')
    
    album_data_path = os.path.join(
        './data/albums',
        f'{artist_id}.csv'
    )
    albums_df = fetch_artist_albums(
        artist_id=artist_id,
        sp=sp,
        executor=executor,
        include_groups=['single', 'album'],
        album_data_path=album_data_path
    )
    all_albums_df = pd.concat([all_albums_df, albums_df])

all_albums_df

Feching Deb Chakravarty (0yn2cFHoVqRiZOgkVs58Uc) albums (1/394)...
No existing album data found. Starting fresh.
No new albums detected. Skipping fetch.
Finished fetching albums. Added 0 new albums.
Feching Light & Salt (0jhgI3xU8n2o1W6EOw9dIf) albums (2/394)...
Loaded existing data with 15 albums.
No new albums detected. Skipping fetch.
Finished fetching albums. Added 0 new albums.
Feching 5tion (6MJCgDOw7EfTJJtKPHXAcD) albums (3/394)...
Loaded existing data with 22 albums.
No new albums detected. Skipping fetch.
Finished fetching albums. Added 0 new albums.
Feching Jewelry (01iyQzyns6Ab0LxjYvHcg9) albums (4/394)...
Loaded existing data with 7 albums.
No new albums detected. Skipping fetch.
Finished fetching albums. Added 0 new albums.
Feching JTL (1QxDhdpQDAxevRaOWUDjPC) albums (5/394)...
Loaded existing data with 5 albums.
No new albums detected. Skipping fetch.
Finished fetching albums. Added 0 new albums.
Feching KISS OF LIFE (4TEK9tIkcoxib4GxT3O4ky) albums (6/394)...
Loaded exist

Unnamed: 0,url,id,name,album_type,total_tracks,available_markets,release_date,release_date_precision,artist_id
0,https://open.spotify.com/album/4LNAKF3T84CEVfJ...,4LNAKF3T84CEVfJuzgra0E,쫄깃쫄깃,album,9,"[AR, AU, AT, BE, BO, BR, BG, CA, CL, CO, CR, C...",2015-07-08,day,1V7MmQ36ihG1f3qe46L50n
1,https://open.spotify.com/album/1INc7cTUIzcZUzh...,1INc7cTUIzcZUzhlB1N7AI,COLOR ME RAD,single,4,"[AR, AU, AT, BE, BO, BR, BG, CA, CL, CO, CR, C...",2016-07-07,day,1V7MmQ36ihG1f3qe46L50n
2,https://open.spotify.com/album/3yELvJ2jFxIvmrf...,3yELvJ2jFxIvmrfn8Pgb3d,머피와 샐리,single,4,"[AR, AU, AT, BE, BO, BR, BG, CA, CL, CO, CR, C...",2015-11-12,day,1V7MmQ36ihG1f3qe46L50n
3,https://open.spotify.com/album/0jXyB4zABzmie79...,0jXyB4zABzmie79OTZJgB0,D.Holic Dark With Dignity,single,3,"[AR, AU, AT, BE, BO, BR, BG, CA, CL, CO, CR, C...",2014-10-23,day,1V7MmQ36ihG1f3qe46L50n
0,https://open.spotify.com/album/2rhq834A24z7ucY...,2rhq834A24z7ucY39moYdb,GOOD BOY,single,1,"[AR, AU, AT, BE, BO, BR, BG, CA, CL, CO, CR, C...",2014-11-21,day,3HJVw8aEtFqoc3raJVE8am
...,...,...,...,...,...,...,...,...,...
1,https://open.spotify.com/album/3pix2U5QbFUSwOe...,3pix2U5QbFUSwOeF1M9Eim,The 1st Mini Album 'WE UNIS',single,5,"[AR, AU, AT, BE, BO, BR, BG, CA, CL, CO, CR, C...",2024-03-27,day,48xyu8QHo1IhsQZGlgNGYZ
0,https://open.spotify.com/album/1Au58twsgHlicnh...,1Au58twsgHlicnh8IfOJ1T,Sweet Tape,album,9,"[AR, AU, AT, BE, BO, BR, BG, CA, CL, CO, CR, C...",2024-07-16,day,1JHWiZqATDzLpkKLOC88pW
1,https://open.spotify.com/album/4M2nKDFzIDHO7cS...,4M2nKDFzIDHO7cScymLji5,Mission of School,album,10,"[AR, AU, AT, BE, BO, BR, BG, CA, CL, CO, CR, C...",2024-01-08,day,1JHWiZqATDzLpkKLOC88pW
2,https://open.spotify.com/album/66dIUNZgjWgAVNu...,66dIUNZgjWgAVNuGh99Zai,DASH,single,1,"[AR, AU, AT, BE, BO, BR, BG, CA, CL, CO, CR, C...",2023-12-01,day,1JHWiZqATDzLpkKLOC88pW


There will be an album with the same name but it will be released in different places. Please see if you want to exclude it based on the requirements.

In [None]:
import pandas as pd
import os

all_albums_df = pd.DataFrame()
for file_name in os.listdir('./data/albums'):
    file_path = os.path.join('./data/albums', file_name)
    df = pd.read_csv(file_path)
    df['updated_at'] = datetime.now(timezone.utc)
    df.to_csv(file_path, index=False)
    all_albums_df = pd.concat([all_albums_df, df], ignore_index=True)
all_albums_df

Unnamed: 0,url,id,name,album_type,total_tracks,available_markets,release_date,release_date_precision,artist_id,updated_at
0,https://open.spotify.com/album/0EDyYMjPenoiODb...,0EDyYMjPenoiODb5PYipb6,Sapta Sagaradaache Ello - Side B (Original Sou...,album,33,"['AR', 'AU', 'AT', 'BE', 'BO', 'BR', 'BG', 'CA...",2024-11-19,day,002yVW3Yn595KWy74buQ1k,2024-12-04 12:32:13.366236+00:00
1,https://open.spotify.com/album/1jPfpf7xs7P8vnJ...,1jPfpf7xs7P8vnJV5LtK1o,Bheema (Original Score),album,18,"['AR', 'AU', 'AT', 'BE', 'BO', 'BR', 'BG', 'CA...",2024-11-08,day,002yVW3Yn595KWy74buQ1k,2024-12-04 12:32:13.366236+00:00
2,https://open.spotify.com/album/3l7Z9jGnQkVjZc7...,3l7Z9jGnQkVjZc7oTPq0dy,Bheema (Original Background Score),album,18,"['AR', 'AU', 'AT', 'BE', 'BO', 'BR', 'BG', 'CA...",2024-11-08,day,002yVW3Yn595KWy74buQ1k,2024-12-04 12:32:13.366236+00:00
3,https://open.spotify.com/album/4aTSrfvWihHGW7i...,4aTSrfvWihHGW7iUr8ryaK,Sapta Sagaralu Dhaati - Side B (Original Motio...,album,7,"['AR', 'AU', 'AT', 'BE', 'BO', 'BR', 'BG', 'CA...",2024-09-30,day,002yVW3Yn595KWy74buQ1k,2024-12-04 12:32:13.366236+00:00
4,https://open.spotify.com/album/68knDSn2a6pQgqL...,68knDSn2a6pQgqLpBGXKHS,Sapta Sagaradaache Ello- Side B (Original Moti...,album,7,"['AR', 'AU', 'AT', 'BE', 'BO', 'BR', 'BG', 'CA...",2024-09-30,day,002yVW3Yn595KWy74buQ1k,2024-12-04 12:32:13.366236+00:00
...,...,...,...,...,...,...,...,...,...,...
6233,https://open.spotify.com/album/6iR6OCxdsSYShgs...,6iR6OCxdsSYShgs3e2kYi3,DAYTOUR 2,single,4,"['AR', 'AU', 'AT', 'BE', 'BO', 'BR', 'BG', 'CA...",2024-08-13,day,7xuXBG13gHSpdAoXNWQ8Se,2024-12-04 12:32:17.207526+00:00
6234,https://open.spotify.com/album/17J3JX1Bj85RwLW...,17J3JX1Bj85RwLWWWOj0ih,DAYTOUR,single,2,"['AR', 'AU', 'AT', 'BE', 'BO', 'BR', 'BG', 'CA...",2024-03-02,day,7xuXBG13gHSpdAoXNWQ8Se,2024-12-04 12:32:17.207526+00:00
6235,https://open.spotify.com/album/4gVkK0mL0fOcKvA...,4gVkK0mL0fOcKvAR0Vclwl,Lovey Dovey,single,2,"['AR', 'AU', 'AT', 'BE', 'BO', 'BR', 'BG', 'CA...",2023-05-31,day,7xuXBG13gHSpdAoXNWQ8Se,2024-12-04 12:32:17.207526+00:00
6236,https://open.spotify.com/album/4mg68SLOjp5W527...,4mg68SLOjp5W527ufbMuN8,Salamat,single,2,"['AR', 'AU', 'AT', 'BE', 'BO', 'BR', 'BG', 'CA...",2023-04-05,day,7xuXBG13gHSpdAoXNWQ8Se,2024-12-04 12:32:17.207526+00:00


In [None]:
all_albums_df.to_csv('./data/spotify_albums_data.csv', index=False, encoding='utf-8')

In [None]:
def fetch_album_tracks(
    album_id: str,
    sp: spotipy.Spotify,
    executor: ApiExecutor,
    limit: int = 50,
    track_data_path: str = 'album_tracks.csv'
) -> pd.DataFrame:
    """
    Fetch all tracks of a specific album using Spotify API, saving data to a CSV file.
    
    Args:
        album_id (str): Spotify album ID.
        sp (spotipy.Spotify): Spotify client instance.
        executor (ApiExecutor): API executor for rate limiting and retries.
        limit (int): Number of results to fetch per API call (default 50).
        track_data_path (str): Filepath for saving track data (default 'album_tracks.csv').
    
    Returns:
        pd.DataFrame: A DataFrame containing all tracks for the given album.
    """
    offset = 0
    all_tracks = []
    processed_track_ids = set()
    
    # Load existing CSV data to avoid duplicates
    try:
        existing_tracks = pd.read_csv(track_data_path)
        processed_track_ids = set(existing_tracks['id'])
        print(f"Loaded existing data with {len(processed_track_ids)} tracks.")
    except FileNotFoundError:
        print("No existing track data found. Starting fresh.")
    
    while True:
        try:
            # Prepare parameters for the API request
            params = {
                'limit': limit,
                'offset': offset
            }
            
            # Execute API request with rate limiting and retries
            response = executor.execute_request(
                lambda: sp.album_tracks(album_id, **params)
            )
            
            tracks = response.get('items', [])
            if not tracks:
                break  # Exit the loop if no more results
            
            for track in tracks:
                if track['id'] in processed_track_ids:
                    continue  # Skip already processed tracks
                
                # Extract artist IDs
                artist_ids = []
                for artist in track['artists']:
                    artist_id = artist['id']
                    artist_ids.append(artist_id)
                
                # Extract track data
                track_data = {
                    'disc_number': track['disc_number'],
                    'track_number': track['track_number'],
                    'duration_ms': track['duration_ms'],
                    'explicit': track['explicit'],
                    'url': track['external_urls'].get('spotify'),
                    'id': track['id'],
                    'name': track['name'],
                    'is_local': track['is_local'],
                    'artist_ids': artist_ids,
                    'album_id': album_id,
                    'updated_at': datetime.now(timezone.utc)
                }
                all_tracks.append(track_data)
                processed_track_ids.add(track['id'])
                
                # Save track data to CSV immediately
                append_to_csv(track_data_path, track_data)
            
            print(f"Fetched {len(tracks)} tracks (offset: {offset}).")
            offset += limit
        
        except Exception as e:
            print(f"Error fetching tracks for album {album_id} at offset {offset}: {e}")
            break
    
    # Convert all tracks to DataFrame and return
    tracks_df = pd.DataFrame(all_tracks)
    return tracks_df

In [None]:
all_albums_tracks_df = pd.DataFrame()
for index, row in all_albums_df.reset_index().iterrows():
    album_id = row['id']
    print(f'Fetching {album_id} ({index + 1} / {len(all_albums_df)})...')
    album_tracks_data_path = os.path.join(
        './data/album_tracks',
        f'{album_id}.csv'
    )
    if os.path.isfile(album_tracks_data_path):
        print(f'Album track file is exists, pass...')
        continue
    fetch_album_tracks(album_id, sp=sp, executor=executor, track_data_path=album_tracks_data_path)

Fetching 7EhwJywfGN4vOD0uqBCirC (1 / 6613)...
No existing track data found. Starting fresh.
Fetched 10 tracks (offset: 0).
Fetching 3d25Sxkvc5QLomfDfUJFdl (2 / 6613)...
No existing track data found. Starting fresh.
Fetched 14 tracks (offset: 0).
Fetching 2jc65r5YsDS9OmGnvY9T3w (3 / 6613)...
No existing track data found. Starting fresh.
Fetched 11 tracks (offset: 0).
Fetching 7qy4orsHqdwoLzf7cfIFSf (4 / 6613)...
No existing track data found. Starting fresh.
Fetched 10 tracks (offset: 0).
Fetching 6kl9ejsgIVeQosQz73VCS4 (5 / 6613)...
No existing track data found. Starting fresh.
Fetched 8 tracks (offset: 0).
Fetching 2nv7hhqj0o86CqKhQ3JTRB (6 / 6613)...
No existing track data found. Starting fresh.
Fetched 9 tracks (offset: 0).
Fetching 3WZKixaKhFcQLEVnInPSeE (7 / 6613)...
No existing track data found. Starting fresh.
Fetched 11 tracks (offset: 0).
Fetching 6O4PjajVTcGFApUJfrsVRw (8 / 6613)...
No existing track data found. Starting fresh.
Fetched 8 tracks (offset: 0).
Fetching 4I5P7duaNV

In [None]:
def fetch_tracks_details(
    track_ids: list[str],
    output_csv_path: str,
    sp: spotipy.Spotify,
    executor: ApiExecutor,
    batch_size: int = 50,
) -> pd.DataFrame:
    """
    Fetch track details in batches using the Spotify /tracks API.
    
    Args:
        track_ids (List[str]): List of track IDs to query.
        output_csv_path (str): Path to save the track details.
        sp (spotipy.Spotify): Spotify client instance.
        executor (ApiExecutor): API executor for rate limiting and retries.
        batch_size (int): Number of track IDs to query per API call (max 50).
    
    Returns:
        pd.DataFrame: A DataFrame containing track details.
    """
    # Prepare to store track data
    track_data_list = []
    
    # Batch processing
    for i in range(0, len(track_ids), batch_size):
        batch_ids = track_ids[i:i + batch_size]
        print(f"Fetching batch {i // batch_size + 1} of {len(track_ids) // batch_size + 1}")
        batch_track_datas = []
        try:
            # Fetch track details for the batch
            track_details = executor.execute_request(
                lambda: sp.tracks(batch_ids)
            )
            
            for track in track_details['tracks']:
                track_data = {
                    'id': track['id'],
                    'popularity': track.get('popularity', 0),
                    'isrc': track['external_ids'].get('isrc', None),
                    'updated_at': datetime.now(timezone.utc)
                }
                track_data_list.append(track_data)
                batch_track_datas.append(track_data)
            
            # Save track data to CSV immediately
            replace_or_append_rows_in_csv(output_csv_path, track_data_list, match_by_id)
        except Exception as e:
            print(f"Error fetching batch: {e}")
            time.sleep(30)  # Backoff in case of an error
    # Return as a DataFrame
    return pd.DataFrame(track_data_list)

In [None]:
import pandas as pd

album_tracks_dir = './data/album_tracks'
track_info_df = pd.DataFrame()
for file_name in os.listdir(album_tracks_dir):
    file_path = os.path.join(album_tracks_dir, file_name)
    single_track_info_df = pd.read_csv(file_path)
    track_info_df = pd.concat([track_info_df, single_track_info_df], ignore_index=True)
track_info_df

Unnamed: 0,disc_number,track_number,duration_ms,explicit,url,id,name,is_local,artist_ids,album_id
0,1,1,198276,False,https://open.spotify.com/track/2eh4Rl2corOdrHS...,2eh4Rl2corOdrHSmwoIFLD,"Bad Manners Title Track (from ""Bad Manners"")",False,['63gvl4egwBtz2czz3aENGa'],000a1ThXw3yBMgqoK20rQb
1,1,2,189698,False,https://open.spotify.com/track/6KUlXAg1gCPCkCT...,6KUlXAg1gCPCkCTCZlu4hO,"Saraayi Kududre Jhum Anthade (from ""Bad Manners"")",False,"['002yVW3Yn595KWy74buQ1k', '0AEPcRWWVLW0T1TLBD...",000a1ThXw3yBMgqoK20rQb
2,1,3,188339,False,https://open.spotify.com/track/7ocknIa9LHTuNFB...,7ocknIa9LHTuNFBhieGGdT,"Oga Oga (from ""Bad Manners"")",False,"['0nMjhemqRwrboQGcs92fh2', '3bLpM0uZQGjGyxtXgJ...",000a1ThXw3yBMgqoK20rQb
3,1,4,205751,False,https://open.spotify.com/track/0bPx8YowpNi1dtw...,0bPx8YowpNi1dtwoSFYCXz,"Jeeva (from""Bad Manners"")",False,"['4s8zgh93hZcl4GbVFgdW1H', '002yVW3Yn595KWy74b...",000a1ThXw3yBMgqoK20rQb
4,1,5,186915,False,https://open.spotify.com/track/344Y5pjalizVER3...,344Y5pjalizVER38YjdUes,I'M IN Love,False,['002yVW3Yn595KWy74buQ1k'],000a1ThXw3yBMgqoK20rQb
...,...,...,...,...,...,...,...,...,...,...
26635,1,17,270626,False,https://open.spotify.com/track/7iLHfjqEF9jGe3g...,7iLHfjqEF9jGe3gMOa9HzV,If It's the Same,False,['7AVa6rcpTQWVqgy91llPP5'],7zXb9k3ezLLT5BAgHeSleh
26636,1,18,332426,False,https://open.spotify.com/track/6uCPGq3jtrEjm8G...,6uCPGq3jtrEjm8Gv979tLy,My Story,False,['7AVa6rcpTQWVqgy91llPP5'],7zXb9k3ezLLT5BAgHeSleh
26637,1,19,242080,False,https://open.spotify.com/track/4xlGgKCa187ln1N...,4xlGgKCa187ln1Ny7xahfA,For Your Love (2007 Live Ver.) (Bonus Track),False,['7AVa6rcpTQWVqgy91llPP5'],7zXb9k3ezLLT5BAgHeSleh
26638,1,1,219200,False,https://open.spotify.com/track/0eazHAdJ2hz5G2L...,0eazHAdJ2hz5G2LWyFewa3,Hide and Seek,False,['5YPW3OmiqnqnQaFjloAvA7'],7zZFWGDvrY7n6N8EhCT7RX


In [None]:
track_info_df.to_csv('./data/spotify_track_data.csv', index=False, encoding='utf-8')

In [None]:
track_detail_path = './data/spotify_track_detail_data.csv'
track_ids = track_info_df['id'].drop_duplicates()
executor.requests_per_second = 0.5
filtered_track_ids = track_ids
if os.path.isfile(track_detail_path):
    track_detail_df = pd.read_csv(track_detail_path)
    filtered_track_ids = track_ids[~track_ids.isin(track_detail_df['id'])]
df = fetch_tracks_details(filtered_track_ids, track_detail_path, sp, executor)

Fetching batch 1 of 355
