In [1]:
import pandas as pd
import requests
import spotipy
from dotenv import load_dotenv
import os
from spotipy.oauth2 import SpotifyOAuth
import json
import time
import datetime as dt
from requests.exceptions import ReadTimeout, HTTPError
import base64

# Load environment variables
load_dotenv()

CLIENT_ID = os.getenv('client_id')
CLIENT_SECRET = os.getenv('client_secret')

# Set up Spotipy with your credentials
sp = spotipy.Spotify(auth_manager=SpotifyOAuth(
    client_id=CLIENT_ID,
    client_secret=CLIENT_SECRET,
    redirect_uri='http://localhost:8888/callback',
    scope="user-library-read"
))

global NUM_CALLS
NUM_CALLS = 0

def api_call_wrapper(func, *args, **kwargs):
    """
    Wrapper for Spotify API calls to handle rate limiting and retries.
    """
    max_retries=3
    delay=1
    backoff_factor=2,
    retries = 0
    current_delay = delay
    while retries < max_retries:
        try:
            # Make the API call
            response = func(*args, **kwargs)
            global NUM_CALLS
            NUM_CALLS += 1
            return response
        
        except HTTPError as e:
            # Handle rate-limiting or server errors (status code 429 or 5xx)
            if e.response.status_code == 429:
                retry_after = int(e.response.headers.get("Retry-After", 1))  # Spotify may provide Retry-After
                print(f"Rate limit exceeded. Retrying after {retry_after} seconds...")
                time.sleep(retry_after)
            else:
                print(f"HTTP error: {e.response.status_code}. Retrying in {current_delay} seconds...")
                time.sleep(current_delay)
                retries += 1
                current_delay *= backoff_factor  # Exponential backoff
            
        except ReadTimeout:
            # Handle timeout errors
            print(f"Request timed out. Retrying in {current_delay} seconds...")
            time.sleep(current_delay)
            retries += 1
            current_delay *= backoff_factor
        
        except Exception as e:
            # Handle unexpected errors
            print(f"An error occurred: {e}. Retrying in {current_delay} seconds...")
            time.sleep(current_delay)
            retries += 1
            current_delay *= backoff_factor
    
    print(f"Max retries reached for {func.__name__}.")
    return None  # Return None if the API call fails

def search_artists_by_genre(genre, limit=50):
    """
    Search for artists by genre.
    
    :param genre: The genre to search for.
    :param limit: Max number of artist IDs to return.
    """
    artist_ids = set()
    results = api_call_wrapper(sp.search, q=f'genre:{genre}', type='artist', limit=50)
    
    while results and len(artist_ids) < limit:
        for artist in results['artists']['items']:
            artist_ids.add(artist['id'])
        
        # Check if there's another page and update search results
        if results['artists']['next']:
            results = api_call_wrapper(sp.next, results['artists'])
        else:
            results = None

    return list(artist_ids)


# Function to get all albums for an artist (including pagination)
def get_artist_albums(artist_id, headers):
    albums_url = f'https://api.spotify.com/v1/artists/{artist_id}/albums'
    all_albums = []
    params = {
        'limit': 50,  # Max limit per request
        'offset': 0,
        'include_groups': 'album'  # Only fetch albums (exclude singles, compilations, etc.)
    }
    
    while True:
        response = requests.get(albums_url, headers=headers, params=params)
        # print(response)
        time.sleep(.4)
        if response.status_code == 200:
            data = response.json()
            all_albums.extend(data['items'])  # Append fetched albums to the list
            break
        elif response.status_code == 429:
            # Handle rate-limiting
            retry_after = int(response.headers.get("Retry-After", 1))
            print(f"Rate limited. Retrying after {retry_after} seconds...")
            # time.sleep(retry_after)
            return "rate limited"
        else:
            # Handle other error cases
            print(f"Failed to fetch albums for artist {artist_id}. Status code: {response.status_code}")
            print("Response:", response.text)
            break

    return all_albums

# Function to download all albums for a list of artists
def download_all_albums_for_artists(artist_list, headers):
    """
    Download all albums for a list of artists.

    :param artist_list: List of artist names or artist IDs.
    :return: Pandas DataFrame with album details.
    """
    all_albums_data = []
    print('before artist loop')
    
    for artist_id in artist_list:
        print(f"Fetching albums for artist ID: {artist_id}")
        albums = get_artist_albums(artist_id, headers)  # Fetch albums for the artist
        if albums:  # If albums were found, process them
            for album in albums:
                all_albums_data.append({
                    'artist_name': album['artists'][0]['name'],
                    'artist_id': album['artists'][0]['id'],
                    'album_id': album['id'],
                    'album_name': album['name'],
                    'release_date': album['release_date'],
                    'total_tracks': album['total_tracks'],
                    'album_type': album['album_type']
                })
        elif albums == "rate_limited":
            print("Rate limited")
            break
    
    # Convert the results to a Pandas DataFrame
    df_albums = pd.DataFrame(all_albums_data)
    
    # Optional: Handle empty data
    if df_albums.empty:
        print("No albums found for the given artists.")
    else:
        print(f"Found {len(df_albums)} albums.")

    return df_albums


# Spotify token URL
def get_auth_token():
    auth_url = 'https://accounts.spotify.com/api/token'

    # Encode the client ID and client secret into base64
    auth_header = base64.b64encode(f"{CLIENT_ID}:{CLIENT_SECRET}".encode()).decode()

    # Define the headers and body for the authentication request
    auth_headers = {
        "Authorization": f"Basic {auth_header}",
        "Content-Type": "application/x-www-form-urlencoded"
    }

    auth_data = {
        "grant_type": "client_credentials"
    }

    # Get the access token
    auth_response = requests.post(auth_url, headers=auth_headers, data=auth_data)

    # Check if authentication was successful
    if auth_response.status_code != 200:
        print("Failed to authenticate. Status code:", auth_response.status_code)
        print("Response:", auth_response.text)
    else:
        auth_response_data = auth_response.json()
        access_token = auth_response_data['access_token']

    # Define the headers for subsequent API requests
    headers = {
        "Authorization": f"Bearer {access_token}"
    }
    return headers


def get_related_artists(artist_id, header):
    related_artist_url = f'https://api.spotify.com/v1/artists/{artist_id}/related-artists'

    while True:
        # Adding a delay to avoid hitting rate limits
        time.sleep(.4)
        
        # Make request to the related artists endpoint
        related_response = requests.get(related_artist_url, headers=header)
        
        # Check if the request was successful
        if related_response.status_code == 200:
            related_data = related_response.json()  # Get the JSON response
            break
        
        elif related_response.status_code == 429:
            # Handle rate-limiting
            retry_after = int(related_response.headers.get("Retry-After", 1))
            print(f"Rate limited. Retrying after {retry_after} seconds...")
            # time.sleep(retry_after)
            return None
        else:
            # Handle other errors
            print("Failed to fetch related artists. Status code:", related_response.status_code)
            print("Response:", related_response.text)
            return pd.DataFrame()  # Return an empty DataFrame if the request fails

    # Function to format each artist's data
    def format_artist_data(artist):
        return {
            "name": artist.get("name"),
            "artist_id": artist.get("id"),
            "genres": ", ".join(artist.get("genres", [])),
            "followers": artist.get("followers", {}).get("total", 0),
            "popularity": artist.get("popularity"),
            "spotify_url": artist.get("external_urls", {}).get("spotify"),
        }
    
    # Extract the list of artists from the related data
    formatted_artists = [format_artist_data(artist) for artist in related_data.get('artists', [])]
    
    # Create a DataFrame from the list of formatted artist data
    related_artist_df = pd.DataFrame(formatted_artists)
    
    return related_artist_df


Current path for getting tracks is get genres, get artists by genres, get related artists of current artists, get artists top tracks, get audio features for each individual track.

Keep current path for getting artists, and batch audio features of the top tracks to reduce api calls

Get genres, get top artists, get related artists, get artists albums, get songs from albums,

Create Album Df, contains album name, artist, release date, included songs,

In [36]:

start_time = time.time()
genre_seeds = sp.recommendation_genre_seeds()
NUM_CALLS += 1
print(NUM_CALLS, time.time() - start_time)

all_artist_ids = set()

for genre in genre_seeds['genres']:
    all_artist_ids.update(search_artists_by_genre(genre, limit=50))
print(NUM_CALLS, time.time() - start_time)
print(len(list(all_artist_ids)))



1 0.1177375316619873
239 81.43151473999023
4305


In [37]:
headers = get_auth_token()
artist_df = pd.DataFrame()
for artist in list(all_artist_ids):
    if artist not in related_artist_df["artist_id"].unique():
        related_artist_df = get_related_artists(artist, headers)
        artist_df = pd.concat([artist_df,related_artist_df])
print(NUM_CALLS, time.time() - start_time)
print(len(list(all_artist_ids)))
artist_df = artist_df.reindex()
artist_df.drop_duplicates(inplace=True)

KeyboardInterrupt: 

In [40]:
artist_df.reset_index(inplace=True)
artist_df.drop_duplicates(inplace=True)

In [41]:
artist_df.to_csv(r"C:\Projects\CIS598_Project\fall-2024-shack02\data\artists.csv")

In [42]:
headers = get_auth_token()
artist_id = list(all_artist_ids)[0]
album_df = download_all_albums_for_artists(artist_df["artist_id"].values, headers)
album_df

before artist loop
Fetching albums for artist ID: 2ynylCO9SRPTKjgNEH0Y2a
Fetching albums for artist ID: 2x48WoJGRLCpCWHEKXMZoB
Fetching albums for artist ID: 3nS4tSuT4VwGiZH6BtlJfC
Fetching albums for artist ID: 7FAAkDlPg6pg0860CIDzmu
Fetching albums for artist ID: 6q4AmzK3GzCuEzkurnYuEQ
Fetching albums for artist ID: 71N7xIvnaOFGetZL5nhWsl
Fetching albums for artist ID: 6e0QWfEFmK6AguLy02mlqi
Fetching albums for artist ID: 0q6u5HyVK4zwGuzEtqjHqa
Fetching albums for artist ID: 5ZPr0RHsR3DrAhtsYMsfHR
Fetching albums for artist ID: 4AfTOzBubFP6STibJPTxwt
Fetching albums for artist ID: 7tjMOkm52H3Qiz3ty2tbNw
Fetching albums for artist ID: 2Tl7uZui4u9a3nXUM9VMei
Fetching albums for artist ID: 4wf6GGNBqaU79839E6yjfn
Fetching albums for artist ID: 16QCJENzcdhwka9bTKYMVB
Fetching albums for artist ID: 1fRv9jiRIN7zAOSpOfRP73
Fetching albums for artist ID: 4qrHkx5cgWIslciLXUMrYw
Fetching albums for artist ID: 6qtECqesbU29iftyeWmldK
Fetching albums for artist ID: 0znuUIjvP0LXEslfaq0Nor
Fetching 

In [1]:
album_df.head()

NameError: name 'album_df' is not defined

In [25]:
album_df.to_csv(r"C:\Projects\CIS598_Project\fall-2024-shack02\data\albums.csv")

In [17]:
album_df = pd.read_csv(r"C:\Projects\fall-2024-shack02\data\albums.csv")
artists_df = pd.read_csv(r"C:\Projects\fall-2024-shack02\data\artists.csv")

In [18]:
len(album_df)

75474

In [4]:
unique_artist_df = artists_df[artists_df["artist_id"].unique]

TypeError: Series.unique() takes 1 positional argument but 2 were given

In [19]:
unique_artist_df = artists_df.drop_duplicates(subset="artist_id")
len(unique_artist_df)
# unique_artist_df.drop(["Unnamed: 0","level_0","index"], axis=1, inplace=True)
# unique_artist_df.to_csv(r"C:\Projects\fall-2024-shack02\data\artists.csv")

13099

In [6]:
# album_df.drop(['index'],axis=1,inplace=True)
artists_with_albums = album_df["artist_id"].unique()
len(artists_with_albums)

10104

In [20]:
artists = unique_artist_df['artist_id'].unique()
artists_need_albums = [artist for artist in artists if not artist in artists_with_albums]


In [21]:
print(len(artists_need_albums),len(artists))

5670 13099


In [22]:
new_albums_df_list = []

In [23]:
headers = get_auth_token()

def segment_list(input_list, segment_size=1000):
    return [input_list[i:i + segment_size] for i in range(0, len(input_list), segment_size)]

segmented_new_artist_list = segment_list(artists_need_albums, segment_size = 1000)


for i,artist_segment in enumerate(segmented_new_artist_list):
    print(f"Segment {i}")
    new_albums_df_list.append(download_all_albums_for_artists(artist_segment, headers))
    print(f"Segment done")
    time.sleep(60)


Segment 0
before artist loop
Fetching albums for artist ID: 7Hy3BcTukq6HpDzVFrnJ8L
Fetching albums for artist ID: 1A5kGvmKIVtX7NhcbtTZJY
Fetching albums for artist ID: 0Ymz1ExCtpFOHsNDa5JeyL
Fetching albums for artist ID: 14IQ7niDNXIIrOSjr32E7O
Fetching albums for artist ID: 6tLHGlt7L7raSf6vr96hWi
Fetching albums for artist ID: 5VeyQ48fHEJP9CVFc4YJ5X
Fetching albums for artist ID: 422QvLrp4rn4VaTnb810uJ
Fetching albums for artist ID: 4Cp8MAVITZ6a8qjTVvhV28
Fetching albums for artist ID: 4ffqFO4lp2UOv2cnG9ka0J
Fetching albums for artist ID: 51NwQz5MA32kigUGYjdA3z
Fetching albums for artist ID: 4OHSon9N7JAfxkjlH8nKDb
Fetching albums for artist ID: 4PTQtiKISN5iGNpbRVv02B
Fetching albums for artist ID: 4x0yIdlIelBumJlBFgT2jt
Fetching albums for artist ID: 088wXCnKN6e54SL0zuVXR3
Fetching albums for artist ID: 75vuZwnDSzjDlz6OOnkfjJ
Fetching albums for artist ID: 5cSF6ZUFVfUIjOUGKl67f5
Fetching albums for artist ID: 20rP9JAY0JpEb4EDNNivoy
Fetching albums for artist ID: 4BU9SHOTANVwAifirXPu5u

TypeError: string indices must be integers, not 'str'

In [24]:
print(len(new_albums_df_list))

5


In [25]:
new_albums_df = pd.DataFrame()
for df in new_albums_df_list:
    new_albums_df = pd.concat([new_albums_df, df])

In [26]:
new_albums_df
all_albums_df = pd.concat([album_df,new_albums_df])

In [27]:
all_albums_df
all_albums_df_dropped = all_albums_df.drop_duplicates("album_id")

In [28]:
print(len(all_albums_df),len(all_albums_df_dropped))
all_albums_df_dropped.to_csv(r"C:\Projects\fall-2024-shack02\data\albums.csv")

104120 103118


In [36]:
all_albums_df_dropped[all_albums_df_dropped["artist_name"].str.contains(":")]

Unnamed: 0,index,artist_name,artist_id,album_id,album_name,release_date,total_tracks,album_type
1684,,The Academy: Segunda Misión,4C3ZmIqca9MWwThLaYbfXu,4PEk2u6PH21dZcFOjdG29R,The Academy: Segunda Misión,2024-03-28,9,album
1687,,The Academy: Segunda Misión,4C3ZmIqca9MWwThLaYbfXu,2de63hNYGRgcNTXCtXQGvu,The Academy: Segunda Misión,2024-03-28,9,album
1688,,The Academy: Segunda Misión,4C3ZmIqca9MWwThLaYbfXu,7HPtZ8XfUirtDilX4LZ9Cg,The Academy: Segunda Misión,2024-03-28,9,album
6858,,Nu:Tone,7pDBRy9uWy1zq5b0uXIABQ,1avheiH128OzFBIsBpCaU8,Future Sound Of Cambridge 3,2008-11-03,11,album
7793,,Nu:Tone,7pDBRy9uWy1zq5b0uXIABQ,3ho4CrNuUCiHLpt2wYTDTr,Little Spaces,2021-02-26,14,album
7794,,Nu:Tone,7pDBRy9uWy1zq5b0uXIABQ,0lTlHYoTV5VFC3k2ijTsDW,Hospitality House Party (DJ Mix),2020-07-31,32,album
7795,,Nu:Tone,7pDBRy9uWy1zq5b0uXIABQ,51aRANPYxGLgsjTCMUPGOE,Sick Music 2020 (DJ Mix),2020-01-31,25,album
7796,,Nu:Tone,7pDBRy9uWy1zq5b0uXIABQ,1mNaO5s2S2qLDyWWI8c5Y2,Hospitality On The Beach 2019 (DJ Mix),2019-05-31,33,album
7797,,Nu:Tone,7pDBRy9uWy1zq5b0uXIABQ,2AOdXwm2IXRha9CC8ZHQpK,Future History,2014-11-10,14,album
7798,,Nu:Tone,7pDBRy9uWy1zq5b0uXIABQ,2ZCJ2YAcRZQ7Ymx2zckbaf,Words And Pictures,2011-02-25,14,album


In [37]:
len(all_albums_df_dropped["artist_name"].unique())

4899

In [13]:
artists_df

Unnamed: 0.2,Unnamed: 0.1,Unnamed: 0,name,artist_id,genres,followers,popularity,spotify_url
0,0,0,Angel Du$t,2ynylCO9SRPTKjgNEH0Y2a,dreamo,63023,41,https://open.spotify.com/artist/2ynylCO9SRPTKj...
1,1,1,Trapped Under Ice,2x48WoJGRLCpCWHEKXMZoB,"chaotic hardcore, hardcore, melodic hardcore, ...",75031,42,https://open.spotify.com/artist/2x48WoJGRLCpCW...
2,2,2,Incendiary,3nS4tSuT4VwGiZH6BtlJfC,"long island punk, metallic hardcore, nyhc",81448,40,https://open.spotify.com/artist/3nS4tSuT4VwGiZ...
3,3,3,DRAIN,7FAAkDlPg6pg0860CIDzmu,california hardcore,91964,45,https://open.spotify.com/artist/7FAAkDlPg6pg08...
4,4,4,Drug Church,6q4AmzK3GzCuEzkurnYuEQ,"dreamo, emo, indie punk, modern melodic hardcore",81260,45,https://open.spotify.com/artist/6q4AmzK3GzCuEz...
...,...,...,...,...,...,...,...,...
13094,13094,22012,Monika Santucci,5E1FAGKopxyWqEREQblCDA,,4808,38,https://open.spotify.com/artist/5E1FAGKopxyWqE...
13095,13095,22013,Luma,29siAJ78u7y79BYOyh0lbp,melodic dubstep,11287,43,https://open.spotify.com/artist/29siAJ78u7y79B...
13096,13096,22019,MEDZ,61c8McUZCtrU9WOjvFkyiL,melodic dubstep,4623,35,https://open.spotify.com/artist/61c8McUZCtrU9W...
13097,13097,22021,lama,01M9LokQdmZvlAuwBLsYYH,melodic dubstep,3053,34,https://open.spotify.com/artist/01M9LokQdmZvlA...


In [14]:
songs_df = pd.read_csv(r"C:\Projects\fall-2024-shack02\data\downloaded_songs_with_track_features_cleaned_copy_2.csv")

In [16]:
songs_df

ValueError: Cannot mask with non-boolean array containing NA / NaN values