In [40]:
# Imports
import spotipy
import pandas as pd
import numpy as np
import time
import random
import base64
import requests
from spotipy.oauth2 import SpotifyClientCredentials
from collections import deque

## Helper Functions

In [23]:
def get_artists_by_search(client, genre, limit=1000):
    '''
    Retrieves artists that fall under a particular genre by a direct search.

    Parameters:
    client (spotipy.client.Spotify object): Client to perform the search with. 
    genre (str): Genre of songs to obtain.
    limit (int): Max number of artists to retrieve. Defaults to 1000.

    Returns:
    artists (pandas.DataFrame): A dataframe containing artists and their associated genres.
    '''
    # Get initial query. See https://developer.spotify.com/documentation/web-api/reference/search/ for search documentation.
    results = client.search(q=f'genre:{genre}', type='artist', limit=50) # Spotify API maxes out at 50
    artists = [{'id':artist['id'], 'query_genre':genre, 'all_genres':artist['genres']} for artist in results['artists']['items']]
    time.sleep(1) # Respect rate limits

    # Get the rest of the songs.
    offset = 50
    while len(artists) < limit and offset < 1000: # Search returns a maximum of 1000 results
        results = client.search(q=f'genre:{genre}', offset=offset, type='artist', limit=50)
        new_artists = [{'id':artist['id'], 'query_genre':genre, 'all_genres':artist['genres']} for artist in results['artists']['items']]
        artists.extend(new_artists)
        offset += 50
        time.sleep(1) # Respect rate limits

    return pd.DataFrame(artists)

In [10]:
def get_track_features(client, track_ids):
    '''
    Extracts audio features from a list of track IDs.

    Parameters:
    client (spotipy.client.Spotify object): Client to perform the query with. 
    track_ids (list[str]): A list of track IDs for which to get audio features.

    Returns:
    track_features (pd.DataFrame): A dataframe containing each track's audio features along with its ID.
    '''
    track_features = []
    for i in range(0, len(track_ids), 50):  # Spotify API allows batch requests up to 50 items
        batch_ids = track_ids[i:i+50]
        features = sp.audio_features(batch_ids)
        track_features.extend(features)
        time.sleep(1)  # To respect rate limits
    return pd.DataFrame(track_features)

In [11]:
def calculate_statistics(client, features):
    '''
    Computes statistics for audio features.

    Parameters:
    client (spotipy.client.Spotify object): Client to perform the search with. 
    features (pd.DataFrame): A dataframe with track IDs and their features.

    Returns:
    stats (pd.DataFrame): A dataframe containing aggregated statistics 
    '''
    if not features:
        return None
    
    stats = {
        'danceability_avg': np.mean(features['danceability']),
        'danceability_median': np.median(features['danceability']),
        'danceability_min': np.min(features['danceability']),
        'danceability_max': np.max(features['danceability']),
        'energy_avg': np.mean(features['energy']),
        'energy_median': np.median(features['energy']),
        'energy_min': np.min(features['energy']),
        'energy_max': np.max(features['energy']),
        'loudness_avg': np.mean(features['loudness']),
        'loudness_median': np.median(features['loudness']),
        'loudness_min': np.min(features['loudness']),
        'loudness_max': np.max(features['loudness']),
        'acousticness_avg': np.mean(features['acousticness']),
        'acousticness_median': np.median(features['acousticness']),
        'acousticness_min': np.min(features['acousticness']),
        'acousticness_max': np.max(features['acousticness']),
        'valence_avg': np.mean(features['valence']),
        'valence_median': np.median(features['valence']),
        'valence_min': np.min(features['valence']),
        'valence_max': np.max(features['valence']),
        'tempo_avg': np.mean(features['tempo']),
        'tempo_median': np.median(features['tempo']),
        'tempo_min': np.min(features['tempo']),
        'tempo_max': np.max(features['tempo']),
        'instrumentalness_avg': np.mean(features['instrumentalness']),
        'instrumentalness_median': np.median(features['instrumentalness']),
        'instrumentalness_min': np.min(features['instrumentalness']),
        'instrumentalness_max': np.max(features['instrumentalness']),
        'liveness_avg': np.mean(features['liveness']),
        'liveness_median': np.median(features['liveness']),
        'liveness_min': np.min(features['liveness']),
        'liveness_max': np.max(features['liveness']),
        'speechiness_avg': np.mean(features['speechiness']),
        'speechiness_median': np.median(features['speechiness']),
        'speechiness_min': np.min(features['speechiness']),
        'speechiness_max': np.max(features['speechiness'])
    }
    return pd.DataFrame(stats)

## Main Code

In [4]:
# Authenticate spotipy
with open("spotify_credentials.txt") as f:
    client_id = f.readline()[:-1]
    client_secret = f.readline()

client_credentials_manager = SpotifyClientCredentials(client_id=client_id, client_secret=client_secret)
sp = spotipy.Spotify(client_credentials_manager=client_credentials_manager)

In [26]:
# Get all genres used for recommendations in Spotify
genres = sp.recommendation_genre_seeds()['genres']

In [27]:
genres

['acoustic',
 'afrobeat',
 'alt-rock',
 'alternative',
 'ambient',
 'anime',
 'black-metal',
 'bluegrass',
 'blues',
 'bossanova',
 'brazil',
 'breakbeat',
 'british',
 'cantopop',
 'chicago-house',
 'children',
 'chill',
 'classical',
 'club',
 'comedy',
 'country',
 'dance',
 'dancehall',
 'death-metal',
 'deep-house',
 'detroit-techno',
 'disco',
 'disney',
 'drum-and-bass',
 'dub',
 'dubstep',
 'edm',
 'electro',
 'electronic',
 'emo',
 'folk',
 'forro',
 'french',
 'funk',
 'garage',
 'german',
 'gospel',
 'goth',
 'grindcore',
 'groove',
 'grunge',
 'guitar',
 'happy',
 'hard-rock',
 'hardcore',
 'hardstyle',
 'heavy-metal',
 'hip-hop',
 'holidays',
 'honky-tonk',
 'house',
 'idm',
 'indian',
 'indie',
 'indie-pop',
 'industrial',
 'iranian',
 'j-dance',
 'j-idol',
 'j-pop',
 'j-rock',
 'jazz',
 'k-pop',
 'kids',
 'latin',
 'latino',
 'malay',
 'mandopop',
 'metal',
 'metal-misc',
 'metalcore',
 'minimal-techno',
 'movies',
 'mpb',
 'new-age',
 'new-release',
 'opera',
 'pagode',

In [28]:
# Get maximum amount of artists for each genre
artist_df = pd.DataFrame(columns=['id', 'query_genre', 'all_genres'])
for genre in genres:
    new_df = get_artists_by_search(sp, genre, limit=1000)
    artist_df = pd.concat([artist_df, new_df], ignore_index=True)
    print(f'Finished collecting artists for genre: {genre}. Total artists found: {len(new_df)}')

Finished collecting artists for genre: acoustic. Total artists found: 839
Finished collecting artists for genre: afrobeat. Total artists found: 644
Finished collecting artists for genre: alt-rock. Total artists found: 1000
Finished collecting artists for genre: alternative. Total artists found: 1000
Finished collecting artists for genre: ambient. Total artists found: 1000
Finished collecting artists for genre: anime. Total artists found: 1000
Finished collecting artists for genre: black-metal. Total artists found: 1000
Finished collecting artists for genre: bluegrass. Total artists found: 539
Finished collecting artists for genre: blues. Total artists found: 1000
Finished collecting artists for genre: bossanova. Total artists found: 0
Finished collecting artists for genre: brazil. Total artists found: 1000
Finished collecting artists for genre: breakbeat. Total artists found: 136
Finished collecting artists for genre: british. Total artists found: 1000
Finished collecting artists for g

In [35]:
# Save data so far so that I don't have to repeat the collection process
artist_df.to_csv('artist_ids.csv', index=False)

For some strange reason the spotipy package won't tell you you've been rate limited with certain functions. Thus, when querying for a massive amount of data like I'm about to do, I'll be using the requests package manually to get the information I need. This way, I'll be able to tell if I've been rate limited and can wait the required amount of time.

In [41]:
# Encode client ID and client secret into base64
client_creds = f"{client_id}:{client_secret}"
client_creds_b64 = base64.b64encode(client_creds.encode()).decode()

# Get spotify API token
token_url = "https://accounts.spotify.com/api/token"
token_data = {
    "grant_type": "client_credentials"
}
token_headers = {
    "Authorization": f"Basic {client_creds_b64}"
}

r = requests.post(token_url, data=token_data, headers=token_headers)
token_response_data = r.json()
access_token = token_response_data["access_token"]

## Deprecated

These are functions that I tried to implement to get more data, but they are just too slow due to the rate limit on the Spotify API.

In [None]:
def get_seed_artists(genres, limit=1):
    '''
    Finds high popularity artists in a list of genres.

    Parameters:
    genres (list[str]): A list of genres for which to get seed artists.
    limit (int): Number of seed artists to get for each genre.

    Returns:
    seed_artists (set[str]): A set of artist ids.
    '''
    seed_artists = set()
    for genre in genres:
        results = sp.search(q=f'genre:{genre}', type='artist', limit=1000)
        artist_df = pd.DataFrame(columns=['artist_id', 'popularity'])
        if results['artists']['items']:
            for artist in results['artists']['items']:
                new_data = pd.DataFrame({'artist_id':[artist['id']], 'popularity':[artist['popularity']]})
                artist_df = pd.concat([artist_df, new_data], ignore_index=True)
        artist_df.sort_values(by='popularity', ascending=False, inplace=True)
        added_artists = set(artist_df.head(limit)['artist_id'])
        seed_artists |= added_artists
        time.sleep(0.5) # To respect rate limits
    return seed_artists

In [7]:
def get_all_tracks_by_artist(artist_id):
    '''
    Gets all tracks by a given artist.

    Parameters:
    artist_id (str): An artist id.

    Returns:
    track_ids (list[str]): A list of track_ids by the given artist.
    '''
    albums = []
    results = sp.artist_albums(artist_id, album_type='album')
    albums.extend(results['items'])
    while results['next']:
        results = sp.next(results)
        albums.extend(results['items'])
        time.sleep(1)  # To respect rate limits

    track_ids = []
    for album in albums:
        try:
            results = sp.album_tracks(album['id'])
            track_ids.extend([track['id'] for track in results['items']])
            while results['next']:
                results = sp.next(results)
                track_ids.extend([track['id'] for track in results['items']])
                time.sleep(1)  # To respect rate limits
        except spotipy.SpotifyException as e:
            print(f"Error fetching tracks for album {album['id']}: {e}")
            return []

    return track_ids

In [4]:
def get_related_artists(seed_artists, max_depth=2, max_artists=100000):
    '''
    Finds related artists through BFS from a set of seed artists.

    Parameters:
    seed_artists (set[str]): A set of seed artist IDs for which to get related artists.
    max_depth (int): Maximum depth to perform BFS.
    max_artists (int): Maximum number of artists to return.

    Returns:
    all_artists (list[tuple]): A list of artist ids together with the depth at which they were found. 
    '''
    visited = seed_artists.copy()
    all_artists = [(artist_id, 0) for artist_id in seed_artists]
    queue = deque(all_artists)
    while queue:
        current_artist_id, depth = queue.popleft()
        if depth <= max_depth:
            try:
                related_artists = sp.artist_related_artists(current_artist_id)['artists']
                for artist in related_artists:
                    artist_id = artist['id']
                    if artist_id not in visited:
                        visited.add(artist_id)
                        all_artists.append((artist_id, depth + 1))
                        queue.append((artist_id, depth + 1))
                        if len(visited) % 100 == 0:
                            print(f'Artists found: {len(visited)}. Current depth: {depth}')
                        if len(visited) % 1000 == 0:
                            with open('artist_ids.txt', 'w') as f:
                                for artist in visited:
                                    f.write(f'{artist}\n')
                        if len(all_artists) >= max_artists:
                            return all_artists
                time.sleep(0.5) # To respect rate limits
            except spotipy.SpotifyException as e:
                print(f"Error fetching related artists for artist {current_artist_id}: {e}")
                continue
    return all_artists