In [1]:
import json
import requests
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
import librosa
import math
import time

In [2]:
#Load the api client id and secret from file
f = open('data/apikeys/apikeys.json')
apikeys = json.load(f)
CLIENT_ID = apikeys['clientId']
CLIENT_SECRET = apikeys['clientSecret']

In [3]:
#get access token
def authenticate_token():
    AUTH_URL = 'https://accounts.spotify.com/api/token'

    auth_response = requests.post(AUTH_URL, {
        'grant_type': 'client_credentials',
        'client_id': CLIENT_ID,
        'client_secret': CLIENT_SECRET,
    })

    # convert the response to JSON
    auth_response_data = auth_response.json()

    # save the access token
    access_token = auth_response_data['access_token']

    headers = {
        'Authorization': f'Bearer {access_token}'
    }
    return headers

headers = authenticate_token()

In [4]:
# base URL of all Spotify API endpoints
BASE_URL = 'https://api.spotify.com/v1/'

genre_seeds = requests.get(BASE_URL + 'recommendations/available-genre-seeds', headers=headers)

In [5]:
genre_seeds = genre_seeds.json()['genres']

In [6]:
from pyarrow import feather
feather.write_feather(pd.DataFrame(genre_seeds, columns=['genre']), 'data/genre_seeds.feather')

In [7]:
results =[]
for idx, genre in enumerate(genre_seeds):
    params = {
        'seed_genres':genre,
        'limit':100
    }

    recs = requests.get(BASE_URL + 'recommendations', params=params, headers=headers)
    rec_tracks = recs.json()['tracks']
    for track in rec_tracks:
        artist = track['artists'][0]
        name = artist['name']
        id = artist['id']
        result = {'artist_name':name, 'artist_id':id}
        results.append(result)
    print(f'{idx+1} / {len(genre_seeds)}', end='\r')

126 / 126

In [8]:
genre_artists_df = pd.DataFrame(results)

In [9]:
genre_artists_df = genre_artists_df.drop_duplicates().reset_index(drop=True)

In [10]:
genre_artists_df.artist_id

0       0OdUWJ0sBjDrqHygGUXeCF
1       1SImpQO0GbjRgvlwCcCtFo
2       5o206eFLx38glA2bb4zqIU
3       3gd8FJtBJtkRxdfbTu19U2
4       53XhwfbYqKCa1cC15pYq2q
                 ...          
5667    5C0gCCG8N5Dh5dZytIgzLX
5668    1N1LP4R1T5AIr33dtUB00h
5669    64X9DGiFHjK6Xn0ebXRNwy
5670    2rmMyZC0sUD1a3jkDxp7iY
5671    1U5zgr455OGyIkLNXvDdrf
Name: artist_id, Length: 5672, dtype: object

In [11]:
chunk_size = math.ceil(len(genre_artists_df) / 50)

In [12]:
chunk_size

114

In [13]:
genre_artists_df['genres'] = float('nan')
genre_artists_df['popularity'] = float('nan')

In [14]:
genre_artists_df

Unnamed: 0,artist_name,artist_id,genres,popularity
0,Band of Horses,0OdUWJ0sBjDrqHygGUXeCF,,
1,The Red Jumpsuit Apparatus,1SImpQO0GbjRgvlwCcCtFo,,
2,Bright Eyes,5o206eFLx38glA2bb4zqIU,,
3,Mumford & Sons,3gd8FJtBJtkRxdfbTu19U2,,
4,Imagine Dragons,53XhwfbYqKCa1cC15pYq2q,,
...,...,...,...,...
5667,Os Mutantes,5C0gCCG8N5Dh5dZytIgzLX,,
5668,Aurelio,1N1LP4R1T5AIr33dtUB00h,,
5669,Khadja Nin,64X9DGiFHjK6Xn0ebXRNwy,,
5670,Balkan Beat Box,2rmMyZC0sUD1a3jkDxp7iY,,


In [15]:
genre_artists_full_results = []
for artists in np.array_split(genre_artists_df, chunk_size):
    params = {'ids' : ','.join(list(artists.artist_id))}
    several_artists = requests.get(BASE_URL+'artists/', params=params, headers=headers)
    for i in artists.index:
        j = i - artists.index[0]
        result = {
            'artist_name': genre_artists_df.loc[i, 'artist_name'],
            'artist_id': genre_artists_df.loc[i, 'artist_id'],
            'genres': several_artists.json()['artists'][j]['genres'],
            'popularity': several_artists.json()['artists'][j]['popularity']
        }
        genre_artists_full_results.append(result)
        print(f'{i+1} / {len(genre_artists_df)}', end= '\r')
        
genre_artists_df = pd.DataFrame(genre_artists_full_results)

5672 / 5672

In [16]:
genre_artists_df

Unnamed: 0,artist_name,artist_id,genres,popularity
0,Band of Horses,0OdUWJ0sBjDrqHygGUXeCF,"[chamber pop, indie folk, indie rock, indietro...",59
1,The Red Jumpsuit Apparatus,1SImpQO0GbjRgvlwCcCtFo,"[alternative metal, neon pop punk, pop punk, p...",63
2,Bright Eyes,5o206eFLx38glA2bb4zqIU,"[chamber pop, indie rock, melancholia, omaha i...",55
3,Mumford & Sons,3gd8FJtBJtkRxdfbTu19U2,"[modern folk rock, modern rock, neo mellow, st...",72
4,Imagine Dragons,53XhwfbYqKCa1cC15pYq2q,"[modern rock, pop, rock]",86
...,...,...,...,...
5667,Os Mutantes,5C0gCCG8N5Dh5dZytIgzLX,"[bossa nova, mpb, psicodelia brasileira, psych...",44
5668,Aurelio,1N1LP4R1T5AIr33dtUB00h,"[garifuna folk, punta]",18
5669,Khadja Nin,64X9DGiFHjK6Xn0ebXRNwy,[burundian pop],29
5670,Balkan Beat Box,2rmMyZC0sUD1a3jkDxp7iY,"[balkan beats, balkan brass, gypsy fusion, gyp...",37


In [17]:
related_dfs = [genre_artists_df]

In [None]:
# Define the maximum number of related artists to fetch per artist
MAX_RELATED_ARTISTS = 1  # Adjust this number as needed


    new_artists = []
    for idx, artist in related_dfs[i-1].iterrows():
        related = requests.get(BASE_URL + f'artists/{artist.artist_id}/related-artists', headers=headers,timeout=10)
        while not related.ok:
            if related.status_code == 401:
                headers = authenticate_token()
            elif related.status_code == 429:
                time.sleep(30)
            else:
                break
            related = requests.get(BASE_URL + f'artists/{artist.artist_id}/related-artists', headers=headers,timeout=10)
        
        # Fetch only a limited number of related artists per artist
        if related.ok:
            for new_artist in related.json().get('artists', [])[:MAX_RELATED_ARTISTS]:
                new_row = {
                    'artist_name': new_artist['name'],
                    'artist_id': new_artist['id'],
                    'genres': new_artist['genres'],
                    'popularity': new_artist['popularity']
                }
                new_artists.append(new_row)
        print(f'{idx+1} / {len(related_dfs[i-1])}', end='\r')
    
    print('\n')
    related_dfs.append(pd.DataFrame(new_artists))
    related_dfs[i] = related_dfs[i].drop_duplicates(subset=['artist_id'])


In [18]:
all_artists = pd.concat([df for df in related_dfs])

In [19]:
all_artists = all_artists.drop_duplicates(subset=['artist_id']).reset_index(drop=True)

In [20]:
all_tracks = []

In [25]:
for idx, artist in all_artists.iterrows():
    top_tracks = requests.get(BASE_URL+'artists/'+artist.artist_id+'/top-tracks?market=US', headers=headers,timeout=10)
    for track in top_tracks.json()['tracks']:
        track_info = {
            'track_id': track['id'],
            'track_name': track['name'],
            'track_preview_link': track['preview_url'],
            'track_popularity': track['popularity'],
            'track_uri': track['uri'],
            'release_date':track['album']['release_date'],
            'artist_name': artist.artist_name,
            'artist_id': artist.artist_id,
            'artist_genres': artist.genres,
            'artist_popularity': artist.popularity
        }
        all_tracks.append(track_info)
    print(f'{idx+1} / {len(all_artists)}', end='\r')
all_tracks_df = pd.DataFrame(all_tracks)

3940 / 5671

JSONDecodeError: Expecting value: line 1 column 1 (char 0)

In [29]:
track_chunk_size

NameError: name 'track_chunk_size' is not defined

In [30]:
all_tracks_df = pd.DataFrame(all_tracks)
track_chunk_size = math.ceil(len(all_tracks_df) / 50)

release_dates = []
for tracks in np.array_split(all_tracks_df, track_chunk_size):
    params = {'ids' : ','.join(list(tracks.track_id)),
             'market': 'US'}
    several_tracks = requests.get(BASE_URL+'tracks/', params=params, headers=headers)
    for i in tracks.index:
        j = i - tracks.index[0]
        result = {
            'track_id': several_tracks.json()['tracks'][j]['id'],
            'release_date': several_tracks.json()['tracks'][j]['album']['release_date']
        }
        release_dates.append(result)
        print(f'{i+1} / {len(all_tracks_df)}', end= '\r')

JSONDecodeError: Expecting value: line 1 column 1 (char 0)

In [31]:
len(release_dates)

NameError: name 'release_dates' is not defined

In [31]:
from pyarrow import feather
feather.write_feather(all_tracks_df, 'data/all_tracks.feather')
feather.write_feather(all_artists, 'data/all_artists.feather')