# Acquisition of Audio Features for a Given Tracklist

## Import Modules and Set Constants

In [2]:
import os
import pandas as pd
import pickle

DIR_DATA = os.path.join('..', 'data')
DIR_PROCESSED = os.path.join(DIR_DATA, 'processed')
DIR_RAW = os.path.join(DIR_DATA, 'raw')
DIR_SNAPSHOTS = os.path.join(DIR_RAW, 'snapshots')
DIR_INPUTS = 'inputs'

BASENAME_OUT = 'track_audio_features'
EXT_OUT = '.csv'
FILENAME_OUT = BASENAME_OUT + EXT_OUT
PATH_OUT = os.path.join(DIR_RAW, FILENAME_OUT)

def PATH_OUT_SNAPSHOT(i: int):
    return os.path.join(DIR_SNAPSHOTS, f'{BASENAME_OUT}{i}{EXT_OUT}')

FILENAME_CREDENTIALS = 'app_credentials.csv'
PATH_CREDENTIALS = os.path.join(DIR_INPUTS, FILENAME_CREDENTIALS)

FILENAME_TRACKLIST = 'unique_track_ids1000000.pkl'
PATH_TRACKLIST = os.path.join(DIR_PROCESSED, FILENAME_TRACKLIST)

AUDIO_FEATURES = ['tempo', 'key', 'mode', 'loudness', 'danceability', 'energy', 'speechiness', 'acousticness', 'instrumentalness', 'liveness', 'valence', 'duration_ms', 'time_signature']
COL_INDEX = 'id'
COL_NAMES = [COL_INDEX] + AUDIO_FEATURES


## Create/Load Table

Read existing table:

In [3]:
df = pd.read_csv(PATH_OUT)
df.set_index(COL_INDEX, inplace=True)

display(df)

Unnamed: 0_level_0,tempo,key,mode,loudness,danceability,energy,speechiness,acousticness,instrumentalness,liveness,valence,duration_ms,time_signature
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1
0000uJA4xCdxThagdLkkLR,184.913,5.0,1.0,-5.621,0.458,0.5910,0.0326,0.5680,0.000015,0.2860,0.6540,161187.0,3.0
0002yNGLtYSYtc0X6ZnFvp,182.345,8.0,1.0,-11.572,0.455,0.6230,0.0523,0.7970,0.903000,0.6340,0.9510,220293.0,4.0
00039MgrmLoIzSpuYKurn9,132.064,1.0,1.0,-5.632,0.742,0.7530,0.0364,0.0178,0.000000,0.1330,0.2630,222727.0,4.0
0005rgjsSeVLp1cze57jIN,133.158,1.0,0.0,-6.141,0.507,0.4460,0.0276,0.7990,0.000000,0.3190,0.4180,213960.0,4.0
0006Rv1e2Xfh6QooyKJqKS,89.048,2.0,0.0,-9.190,0.295,0.4980,0.0301,0.7950,0.944000,0.1070,0.0445,189639.0,3.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...
7zzptITgTKf4HpJM8ye47v,81.071,1.0,0.0,-6.398,0.447,0.7240,0.0372,0.7880,0.202000,0.2420,0.9400,215813.0,4.0
7zzpwV2lgKsLke68yFoZdp,129.996,7.0,1.0,-2.558,0.497,0.6980,0.0317,0.1270,0.000000,0.1160,0.5520,233933.0,4.0
7zzrzbrb14URUZlmSrCGfM,98.463,9.0,0.0,-25.031,0.347,0.0127,0.0453,0.9490,0.000033,0.0478,0.1510,340560.0,3.0
7zzwFo2lPCgXphtN89XmLk,124.999,0.0,1.0,-10.112,0.637,0.7890,0.0538,0.0500,0.563000,0.2330,0.8600,297520.0,4.0


OR: Create new table:

In [None]:
df = pd.DataFrame(columns=COL_NAMES)
df.set_index('id', inplace=True)

display(df)

## Load Tracklist

In [4]:
with open(PATH_TRACKLIST, 'rb') as file_tracklist:
    track_ids = pickle.load(file_tracklist)

print(f'Loaded {len(track_ids)} track IDs!')

Loaded 1205287 track IDs!


The table may already contain audio features for some songs but not for others.
Thereby, the list of track IDs can be filtered to only contain track IDs that do not yet have an entry with audio features in the table.

In [6]:
track_ids_missing = [id for id in track_ids if id not in df.index]
print(f'The current table is missing data for {len(track_ids_missing)} entries of the tracklist.\nUpdating tracklist to only contain IDs of tracks with missing data.')
track_ids = track_ids_missing

The current table is missing data for 44 entries of the tracklist.
Updating tracklist to only contain IDs of tracks with missing data.


## Extend Table
Inserting new rows into a table comes with a significant performance overhead due to the needed memory reallocation. 
Thereby, extending the table once to already have a row for each track speeds the insert for each iteration by orders of magnitude.

In [37]:
len_init = len(df)
print(f'Table currently contains: {len_init} rows')
new_index = pd.Index.union(df.index, track_ids)
df = df.reindex(new_index)
len_new = len(df)
print(f'Table filled with {len_new - len_init} empty rows for all missing indices. New length: {len_new}')

Table currently contains: 1238655 rows
Table filled with 44 empty rows for all missing indices. New length: 1238699


## Load Credentials 

Spotify really does not like us bombarding their API with audio feature requests.
From experience, an account/app gets (temporarily?) blocked for that api endpoint after about 2,000 requests.

Thereby, we simply give a list containing credentials to different accounts/apps such that the script can automatically move on to the next one after one is blocked.

In [2]:
def get_credentials_table():
    return pd.read_csv(PATH_CREDENTIALS)

credentials = get_credentials_table()
display(credentials)

Unnamed: 0,email,id,secret
0,lbz43312@zslsz.com,b17dc7e294854b0f88329a34cfb540d3,449841ef10a04b9fb7b5b6e3d020764e
1,ezy22961@zslsz.com,55e75f4579fe46259223aa2a23bf83e0,fa2cbd49bcdf48d0ac564ec3d82d825a
2,mui32951@omeie.com,39255ad14e0b4f7e9ece44928cf45be5,469a2c559a524dcf8663f2bea7f45581
3,wkb49017@zslsz.com,d357546d9002442b8d09c5ca8561ad0f,10b65e285a8f4170afbe26d190598456
4,sxy09806@nezid.com,7a17222ff7004be69c10d8c776f5e2f1,b9d6f0d57d50467d97de209ade1db329
5,wch48879@omeie.com,37682dfc69e3464388e3927cdc5c9b9a,6bbedf13313d4ab58f899864dede141e
6,dzs03393@zbock.com,54d7b6dc416c40df882e6e4000d34e86,3c2a0f103b2a4a0b810882eaf55cf08a
7,dnh54136@zbock.com,7826618f8b99464c9d77d133d66da7b9,644feb750fbb44ebad0e98d1bac15640
8,uya20645@zbock.com,7f688f5c1ee343aabe1fc703f768d076,e0864d7e36a74733b3c7be7379154289
9,smw99794@zslsz.com,0242fa806fc14655b5a09bae1fc4905b,b1b1834e9cf649ae8dbc77f2bc70f79e


## Get Audio Features

### Define API Functions and Classes

API requests need an accompanying authorization token that has to be requested separately.
Such a token is only valid for a given time. If clear_cached is not set, the functions will first try to find a cached access token from the last request and only if that fails request (and cache) a new access token

In [3]:
import requests
import time
import base64
import pickle
import os

class SpotifyAppCredentials:
    email: str
    id: str
    secret: str

    def __init__(self, row: pd.Series):
        self.email = row['email']
        self.id = row['id']
        self.secret = row['secret']

    def __str__(self):
        return f'email: {self.email}, id: {self.id}, secret: {self.secret}'

class SpotifyAuthToken:
    token: str
    expires_at: float

    def __init__(self, response: requests.models.Response):
        self.token = response.json()['access_token']
        self.expires_at = time.time() + int(response.json()['expires_in'])

URL_API = "https://api.spotify.com/v1/"
URL_ACCOUNTS = 'https://accounts.spotify.com/'

EP_TOKEN = 'api/token/'
EP_AUDIO_FEATURES = "audio-features/"

FILE_AUTH = 'spotify_auth_token.pkl'
DIR_CACHE = 'cached_files'

THRESH_EXPIRATION = 60

def get_track_features(track_ids: list[str], timeout: int, credentials: SpotifyAppCredentials, clear_cached: bool = False):

    auth_token = get_auth_token(credentials, clear_cached)

    headers = {
        'Authorization': 'Bearer ' + auth_token.token
    }

    url = URL_API + EP_AUDIO_FEATURES + '?ids=' + ','.join(track_ids)
    response = requests.get(url, headers=headers, timeout=timeout)
    response.raise_for_status()

    return response.json()
        

def get_auth_token(credentials: SpotifyAppCredentials, clear_cached: bool = False) -> SpotifyAuthToken:

    os.makedirs(DIR_CACHE, exist_ok=True)
    path_auth_token = os.path.join(DIR_CACHE, FILE_AUTH)

    auth_token = None

    if (not clear_cached) and os.path.exists(path_auth_token):
        with open(path_auth_token, 'rb') as file_auth_token:
            auth_token = pickle.load(file_auth_token)

    if (auth_token and auth_token.expires_at <= time.time() - THRESH_EXPIRATION)\
        or not auth_token:
        auth_token = api_request_auth_token(credentials)
    
        with open(path_auth_token, 'wb') as file_auth_token:
            pickle.dump(auth_token, file_auth_token)
    
    return auth_token
    

def api_request_auth_token(credentials: SpotifyAppCredentials) -> SpotifyAuthToken:
    
    credentials = base64.b64encode(f'{credentials.id}:{credentials.secret}'.encode()).decode('utf-8')
    headers = {
        'Authorization': 'Basic ' + credentials
    }
    data = {
        'grant_type': 'client_credentials'
    }
    url = URL_ACCOUNTS + EP_TOKEN
    response = requests.post(url, headers=headers, data=data)

    return SpotifyAuthToken(response)
    

### Set Initial Parameters

In [38]:
ids_processed = 0
row_credentials = 0
iterations_since_save = 0
reload_auth = True

### Actually do the Work

The API can be queried for multiple (up to 100 song IDs) in a single request. 
Excessively short intervals between multiple requests resulted in account suspension after only a few hundred requests, thereby, a target request interval and a minimum cool down can be set.

Intermediate results are saved periodically to avoid data loss in the case of program or machine failure.

In [39]:
import time
import requests
from IPython.display import clear_output
from datetime import datetime

SIZE_CHUNKS = 100   # max 100

REQ_COOLDOWN = 0.1
REQ_INTERVAL = 0.75
REQ_TIMEOUT = 5

SAVE_INTERVAL = 200

BAD_RESPONSE_COOLDOWN = 10
NO_MORE_CREDS_COOLDOWN = 60

def curr_time():
    return datetime.now().strftime('%Y-%m-%d %H:%M:%S.%f')[:-3]

def log(msg: str):
    print(f'[{curr_time()}] {msg}')

curr_creds = SpotifyAppCredentials(credentials.iloc[row_credentials])

while (ids_processed < len(track_ids)):
    clear_output(True)
    start_time = time.time()
    try:
        log(f'Current credentials: {row_credentials + 1} of {len(credentials)}, {curr_creds}')
        log(f'Iterations since last save: {iterations_since_save} of {SAVE_INTERVAL}')
        track_ids_chunk = track_ids[ids_processed:ids_processed + SIZE_CHUNKS]
        log(f'Sending request for chunk [{ids_processed}:{ids_processed + len(track_ids_chunk) - 1}]')

        features_chunk = get_track_features(track_ids_chunk, REQ_TIMEOUT, curr_creds, reload_auth)
        log(f'Response received: {features_chunk}')
        reload_auth = False

        response_track_count = 0
        
        for track_features in features_chunk['audio_features']:
            if (track_features != None):
                response_track_count += 1
                track_features_sorted = [track_features[feat] for feat in AUDIO_FEATURES if feat in track_features]
                df.loc[track_features['id']] = track_features_sorted

        log(f'Added {response_track_count} entries to the table!')

        iterations_since_save += 1

        if (iterations_since_save >= SAVE_INTERVAL):
            log(f'{iterations_since_save} iterations since last save - saving partial data!')
            df.to_csv(PATH_OUT_SNAPSHOT(ids_processed))
            iterations_since_save = 0
            
        time_taken = time.time() - start_time
        sleep_duration = max(0, REQ_INTERVAL - time_taken) + REQ_COOLDOWN
        log(f'Sleeping for {sleep_duration:.3f} seconds...')
        time.sleep(sleep_duration)
        ids_processed += len(track_ids_chunk)

    except requests.Timeout as err:
        log(f'Request timed out.')

    except requests.HTTPError as err:
        log(f'HTTP Error at {ids_processed}: {err.response}')
        
        if (err.response.status_code == 429):

            log('Account limit reached - moving on to the next one!')
            row_credentials += 1
            reload_auth = True

            while (row_credentials >= len(credentials)):
                credentials = get_credentials_table()
                if (row_credentials >= len(credentials)):
                    time.sleep(NO_MORE_CREDS_COOLDOWN)

            curr_creds = SpotifyAppCredentials(credentials.iloc[row_credentials])

        else:
            time.sleep(BAD_RESPONSE_COOLDOWN)

    except OSError as err:
        log(f'OSError at {ids_processed}: {err}')
        time.sleep(REQ_INTERVAL)

display(df)
print(time.time())

[2024-02-03 17:26:16.977] Current credentials: 1 of 11, email: lbz43312@zslsz.com, id: b17dc7e294854b0f88329a34cfb540d3, secret: 449841ef10a04b9fb7b5b6e3d020764e
[2024-02-03 17:26:16.977] Iterations since last save: 0 of 200
[2024-02-03 17:26:16.977] Sending request for chunk [0:43]
[2024-02-03 17:26:17.141] Response received: {'audio_features': [None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None]}
[2024-02-03 17:26:17.141] Added 0 entries to the table!
[2024-02-03 17:26:17.142] Sleeping for 0.686 seconds...


Unnamed: 0_level_0,tempo,key,mode,loudness,danceability,energy,speechiness,acousticness,instrumentalness,liveness,valence,duration_ms,time_signature
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1
0000uJA4xCdxThagdLkkLR,184.913,5.0,1.0,-5.621,0.458,0.5910,0.0326,0.5680,0.000015,0.2860,0.6540,161187.0,3.0
0002yNGLtYSYtc0X6ZnFvp,182.345,8.0,1.0,-11.572,0.455,0.6230,0.0523,0.7970,0.903000,0.6340,0.9510,220293.0,4.0
00039MgrmLoIzSpuYKurn9,132.064,1.0,1.0,-5.632,0.742,0.7530,0.0364,0.0178,0.000000,0.1330,0.2630,222727.0,4.0
0005rgjsSeVLp1cze57jIN,133.158,1.0,0.0,-6.141,0.507,0.4460,0.0276,0.7990,0.000000,0.3190,0.4180,213960.0,4.0
0006Rv1e2Xfh6QooyKJqKS,89.048,2.0,0.0,-9.190,0.295,0.4980,0.0301,0.7950,0.944000,0.1070,0.0445,189639.0,3.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...
7zzptITgTKf4HpJM8ye47v,81.071,1.0,0.0,-6.398,0.447,0.7240,0.0372,0.7880,0.202000,0.2420,0.9400,215813.0,4.0
7zzpwV2lgKsLke68yFoZdp,129.996,7.0,1.0,-2.558,0.497,0.6980,0.0317,0.1270,0.000000,0.1160,0.5520,233933.0,4.0
7zzrzbrb14URUZlmSrCGfM,98.463,9.0,0.0,-25.031,0.347,0.0127,0.0453,0.9490,0.000033,0.0478,0.1510,340560.0,3.0
7zzwFo2lPCgXphtN89XmLk,124.999,0.0,1.0,-10.112,0.637,0.7890,0.0538,0.0500,0.563000,0.2330,0.8600,297520.0,4.0


1706977577.838779


## Filter and Save Results

Rows without data were added to the table for performance reasons. 
However, they should not be part of the output, so they are filtered out here.

In [33]:
df_filtered = df.dropna(how='all')
print(f'Filtered out {len(df) - len(df_filtered)} empty rows.')
print(f'Total number of rows remaining: {len(df_filtered)}')

Filtered out 0 empty rows.
Total number of rows remaining: 1238655


**Warning: overwrites existing file!**

In [34]:
df_filtered.to_csv(PATH_OUT)

## Misc


In [33]:
print(ids_processed)
display(df)

451800


Unnamed: 0_level_0,tempo,key,mode,loudness,danceability,energy,speechiness,acousticness,instrumentalness,liveness,valence,duration_ms,time_signature
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1
3zyYVItCMCjFzBHTyjrxPK,79.227,8.0,1.0,-3.406,0.4660,0.8560,0.3180,0.257000,0.000000,0.6750,0.5310,292223.0,4.0
4jSy0HTIoC9yiwZ8OVyTCW,164.207,1.0,1.0,-7.972,0.5100,0.8490,0.1190,0.000546,0.000748,0.6890,0.8870,315067.0,4.0
4zyqBSUFNkJ20mw1FB68gt,83.947,4.0,0.0,-22.867,0.3080,0.1140,0.0321,0.958000,0.902000,0.0853,0.3030,350906.0,4.0
63B3TtwUzOoJoe3unMteVa,93.696,1.0,0.0,-4.166,0.6600,0.9430,0.2770,0.129000,0.000000,0.5570,0.5990,210733.0,4.0
7y9iMe8SOB6z3NoHE2OfXl,118.384,0.0,1.0,-3.539,0.6750,0.7510,0.0296,0.060400,0.000000,0.0893,0.6120,181279.0,4.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...
19y7Eh2ORRo6spiEcr4aTU,120.083,9.0,1.0,-10.959,0.7970,0.3120,0.0608,0.345000,0.000000,0.1120,0.2680,201834.0,4.0
5zlEDnvh0Pg0xgibb5p2Gl,114.050,7.0,1.0,-8.319,0.6330,0.5410,0.0337,0.023300,0.001000,0.1050,0.6580,157760.0,4.0
5yBph8a8SsrBAwgdgv1V5j,49.986,2.0,0.0,-27.900,0.0913,0.0229,0.0372,0.957000,0.911000,0.0847,0.0395,113747.0,4.0
6fi9MNGmRZXg5G7Z8JwUjT,115.978,6.0,0.0,-8.669,0.6910,0.6180,0.0690,0.168000,0.008240,0.1050,0.2610,202800.0,4.0


416836
