##### Import relevant libraries/modules and get access token

In [1]:
import pandas as pd
import numpy as np
from api_functions import *

In [2]:
access_token_header = get_access_token(CLIENT_ID, CLIENT_SECRET)

In [3]:
# playlist_id = '1ulFK6boXuJRlRy5kH0pEX'
playlist_id = '6VaNNtZuGdbQ3GMNnhPl9e'

##### Get playlist data (i.e. all tracks from playlist)

In [4]:
# First call to get playlist information
playlist_response = playlist_tracks_endpoint(playlist_id, access_token_header)

In [5]:
# If playlist has more than 100 songs, more calls will be necessary
all_responses = [playlist_response]

next_page = playlist_response['next']
while next_page is not None:
    offset_response = general_endpoint(next_page, access_token_header)
    all_responses.append(offset_response)
    next_page = offset_response['next']

##### Create dataframe with only relevant info from API call(s)

In [6]:
cleaned_tracks = {}
duplicates = []

for response in all_responses:
    for track in response['items']:
        track_id = track['track']['id']
        track_name = track['track']['name']
        track_artists = [artist['name'] for artist in track['track']['artists']]
        # Keeping track of any duplicate tracks in the playlist
        if track_id in cleaned_tracks:
            duplicates.append(track_id)
        cleaned_tracks[track_id] = {'artists': track_artists, 'name': track_name}
all_tracks_df = pd.DataFrame.from_dict(cleaned_tracks, orient='index')

all_tracks_df.tail()

Unnamed: 0,artists,name
3BwkldhE22DdxOcXegF2Kv,[Akon],Bananza (Belly Dancer)
3koCCeSaVUyrRo3N2gHrd8,"[Earth, Wind & Fire]",Let's Groove
7FwBtcecmlpc1sLySPXeGE,[Bruce Springsteen],Dancing In the Dark
5tdKaKLnC4SgtDZ6RlWeal,[Whitney Houston],How Will I Know
2lDODk7inZnmUHbIjUnIwP,"[VANO 3000, BADBADNOTGOOD, Samuel T. Herring]",Running Away


##### Ensuring dataframe is correct

In [7]:
# Checking length of tracks (imported from API) matches that of the dataframe (adjusted for any duplicate tracks)
all_tracks_df.shape[0] == playlist_response['total'] - len(duplicates)

True

##### Create string of ids to query Audio Features

In [8]:
track_ids = all_tracks_df.index

def prepare_ids_for_query(ids, max_len=100):
    track_ids = np.array(ids)
    split_ids = []
    # If there are more than 100 tracks in the playlist
    while len(track_ids) > max_len:
        # Create a string of comma-delimited ids and add this to a list
        split_ids.append(",".join(track_ids[:max_len]))
        # Move on to next section of ids
        track_ids = np.delete(track_ids, np.s_[:max_len])
    # Add any remaining tracks (or add all tracks if number of tracks are sub 100)
    split_ids.append(",".join(track_ids))
    return split_ids

ids_list = prepare_ids_for_query(track_ids)

##### Checking ids have been formatted correctly

In [9]:
# Checking length of last group matches total length minus sum of all other groups
len(ids_list[-1].split(",")) == playlist_response['total'] - sum([len(ids.split(",")) for ids in ids_list[:-1]]) - len(duplicates)

True

##### Get audio features data for all tracks in playlist

In [10]:
all_audio_features_list = [multiple_audio_features_endpoint(ids, access_token_header) for ids in ids_list]

##### Create dataframe of audio features

In [11]:
all_audio_features_df = pd.DataFrame()

# For each length-100 grouping of tracks
for tracks_features in all_audio_features_list:
    audios_features = tracks_features['audio_features']
    ids = [data['id'] for data in audios_features]
    features_df = pd.DataFrame(audios_features, index=ids)
    # Drop irrelevant columns
    features_df.drop(['type', 'id', 'uri', 'track_href', 'analysis_url'], axis=1, inplace=True)
    all_audio_features_df = pd.concat([all_audio_features_df, features_df])

all_audio_features_df.tail()

Unnamed: 0,danceability,energy,key,loudness,mode,speechiness,acousticness,instrumentalness,liveness,valence,tempo,duration_ms,time_signature
3BwkldhE22DdxOcXegF2Kv,0.877,0.702,10,-5.899,0,0.127,0.0334,9e-06,0.743,0.69,104.833,238493,4
3koCCeSaVUyrRo3N2gHrd8,0.869,0.648,11,-8.698,0,0.0633,0.121,2.2e-05,0.126,0.9,125.035,339320,4
7FwBtcecmlpc1sLySPXeGE,0.527,0.942,1,-5.64,0,0.0366,0.0115,0.0,0.188,0.495,148.723,241307,4
5tdKaKLnC4SgtDZ6RlWeal,0.832,0.544,6,-12.697,1,0.0442,0.201,0.000139,0.632,0.928,119.49,275533,4
2lDODk7inZnmUHbIjUnIwP,0.616,0.694,7,-10.898,1,0.0411,0.664,0.858,0.145,0.732,93.298,111787,4


##### Merge dataframes together

In [12]:
tracks_with_features = pd.merge(all_tracks_df, all_audio_features_df, left_index=True, right_index=True)

tracks_with_features

Unnamed: 0,artists,name,danceability,energy,key,loudness,mode,speechiness,acousticness,instrumentalness,liveness,valence,tempo,duration_ms,time_signature
6dtWKqqdveI3YvdYJQKWWn,[ZAYN],iT's YoU,0.611,0.319,5,-10.743,1,0.0357,0.81500,0.015100,0.1080,0.1760,124.920,226653,4
73Qlt0XuhuYzEigjitm2ze,[Demi Lovato],Cool for the Summer,0.587,0.620,5,-5.612,0,0.0366,0.00504,0.000181,0.0762,0.2860,114.080,214740,4
5jXNOgH5A7yI76BU9kaP6N,"[Kanye West, JAY-Z, Rick Ross, Nicki Minaj, Bo...",Monster,0.622,0.679,11,-5.621,0,0.2450,0.00275,0.006830,0.6060,0.0878,123.380,378893,5
3MxJGBnYaP4yQYUv90HLqZ,[One Direction],Moments,0.474,0.739,2,-4.937,1,0.0451,0.02830,0.000000,0.2990,0.2530,149.894,262640,4
1wHZx0LgzFHyeIZkUydNXq,[Travis Scott],Antidote,0.713,0.526,1,-5.046,1,0.0320,0.00767,0.000148,0.1240,0.1310,131.050,262693,4
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3BwkldhE22DdxOcXegF2Kv,[Akon],Bananza (Belly Dancer),0.877,0.702,10,-5.899,0,0.1270,0.03340,0.000009,0.7430,0.6900,104.833,238493,4
3koCCeSaVUyrRo3N2gHrd8,"[Earth, Wind & Fire]",Let's Groove,0.869,0.648,11,-8.698,0,0.0633,0.12100,0.000022,0.1260,0.9000,125.035,339320,4
7FwBtcecmlpc1sLySPXeGE,[Bruce Springsteen],Dancing In the Dark,0.527,0.942,1,-5.640,0,0.0366,0.01150,0.000000,0.1880,0.4950,148.723,241307,4
5tdKaKLnC4SgtDZ6RlWeal,[Whitney Houston],How Will I Know,0.832,0.544,6,-12.697,1,0.0442,0.20100,0.000139,0.6320,0.9280,119.490,275533,4


##### Export data to file

In [16]:
tracks_with_features.to_pickle("../data/spotify_dataset.pkl")