### Creating dataset of songs using Spotify's API

##### Import relevant libraries/modules and get access token

In [1]:
import pandas as pd
import numpy as np
from datetime import datetime
from api_functions import *

In [2]:
access_token_header = get_access_token(CLIENT_ID, CLIENT_SECRET)

In [3]:
playlist_id = '6VaNNtZuGdbQ3GMNnhPl9e'

##### Get playlist data (i.e. all tracks from playlist)

In [4]:
# First call to get playlist information
playlist_response = playlist_tracks_endpoint(playlist_id, access_token_header)

In [5]:
# If playlist has more than 100 songs, more calls will be necessary
all_responses = [playlist_response]

next_page = playlist_response['next']
while next_page is not None:
    offset_response = general_endpoint(next_page, access_token_header)
    all_responses.append(offset_response)
    next_page = offset_response['next']

##### Create dataframe with only relevant info from API call(s)

In [6]:
cleaned_tracks = {}
duplicates = []

for response in all_responses:
    for track in response['items']:
        track_id = track['track']['id']
        track_name = track['track']['name']
        track_artists = [artist['name'] for artist in track['track']['artists']]
        # Keeping track of any duplicate tracks in the playlist
        if track_id in cleaned_tracks:
            duplicates.append(track_id)
        cleaned_tracks[track_id] = {'artists': track_artists, 'name': track_name}
all_tracks_df = pd.DataFrame.from_dict(cleaned_tracks, orient='index')

all_tracks_df.tail()

Unnamed: 0,artists,name
1OxcIUqVmVYxT6427tbhDW,[Lenny Kravitz],Fly Away
6LtHYDgYHRCHoKK3snfr2w,"[Labrinth, Zendaya]","I'm Tired - From ""Euphoria"" An HBO Original Se..."
0Z2J91b2iTGLVTZC4fKgxf,"[2Pac, Outlawz]",Hit 'Em Up - Single Version
2sY0M3k69oAYz7EOPVRFjo,[MIKA],Big Girl (You Are Beautiful)
1FTSo4v6BOZH9QxKc3MbVM,[Blur],Song 2 - 2012 Remaster


##### Ensuring dataframe is correct

In [7]:
# Checking length of tracks (imported from API) matches that of the dataframe (adjusted for any duplicate tracks)
all_tracks_df.shape[0] == playlist_response['total'] - len(duplicates)

True

##### Create string of ids to query Audio Features and Track Information

In [8]:
track_ids = all_tracks_df.index

def prepare_ids_for_query(ids, max_len=100):
    track_ids = np.array(ids)
    split_ids = []
    # If there are more than 100 tracks in the playlist
    while len(track_ids) > max_len:
        # Create a string of comma-delimited ids and add this to a list
        split_ids.append(",".join(track_ids[:max_len]))
        # Move on to next section of ids
        track_ids = np.delete(track_ids, np.s_[:max_len])
    # Add any remaining tracks (or add all tracks if number of tracks are sub 100)
    split_ids.append(",".join(track_ids))
    return split_ids

# Max ids is 100 for Audio Features, but 50 for Get Several Tracks
ids_list_100 = prepare_ids_for_query(track_ids)
ids_list_50 = prepare_ids_for_query(track_ids, 50)

##### Checking ids have been formatted correctly

In [9]:
# Checking length of last group matches total length minus sum of all other groups
len(ids_list_100[-1].split(",")) == playlist_response['total'] - sum([len(ids.split(",")) for ids in ids_list_100[:-1]]) - len(duplicates)

True

In [10]:
len(ids_list_50[-1].split(",")) == playlist_response['total'] - sum([len(ids.split(",")) for ids in ids_list_50[:-1]]) - len(duplicates)

True

##### Get track information for all tracks in playlist

In [11]:
all_tracks_info_list = [multiple_tracks_info_endpoint(ids, access_token_header) for ids in ids_list_50]

In [12]:
all_tracks_info_df = pd.DataFrame()

# For each length-50 grouping of tracks
for tracks in all_tracks_info_list:
    tracks_info = tracks['tracks']
    tracks_ids = [data['id'] for data in tracks_info]
    tracks_popularity = [data['popularity'] for data in tracks_info]
    tracks_release_year = [pd.to_datetime(data['album']['release_date']).year for data in tracks_info]
    tracks_info_df = pd.DataFrame({'popularity': tracks_popularity, 'release_year': tracks_release_year}, index=tracks_ids)
    all_tracks_info_df = pd.concat([all_tracks_info_df, tracks_info_df])

all_tracks_info_df.tail()

Unnamed: 0,popularity,release_year
1OxcIUqVmVYxT6427tbhDW,75,1998
6LtHYDgYHRCHoKK3snfr2w,90,2022
0Z2J91b2iTGLVTZC4fKgxf,85,1998
2sY0M3k69oAYz7EOPVRFjo,56,2006
1FTSo4v6BOZH9QxKc3MbVM,80,1997


##### Get audio features data for all tracks in playlist

In [13]:
all_audio_features_list = [multiple_audio_features_endpoint(ids, access_token_header) for ids in ids_list_100]

##### Create dataframe of audio features

In [14]:
all_audio_features_df = pd.DataFrame()

# For each length-100 grouping of tracks
for tracks_features in all_audio_features_list:
    audios_features = tracks_features['audio_features']
    tracks_ids = [data['id'] for data in audios_features]
    features_df = pd.DataFrame(audios_features, index=tracks_ids)
    # Drop irrelevant columns
    features_df.drop(['type', 'id', 'uri', 'track_href', 'analysis_url'], axis=1, inplace=True)
    all_audio_features_df = pd.concat([all_audio_features_df, features_df])

all_audio_features_df.tail()

Unnamed: 0,danceability,energy,key,loudness,mode,speechiness,acousticness,instrumentalness,liveness,valence,tempo,duration_ms,time_signature
1OxcIUqVmVYxT6427tbhDW,0.587,0.872,7,-5.066,1,0.0484,0.018,0.0,0.622,0.735,159.925,221333,4
6LtHYDgYHRCHoKK3snfr2w,0.375,0.133,0,-10.624,0,0.0352,0.651,0.0,0.0695,0.192,131.721,187944,4
0Z2J91b2iTGLVTZC4fKgxf,0.916,0.844,7,-3.967,1,0.236,0.0394,0.0,0.0778,0.586,95.19,312627,4
2sY0M3k69oAYz7EOPVRFjo,0.797,0.75,11,-6.324,1,0.0773,0.0384,0.0,0.232,0.808,116.005,248000,4
1FTSo4v6BOZH9QxKc3MbVM,0.674,0.789,8,-6.903,1,0.0676,0.00178,0.0071,0.0754,0.918,129.804,121160,4


##### Merge dataframes together

In [15]:
info_with_features_df = pd.merge(all_tracks_info_df, all_audio_features_df, left_index=True, right_index=True)

info_with_features_df.tail()

Unnamed: 0,popularity,release_year,danceability,energy,key,loudness,mode,speechiness,acousticness,instrumentalness,liveness,valence,tempo,duration_ms,time_signature
1OxcIUqVmVYxT6427tbhDW,75,1998,0.587,0.872,7,-5.066,1,0.0484,0.018,0.0,0.622,0.735,159.925,221333,4
6LtHYDgYHRCHoKK3snfr2w,90,2022,0.375,0.133,0,-10.624,0,0.0352,0.651,0.0,0.0695,0.192,131.721,187944,4
0Z2J91b2iTGLVTZC4fKgxf,85,1998,0.916,0.844,7,-3.967,1,0.236,0.0394,0.0,0.0778,0.586,95.19,312627,4
2sY0M3k69oAYz7EOPVRFjo,56,2006,0.797,0.75,11,-6.324,1,0.0773,0.0384,0.0,0.232,0.808,116.005,248000,4
1FTSo4v6BOZH9QxKc3MbVM,80,1997,0.674,0.789,8,-6.903,1,0.0676,0.00178,0.0071,0.0754,0.918,129.804,121160,4


In [16]:
tracks_df = pd.merge(all_tracks_df, info_with_features_df, left_index=True, right_index=True)

tracks_df

Unnamed: 0,artists,name,popularity,release_year,danceability,energy,key,loudness,mode,speechiness,acousticness,instrumentalness,liveness,valence,tempo,duration_ms,time_signature
6dtWKqqdveI3YvdYJQKWWn,[ZAYN],iT's YoU,64,2016,0.611,0.319,5,-10.743,1,0.0357,0.81500,0.015100,0.1080,0.1760,124.920,226653,4
73Qlt0XuhuYzEigjitm2ze,[Demi Lovato],Cool for the Summer,0,2015,0.587,0.620,5,-5.612,0,0.0366,0.00504,0.000181,0.0762,0.2860,114.080,214740,4
5jXNOgH5A7yI76BU9kaP6N,"[Kanye West, JAY-Z, Rick Ross, Nicki Minaj, Bo...",Monster,0,2010,0.622,0.679,11,-5.621,0,0.2450,0.00275,0.006830,0.6060,0.0878,123.380,378893,5
3MxJGBnYaP4yQYUv90HLqZ,[One Direction],Moments,66,2012,0.474,0.739,2,-4.937,1,0.0451,0.02830,0.000000,0.2990,0.2530,149.894,262640,4
1wHZx0LgzFHyeIZkUydNXq,[Travis Scott],Antidote,78,2015,0.713,0.526,1,-5.046,1,0.0320,0.00767,0.000148,0.1240,0.1310,131.050,262693,4
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1OxcIUqVmVYxT6427tbhDW,[Lenny Kravitz],Fly Away,75,1998,0.587,0.872,7,-5.066,1,0.0484,0.01800,0.000000,0.6220,0.7350,159.925,221333,4
6LtHYDgYHRCHoKK3snfr2w,"[Labrinth, Zendaya]","I'm Tired - From ""Euphoria"" An HBO Original Se...",90,2022,0.375,0.133,0,-10.624,0,0.0352,0.65100,0.000000,0.0695,0.1920,131.721,187944,4
0Z2J91b2iTGLVTZC4fKgxf,"[2Pac, Outlawz]",Hit 'Em Up - Single Version,85,1998,0.916,0.844,7,-3.967,1,0.2360,0.03940,0.000000,0.0778,0.5860,95.190,312627,4
2sY0M3k69oAYz7EOPVRFjo,[MIKA],Big Girl (You Are Beautiful),56,2006,0.797,0.750,11,-6.324,1,0.0773,0.03840,0.000000,0.2320,0.8080,116.005,248000,4


##### Export data to file

In [17]:
current_time = datetime.now().strftime("%d-%m-%Y_%H-%M-%S")
tracks_df.to_pickle(f"../data/spotify_dataset_{current_time}.pkl")