### Creating dataset of songs using Spotify's API

##### Import relevant libraries/modules and get access token

In [1]:
import pandas as pd
import numpy as np
from os.path import exists
from datetime import datetime
from api_functions import *

In [2]:
access_token_header = get_access_token(CLIENT_ID, CLIENT_SECRET)

##### Get playlist IDs

In [3]:
playlist_ids = {
    "classical": "3HYK6ri0GkvRcM6GkKh0hJ",
    "country": "4mijVkpSXJziPiOrK7YX4M",
    "edm": "3pDxuMpz94eDs7WFqudTbZ",
    "hip-hop": "6MXkE0uYF4XwU4VTtyrpfP",
    "jazz": "5EyFMotmvSfDAZ4hSdKrbx",
    "pop": "6gS3HhOiI17QNojjPuPzqc",
    "rap": "6s5MoZzR70Qef7x4bVxDO1",
    "rnb": "1rLnwJimWCmjp3f0mEbnkY",
    "rock": "7dowgSWOmvdpwNkGFMUs6e"
}

In [4]:
playlist_id = playlist_ids['rock']
playlist_name = "rock"

##### Get playlist data (i.e. all tracks from playlist)

In [5]:
# First call to get playlist information
playlist_response = playlist_tracks_endpoint(playlist_id, access_token_header)

In [6]:
# If playlist has more than 100 songs, more calls will be necessary
all_responses = [playlist_response]

next_page = playlist_response['next']
while next_page is not None:
    offset_response = general_endpoint(next_page, access_token_header)
    all_responses.append(offset_response)
    next_page = offset_response['next']

##### Create dataframe with only relevant info from API call(s)

In [7]:
cleaned_tracks = {}
duplicates = []

for response in all_responses:
    for track in response['items']:
        track_id = track['track']['id']
        track_name = track['track']['name']
        track_artists = [artist['name'] for artist in track['track']['artists']]
        # Keeping track of any duplicate tracks in the playlist
        if track_id in cleaned_tracks:
            duplicates.append(track_id)
        cleaned_tracks[track_id] = {'artists': track_artists, 'name': track_name}
all_tracks_df = pd.DataFrame.from_dict(cleaned_tracks, orient='index')

all_tracks_df.tail()

Unnamed: 0,artists,name
6Qb7gtV6Q4MnUjSbkFcopl,[Paul Simon],50 Ways to Leave Your Lover
0y5a9zXjz7NSEkRCPKNZwP,[The Last Shadow Puppets],My Mistakes Were Made For You
0Q2dRzhPWFOex89dOjbq5Y,[Frank Zappa],Watermelon In Easter Hay
5eMjcHVRRh1tbimTgJPoGn,[Jackson Browne],Stay - 2018 Remaster
6E3NosMXYlGD21K7KqIxQ1,[The Vaccines],I Always Knew


##### Ensuring dataframe is correct

In [8]:
# Checking length of tracks (imported from API) matches that of the dataframe (adjusted for any duplicate tracks)
all_tracks_df.shape[0] == playlist_response['total'] - len(duplicates)

True

##### Create string of ids to query Audio Features and Track Information

In [9]:
track_ids = all_tracks_df.index

def prepare_ids_for_query(ids, max_len=100):
    track_ids = np.array(ids)
    split_ids = []
    # If there are more than 100 tracks in the playlist
    while len(track_ids) > max_len:
        # Create a string of comma-delimited ids and add this to a list
        split_ids.append(",".join(track_ids[:max_len]))
        # Move on to next section of ids
        track_ids = np.delete(track_ids, np.s_[:max_len])
    # Add any remaining tracks (or add all tracks if number of tracks are sub 100)
    split_ids.append(",".join(track_ids))
    return split_ids

# Max ids is 100 for Audio Features, but 50 for Get Several Tracks
ids_list_100 = prepare_ids_for_query(track_ids)
ids_list_50 = prepare_ids_for_query(track_ids, 50)

##### Checking ids have been formatted correctly

In [10]:
# Checking length of last group matches total length minus sum of all other groups
len(ids_list_100[-1].split(",")) == playlist_response['total'] - sum([len(ids.split(",")) for ids in ids_list_100[:-1]]) - len(duplicates)

True

In [11]:
len(ids_list_50[-1].split(",")) == playlist_response['total'] - sum([len(ids.split(",")) for ids in ids_list_50[:-1]]) - len(duplicates)

True

##### Get track information for all tracks in playlist

In [12]:
all_tracks_info_list = [multiple_tracks_info_endpoint(ids, access_token_header) for ids in ids_list_50]

In [13]:
def get_release_date(data):
    try:
        return pd.to_datetime(data['album']['release_date']).year
    except:
        return pd.NA

In [14]:
all_tracks_info_df = pd.DataFrame()

# For each length-50 grouping of tracks
for tracks in all_tracks_info_list:
    tracks_info = tracks['tracks']
    tracks_ids = [data['id'] for data in tracks_info]
    tracks_release_year = []
    for data in tracks_info:
        tracks_release_year.append(get_release_date(data))
    tracks_popularity = [data['popularity'] for data in tracks_info]
    tracks_info_df = pd.DataFrame({'release_year': tracks_release_year, 'popularity': tracks_popularity}, index=tracks_ids)
    all_tracks_info_df = pd.concat([all_tracks_info_df, tracks_info_df])

all_tracks_info_df.tail()

Unnamed: 0,release_year,popularity
6Qb7gtV6Q4MnUjSbkFcopl,1975,72
0y5a9zXjz7NSEkRCPKNZwP,2008,36
0Q2dRzhPWFOex89dOjbq5Y,1979,60
5eMjcHVRRh1tbimTgJPoGn,1977,64
6E3NosMXYlGD21K7KqIxQ1,2012,63


##### Get audio features data for all tracks in playlist

In [15]:
all_audio_features_list = [multiple_audio_features_endpoint(ids, access_token_header) for ids in ids_list_100]

##### Create dataframe of audio features

In [16]:
all_audio_features_df = pd.DataFrame()

# For each length-100 grouping of tracks
for tracks_features in all_audio_features_list:
    audios_features = tracks_features['audio_features']
    tracks_ids = [data['id'] for data in audios_features]
    features_df = pd.DataFrame(audios_features, index=tracks_ids)
    # Drop irrelevant columns
    features_df.drop(['type', 'id', 'uri', 'track_href', 'analysis_url'], axis=1, inplace=True)
    all_audio_features_df = pd.concat([all_audio_features_df, features_df])

all_audio_features_df.tail()

Unnamed: 0,danceability,energy,key,loudness,mode,speechiness,acousticness,instrumentalness,liveness,valence,tempo,duration_ms,time_signature
6Qb7gtV6Q4MnUjSbkFcopl,0.815,0.372,7,-12.814,1,0.0752,0.166,0.000116,0.0767,0.293,101.684,217347,4
0y5a9zXjz7NSEkRCPKNZwP,0.445,0.913,2,-4.789,0,0.0434,0.0285,0.0,0.376,0.764,97.62,184547,4
0Q2dRzhPWFOex89dOjbq5Y,0.486,0.433,9,-13.21,1,0.0439,0.192,0.031,0.124,0.295,110.936,547960,4
5eMjcHVRRh1tbimTgJPoGn,0.603,0.765,7,-8.265,1,0.0275,0.0974,0.296,0.11,0.72,107.068,204464,4
6E3NosMXYlGD21K7KqIxQ1,0.393,0.872,2,-2.53,1,0.0401,0.000122,0.000407,0.312,0.393,150.097,214347,4


##### Merge dataframes together

In [17]:
info_with_features_df = pd.merge(all_tracks_info_df, all_audio_features_df, left_index=True, right_index=True)

info_with_features_df.tail()

Unnamed: 0,release_year,popularity,danceability,energy,key,loudness,mode,speechiness,acousticness,instrumentalness,liveness,valence,tempo,duration_ms,time_signature
6Qb7gtV6Q4MnUjSbkFcopl,1975,72,0.815,0.372,7,-12.814,1,0.0752,0.166,0.000116,0.0767,0.293,101.684,217347,4
0y5a9zXjz7NSEkRCPKNZwP,2008,36,0.445,0.913,2,-4.789,0,0.0434,0.0285,0.0,0.376,0.764,97.62,184547,4
0Q2dRzhPWFOex89dOjbq5Y,1979,60,0.486,0.433,9,-13.21,1,0.0439,0.192,0.031,0.124,0.295,110.936,547960,4
5eMjcHVRRh1tbimTgJPoGn,1977,64,0.603,0.765,7,-8.265,1,0.0275,0.0974,0.296,0.11,0.72,107.068,204464,4
6E3NosMXYlGD21K7KqIxQ1,2012,63,0.393,0.872,2,-2.53,1,0.0401,0.000122,0.000407,0.312,0.393,150.097,214347,4


In [33]:
tracks_df = pd.merge(all_tracks_df, info_with_features_df, left_index=True, right_index=True)
tracks_df.dropna(inplace=True)

tracks_df

Unnamed: 0,artists,name,release_year,popularity,danceability,energy,key,loudness,mode,speechiness,acousticness,instrumentalness,liveness,valence,tempo,duration_ms,time_signature
3qiyyUfYe7CRYLucrPmulD,[The Who],Baba O'Riley,1971,79,0.489,0.724,5,-8.367,1,0.0352,0.313000,0.185000,0.2870,0.150,117.292,300400,4
1QEEqeFIZktqIpPI4jSVSF,[Boston],More Than a Feeling,1976,81,0.377,0.681,7,-8.039,1,0.0298,0.000880,0.002300,0.0504,0.285,108.789,285133,4
7N3PAbqfTjSEU1edb2tY8j,[Van Halen],Jump - 2015 Remaster,1984,81,0.572,0.835,0,-6.219,1,0.0317,0.171000,0.000377,0.0702,0.795,129.981,241600,4
1bp2IO61zbQrbWNmKKxg3f,[Steve Miller Band],The Joker,1973,78,0.596,0.448,5,-9.616,1,0.0396,0.365000,0.000005,0.2060,0.796,83.293,264504,4
5CQ30WqJwcep0pYcV4AMNc,[Led Zeppelin],Stairway to Heaven - Remaster,1971,82,0.338,0.340,9,-12.049,0,0.0339,0.580000,0.003200,0.1160,0.197,82.433,482830,4
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6Qb7gtV6Q4MnUjSbkFcopl,[Paul Simon],50 Ways to Leave Your Lover,1975,72,0.815,0.372,7,-12.814,1,0.0752,0.166000,0.000116,0.0767,0.293,101.684,217347,4
0y5a9zXjz7NSEkRCPKNZwP,[The Last Shadow Puppets],My Mistakes Were Made For You,2008,36,0.445,0.913,2,-4.789,0,0.0434,0.028500,0.000000,0.3760,0.764,97.620,184547,4
0Q2dRzhPWFOex89dOjbq5Y,[Frank Zappa],Watermelon In Easter Hay,1979,60,0.486,0.433,9,-13.210,1,0.0439,0.192000,0.031000,0.1240,0.295,110.936,547960,4
5eMjcHVRRh1tbimTgJPoGn,[Jackson Browne],Stay - 2018 Remaster,1977,64,0.603,0.765,7,-8.265,1,0.0275,0.097400,0.296000,0.1100,0.720,107.068,204464,4


##### Export data to file

In [20]:
tracks_df.to_pickle(f"../data_deprecated/spotify_dataset_{playlist_name}.pkl")