## Constants

In [9]:
import os

FILE_COUNT = 1000   # TODO: change to 1000
PLAYLISTS_PER_FILE = 1000

DIR_DATA_RAW = os.path.join("..", "data", "raw")
DIR_DATA_PROCESSED = os.path.join("..", "data", "processed")

PLAYLIST_COUNT = FILE_COUNT * PLAYLISTS_PER_FILE

Merge the data extracted from the Spotify API (csv file) with that of the Spotify Million Playlist Dataset (pickle file)

In [10]:
import os
import pandas as pd

path_audio_features = os.path.join(DIR_DATA_RAW, "tracks", "track_audio_features.csv")
audio_features = pd.read_csv(path_audio_features).set_index("id")

display(audio_features)

Unnamed: 0_level_0,tempo,key,mode,loudness,danceability,energy,speechiness,acousticness,instrumentalness,liveness,valence,duration_ms,time_signature
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1
3zyYVItCMCjFzBHTyjrxPK,79.227,8.0,1.0,-3.406,0.466,0.856,0.3180,0.257000,0.000000,0.6750,0.531,292223.0,4.0
4jSy0HTIoC9yiwZ8OVyTCW,164.207,1.0,1.0,-7.972,0.510,0.849,0.1190,0.000546,0.000748,0.6890,0.887,315067.0,4.0
4zyqBSUFNkJ20mw1FB68gt,83.947,4.0,0.0,-22.867,0.308,0.114,0.0321,0.958000,0.902000,0.0853,0.303,350906.0,4.0
63B3TtwUzOoJoe3unMteVa,93.696,1.0,0.0,-4.166,0.660,0.943,0.2770,0.129000,0.000000,0.5570,0.599,210733.0,4.0
7y9iMe8SOB6z3NoHE2OfXl,118.384,0.0,1.0,-3.539,0.675,0.751,0.0296,0.060400,0.000000,0.0893,0.612,181279.0,4.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...
0s9iofqbQdsmWZQJ1TZh1c,118.061,7.0,1.0,-13.766,0.550,0.351,0.0443,0.531000,0.006020,0.1500,0.237,162826.0,4.0
5Ozvg38CnCQKmPP8tgSRA6,96.520,0.0,1.0,-4.130,0.609,0.742,0.0477,0.001310,0.000002,0.0725,0.531,256227.0,4.0
2yWsZpLZNmap92YkGkzyDq,179.985,6.0,1.0,-5.899,0.373,0.584,0.0272,0.004810,0.449000,0.1630,0.403,214667.0,3.0
7JxA6bPIyXNohnrTw6CzS0,178.012,7.0,0.0,-7.096,0.473,0.829,0.3620,0.048900,0.000000,0.2530,0.583,270680.0,4.0


Create a function to compute relevant metrics about a playlist's audio features.

In [11]:
def get_playlist_metrics(df: pd.core.frame.DataFrame):
    def get_statistic(f) -> pd.core.frame.DataFrame:
        return df.apply(f).to_frame().transpose()

    assert isinstance(df, pd.core.frame.DataFrame)
    result = pd.concat({"min" : get_statistic(pd.Series.min),
                        "q1" : get_statistic(lambda x: x.quantile(0.25)),
                        "mean" : get_statistic(pd.Series.mean),
                        "median" : get_statistic(pd.Series.median),
                        "q3" : get_statistic(lambda x: x.quantile(0.75)),
                        "max" : get_statistic(pd.Series.max),
                        "standard deviation" : get_statistic(pd.Series.std),
                        "variance" : get_statistic(pd.Series.var)}, axis=1)
    
    result.columns = result.columns.swaplevel(0, 1)
    result.sort_index(axis=1, level=0, inplace=True)
    return result

Combine extracted and calculated data into a playlist frame

In [15]:
import pickle

playlist_list_path = os.path.join(DIR_DATA_PROCESSED, "playlists" + str(PLAYLIST_COUNT) + ".pkl")

with open(playlist_list_path, "rb") as fin:
    playlist_data = pickle.load(fin)

playlists_metrics = pd.DataFrame()

for i, playlist in enumerate(playlist_data):
    playlist_features = pd.DataFrame(columns = audio_features.columns.tolist())

    for track_id in playlist["track_ids"]:
        if track_id not in audio_features.index: # TODO: handle cases when track id has no audio features
            continue

        track_features = pd.DataFrame(audio_features.loc[track_id]).T # .loc gives series, so transpose is needed

        playlist_features = pd.concat([playlist_features if not playlist_features.empty else None, 
                                       track_features], axis=0)
        
    playlist_info = pd.DataFrame(data={k:[v] for k,v in playlist.items() if k != "track_ids"})
    playlist_info.columns = pd.MultiIndex.from_product([['metadata'], playlist_info.columns])
    
    playlist_metrics = pd.concat([playlist_info, get_playlist_metrics(playlist_features)], axis=1)
    playlists_metrics = pd.concat([playlists_metrics, playlist_metrics], ignore_index=True)

    print("Processed playlists: {}/{}".format(i+1, len(playlist_data)), end="\r")

display(playlists_metrics)

Processed playlists: 245781/245781

Unnamed: 0_level_0,metadata,metadata,metadata,metadata,metadata,metadata,metadata,metadata,metadata,acousticness,...,time_signature,time_signature,valence,valence,valence,valence,valence,valence,valence,valence
Unnamed: 0_level_1,name,collaborative,pid,modified_at,num_tracks,num_albums,num_followers,num_edits,num_artists,max,...,standard deviation,variance,max,mean,median,min,q1,q3,standard deviation,variance
0,90s,false,4,1401667200,17,16,2,7,16,0.31100,...,,,0.757,0.757000,0.7570,0.7570,0.75700,0.75700,,
1,BOP,false,8,1508976000,46,37,2,21,23,0.02890,...,0.000000,0.000000,0.650,0.624000,0.6240,0.5980,0.61100,0.63700,0.036770,0.001352
2,abby,false,10,1509321600,72,60,2,36,40,0.70000,...,0.000000,0.000000,0.725,0.436667,0.4230,0.1810,0.26750,0.59500,0.220136,0.048460
3,mixtape,false,20,1509494400,14,9,3,11,6,0.05790,...,0.000000,0.000000,0.496,0.419500,0.4195,0.3430,0.38125,0.45775,0.108187,0.011704
4,fall '17,false,22,1509408000,42,39,2,15,37,0.25300,...,0.000000,0.000000,0.645,0.626000,0.6260,0.6070,0.61650,0.63550,0.026870,0.000722
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
245776,Baby Making Music,false,999985,1455753600,28,26,3,18,16,0.15300,...,,,0.515,0.515000,0.5150,0.5150,0.51500,0.51500,,
245777,Work,false,999986,1509321600,130,113,3,51,82,0.49800,...,0.000000,0.000000,0.863,0.461323,0.4460,0.0862,0.25100,0.64600,0.249748,0.062374
245778,Calm,false,999987,1506643200,182,157,2,97,112,0.97200,...,0.307794,0.094737,0.530,0.312500,0.3040,0.1170,0.25375,0.40025,0.110891,0.012297
245779,Jams,false,999988,1500336000,27,21,2,20,18,0.27400,...,0.000000,0.000000,0.370,0.292000,0.2810,0.2250,0.25300,0.32550,0.073123,0.005347


In [18]:
import pickle

PLAYLISTS_METRICS_PATH = os.path.join(DIR_DATA_PROCESSED, "playlists_metrics" + str(PLAYLIST_COUNT) + ".pkl")

with open(PLAYLISTS_METRICS_PATH, "wb") as fout:
    pickle.dump(playlists_metrics, fout, protocol = pickle.HIGHEST_PROTOCOL)

In [19]:
with open(PLAYLISTS_METRICS_PATH, "rb") as fin:
    test = pickle.load(fin)

display(test)

Unnamed: 0_level_0,metadata,metadata,metadata,metadata,metadata,metadata,metadata,metadata,metadata,acousticness,...,time_signature,time_signature,valence,valence,valence,valence,valence,valence,valence,valence
Unnamed: 0_level_1,name,collaborative,pid,modified_at,num_tracks,num_albums,num_followers,num_edits,num_artists,max,...,standard deviation,variance,max,mean,median,min,q1,q3,standard deviation,variance
0,90s,false,4,1401667200,17,16,2,7,16,0.31100,...,,,0.757,0.757000,0.7570,0.7570,0.75700,0.75700,,
1,BOP,false,8,1508976000,46,37,2,21,23,0.02890,...,0.000000,0.000000,0.650,0.624000,0.6240,0.5980,0.61100,0.63700,0.036770,0.001352
2,abby,false,10,1509321600,72,60,2,36,40,0.70000,...,0.000000,0.000000,0.725,0.436667,0.4230,0.1810,0.26750,0.59500,0.220136,0.048460
3,mixtape,false,20,1509494400,14,9,3,11,6,0.05790,...,0.000000,0.000000,0.496,0.419500,0.4195,0.3430,0.38125,0.45775,0.108187,0.011704
4,fall '17,false,22,1509408000,42,39,2,15,37,0.25300,...,0.000000,0.000000,0.645,0.626000,0.6260,0.6070,0.61650,0.63550,0.026870,0.000722
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
245776,Baby Making Music,false,999985,1455753600,28,26,3,18,16,0.15300,...,,,0.515,0.515000,0.5150,0.5150,0.51500,0.51500,,
245777,Work,false,999986,1509321600,130,113,3,51,82,0.49800,...,0.000000,0.000000,0.863,0.461323,0.4460,0.0862,0.25100,0.64600,0.249748,0.062374
245778,Calm,false,999987,1506643200,182,157,2,97,112,0.97200,...,0.307794,0.094737,0.530,0.312500,0.3040,0.1170,0.25375,0.40025,0.110891,0.012297
245779,Jams,false,999988,1500336000,27,21,2,20,18,0.27400,...,0.000000,0.000000,0.370,0.292000,0.2810,0.2250,0.25300,0.32550,0.073123,0.005347
