## Constants

In [9]:
import os

FILE_COUNT = 1000   # TODO: change to 1000
PLAYLISTS_PER_FILE = 1000

DIR_DATA_RAW = os.path.join("..", "data", "raw")
DIR_DATA_PROCESSED = os.path.join("..", "data", "processed")

PLAYLIST_COUNT = FILE_COUNT * PLAYLISTS_PER_FILE

Merge the data extracted from the Spotify API (csv file) with that of the Spotify Million Playlist Dataset (pickle file)

In [10]:
import os
import pandas as pd

path_audio_features = os.path.join(DIR_DATA_RAW, "tracks", "track_audio_features.csv")
audio_features = pd.read_csv(path_audio_features).set_index("id")

display(audio_features)

Unnamed: 0_level_0,tempo,key,mode,loudness,danceability,energy,speechiness,acousticness,instrumentalness,liveness,valence,duration_ms,time_signature
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1
0000uJA4xCdxThagdLkkLR,184.913,5.0,1.0,-5.621,0.458,0.5910,0.0326,0.5680,0.000015,0.2860,0.6540,161187.0,3.0
0002yNGLtYSYtc0X6ZnFvp,182.345,8.0,1.0,-11.572,0.455,0.6230,0.0523,0.7970,0.903000,0.6340,0.9510,220293.0,4.0
00039MgrmLoIzSpuYKurn9,132.064,1.0,1.0,-5.632,0.742,0.7530,0.0364,0.0178,0.000000,0.1330,0.2630,222727.0,4.0
0005rgjsSeVLp1cze57jIN,133.158,1.0,0.0,-6.141,0.507,0.4460,0.0276,0.7990,0.000000,0.3190,0.4180,213960.0,4.0
0006Rv1e2Xfh6QooyKJqKS,89.048,2.0,0.0,-9.190,0.295,0.4980,0.0301,0.7950,0.944000,0.1070,0.0445,189639.0,3.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...
7zzptITgTKf4HpJM8ye47v,81.071,1.0,0.0,-6.398,0.447,0.7240,0.0372,0.7880,0.202000,0.2420,0.9400,215813.0,4.0
7zzpwV2lgKsLke68yFoZdp,129.996,7.0,1.0,-2.558,0.497,0.6980,0.0317,0.1270,0.000000,0.1160,0.5520,233933.0,4.0
7zzrzbrb14URUZlmSrCGfM,98.463,9.0,0.0,-25.031,0.347,0.0127,0.0453,0.9490,0.000033,0.0478,0.1510,340560.0,3.0
7zzwFo2lPCgXphtN89XmLk,124.999,0.0,1.0,-10.112,0.637,0.7890,0.0538,0.0500,0.563000,0.2330,0.8600,297520.0,4.0


Create a function to compute relevant metrics about a playlist's audio features.

In [11]:
def get_playlist_metrics(df: pd.core.frame.DataFrame):
    def get_statistic(f) -> pd.core.frame.DataFrame:
        return df.apply(f).to_frame().transpose()

    assert isinstance(df, pd.core.frame.DataFrame)
    result = pd.concat({"min" : get_statistic(pd.Series.min),
                        "q1" : get_statistic(lambda x: x.quantile(0.25)),
                        "mean" : get_statistic(pd.Series.mean),
                        "median" : get_statistic(pd.Series.median),
                        "q3" : get_statistic(lambda x: x.quantile(0.75)),
                        "max" : get_statistic(pd.Series.max),
                        "standard deviation" : get_statistic(pd.Series.std),
                        "variance" : get_statistic(pd.Series.var)}, axis=1)
    
    result.columns = result.columns.swaplevel(0, 1)
    result.sort_index(axis=1, level=0, inplace=True)
    return result

Combine extracted and calculated data into a playlist frame

In [20]:
import pickle

playlist_list_path = os.path.join(DIR_DATA_PROCESSED, "playlists" + str(PLAYLIST_COUNT) + ".pkl")

print("Loading playlist data from pickle file...")
with open(playlist_list_path, "rb") as fin:
    playlist_data = pickle.load(fin)

print("Start extracting playlist metrics...")
playlists_metrics = pd.DataFrame()

for i, playlist in enumerate(playlist_data[:5]):
    playlist_features = pd.DataFrame(columns = audio_features.columns.tolist())
    for i, track_id in enumerate(playlist["track_ids"]):
        if track_id not in audio_features.index: # TODO: handle cases when track id has no audio features
            print("Playlist {} : missing audio features for track {}/{} with id = {}".format(playlist["pid"], i, playlist["num_tracks"], track_id))
            continue

        track_features = pd.DataFrame(audio_features.loc[track_id]).T # .loc gives series, so transpose is needed

        playlist_features = pd.concat([playlist_features if not playlist_features.empty else None, 
                                       track_features], axis=0)
        
    playlist_info = pd.DataFrame(data={k:[v] for k,v in playlist.items() if k != "track_ids"})
    playlist_info.columns = pd.MultiIndex.from_product([['metadata'], playlist_info.columns])
    
    playlist_metrics = pd.concat([playlist_info, get_playlist_metrics(playlist_features)], axis=1)
    playlists_metrics = pd.concat([playlists_metrics, playlist_metrics], ignore_index=True)

    print("Processed playlists: {}/{}".format(i+1, len(playlist_data)), end="\r")

display(playlists_metrics)

Loading playlist data from pickle file...
Start extracting playlist metrics...
Playlist 4 : missing audio features for track 5/17 with id = 3Y8Ff1nH44jFywAtpgmleZ
Playlist 4 : missing audio features for track 9/17 with id = 6G8fblD9DbcEmaKOKDy3XL
Playlist 4 : missing audio features for track 14/17 with id = 3XEtw7t4V5sfd2vtkp0ql7
Playlist 8 : missing audio features for track 6/46 with id = 6oXUzxFOfdXbD8ivAfAraN
Playlist 8 : missing audio features for track 8/46 with id = 5LlsD7LdSMkGV4Iu0a2Zq0
Playlist 8 : missing audio features for track 10/46 with id = 3dEFa9KjOLEZl980ctEEv1
Playlist 8 : missing audio features for track 13/46 with id = 5TGYo4MrNKqIvJqgx134py
Playlist 8 : missing audio features for track 22/46 with id = 0GO8y8jQk1PkHzS31d699N
Playlist 8 : missing audio features for track 39/46 with id = 7uEcCGtM1FBBGIhPozhJjv
Playlist 10 : missing audio features for track 5/72 with id = 3NJG6vMH1ZsectZkocMEm0
Playlist 10 : missing audio features for track 14/72 with id = 49GYdiLjYMrU

Unnamed: 0_level_0,metadata,metadata,metadata,metadata,metadata,metadata,metadata,metadata,metadata,acousticness,...,time_signature,time_signature,valence,valence,valence,valence,valence,valence,valence,valence
Unnamed: 0_level_1,name,collaborative,pid,modified_at,num_tracks,num_albums,num_followers,num_edits,num_artists,max,...,standard deviation,variance,max,mean,median,min,q1,q3,standard deviation,variance
0,90s,False,4,1401667200,17,16,2,7,16,0.454,...,0.0,0.0,0.783,0.448571,0.4835,0.144,0.287,0.55675,0.196927,0.03878
1,BOP,False,8,1508976000,46,37,2,21,23,0.969,...,0.158114,0.025,0.9,0.540075,0.569,0.0931,0.33575,0.756,0.233526,0.054534
2,abby,False,10,1509321600,72,60,2,36,40,0.826,...,0.438738,0.192491,0.883,0.478452,0.4485,0.123,0.31675,0.646,0.220566,0.048649
3,mixtape,False,20,1509494400,14,9,3,11,6,0.351,...,0.0,0.0,0.68,0.337754,0.313,0.0793,0.238,0.484,0.192852,0.037192
4,fall '17,False,22,1509408000,42,39,2,15,37,0.935,...,0.226294,0.051209,0.962,0.430063,0.409,0.0373,0.28075,0.5565,0.22385,0.050109


In [13]:
import pickle

PLAYLISTS_METRICS_PATH = os.path.join(DIR_DATA_PROCESSED, "playlists_metrics" + str(PLAYLIST_COUNT) + ".pkl")

with open(PLAYLISTS_METRICS_PATH, "wb") as fout:
    pickle.dump(playlists_metrics, fout, protocol = pickle.HIGHEST_PROTOCOL)

In [14]:
with open(PLAYLISTS_METRICS_PATH, "rb") as fin:
    test = pickle.load(fin)

display(test)

Unnamed: 0_level_0,metadata,metadata,metadata,metadata,metadata,metadata,metadata,metadata,metadata,acousticness,...,time_signature,time_signature,valence,valence,valence,valence,valence,valence,valence,valence
Unnamed: 0_level_1,name,collaborative,pid,modified_at,num_tracks,num_albums,num_followers,num_edits,num_artists,max,...,standard deviation,variance,max,mean,median,min,q1,q3,standard deviation,variance
0,90s,False,4,1401667200,17,16,2,7,16,0.454,...,0.0,0.0,0.783,0.448571,0.4835,0.144,0.287,0.55675,0.196927,0.03878
