## Constants

In [12]:
import os

FILE_COUNT = 1000   # TODO: change to 1000
PLAYLISTS_PER_FILE = 1000

DIR_DATA_RAW = os.path.join("..", "data", "raw")
DIR_DATA_PROCESSED = os.path.join("..", "data", "processed")

PLAYLIST_COUNT = FILE_COUNT * PLAYLISTS_PER_FILE

PATH_OUT = os.path.join(DIR_DATA_PROCESSED, "playlists_metrics" + str(PLAYLIST_COUNT) + ".pkl")
PATH_PLAYLIST_LIST = os.path.join(DIR_DATA_PROCESSED, "playlists" + str(PLAYLIST_COUNT) + ".pkl")

Merge the data extracted from the Spotify API (csv file) with that of the Spotify Million Playlist Dataset (pickle file)

In [3]:
import os
import pandas as pd

path_audio_features = os.path.join(DIR_DATA_RAW, "tracks", "track_audio_features.csv")
audio_features = pd.read_csv(path_audio_features).set_index("id")

display(audio_features)

Unnamed: 0_level_0,tempo,key,mode,loudness,danceability,energy,speechiness,acousticness,instrumentalness,liveness,valence,duration_ms,time_signature
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1
0000uJA4xCdxThagdLkkLR,184.913,5.0,1.0,-5.621,0.458,0.5910,0.0326,0.5680,0.000015,0.2860,0.6540,161187.0,3.0
0002yNGLtYSYtc0X6ZnFvp,182.345,8.0,1.0,-11.572,0.455,0.6230,0.0523,0.7970,0.903000,0.6340,0.9510,220293.0,4.0
00039MgrmLoIzSpuYKurn9,132.064,1.0,1.0,-5.632,0.742,0.7530,0.0364,0.0178,0.000000,0.1330,0.2630,222727.0,4.0
0005rgjsSeVLp1cze57jIN,133.158,1.0,0.0,-6.141,0.507,0.4460,0.0276,0.7990,0.000000,0.3190,0.4180,213960.0,4.0
0006Rv1e2Xfh6QooyKJqKS,89.048,2.0,0.0,-9.190,0.295,0.4980,0.0301,0.7950,0.944000,0.1070,0.0445,189639.0,3.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...
7zzptITgTKf4HpJM8ye47v,81.071,1.0,0.0,-6.398,0.447,0.7240,0.0372,0.7880,0.202000,0.2420,0.9400,215813.0,4.0
7zzpwV2lgKsLke68yFoZdp,129.996,7.0,1.0,-2.558,0.497,0.6980,0.0317,0.1270,0.000000,0.1160,0.5520,233933.0,4.0
7zzrzbrb14URUZlmSrCGfM,98.463,9.0,0.0,-25.031,0.347,0.0127,0.0453,0.9490,0.000033,0.0478,0.1510,340560.0,3.0
7zzwFo2lPCgXphtN89XmLk,124.999,0.0,1.0,-10.112,0.637,0.7890,0.0538,0.0500,0.563000,0.2330,0.8600,297520.0,4.0


## Calculate Playlist Statistics

Initially, the processing was done single-threaded (see [legacy code](#single-threaded-version-legacy)). 
However, that approach resulted in a runtime of ~2 hours.

The improved version splits the work over 8 workers (as separate *processes*, as separate *threads* cannot actually run concurrently in Python), concurrently processing parts of the list of playlists and only combining their results upon completion.
Overall, this reduced the runtime to ~22 minutes for the same number of playlists.

Processes of the multiprocessing module can't use functions that are defined as part of a notebook. Therefore, the worker function is in a separate Python file.

### Import Playlist List

In [None]:
import pickle

with open(PATH_PLAYLIST_LIST, "rb") as fin:
    playlist_data = pickle.load(fin)

### Process Playlists (Multithreaded)

In [None]:
from multiprocessing import Pool
from process_playlists import get_playlist_metrics, process_playlists

num_processes = 8
results = []

pool = Pool(processes=num_processes)
playlists_per_process = len(playlist_data) // num_processes
processes = []

try:
    for i in range(num_processes):
        start = i * playlists_per_process
        end = (i + 1) * playlists_per_process

        if i == num_processes - 1:
            end = len(playlist_data)

        process = pool.apply_async(process_playlists, args=(playlist_data, audio_features, start, end, i))
        processes.append(process)

    for process in processes:
        result = process.get()
        results.append(result)

except KeyboardInterrupt:
    pool.terminate()
    raise KeyboardInterrupt
finally:
    pool.close()

# Concatenate the results
playlists_metrics = pd.concat(results, ignore_index=True)
display(playlists_metrics)


### Single Threaded Version (Legacy)

In [None]:
from process_playlists import get_playlist_metrics

playlists_metrics = pd.DataFrame()

for i, playlist in enumerate(playlist_data):

    playlist_features = pd.DataFrame(columns = audio_features.columns.tolist())
    new_index = pd.Index.union(playlist_features.index, playlist['track_ids'])
    playlist_features = playlist_features.reindex(new_index)

    for j, track_id in enumerate(playlist["track_ids"]):
        if track_id not in audio_features.index: 
            print("Playlist {} : missing audio features for track {}/{} with id = {}".format(playlist["pid"], i, playlist["num_tracks"], track_id))
            continue

        track_features = pd.DataFrame(audio_features.loc[track_id]).T # .loc gives series, so transpose is needed

        playlist_features = pd.concat([playlist_features if not playlist_features.empty else None, 
                                       track_features], axis=0)
        
    playlist_info = pd.DataFrame(data={k:[v] for k,v in playlist.items() if k != "track_ids"})
    playlist_info.columns = pd.MultiIndex.from_product([['metadata'], playlist_info.columns])
    playlist_metrics = pd.concat([playlist_info, get_playlist_metrics(playlist_features)], axis=1)

    playlists_metrics = pd.concat([playlists_metrics, playlist_metrics], ignore_index=True)

    print("Processed playlists: {}/{}".format(i+1, len(playlist_data)), end="\r")

display(playlists_metrics)

## Save/View Results

In [10]:
import pickle

with open(PATH_OUT, "wb") as fout:
    pickle.dump(playlists_metrics, fout, protocol = pickle.HIGHEST_PROTOCOL)

### Check File Contents

In [14]:
with open(PATH_OUT, "rb") as fin:
    display(pickle.load(fin))

Unnamed: 0_level_0,metadata,metadata,metadata,metadata,metadata,metadata,metadata,metadata,metadata,acousticness,...,time_signature,time_signature,valence,valence,valence,valence,valence,valence,valence,valence
Unnamed: 0_level_1,name,collaborative,pid,modified_at,num_tracks,num_albums,num_followers,num_edits,num_artists,max,...,standard deviation,variance,max,mean,median,min,q1,q3,standard deviation,variance
0,90s,false,4,1401667200,17,16,2,7,16,0.948,...,0.242536,0.058824,0.964,0.490294,0.4980,0.1290,0.27900,0.60900,0.262911,0.069122
1,BOP,false,8,1508976000,46,37,2,21,23,0.969,...,0.147442,0.021739,0.900,0.539109,0.5690,0.0931,0.34875,0.71300,0.222164,0.049357
2,abby,false,10,1509321600,72,60,2,36,40,0.826,...,0.407289,0.165884,0.883,0.463861,0.4275,0.1230,0.30400,0.61225,0.216904,0.047048
3,mixtape,false,20,1509494400,14,9,3,11,6,0.351,...,0.000000,0.000000,0.704,0.363914,0.3280,0.0793,0.24350,0.49300,0.209552,0.043912
4,fall '17,false,22,1509408000,42,39,2,15,37,0.935,...,0.517409,0.267712,0.962,0.437533,0.4090,0.0373,0.28075,0.59500,0.229421,0.052634
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
245776,Baby Making Music,false,999985,1455753600,28,26,3,18,16,0.679,...,0.314970,0.099206,0.617,0.405579,0.4055,0.0812,0.31900,0.50075,0.135829,0.018450
245777,Work,false,999986,1509321600,130,113,3,51,82,0.869,...,0.087706,0.007692,0.949,0.479970,0.4825,0.0378,0.33650,0.62375,0.200056,0.040022
245778,Calm,false,999987,1506643200,182,157,2,97,112,0.979,...,0.295114,0.087092,0.803,0.306197,0.2760,0.0385,0.17650,0.40575,0.169598,0.028764
245779,Jams,false,999988,1500336000,27,21,2,20,18,0.330,...,0.000000,0.000000,0.845,0.417741,0.3980,0.1100,0.29650,0.51950,0.187114,0.035012
