Note: each of the cells can be run individually, as long as the prerequisite pickle files exist in `/data/processed/` directory. Otherwise, waiting for execution of all cells can take a substantial amount of time.

## Constants for the Notebook

In [None]:
import os

file_count = 50   # TODO: change to 1000
playlists_per_file = 1000
min_follower_threshold = 2

dir_data_raw = os.path.join("..", "data", "raw")
dir_data_processed = os.path.join("..", "data", "processed")

playlist_count = file_count * playlists_per_file

## Extracting Playlist Data

The 1 000 000 spotify albums are stored in `/data/raw/playlists/` folder, downloaded from [this source](https://www.aicrowd.com/challenges/spotify-million-playlist-dataset-challenge/dataset_files). The albums are stored in 1000 `.json` files with 1000 playlists each.

Read all JSON files and merge their playlist information into a single list structure. The Python list is saved into a pickle structure for intermediate representation:

In [88]:
import json
import pickle
import os

playlist_list_path = os.path.join(dir_data_processed, "playlists" + str(playlist_count) + ".pkl")

playlist_list = []

for file_index in range(file_count):
    start_playlist_id = file_index * playlists_per_file
    end_playlist_id = (file_index + 1) * playlists_per_file - 1
    playlist_json_path = os.path.join("..", "data", "raw", "playlists", "mpd.slice." + str(start_playlist_id) + "-" + str(end_playlist_id) + ".json")
    
    with open(playlist_json_path, 'r') as json_file:
        data = json.load(json_file)
    
    playlist_data = data.get("playlists", [])

    for playlist in playlist_data:
        if playlist["num_followers"] < min_follower_threshold:
            continue
        
        track_id_list = []
        for track in playlist["tracks"]:
            id = track["track_uri"][len("spotify:track:"):]
            track_id_list.append(id)
        del playlist["duration_ms"]
        del playlist["tracks"]
        playlist["track_ids"] = track_id_list

        playlist_list.append(playlist)

assert playlist_list[3]['name'] == "mixtape"

with open(playlist_list_path, "wb") as fout:
    pickle.dump(playlist_list, fout, protocol = pickle.HIGHEST_PROTOCOL)


Generate an overview version of playlists without the track list:

In [89]:
import pickle
import os

playlist_list_path = os.path.join(dir_data_processed, "playlists" + str(playlist_count) + ".pkl")
playlist_overview_path = os.path.join(dir_data_processed, "playlists_overview" + str(playlist_count) + ".pkl")

with open(playlist_list_path, "rb") as fin:
    playlist_data = pickle.load(fin)

for playlist in playlist_data:
    del playlist["track_ids"]

with open(playlist_overview_path, "wb") as fout:
    pickle.dump(playlist_data, fout, protocol = pickle.HIGHEST_PROTOCOL)

Generate a list of unique track ids accross all playlists:

In [90]:
import pickle
import os

playlist_list_path = os.path.join(dir_data_processed, "playlists" + str(playlist_count) + ".pkl")
track_uris_path = os.path.join(dir_data_processed, "unique_track_ids" + str(playlist_count) + ".pkl")

with open(playlist_list_path, "rb") as fin:
    playlist_data = pickle.load(fin)

track_id_set = set()
for playlist in playlist_data:
    for id in playlist["track_ids"]:
        track_id_set.add(id)

assert "6I9VzXrHxO9rA9A5euc8Ak" in track_id_set

track_id_list = list(track_id_set)

with open(track_uris_path, "wb") as fout:
    pickle.dump(track_id_list, fout, protocol = pickle.HIGHEST_PROTOCOL)