In [None]:
# first attempt, decided that the multiprocessing wasn't giving enough of a benefit so decided to keep it simpler and remove it

In [None]:
import os
from pathlib import Path
from glob import glob
from tqdm.auto import tqdm
import json
from joblib import Parallel, parallel_backend, delayed
import numpy as np

path = Path(os.getenv("SPOTIFY_DATASET"))

slices = sorted(glob(str(path/"data"/"*.json")), key=lambda x: int(x.split("mpd.slice.")[-1].split("-")[0]))

In [None]:
def extract_distinct_items(path):
    data = json.load(open(path, "rb"))
    
    tracks = set()
    albums = set()
    artists = set()
    for playlist in data["playlists"]:
        for track_info in playlist["tracks"]:
            track = f"{track_info['track_name']} ({track_info['track_uri']})"
            album = f"{track_info['album_name']} ({track_info['album_uri']})"
            artist = f"{track_info['artist_name']} ({track_info['artist_uri']})"

            tracks.add(track)
            albums.add(album)
            artists.add(artist)
        
    return tracks, albums, artists

In [None]:
def determine_categories(slices):
    with parallel_backend("loky"):
        tracks = set()
        albums = set()
        artists = set()

        for _tracks, _albums, _artists in tqdm(Parallel()(delayed(extract_distinct_items)(slc) for slc in tqdm(slices))):
            tracks = tracks.union(_tracks)
            albums = albums.union(_albums)
            artists = artists.union(_artists)

    return sorted(tracks), sorted(albums), sorted(artists)

In [None]:
tracks, albums, artists = determine_categories(slices)

assert len(tracks) >= len(albums)
assert len(albums) >= len(artists)

# assert len(tracks) == len(np.unique(tracks))
# assert len(albums) == len(np.unique(albums))
# assert len(artists) == len(np.unique(artists))

len(tracks), len(albums), len(artists)

In [None]:
track2idx = {track: i for i, track in enumerate(tracks)}
album2idx = {album: i for i, album in enumerate(albums)}
artist2idx = {artist: i for i, artist in enumerate(artists)}

In [None]:
def _get_rows_and_cols(path, playlist_idx, track2idx, album2idx, artist2idx):
    data = json.load(open(path, "rb"))
    
    rows = []
    track_cols = []
    album_cols = []
    artist_cols = []
    for i, playlist in enumerate(data["playlists"]):
        for track_info in playlist["tracks"]:
            rows.append(playlist_idx + i)
            
            track = f"{track_info['track_name']} ({track_info['track_uri']})"
            album = f"{track_info['album_name']} ({track_info['album_uri']})"
            artist = f"{track_info['artist_name']} ({track_info['artist_uri']})"
            
            track_cols.append(track2idx[track])
            album_cols.append(album2idx[album])
            artist_cols.append(artist2idx[artist])
        
    return rows, track_cols, album_cols, artist_cols

In [None]:
def get_aggregated_rows_and_cols(slices, track2idx, album2idx, artist2idx):
    with parallel_backend("loky"):
        rows = []
        track_cols = []
        album_cols = []
        artist_cols = []
        for _rows, _track_cols, _album_cols, _artist_cols in \
                tqdm(Parallel()(delayed(_get_rows_and_cols)(slc, playlist_idx, track2idx, album2idx, artist2idx) 
                                for playlist_idx, slc in enumerate(tqdm(slices)))):
            for row, track_col, album_col, artist_col in zip(_rows, _track_cols, _album_cols, _artist_cols):
                rows.append(row)
                track_cols.append(track_col)
                album_cols.append(album_col)
                artist_cols.append(artist_col)

    return rows, track_cols, album_cols

In [None]:
rows, track_cols, album_cols, artist_cols = get_aggregated_rows_and_cols(slices, track2idx, album2idx, artist2idx)