This file contains code for processing the MPD into a smaller dataset. It assumes the json files containing the data are stored in the directory `./data`

In [22]:
import networkx as nx
import pandas as pd
import numpy as np
import glob
import json
import tqdm
import random

In [23]:
# seed random for reproducibility
random.seed(1)
# grab all MPD data files
files = glob.glob("data/*.json")
playlists = {}
tracks = {}
track_names = {}

# create an empty graphs
G = nx.Graph()
# set the track counter to 2 million so we don't overlap with playlist ids
track_counter = 2000000

playlist_counter = 0
# shuffle the files so we don't get a bias towards the first files
random.shuffle(files)
for file in tqdm.tqdm(files[:25]):
    # read in the data
    with open(file, "r") as f:
        data = json.load(f)['playlists']
    # loop through each playlist
    for playlist in data:
        pl_pid = playlist['pid']
        # only use playlists with at least 30 tracks
        if len(playlist['tracks']) < 30:
            continue

        playlists[playlist_counter] = pl_pid
        # add the playlist to the graph
        G.add_node(playlist_counter)
        for track in playlist['tracks']:
            track_uri = track['track_uri']
            # add the track to the graph if it doesn't exist
            if not tracks.get(track_uri):
                tracks[track_uri] = track_counter
                track_names[track_counter] = track['track_name']
                G.add_node(track_counter)
                track_counter += 1
            # add the edge between the playlist and the track
            G.add_edge(playlist_counter, tracks[track_uri])
        playlist_counter+=1
with open('track_idx.json', 'w') as f:
    json.dump(tracks, f)


100%|██████████| 25/25 [00:10<00:00,  2.30it/s]


In [26]:
# find the k-core of the graph
sG = nx.k_core(G, k=25)
nx.write_edgelist(sG, 'data/sample_graph.csv')
num_tracks, num_playlist = len([n for n in sG.nodes() if n >= 2000000]), len([n for n in sG.nodes() if n < 2000000])
print(f"Number of tracks: {num_tracks}, Number of playlists: {num_playlist}")

(7990, 11233)