In [4]:
import pandas as pd
import numpy as np
import json
import os

# constants
DATAFRAME_PATH = 'dataframes'

### Create the playlists dataframe from raw JSON data

In [None]:
'''
playlist_col = ['collaborative', 'duration_ms', 'modified_at', 
                'name', 'num_albums', 'num_artists', 'num_edits',
                'num_followers', 'num_tracks', 'pid']
    tracks_col = ['album_name', 'album_uri', 'artist_name', 'artist_uri', 
                  'duration_ms', 'track_name', 'track_uri'] 
    playlist_test_col = ['name', 'num_holdouts', 'num_samples', 'num_tracks', 'pid']
'''
def load_playlists(path):
    for file in os.listdir("raw_data"):
        if not file.endswith(".json"):
            continue
        with open("raw_data/"+file) as f:
            js_slice = json.load(f)
            for playlist in js_slice["playlists"]:
                playlist['tracks'] = [track['track_uri'] for track in playlist['tracks']]
                yield playlist
        print(f"Added playlists from {file}")

df = pd.DataFrame(load_playlists(DATAFRAME_PATH + '/playlists.h5'))
df.to_hdf(DATAFRAME_PATH + '/playlists.h5', 'playlists')
print('Stored files to playlists.h5')

### Read the playlists & tracks dataframes

In [68]:
playlists_df = pd.read_hdf(DATAFRAME_PATH + '/playlists.h5', 'playlists')
tracks_df = pd.read_hdf(DATAFRAME_PATH + '/tracks.h5', 'tracks')

In [69]:
null_tracks = tracks_df[tracks_df.isnull().any(1)]
tracks_df = tracks_df.dropna()

In [81]:
tracks_to_remove =  null_tracks['tracks'].tolist()
total_tracks_all_playlists = playlists_df['tracks'].apply(len).sum()

def remove_na_tracks(row):
    row['tracks'] = list(filter(lambda track: track not in tracks_to_remove, row['tracks']))
    return row

playlists_df = playlists_df.apply(remove_na_tracks, axis=1)

print(f'Removed {total_tracks_all_playlists - playlists_df['tracks'].apply(len).sum()} tracks from playlists')

16661251


16661209

## Building the train and test sets

In [82]:
from sklearn.model_selection import train_test_split

X_train, X_test = train_test_split(playlists_df, test_size=0.2)

In [None]:
# Modifying the test set
unique_tracks_train = pd.unique(X_train['tracks'].explode())
                                
def remove_unseen_tracks(row):
    row['tracks'] = list(filter(lambda track: track in unique_tracks_train, row['tracks']))
    return row
                                
X_test_incomplete = X_test[X_test['tracks'].apply(len) > 20]
X_test_incomplete['tracks'] = X_test_incomplete.apply(remove_unseen_tracks, axis=1)

In [None]:
# Create challenge set
from random import sample
from functools import partial

X_test_incomplete['tracks'] = X_test_incomplete['tracks'].apply(partial(sample,10))