In [1]:
import pandas as pd
import numpy as np
import json
import os

# constants
DATAFRAME_PATH = 'dataframes'

### Create the playlists dataframe from raw JSON data

In [None]:
# pre-load the pre-prepared all_tracks dataframe
all_tracks_df = pd.read_hdf(DATAFRAME_PATH + '/all_tracks.h5', 'tracks')
null_tracks = all_tracks_df[all_tracks_df.isnull().any(1)]['tracks']

playlist_track_info = []

def load_playlists(path):
    for file in os.listdir("raw_data"):
        if not file.endswith(".json"):
            continue
        with open("raw_data/"+file) as f:
            js_slice = json.load(f)
            for playlist in js_slice['playlists']:
                for track in playlist['tracks']:
                    if track['track_uri'] not in null_tracks:
                        playlist_track_info.append([track['track_uri'], playlist['pid'], track['pos']])
                playlist.pop('tracks')
                yield playlist
        print(f"Added playlists from {file}")

main_playlist_df = pd.DataFrame(load_playlists(DATAFRAME_PATH + '/playlists.h5'))
playlist_track_info_df = pd.DataFrame(playlist_track_info, columns=['track_uri', 'pid', 'pos'])
playlist_track_info_df.set_index('pid')

print('Storing h5 files...')
main_playlist_df.to_hdf(DATAFRAME_PATH + '/playlists.h5', 'playlists')
playlist_track_info_df.to_hdf(DATAFRAME_PATH + '/playlist_track_info.h5', 'track_info')
print('Stored files as h5')

### Read the playlists, tracks and track_info dataframes

In [2]:
playlists_df = pd.read_hdf(DATAFRAME_PATH + '/playlists.h5', 'playlists')
tracks_df = pd.read_hdf(DATAFRAME_PATH + '/clean_tracks.h5', 'tracks')
track_info_df = pd.read_hdf(DATAFRAME_PATH + '/playlist_track_info.h5', 'track_info')

## Building the train and test sets

In [3]:
from sklearn.model_selection import train_test_split

X_train, X_test = train_test_split(playlists_df, test_size=0.2)

In [4]:
# Modifying the test set
X_train_track_info = track_info_df[track_info_df['pid'].isin(X_train['pid'].values)]
X_test_track_info = track_info_df[track_info_df['pid'].isin(X_test['pid'].values)]

# Keep only tracks that are in the training set in the test set 
X_train_track_uris = pd.unique(X_train_track_info['track_uri'])
X_test_track_info_ground_truth = X_test_track_info[X_test_track_info['track_uri'].isin(X_train_track_uris)]

In [6]:
# This cell can take long to run ~10mins
# if .h5 files are available, use those

challenge_set_list = []
new_test_set = []

def frac_to_sample(playlist_tracks):
    if playlist_tracks.size >= 2:
        return 0.5
    else:
        return None
        
def build_challenge_set():
    for pid in X_test['pid']:
        playlist_tracks = X_test_track_info_ground_truth[X_test_track_info_ground_truth['pid'] == pid]
        frac = frac_to_sample(playlist_tracks)
        if frac is not None and frac > 0:
            random_sample_df = playlist_tracks.sample(frac = frac, random_state=1)
            num_sample = random_sample_df.size
            new_test_set.append(X_test[X_test['pid'] == pid].values.flatten().tolist() + [num_sample, playlist_tracks.size-num_sample])
            for index, track in random_sample_df.iterrows():
                challenge_set_list.append([track['track_uri'], track['pid'], track['pos']])
    
build_challenge_set()

In [13]:
X_challenge = pd.DataFrame(new_test_set, columns=list(X_test.columns) + ['num_of_seeds', 'num_witheld'])
X_challenge_track_info = pd.DataFrame(challenge_set_list, columns=list(X_test_track_info_ground_truth.columns))

# Write challenge set as hdf
X_challenge.to_hdf(DATAFRAME_PATH + '/challenge_playlists.h5', 'playlists')
X_challenge_track_info.to_hdf(DATAFRAME_PATH + '/challenge_playlist_track_info.h5', 'track_info')

In [15]:
# Load challenge dataframes
X_challenge_set = pd.read_hdf(DATAFRAME_PATH + '/challenge_playlists.h5', 'playlists')
X_challenge_set_track_info = pd.read_hdf(DATAFRAME_PATH + '/challenge_playlist_track_info.h5', 'track_info')