# Create a test and training split for the mpd.

Build a test data set from the mpd using the playlist distribution found in the official challenge set.

This extracts 10k playlists from the mpd as a test set substitution for the original challenge set.  It saves the original mpd data files as a new training set with the test set removed. Keeping the structure of the original file set will simplify operation of codes that expect that input.

The constructed splits will be named by a directory like mpd-split-<description> that contains the test-set.json and a data subdir with the mpd slices.
    
The challenge set will then need to be constructed from the test-set.json so that codes can processes a challenge set of withheld data. Additional downstream processing with rate results submitted against the split.

In [None]:
import sys
import json
import re
import collections
import os
import datetime
import pandas as pd
import numpy as np

## Load the mpd slice files

Create one big data frame to make it simple to select the random samples.

In [None]:
playlists = pd.DataFrame()
tracks = pd.DataFrame()

In [None]:
debug = True
quick = True
max_files_for_quick_processing = 20

# random state
seed = 1

In [None]:
def process_mpd(path):
    global playlists, tracks;
    
    count = 0
    filenames = os.listdir(path)
    for filename in sorted(filenames):
        if filename.startswith("mpd.slice.") and filename.endswith(".json"):
            fullpath = os.sep.join((path, filename))
            f = open(fullpath)
            js = f.read()
            f.close()
            if debug: print("loaded {}:".format(fullpath))
            mpd_slice = json.loads(js)
            # Flatten data
            # extract slice info to keep association with original training files.
            slice_info = mpd_slice['info']['slice']
            slice_playlists = pd.json_normalize(mpd_slice, record_path=['playlists'])
            slice_playlists["slice"] = slice_info
            if debug: print("slice length {}:".format(len(slice_playlists)))
            slice_tracks = pd.json_normalize(mpd_slice['playlists'], record_path=['tracks'], meta=['pid'])
            # drop tracks from playlist dataframe
            # not worth it to save space, just makes it harder to reconstruct the playlist
            #slice_playlists.drop(columns='tracks', inplace=True)
            playlists = playlists.append(slice_playlists)
            tracks = tracks.append(slice_tracks)
            count += 1

            if quick and count > max_files_for_quick_processing:
                break


In [None]:
%%time
process_mpd("data/mpd/data")

In [None]:
len(playlists)

## Get Challenge set distribution

Just read the data distribution from the challenge set file directly.

In [None]:
# load data using Python JSON module
with open('data/challenge_set.json','r') as f:
    data = json.loads(f.read())

In [None]:
# Flatten data
challenge_playlists = pd.json_normalize(data, record_path=['playlists'])

In [None]:
challenge_playlists.drop(columns=['tracks'], inplace=True)

In [None]:
challenge_playlists.num_tracks.value_counts()

In [None]:
for length, count in challenge_playlists.num_tracks.value_counts().iteritems():
    print("len {} count {}".format(length,count))

## compare distributions between challenge and train data set

Can see that the challenge set is similar but has some boosted representation at the higher and lower ends, likely to accomidate the 4x use of 25 and 100 length playlists. and 2x use of 0,1,5 seed.

The spikes in the mpd might be due to the natural boundaries people see as playlist length at 50+ 100+ and 150+ or maybe there was some defacto limit imposed by spotify for a time.

In [None]:
playlists.hist( column="num_tracks", bins=240)

In [None]:
challenge_playlists.hist( column="num_tracks", bins=240)

## Extract test set from training based on challenge distribution

In [None]:
testset = pd.DataFrame()

for length, count in challenge_playlists.num_tracks.value_counts().iteritems():
    if debug: print("len {} count {}".format(length,count))
    # shrink count by 10% of available tracks if there aren't enough
    # should only happen during dev when full data set not in use
    num_avail = len(playlists[playlists.num_tracks==length])
    if (num_avail < count):
        newcount=num_avail - int(num_avail * .10)
        print("WARNING: adjusted len {} count from {} to {}".format(length, count, newcount))
        count=newcount
    testset=testset.append(playlists[playlists.num_tracks==length].sample(n=count, random_state=seed))
    #if debug: print("len(testset): {}".format(len(testset)))

In [None]:
len(testset)

In [None]:
testset.hist( column="num_tracks", bins=240)

## Save test set as json

Need to pull out the playlists in the test set and build a challenge set. 
Need to remove the rows from the training set that are now for testing only.
Then build the training set without the test set.
Each file needs a header.


In [None]:
testname="ex1-from-21k-train"

In [None]:
today=datetime.datetime.now(datetime.timezone.utc)

In [None]:
print(today.isoformat())

In [None]:
try:
    os.mkdir("data/"+testname)
except FileExistsError:
    pass

In [None]:
# drop the info header
testset.drop(columns=['slice'], inplace=True)

In [None]:
# add a custom info header
fileinfo = '''{{
    "info": {{
       "generated_on": "{}", 
       "slice": "{}", 
       "version": "v1"
    }},\n'''.format(today.isoformat(), testname) 


In [None]:
testjson = testset.to_json(orient="records", indent=4)

In [None]:
# add extra indent to the json so it fits into the final output
testjson = re.sub('\n', '\n    ', testjson)

In [None]:
with open('data/'+testname+'/testset.json','w') as f:
    f.write(fileinfo + '    "playlists": ' + testjson)
f.close()

%%time

# ignore
don't do it this way, leave the tracks in with the playlists and then there is no need to reconstruct the playlist.

# add an empty tracks column
testset["tracks"] = ""

for pid in testset.pid:
    playlist_tracks = tracks[tracks.pid==pid]
    playlist_tracks = playlist_tracks.drop(columns=['pid'])
    testset.loc[testset.pid == pid, 'tracks'] = '"tracks": ' + playlist_tracks.to_json(orient='records', indent=2)