# Create a test and training split for the mpd.

Build a test data set from the mpd using the playlist distribution found in the official challenge set.

This extracts 10k playlists from the mpd as a test set substitution for the original challenge set.  It saves the original mpd data files as a new training set with the test set removed. Keeping the structure of the original file set will simplify operation of codes that expect that input.

The constructed splits will be named by a directory like mpd-split-<description> that contains the test-set.json and a data subdir with the mpd slices.
    
The challenge set will then need to be constructed from the test-set.json so that codes can processes a challenge set of withheld data. Additional downstream processing with rate results submitted against the split.

In [None]:
import sys
import json
import re
import collections
import os
import datetime
import pandas as pd
import numpy as np

## Load the mpd slice files

Create one big data frame to make it simple to select the random samples.

In [None]:
playlists = pd.DataFrame()
tracks = pd.DataFrame()

In [None]:
debug = True
quick = True
max_files_for_quick_processing = 20

# random state
seed = 1

In [None]:
def process_mpd(path):
    global playlists, tracks;
    
    count = 0
    filenames = os.listdir(path)
    for filename in sorted(filenames):
        if filename.startswith("mpd.slice.") and filename.endswith(".json"):
            fullpath = os.sep.join((path, filename))
            f = open(fullpath)
            js = f.read()
            f.close()
            if debug: print("loaded {}:".format(fullpath))
            mpd_slice = json.loads(js)
            # Flatten data
            # extract slice info to keep association with original training files.
            slice_info = mpd_slice['info']['slice']
            slice_playlists = pd.json_normalize(mpd_slice, record_path=['playlists'])
            slice_playlists["slice"] = slice_info
            if debug: print("slice length {}:".format(len(slice_playlists)))
            slice_tracks = pd.json_normalize(mpd_slice['playlists'], record_path=['tracks'], meta=['pid'])
            # drop tracks from playlist dataframe
            # not worth it to save space, just makes it harder to reconstruct the playlist
            #slice_playlists.drop(columns='tracks', inplace=True)
            playlists = playlists.append(slice_playlists)
            tracks = tracks.append(slice_tracks)
            count += 1

            if quick and count > max_files_for_quick_processing:
                break


In [None]:
%%time
process_mpd("data/mpd/data")

Set a new index for playlists so each row has unique id using pid. After reading the slice files the index values repeat for each slice.

Preference is to not use the pid since that drops this data column.
Instead create a new column of integers for each row and then set that as the index.

In [None]:
playlists["newidx"]=range(len(playlists))

playlists.set_index("newidx", inplace=True)

## Get Challenge set distribution

Just read the data distribution from the challenge set file directly.

In [None]:
# load data using Python JSON module
with open('data/challenge_set.json','r') as f:
    data = json.loads(f.read())

In [None]:
# Flatten data
challenge_playlists = pd.json_normalize(data, record_path=['playlists'])

In [None]:
challenge_playlists.drop(columns=['tracks'], inplace=True)

In [None]:
challenge_playlists.num_tracks.value_counts()

In [None]:
for length, count in challenge_playlists.num_tracks.value_counts().iteritems():
    print("len {} count {}".format(length,count))

## compare distributions between challenge and train data set

Can see that the challenge set is similar but has some boosted representation at the higher and lower ends, likely to accomidate the 4x use of 25 and 100 length playlists. and 2x use of 0,1,5 seed.

The spikes in the mpd might be due to the natural boundaries people see as playlist length at 50+ 100+ and 150+ or maybe there was some defacto limit imposed by spotify for a time.

In [None]:
playlists.hist( column="num_tracks", bins=240)

In [None]:
challenge_playlists.hist( column="num_tracks", bins=240)

## Extract test set from training based on challenge distribution

Sampling without replacement is implicit because the sample is taken from explicit playlist length subsets of the whole data set.
This prevents resampling of the same playlist across different calls to sample.

Could remove the sampled playlists from the original data set as we go along.
This wouldn't change the sample behavior because it is already occuring on filtered playlist length and then sampling from that subset.

In [None]:
testset = pd.DataFrame()

for length, count in challenge_playlists.num_tracks.value_counts().iteritems():
    if debug: print("len {} count {}".format(length,count))
    # shrink count by 10% of available tracks if there aren't enough
    # should only happen during dev when full data set not in use
    num_avail = len(playlists[playlists.num_tracks==length])
    if (num_avail < count):
        newcount=num_avail - int(num_avail * .10)
        print("WARNING: adjusted len {} count from {} to {}".format(length, count, newcount))
        count=newcount
    testset=testset.append(playlists[playlists.num_tracks==length].sample(n=count, random_state=seed))
    #if debug: print("len(testset): {}".format(len(testset)))

In [None]:
len(testset)

In [None]:
testset.hist( column="num_tracks", bins=240)

## Save test set as json

Need to pull out the playlists in the test set and build a challenge set. 
Need to remove the rows from the training set that are now for testing only.
Then build the training set without the test set.
Each file needs a header.


In [None]:
testname="ex2-from-21k-train"

In [None]:
today=datetime.datetime.now(datetime.timezone.utc)

In [None]:
print(today.isoformat())

In [None]:
try:
    os.mkdir("data/"+testname)
except FileExistsError:
    pass

In [None]:
# drop the info header
testset.drop(columns=['slice'], inplace=True)

In [None]:
# add a custom info header
fileinfo = '''{{
    "info": {{
       "generated_on": "{}", 
       "slice": "{}", 
       "version": "v1"
    }},\n'''.format(today.isoformat(), testname) 


In [None]:
testjson = testset.to_json(orient="records", indent=4)

In [None]:
# add extra indent to the json so it fits into the final output
testjson = re.sub('\n', '\n    ', testjson)

In [None]:
with open('data/'+testname+'/testset.json','w') as f:
    f.write(fileinfo + '    "playlists": ' + testjson)
f.close()

## Remove test set from training data

Removing the sampled set of play lists is easy with the [isin() filter](https://stackoverflow.com/a/27965417/8928529).


Confirm isin() filter removes correct count of testset playlists.

In [None]:
testset.shape

In [None]:
playlists.shape

In [None]:
playlists[~playlists.pid.isin(testset.pid)].shape

In [None]:
playlists.shape[0]-testset.shape[0]

In [None]:
trainset = playlists[~playlists.pid.isin(testset.pid)]

In [None]:
trainset.shape

## Save the new training set

Saving the data sets will be easy by just using the slice information to recreate the files and then adding the fileinfo header as above with slice and potentially version info named for test set.

trainset[trainset.slice.isin()]

In [None]:
try:
    os.mkdir("data/"+testname+"/data")
except FileExistsError:
    pass

In [None]:
for slice in trainset.slice.unique():
    fileinfo = '''{{
    "info": {{
       "generated_on": "{}", 
       "slice": "{}", 
       "version": "v1"
    }},\n'''.format(today.isoformat(), slice) 

    trainjson = trainset[trainset.slice == slice].to_json(orient="records", indent=4)
    # add extra indent to the json so it fits into the final output
    trainjson = re.sub('\n', '\n    ', trainjson)
    slicefile = 'data/'+testname+'/data/mpd.slice.'+slice+'.json'
    with open(slicefile,'w') as f:
        f.write(fileinfo + '    "playlists": ' + trainjson)
    f.close()
    if debug: print("wrote slice {}".format(slice))

The test and training sets have now been created.

## Create challenge data

For each challenge set of 1000 tracks in the reference data:
  * loop through the subset
    * get a track of the requested length and remove it from the testset
    * save testable format of pid and actual trackids in order
    * format dropout it according to the challenge task
    * add it to the challenge set

In [None]:
challenge_playlists

In [None]:
challenge_playlists[1000:2000].num_tracks.hist()

In [None]:
data

# Create the challenge set organization from the test data.

In [None]:
test_challenge=pd.DataFrame()

In [None]:
testin = testset.copy()

Set a new unique test set index to accomidate oversampling

In [None]:
testin["newidx"]=range(len(testin))
testin.set_index("newidx", inplace=True)

In [None]:
def create_task(distribution, srcdata, trace=False):
    '''
    create a named challenge task from srcdata using the track length distribution
    
    the samples are removed from srcdata to implement without replacement across calls
    '''
    subset = pd.DataFrame()

    for i, length in distribution.num_tracks.iteritems():
        if trace: print("playlist: {} len: {}".format(i, length))
        #
        # if we don't have enough input data re-use the last sample
        # won't work if the first sample errors out.
        #
        try:
            sample = srcdata[srcdata.num_tracks==length].sample(n=1, random_state=seed)
            newsample=True
        except ValueError:
            if 'sample' not in locals():
                print ("ERROR: no first sample taking min matching sample outside of distribution")
                # pick a sample that is at least as long as the one in the distribution
                sample=srcdata[srcdata.num_tracks>length].sample(n=1, random_state=seed)
                last_sample = sample
                #break
            else:
                if debug: print("WARNING: Playlist length {} not found reuse sample {}".format(length, sample.pid))
                sample = last_sample
            newsample=False
        subset = subset.append(sample)
        if (newsample):
            srcdata.drop(sample.index, inplace=True)
            last_sample=sample
        
    return subset

In [None]:
def add_task_descriptor(dataset, name, num_samples, random, with_title):
    '''
    Add columns to challenge task describing the task. 
    This will help automate processing
    '''
    
    dataset["task_name"]=name
    dataset["num_samples"]=num_samples
    dataset["random"]=random
    dataset["with_title"]=with_title
    
    return dataset

In [None]:
testin

### Build title only task

In [None]:
challenge_playlists[0:1000].head()

In [None]:
task_playlists = create_task(challenge_playlists[0:1000], testin)

task_playlists = add_task_descriptor(task_playlists, name="title_only", 
                                     num_samples=0, random=False, with_title=True)

test_challenge=test_challenge.append(task_playlists)

In [None]:
len(test_challenge)

### Build Sample 5 with Title

In [None]:
challenge_playlists[1000:2000].head()

In [None]:
task_playlists = create_task(challenge_playlists[1000:2000], testin)

task_playlists = add_task_descriptor(task_playlists, name="first_5_title", 
                                     num_samples=5, random=False, with_title=True)

test_challenge=test_challenge.append(task_playlists)

In [None]:
len(test_challenge)

### Build First 5 without Title

In [None]:
challenge_playlists[2000:3000].head()

In [None]:
task_playlists = create_task(challenge_playlists[2000:3000], testin)

task_playlists = add_task_descriptor(task_playlists, name="first_5_wo_title", 
                                     num_samples=5, random=False, with_title=False)

test_challenge=test_challenge.append(task_playlists)

In [None]:
len(test_challenge)

### Build First 10 with Title

In [None]:
challenge_playlists[3000:4000].head()

In [None]:
task_playlists = create_task(challenge_playlists[3000:4000], testin)

task_playlists = add_task_descriptor(task_playlists, name="first_10_title", 
                                     num_samples=10, random=False, with_title=True)

test_challenge=test_challenge.append(task_playlists)

In [None]:
len(test_challenge)

### Build First 10 without Title

In [None]:
challenge_playlists[4000:5000].head()

In [None]:
task_playlists = create_task(challenge_playlists[4000:5000], testin)

task_playlists = add_task_descriptor(task_playlists, name="first_10_wo_title", 
                                     num_samples=10, random=False, with_title=False)

test_challenge=test_challenge.append(task_playlists)

In [None]:
len(test_challenge)

### Build Rand 25 with Title

In [None]:
challenge_playlists[5000:6000].head()

In [None]:
task_playlists = create_task(challenge_playlists[5000:6000], testin)

task_playlists = add_task_descriptor(task_playlists, name="first_25_title", 
                                     num_samples=25, random=False, with_title=True)

test_challenge=test_challenge.append(task_playlists)

In [None]:
len(test_challenge)

### Build Rand 25 without Title

In [None]:
challenge_playlists[6000:7000].head()

In [None]:
task_playlists = create_task(challenge_playlists[6000:7000], testin)

task_playlists = add_task_descriptor(task_playlists, name="rand_25_title", 
                                     num_samples=25, random=True, with_title=True)

test_challenge=test_challenge.append(task_playlists)

In [None]:
len(test_challenge)

### Build Rand 100  with Title

In [None]:
challenge_playlists[7000:8000].head()

In [None]:
task_playlists = create_task(challenge_playlists[7000:8000], testin)

task_playlists = add_task_descriptor(task_playlists, name="first_100_title", 
                                     num_samples=100, random=False, with_title=True)

test_challenge=test_challenge.append(task_playlists)

In [None]:
len(test_challenge)

### Build Rand 100 without Title

In [None]:
challenge_playlists[8000:9000].head()

In [None]:
task_playlists = create_task(challenge_playlists[8000:9000], testin)

task_playlists = add_task_descriptor(task_playlists, name="rand_100_title", 
                                     num_samples=100, random=True, with_title=True)

test_challenge=test_challenge.append(task_playlists)

In [None]:
len(test_challenge)

### Build First 1 with Title

In [None]:
challenge_playlists[9000:10000].head()

In [None]:
task_playlists = create_task(challenge_playlists[9000:10000], testin)

task_playlists = add_task_descriptor(task_playlists, name="first_1_title", 
                                     num_samples=1, random=False, with_title=True)

test_challenge=test_challenge.append(task_playlists)

In [None]:
len(test_challenge)

In [None]:
len(testin)

All testset entries have been consumed and every task has been populated.

## Write challenge set 

This is the answer key for the corresponding testset.

In [None]:
test_challenge

In [None]:
def write_set(dataset, datasetname, timestamp, name="challengeset.json", tag=""): 
    '''Save dataframe to json format following MPD format convention with header'''

    # add a custom info header
    fileinfo = '''{{
        "info": {{
           "generated_on": "{}", 
           "slice": "{}", 
           "version": "v1"
        }},\n'''.format(timestamp.isoformat(), datasetname + tag) 


    json = dataset.to_json(orient="records", indent=4)

    # add extra indent to the json so it fits into the final output
    json = re.sub('\n', '\n    ', json)

    if debug: print("write file data/{}/{}".format(datasetname, name))
    with open('data/'+datasetname+'/' + name,'w') as f:
        f.write(fileinfo + '    "playlists": ' + json)
    f.close()

In [None]:
write_set(test_challenge, testname, today)

## create challenge tasks

this is where we process the task attributes to create two outputs from the challengeset:
* the answer key - to compare results and compute the rating scores
* the withheld format of the challengeset

In [None]:
test_challenge.head()

In [None]:
test_challenge[["num_samples", "random", "with_title" ]].iteritems()

In [None]:
for index in range(test_challenge.shape[0]):
        num_samples = test_challenge.iloc[index].num_samples
        random = test_challenge.iloc[index].random
        with_title = test_challenge.iloc[index].with_title
        print("plist {}, num_samples {}, random {}, with_title {}".format(index, num_samples, random, with_title))
    

In [None]:
test_challenge.iloc[1].tracks

In [None]:
challenge_playlists

In [None]:
challenge_out = pd.DataFrame()

In [None]:
#test_challenge.iloc[0]['tracks'].sample(5)

In [None]:
#testtracks

In [None]:
test_challenge.shape

In [None]:
import random as rand
from operator import itemgetter

In [None]:
challenge_out = pd.DataFrame()

# for each playlist produce the correctly formatted output of withheld tracks

for index in range(test_challenge.shape[0]):
    num_samples = test_challenge.iloc[index].num_samples
    random = test_challenge.iloc[index].random
    with_title = test_challenge.iloc[index].with_title
    if debug: print("plist {}, num_samples {}, random {}, with_title {}".format(index, num_samples, random, with_title))

    
   
    if  random:
        # sample num_samples
        #print("len {}, samples{}".format(len(test_challenge.iloc[index]['tracks']), num_samples))
        testtracks = rand.sample(test_challenge.iloc[index]['tracks'], num_samples)
        testtracks = [sorted(testtracks, key=itemgetter('pos'))]
    else:
        #print("tracks type {}".format(len(test_challenge.iloc[index]['tracks'])))
        #print(test_challenge.iloc[index]['tracks'][0:3])
        testtracks = [test_challenge.iloc[index]['tracks'][0:num_samples]]
        #testtracks=[]
  
    if with_title:
        testtitle = test_challenge.iloc[index]['name']
    else:
        testtitle = ""
    
    num_tracks = test_challenge.iloc[index]['num_tracks']
    num_holdouts = num_tracks - num_samples
    pid = 2000000 + index

    entry = pd.DataFrame([(testtitle, num_holdouts, pid, num_tracks, num_samples)], columns=('name', 'num_holdouts', 'pid', 'num_tracks', 'num_samples'), index=[index])
    entry['tracks'] = testtracks
    
    #print(entry)
    
    challenge_out = challenge_out.append(entry)
    #print(index)

In [None]:
challenge_out

In [None]:
write_set(challenge_out, testname, today, name="ex2-challengeset.json", tag="_challengeset")

The challenge set is created.

Fix: header in output to match following but loose json interpretation should ignore it.
```
    "date": "2018-01-16 08:47:28.198015", 
    "version": "v1", 
    "playlists": [
```

## Write out challenge answer key is submit format

This may make it easier to run the scoring function.

In [None]:
#challenge_out.iloc[6000].tracks[2]['track_uri']

In [None]:
#sorted(challenge_out.iloc[5001].tracks, key=itemgetter("pos"), reverse=True) #[3]["pos"]

In [None]:
test_challenge