## Score challenge submission

This notebook produces scores for a custom challenge set.  It scores each tasks and provides a summary score (average).

This is designed to help compare the quality of a solution against a challenge set under our control and demonstrate that the aicrowd score is consistent.

In [None]:
import sys
import json
import re
import collections
import os
import datetime
import pandas as pd
import numpy as np

In [None]:
challenge="ex2-from-21k-train-with-pids"
submitdir="/home/jpr/projects/mpd-challenge-aicrowd/"
#submissionfile="method-02-mympd-2nd-21k-2021-11-07.csv.gz"
submissionfile="method-01-mympd-2nd-21k-2021-11-14.csv.gz"

### Load the no holdouts challenge set

In [None]:
with open('data/{}/challenge_set_noholdout.json'.format(challenge),'r') as f:
    data = json.loads(f.read())

In [None]:
noholdout = pd.json_normalize(data,"playlists")

In [None]:
noholdout

### Load challenge submission`

In [None]:
rectracks=pd.read_csv('{}/{}'.format(submitdir, submissionfile), header=None, skiprows=1, index_col=0)


Make sure the pids match those of the challenge set.  In the case of mympd the pid range starts at 200000 and goes up in sequence of noholdout data set.

In [None]:
rectracks.head()

In [None]:
rectracks = rectracks.sort_values(by=0, axis='index')

In [None]:
rectracks.head()

## convert the noholdouts into track lists

In [None]:
startpid=2000000

In [None]:
[track["track_uri"] for track in noholdout.iloc[0].tracks[0:2]]

In [None]:
[track["track_uri"] for track in noholdout.iloc[0].tracks]

In [None]:
noholdout.iloc[0]

In [None]:
len(noholdout)

R-precision is the fraction of correctly recommended tracks in the ground truth playlist as described on [the challenge site](https://www.aicrowd.com/challenges/spotify-million-playlist-dataset-challenge#evaluation)
    

Use the [r-precesion calculation from the hello_world metrics](https://github.com/jprorama/spotify_recSys_challenge_2018/blob/f33d82715190a20fdbc998c9ff709bcabd62a55e/utils/metrics.py#L26)

In [None]:
def get_r_precision(answer, cand):
    set_answer = set(answer)
    r = len(set_answer&set(cand[:len(set_answer)])) / len(set_answer)
    return r

def get_ndcg(answer, cand):
    cand_len = len(cand) 
    idcg=0
    dcg=0
    
    #print("cand len {}".format(cand_len))
    #print("ans len {}".format(len(answer)))
    #print("cand: {}".format(cand))
    
    for i in range(cand_len):
        #print("i {}".format(i))
        #print("cand {}".format(cand[i]))
        if cand[i] in answer: 
            dcg += (1/math.log(i+1+1,2))

    for i in range(len(set(answer))):
        idcg += (1/math.log(i+1+1,2))
    
    return dcg/idcg

In [None]:
[track["track_uri"] for track in noholdout.iloc[0].tracks]

In [None]:
rectracks.iloc[0].to_list()

In [None]:
import math

In [None]:
realtracks=pd.DataFrame()
rprec_sum = 0.0
ndcg_sum = 0.0
rprec_match = 0
ndcg_match = 0

for i in range(len(noholdout)):
    pid = startpid + i
    gttracks = [track["track_uri"] for track in noholdout.iloc[i].tracks]
    candtracks = rectracks.iloc[i].to_list()
    rprec = get_r_precision(gttracks, candtracks)
    rprec_sum = rprec_sum + rprec
    if rprec > 0:
        rprec_match += 1
    ndcg = get_ndcg(gttracks, candtracks)
    ndcg_sum = ndcg_sum + ndcg
    if ndcg > 0:
        ndcg_match += 1
    
print("rprec = {}".format(rprec_sum/len(noholdout)))
print("ndcg = {}".format(ndcg_sum/len(noholdout)))
print("rprec_match = {}".format(rprec_match))
print("ndcg_match = {}".format(ndcg_match))

In [None]:
realtracks=pd.DataFrame()
rprec_sum = 0.0
successes = 0

numplaylists = len(noholdout)
numplaylists = 2

for i in range(numplaylists):
    pid = startpid + i;
    gtlen=len(noholdout.iloc[i].tracks)
    gttracks = [track["track_uri"] for track in noholdout.iloc[i].tracks[0:gtlen]]
    gtset = set(gttracks)
    recset = set(rectracks.iloc[i][0:gtlen])
    rprec = len(gtset & recset)/gtlen
    if rprec >= 0:
        successes += 1
        #print("gt:  {}: {}".format(pid, ",".join(gttracks)))
        #print("rec: {}: {}".format(pid, ",".join(rectracks.iloc[i][0:gtlen])))
        #print("intersect: {}".format(gtset & recset))
        print("task name:     {}".format(noholdout.iloc[i]["task_name"]))
        print("intersect len: {}".format(len(gtset & recset)))
        print("playlist len:  {}".format(gtlen))
        print("playlist name: {}".format(noholdout.iloc[i]["name"]))
        print("{}: {}".format(i, rprec))
        print("tracks: {}".format(",".join([track["artist_name"] for track in noholdout.iloc[i].tracks[0:gtlen]])))
    rprec_sum = rprec_sum + rprec

#print("rprec_sum = {}".format(rprec_sum))
print("rprec = {}".format(rprec_sum/len(noholdout)))
print("succeses = {}".format(successes))
#print("{}".format(tracks[0:len(tracks)]["track_uri"]))
    

In [None]:
noholdout[noholdout["challenge_pid"]==2006675]

In [None]:
noholdout[noholdout["challenge_pid"]==2006675].tracks

In [None]:
type(noholdout.iloc[10].tracks)

In [None]:
type(noholdout[noholdout["challenge_pid"]==2000010].tracks.to_list())

In [None]:
type(noholdout[noholdout["challenge_pid"]==2000010].tracks.to_list()[0])

In [None]:
[track["track_uri"] for track in noholdout[noholdout["challenge_pid"]==2000010].tracks.to_list()[0]]

In [None]:
i=0


for track in noholdout.loc[noholdout["challenge_pid"]==2000010].tracks.to_list()[0]:
    print("i = {}".format(i))
    i += 1
    #print("index = {}".format(index))
    #print("{}".format(track["track_uri"]))
    print("{}".format(type(track)))

In [None]:
rectracks.head()

In [None]:
rectracks.loc[2000010].tolist()

In [None]:
realtracks=pd.DataFrame()
rprec_sum = 0.0
ndcg_sum = 0.0
rprec_match = 0
ndcg_match = 0

for i in range(len(noholdout)):
    pid = startpid + i
    #print("pid={}".format(pid))
    gttracks = [track["track_uri"] for track in noholdout[noholdout["challenge_pid"]==pid].tracks.to_list()[0]]
    candtracks = rectracks.loc[pid].to_list()
    rprec = get_r_precision(gttracks, candtracks)
    rprec_sum = rprec_sum + rprec
    if rprec > 0:
        rprec_match += 1
    ndcg = get_ndcg(gttracks, candtracks)
    ndcg_sum = ndcg_sum + ndcg
    if ndcg > 0:
        ndcg_match += 1
    
print("rprec = {}".format(rprec_sum/len(noholdout)))
print("ndcg = {}".format(ndcg_sum/len(noholdout)))
print("rprec_match = {}".format(rprec_match))
print("ndcg_match = {}".format(ndcg_match))

## Create Song DB

Load all the songs from the training and noholdout to analyze hits an misses in the recommendations set.

Specifically motivated by the complete failure of vl6 on mympd

In [None]:
trainset = "mpd-2nd-21k"

In [None]:
debug = True
quick = True
max_files_for_quick_processing = 20

# random state
seed = 1

In [None]:
playlists = pd.DataFrame()
tracks = pd.DataFrame()

In [None]:
def process_mpd(path):
    global playlists, tracks;
    
    count = 0
    filenames = os.listdir(path)
    for filename in sorted(filenames):
        if filename.startswith("mpd.slice.") and filename.endswith(".json"):
            fullpath = os.sep.join((path, filename))
            f = open(fullpath)
            js = f.read()
            f.close()
            if debug: print("loaded {}:".format(fullpath))
            mpd_slice = json.loads(js)
            # Flatten data
            # extract slice info to keep association with original training files.
            slice_info = mpd_slice['info']['slice']
            slice_playlists = pd.json_normalize(mpd_slice, record_path=['playlists'])
            slice_playlists["slice"] = slice_info
            if debug: print("slice length {}:".format(len(slice_playlists)))
            slice_tracks = pd.json_normalize(mpd_slice['playlists'], record_path=['tracks'], meta=['pid'])
            # drop tracks from playlist dataframe
            # not worth it to save space, just makes it harder to reconstruct the playlist
            #slice_playlists.drop(columns='tracks', inplace=True)
            playlists = playlists.append(slice_playlists)
            tracks = tracks.append(slice_tracks)
            count += 1

            if quick and count > max_files_for_quick_processing:
                break


In [None]:
%%time
process_mpd("data/"+ trainset)

In [None]:
tracks[tracks["track_uri"]=="spotify:track:1vvsD4wGSpDtL1hPR4aNKJ"]

In [None]:
tracksdb = tracks[["artist_name", "track_uri", "artist_uri", "track_name", "album_uri", "duration_ms", "album_name"]].drop_duplicates(subset=["track_uri"])

In [None]:
len(tracksdb)

In [None]:
tracksdb[tracksdb["track_uri"]=="spotify:track:3uvypVUsiIr1B0BccIcsEh"]

In [None]:
noholdout[noholdout["challenge_pid"]==2000001].tracks.to_list()

In [None]:
for track in rectracks.loc[2000001][0:10].to_list():
    print("track: \"{}\"".format(track))
    print(tracksdb[tracksdb["track_uri"]==track])