## Score challenge submission

This notebook produces scores for a custom challenge set.  It scores each tasks and provides a summary score (average).

This is designed to help compare the quality of a solution against a challenge set under our control and demonstrate that the aicrowd score is consistent.

In [None]:
import sys
import json
import re
import collections
import os
import datetime
import pandas as pd
import numpy as np
import math

In [None]:
challenge="ex2-from-21k-train-with-pids"
submitdir="/home/jpr/projects/mpd-challenge-aicrowd/"
submissionfile="method-01-mympd-2nd-21k-2021-11-14.csv.gz"
submissions = [{"tag": "vl6", "file": submissionfile}]
submissionfile="method-02-mympd-2nd-21k-2021-11-07.csv.gz"
submissions.append({"tag": "hw", "file": submissionfile})

### Load the no holdouts challenge set

In [None]:
with open('data/{}/challenge_set_noholdout.json'.format(challenge),'r') as f:
    data = json.loads(f.read())

In [None]:
noholdout = pd.json_normalize(data,"playlists")

In [None]:
noholdout.head()

### Load challenge submission`

In [None]:
rectracks=pd.read_csv('{}/{}'.format(submitdir, submissionfile), header=None, skiprows=1, index_col=0, skipinitialspace=True)


Make sure the pids match those of the challenge set.  In the case of mympd the pid range starts at 200000 and goes up in sequence of noholdout data set.

In [None]:
rectracks.head()

In [None]:
rectracks = rectracks.sort_values(by=0, axis='index')

In [None]:
rectracks.head()

## Score Challenge Submission

In [None]:
startpid=2000000

R-precision is the fraction of correctly recommended tracks in the ground truth playlist as described on [the challenge site](https://www.aicrowd.com/challenges/spotify-million-playlist-dataset-challenge#evaluation)
    

Use the [r-precesion calculation from the hello_world metrics](https://github.com/jprorama/spotify_recSys_challenge_2018/blob/f33d82715190a20fdbc998c9ff709bcabd62a55e/utils/metrics.py#L26)

In [None]:
def get_r_precision(answer, cand):
    set_answer = set(answer)
    r = len(set_answer&set(cand[:len(set_answer)])) / len(set_answer)
    return r

def get_ndcg(answer, cand):
    cand_len = len(cand) 
    idcg=0
    dcg=0
    
    #print("cand len {}".format(cand_len))
    #print("ans len {}".format(len(answer)))
    #print("cand: {}".format(cand))
    
    for i in range(cand_len):
        #print("i {}".format(i))
        #print("cand {}".format(cand[i]))
        if cand[i] in answer: 
            dcg += (1/math.log(i+1+1,2))

    for i in range(len(set(answer))):
        idcg += (1/math.log(i+1+1,2))
    
    return dcg/idcg

In [None]:
realtracks=pd.DataFrame()
rprec_sum = 0.0
ndcg_sum = 0.0
rprec_match = 0
ndcg_match = 0

for i in range(len(noholdout)):
    pid = startpid + i
    #print("pid={}".format(pid))
    gttracks = [track["track_uri"] for track in noholdout[noholdout["challenge_pid"]==pid].tracks.to_list()[0]]
    candtracks = rectracks.loc[pid].to_list()
    rprec = get_r_precision(gttracks, candtracks)
    rprec_sum = rprec_sum + rprec
    if rprec > 0:
        rprec_match += 1
    ndcg = get_ndcg(gttracks, candtracks)
    ndcg_sum = ndcg_sum + ndcg
    if ndcg > 0:
        ndcg_match += 1
    
print("rprec = {}".format(rprec_sum/len(noholdout)))
print("ndcg = {}".format(ndcg_sum/len(noholdout)))
print("rprec_match = {}".format(rprec_match))
print("ndcg_match = {}".format(ndcg_match))

In [None]:
def score_set(answer, candidates):
    rprec_sum = 0.0
    ndcg_sum = 0.0
    rprec_match = 0
    ndcg_match = 0

    scores = dict()

    for pid in answer.challenge_pid:
        #pid = startpid + i
        #print("pid={}".format(pid))
        gttracks = [track["track_uri"] for track in answer[answer["challenge_pid"]==pid].tracks.to_list()[0]]
        candtracks = candidates.loc[pid].to_list()
        rprec = get_r_precision(gttracks, candtracks)
        rprec_sum = rprec_sum + rprec
        if rprec > 0:
            rprec_match += 1
        ndcg = get_ndcg(gttracks, candtracks)
        ndcg_sum = ndcg_sum + ndcg
        if ndcg > 0:
            ndcg_match += 1
        
    scores["rprec"] = rprec_sum/len(answer)
    scores["rprec_match"] = rprec_match
    scores["ndcg"] = ndcg_sum/len(answer)
    scores["ndcg_match"] = ndcg_match
        
    return scores

In [None]:
rprec_sum = 0.0
ndcg_sum = 0.0
rprec_match = 0
ndcg_match = 0

scores = dict()

for task in noholdout.task_name.drop_duplicates():

    scores[task] = score_set(noholdout[noholdout["task_name"]==task], rectracks)
    
    rprec_sum += scores[task]["rprec"]
    rprec_match += scores[task]["rprec_match"]
    ndcg_sum += scores[task]["ndcg"]
    ndcg_match += scores[task]["ndcg_match"]
    
    
    print("task: {}".format(task))
    print("rprec = {}".format(scores[task]["rprec"]))
    print("ndcg = {}".format(scores[task]["ndcg"]))
    print("rprec_match = {}".format(scores[task]["rprec_match"]))
    print("ndcg_match = {}".format(scores[task]["ndcg_match"]))
    print("\n")

scores["total"] = dict()
scores["total"]["rprec"] = rprec_sum/10
scores["total"]["rprec_match"] = rprec_match
scores["total"]["ndcg"] = ndcg_sum/10
scores["total"]["ndcg_match"] = ndcg_match

print("task: {}".format("total"))
print("rprec = {}".format(scores["total"]["rprec"]))
print("ndcg = {}".format(scores["total"]["ndcg"]))
print("rprec_match = {}".format(scores["total"]["rprec_match"]))
print("ndcg_match = {}".format(scores["total"]["ndcg_match"]))
print("\n")

In [None]:
def score_setdf(answer, candidate):

    rprec_sum = 0.0
    ndcg_sum = 0.0
    rprec_match = 0
    ndcg_match = 0

    results = answer.copy()

    #results["candidates"] = 
    results["rprec"] = 0
    results["ndcg"] = 0

    for pid in results.challenge_pid:
        #pid = startpid + i
        #print("pid={}".format(pid))
        gttracks = [track["track_uri"] for track in results[results["challenge_pid"]==pid].tracks.to_list()[0]]
        candtracks = rectracks.loc[pid].to_list()
        rprec = get_r_precision(gttracks, candtracks)
        rprec_sum = rprec_sum + rprec
        if rprec > 0:
            rprec_match += 1
        ndcg = get_ndcg(gttracks, candtracks)
        ndcg_sum = ndcg_sum + ndcg
        if ndcg > 0:
            ndcg_match += 1

        #results.at[results["challenge_pid"]==pid, "candidates"] = candtracks
        results.at[results["challenge_pid"]==pid, "rprec"] = rprec
        results.at[results["challenge_pid"]==pid, "ndcg"] = ndcg
        
    return results


In [None]:
results = pd.DataFrame()
tmpdf = pd.DataFrame()

for submission in submissions:
    print("submission: {}".format(submission))
    print("file: {}".format(submission["file"]))
    rectracks=pd.read_csv('{}/{}'.format(submitdir, submission["file"]), header=None, skiprows=1, index_col=0, skipinitialspace=True)
    tmpdf = score_setdf(noholdout, rectracks)
    print("tmpdf len: {}".format(len(tmpdf)))
    tmpdf["tag"]=submission["tag"]
    results=results.append(tmpdf)

In [None]:
results.groupby(["tag", "task_name"]).rprec.mean()

In [None]:
nodup = results[results["tag"]=="vl6"].drop_duplicates(subset="pid")

In [None]:
nodup = nodup.append(results[results["tag"]=="hw"].drop_duplicates(subset="pid"))

In [None]:
nodup.groupby(["tag", "task_name"]).rprec.mean()

In [None]:
means= nodup.groupby(["tag", "task_name"]).rprec.mean()

In [None]:

ax=means.hw.plot.bar(legend=True, color="red", alpha=.5)
means.vl6.plot.bar(ax=ax,legend=True, alpha=0.35)
