## Score challenge submission

This notebook produces scores for a custom challenge set.  It scores each tasks and provides a summary score (average).

This is designed to help compare the quality of a solution against a challenge set under our control and demonstrate that the aicrowd score is consistent.

In [None]:
import sys
import json
import re
import collections
import os
import datetime
import pandas as pd
import numpy as np
import math
import matplotlib.pyplot as plt
import seaborn as sns

In [None]:
challenge="ex2-from-21k-train-with-pids"
submitdir="/home/jpr/projects/mpd-challenge-aicrowd/"
submissionfile="method-01-mympd-2nd-21k-2021-11-14.csv.gz"
submissions = [{"tag": "vl6", "file": submissionfile}]
submissionfile="method-02-mympd-2nd-21k-2021-11-07.csv.gz"
submissions.append({"tag": "hw", "file": submissionfile})
submissionfile="method-u2uknn-mympd-mpd-2nd-21k-2021-12-12.csv.gz"
submissions.append({"tag": "u2uknn", "file": submissionfile})
submissionfile="method-u2uknn-mympd-full-mympd-full-20k-2021-12-29.csv.gz"
submissions.append({"tag": "u2uknnfull", "file": submissionfile})
submissionfile="method-i2iknn-mympd-mpd-2nd-21k-2021-12-18.csv.gz"
submissions.append({"tag": "i2iknn", "file": submissionfile})
submissionfile="method-mfals-mympd-mpd-2nd-21k-2021-12-26.csv.gz"
submissions.append({"tag": "mfals", "file": submissionfile})

In [None]:
result_files={"vl6": "method-01-mympd-2nd-21k-2021-11-14.csv.gz",
        "hw": "method-02-mympd-2nd-21k-2021-11-07.csv.gz",
        "u2uknn": "method-u2uknn-mympd-mpd-2nd-21k-2021-12-12.csv.gz"}

### Load the no holdouts challenge set

In [None]:
with open('data/{}/challenge_set_noholdout.json'.format(challenge),'r') as f:
    data = json.loads(f.read())

In [None]:
noholdout = pd.json_normalize(data,"playlists")

In [None]:
noholdout.head()

Reorder noholdouts to group sequential and random tasks.

In [None]:
tmpdf=noholdout[0:6000]

In [None]:
tmpdf=tmpdf.append(noholdout[7000:8000])

In [None]:
tmpdf=tmpdf.append(noholdout[6000:7000])

In [None]:
tmpdf=tmpdf.append(noholdout[8000:10000])

In [None]:
tmpdf.task_name.unique()

In [None]:
noholdout=tmpdf

In [None]:
with open('data/{}/challenge_set_noholdout.json'.format("mympd-full"),'r') as f:
    data = json.loads(f.read())
    
 
noholdout2 = pd.json_normalize(data,"playlists")

In [None]:
def reorder_holdout(noholdout):
    tmpdf=noholdout[0:6000]
    tmpdf=tmpdf.append(noholdout[7000:8000])
    tmpdf=tmpdf.append(noholdout[6000:7000])
    tmpdf=tmpdf.append(noholdout[8000:10000])
    return tmpdf

In [None]:
noholdout2=reorder_holdout(noholdout2)

### Load challenge submission`

In [None]:
rectracks=pd.read_csv('{}/{}'.format(submitdir, submissionfile), header=None, skiprows=1, index_col=0, skipinitialspace=True)


Make sure the pids match those of the challenge set.  In the case of mympd the pid range starts at 200000 and goes up in sequence of noholdout data set.

In [None]:
rectracks.head()

In [None]:
rectracks = rectracks.sort_values(by=0, axis='index')

In [None]:
rectracks.head()

## Score Challenge Submission

In [None]:
startpid=2000000

R-precision is the fraction of correctly recommended tracks in the ground truth playlist as described on [the challenge site](https://www.aicrowd.com/challenges/spotify-million-playlist-dataset-challenge#evaluation)
    

Use the [r-precesion calculation from the hello_world metrics](https://github.com/jprorama/spotify_recSys_challenge_2018/blob/f33d82715190a20fdbc998c9ff709bcabd62a55e/utils/metrics.py#L26)

In [None]:
def get_r_precision(answer, cand):
    set_answer = set(answer)
    r = len(set_answer&set(cand[:len(set_answer)])) / len(set_answer)
    return r

def get_ndcg(answer, cand):
    cand_len = len(cand) 
    idcg=0
    dcg=0
    
    #print("cand len {}".format(cand_len))
    #print("ans len {}".format(len(answer)))
    #print("cand: {}".format(cand))
    
    for i in range(cand_len):
        #print("i {}".format(i))
        #print("cand {}".format(cand[i]))
        if cand[i] in answer: 
            dcg += (1/math.log(i+1+1,2))

    for i in range(len(set(answer))):
        idcg += (1/math.log(i+1+1,2))
    
    return dcg/idcg

In [None]:
realtracks=pd.DataFrame()
rprec_sum = 0.0
ndcg_sum = 0.0
rprec_match = 0
ndcg_match = 0

for i in range(len(noholdout)):
    pid = startpid + i
    #print("pid={}".format(pid))
    gttracks = [track["track_uri"] for track in noholdout[noholdout["challenge_pid"]==pid].tracks.to_list()[0]]
    candtracks = rectracks.loc[pid].to_list()
    rprec = get_r_precision(gttracks, candtracks)
    rprec_sum = rprec_sum + rprec
    if rprec > 0:
        rprec_match += 1
    ndcg = get_ndcg(gttracks, candtracks)
    ndcg_sum = ndcg_sum + ndcg
    if ndcg > 0:
        ndcg_match += 1
    
print("rprec = {}".format(rprec_sum/len(noholdout)))
print("ndcg = {}".format(ndcg_sum/len(noholdout)))
print("rprec_match = {}".format(rprec_match))
print("ndcg_match = {}".format(ndcg_match))

In [None]:
def score_set(answer, candidates):
    rprec_sum = 0.0
    ndcg_sum = 0.0
    rprec_match = 0
    ndcg_match = 0

    scores = dict()

    for pid in answer.challenge_pid:
        #pid = startpid + i
        #print("pid={}".format(pid))
        gttracks = [track["track_uri"] for track in answer[answer["challenge_pid"]==pid].tracks.to_list()[0]]
        candtracks = candidates.loc[pid].to_list()
        rprec = get_r_precision(gttracks, candtracks)
        rprec_sum = rprec_sum + rprec
        if rprec > 0:
            rprec_match += 1
        ndcg = get_ndcg(gttracks, candtracks)
        ndcg_sum = ndcg_sum + ndcg
        if ndcg > 0:
            ndcg_match += 1
        
    scores["rprec"] = rprec_sum/len(answer)
    scores["rprec_match"] = rprec_match
    scores["ndcg"] = ndcg_sum/len(answer)
    scores["ndcg_match"] = ndcg_match
        
    return scores

In [None]:
rprec_sum = 0.0
ndcg_sum = 0.0
rprec_match = 0
ndcg_match = 0

scores = dict()

for task in noholdout.task_name.drop_duplicates():

    scores[task] = score_set(noholdout[noholdout["task_name"]==task], rectracks)
    
    rprec_sum += scores[task]["rprec"]
    rprec_match += scores[task]["rprec_match"]
    ndcg_sum += scores[task]["ndcg"]
    ndcg_match += scores[task]["ndcg_match"]
    
    
    print("task: {}".format(task))
    print("rprec = {}".format(scores[task]["rprec"]))
    print("ndcg = {}".format(scores[task]["ndcg"]))
    print("rprec_match = {}".format(scores[task]["rprec_match"]))
    print("ndcg_match = {}".format(scores[task]["ndcg_match"]))
    print("\n")

scores["total"] = dict()
scores["total"]["rprec"] = rprec_sum/10
scores["total"]["rprec_match"] = rprec_match
scores["total"]["ndcg"] = ndcg_sum/10
scores["total"]["ndcg_match"] = ndcg_match

print("task: {}".format("total"))
print("rprec = {}".format(scores["total"]["rprec"]))
print("ndcg = {}".format(scores["total"]["ndcg"]))
print("rprec_match = {}".format(scores["total"]["rprec_match"]))
print("ndcg_match = {}".format(scores["total"]["ndcg_match"]))
print("\n")

In [None]:
def score_setdf(answer, candidate):

    rprec_sum = 0.0
    ndcg_sum = 0.0
    rprec_match = 0
    ndcg_match = 0

    results = answer.copy()

    #results["candidates"] = 
    results["rprec"] = 0
    results["ndcg"] = 0

    for pid in results.challenge_pid:
        #pid = startpid + i
        #print("pid={}".format(pid))
        gttracks = [track["track_uri"] for track in results[results["challenge_pid"]==pid].tracks.to_list()[0]]
        candtracks = rectracks.loc[pid].to_list()
        rprec = get_r_precision(gttracks, candtracks)
        rprec_sum = rprec_sum + rprec
        if rprec > 0:
            rprec_match += 1
        ndcg = get_ndcg(gttracks, candtracks)
        ndcg_sum = ndcg_sum + ndcg
        if ndcg > 0:
            ndcg_match += 1

        #results.at[results["challenge_pid"]==pid, "candidates"] = candtracks
        results.at[results["challenge_pid"]==pid, "rprec"] = rprec
        results.at[results["challenge_pid"]==pid, "ndcg"] = ndcg
        
    return results


In [None]:
%%time

rlist=[]
results = pd.DataFrame()
tmpdf = pd.DataFrame()

for submission in submissions:
    print("submission: {}".format(submission))
    print("file: {}".format(submission["file"]))
    rectracks=pd.read_csv('{}/{}'.format(submitdir, submission["file"]), header=None, skiprows=1, index_col=0, skipinitialspace=True)
    if (submission["tag"]=="u2uknnfull"):
        tmpdf = score_setdf(noholdout2, rectracks)
    else:    
        tmpdf = score_setdf(noholdout, rectracks)
    print("tmpdf len: {}".format(len(tmpdf)))
    tmpdf["tag"]=submission["tag"]
    rlist.append(tmpdf)
    
results=pd.concat(rlist)
results.reset_index(drop=True, inplace=True)

In [None]:
results.groupby(["tag", "task_name"]).rprec.mean()

### Summarize total peformance

In [None]:
results.groupby(["tag"]).rprec.mean()

In [None]:
results.groupby(["tag"]).ndcg.mean()

Remove rand100 which is the task which most significantly differentiates method performance.

In [None]:
results[results.task_name!="rand_100_title"].groupby(["tag"]).rprec.mean()

In [None]:
results[results.task_name!="rand_100_title"].groupby(["tag"]).ndcg.mean()

In [None]:
nodup = results[results["tag"]=="vl6"].drop_duplicates(subset="pid")

In [None]:
nodup = nodup.append(results[results["tag"]=="hw"].drop_duplicates(subset="pid"))

In [None]:
nodup = nodup.append(results[results["tag"]=="u2uknn"].drop_duplicates(subset="pid"))

In [None]:
nodup = nodup.append(results[results["tag"]=="u2uknnfull"].drop_duplicates(subset="pid"))

In [None]:
nodup = nodup.append(results[results["tag"]=="i2iknn"].drop_duplicates(subset="pid"))

In [None]:
nodup = nodup.append(results[results["tag"]=="mfals"].drop_duplicates(subset="pid"))

In [None]:
nodup.groupby(["tag", "task_name"]).rprec.mean()

In [None]:
means= nodup.groupby(["tag", "task_name"]).rprec.mean()

In [None]:

ax=means.vl6.plot.bar(legend=True, color="blue", alpha=.35)
means.hw.plot.bar(ax=ax,legend=True, color="red", alpha=0.35)
means.u2uknn.plot.bar(ax=ax,legend=True, color="purple", alpha=0.35)

Create means as dataframe rather than groupby object by using the aggregator on the groupby object rather than the series of rprec column alone.  This keeps the column names and should simplify plotting.

https://pandas.pydata.org/pandas-docs/stable/user_guide/groupby.html

In [None]:
means = nodup[["tag", "task_name","rprec"]].groupby(["tag", "task_name"], sort=False).mean().reset_index()

In [None]:
sns.catplot(x = "task_name",       # x variable name
            y = "rprec",       # y variable name
            hue = "tag",  # group variable name
            data = means,     # dataframe to plot
            kind = "bar",
            aspect = 2)
ignore=plt.xticks(rotation=45)
ignore=plt.title("R-Precision Compared")

In [None]:
means = nodup[["tag", "task_name","ndcg"]].groupby(["tag", "task_name"], sort=False).mean().reset_index()

In [None]:
sns.catplot(x = "task_name",       # x variable name
            y = "ndcg",       # y variable name
            hue = "tag",  # group variable name
            data = means,     # dataframe to plot
            kind = "bar",
            aspect = 2)
ignore=plt.xticks(rotation=45)
ignore=plt.title("NDCG Compared")

## Explore Violin plot

start with the routine used during the recsys18 analysis

In [None]:
def plot_violin(df, title="Violin Plot"):
    #sns.set_style("white") 
    sns.set(rc={'figure.figsize':(8,6)})
    g = sns.violinplot(data=df, cut=0, orient='v', scale='width')
    #g = sns.violinplot(x=df.iloc[,0], y=df.iloc[0,:], cut=0, scale='width')
    g.set_title(title)
    #g.set_xlabel("Subtask")
    g.set_ylabel("Score")
    g.set_xticklabels(g.get_xticklabels(), rotation=90)

    return g

In [None]:
nodup.groupby(["tag", "task_name"]).rprec.describe().T

Don't need the describe data because that will come from the violin plot.  Just need to use all the raw data points.

In [None]:
vl6desc = nodup[nodup["tag"]=="vl6"][["task_name", "rprec"]] #.groupby(["task_name"]).mean()

In [None]:
hwdesc = nodup[nodup["tag"]=="hw"][["task_name", "rprec"]]

In [None]:
plot_violin(vl6desc[vl6desc["task_name"]=="rand_100_title"], "rand_100_title")

In [None]:
plot_violin(hwdesc[hwdesc["task_name"]=="rand_100_title"], "rand_100_title")

## Explore side-by-side plots

What to take all the tasks and see the teams side by side

Create violin plots for each task. https://stackoverflow.com/a/47487445/8928529

Basically loop through the tasks and plot on each subplot axis.

In [None]:
fig, axes = plt.subplots(10, 2, figsize=(20, 32), sharey='row')
axes_cols = (axes.flatten()[::2], axes.flatten()[1::2])

i=0
for task in vl6desc.task_name.drop_duplicates():
    
    ax=axes_cols[0][i]
    sns.violinplot(data=vl6desc[vl6desc["task_name"]==task], cut=0, orient='v', scale='width', ax=ax)
    ax.set_title('task = {}'.format(task), y=0.95)
    ax=axes_cols[1][i]
    sns.violinplot(data=hwdesc[hwdesc["task_name"]==task], cut=0, orient='v', scale='width', ax=ax)
    ax.set_title('task = {}'.format(task), y=0.95)
    
    
    i += 1

## Explore split plot

This lets me see the data paired directly and allows easier visual comparison of differences.

The test rand_100_title task shows clear differences.

In [None]:
data = nodup[((nodup["tag"]=="vl6") | (nodup["tag"]=="hw"))][["tag","task_name", "rprec"]]

In [None]:
sns.violinplot(data=data[data["task_name"]=="rand_100_title"], cut=0, orient='v', scale='width',
               x="task_name", y="rprec",
               hue="tag",
               split=True, inner="quart")

#sns.despine(left=True)

In [None]:
rprec_data = nodup[((nodup["tag"]=="vl6") | (nodup["tag"]=="hw"))][["tag","task_name", "rprec"]]
ndcg_data  = nodup[((nodup["tag"]=="vl6") | (nodup["tag"]=="hw"))][["tag","task_name", "ndcg"]]

In [None]:
fig, axes = plt.subplots(10, 2, figsize=(20, 32), sharey='row')
axes_cols = (axes.flatten()[::2], axes.flatten()[1::2])

i=0
for task in vl6desc.task_name.drop_duplicates():
    
    ax=axes_cols[0][i]
    sns.violinplot(data=rprec_data[rprec_data["task_name"]==task], cut=0, orient='v', scale='width',
               x="task_name", y="rprec",
               hue="tag",
               split=True, inner="quart", ax=ax)
    ax.set_title('task = {}'.format(task), y=0.95)

    ax=axes_cols[1][i]
    sns.violinplot(data=ndcg_data[ndcg_data["task_name"]==task], cut=0, orient='v', scale='width',
               x="task_name", y="ndcg",
               hue="tag",
               split=True, inner="quart", ax=ax)
    ax.set_title('task = {}'.format(task), y=0.95)
    
    
    i += 1

With the rprec and ndcg data plotted side by side it is clear that the rand_100_title was much more effectively solved by vl6.

The second most effective was first_1_title where vl6 had higher mean.

However the the title_only solution of hw is clearly better.

All the rest of the tasks had nearly identical means and distributions with the hw solution having slightly higher ndcg in those tasks.


In [None]:
task

In [None]:
rprec_data = nodup[["tag","task_name", "rprec"]]
ndcg_data  = nodup[["tag","task_name", "ndcg"]]

In [None]:
rprec_data=rprec_data.reset_index().rename(columns={"index": "pid"})

In [None]:
rprec_data[((rprec_data["tag"]=="vl6") & (rprec_data["task_name"]=="first_1_title"))]

In [None]:
sns.scatterplot(data=rprec_data[((rprec_data["tag"]=="vl6") & (rprec_data["task_name"]=="first_1_title"))], x='pid', y="rprec")

In [None]:
sns.scatterplot(data=rprec_data[((rprec_data["tag"]=="hw") & (rprec_data["task_name"]=="first_1_title"))], x='pid', y="rprec")

In [None]:
sns.scatterplot(data=rprec_data[((rprec_data["tag"]=="vl6"))], x="pid", y="rprec", hue="tag", alpha=.5)
#sns.scatterplot(data=rprec_data[((rprec_data["tag"]=="hw"))], y="rprec", hue="tag", alpha=.5, )

In [None]:
sns.scatterplot(data=rprec_data, x="pid", y="rprec", hue="tag", alpha=0.4)

It's much easier to get fast insight from seaborn using facet grids since they are built right from the data.

https://seaborn.pydata.org/tutorial/axis_grids.html

In [None]:
g = sns.FacetGrid(rprec_data, row="task_name", col="tag", hue="tag", margin_titles=True, height=4, aspect=2)
g.map(sns.scatterplot, "pid", "rprec", linewidth = 0)

In [None]:
rprec_data["rpid"]= rprec_data.pid % 1000

In [None]:
g = sns.FacetGrid(rprec_data, row="task_name", col="tag", hue="tag", margin_titles=True, height=3, aspect=2)
g.map(sns.scatterplot, "rpid", "rprec", linewidth = 0)

Revisit violin distribution plots now with all data.

In [None]:
g = sns.FacetGrid(rprec_data, row="task_name", hue="tag", margin_titles=True, height=3, aspect=4)
g.map(sns.violinplot, "tag", "rprec", order=['vl6', 'hw', 'u2uknn', 'u2uknnfull','i2iknn','mfals'], palette="muted", inner="quart", cut=0, orient='v', scale='count')

##  Inspect the highest scoring recommendations

Understand which tasks and methods perform the best.

In [None]:
nodup[nodup.rprec > 0.8][["rprec", "ndcg", "tag","task_name", "name", "num_tracks"]]