## Score challenge submission

This notebook produces scores for a custom challenge set.  It scores each tasks and provides a summary score (average).

This is designed to help compare the quality of a solution against a challenge set under our control and demonstrate that the aicrowd score is consistent.

In [None]:
import sys
import json
import re
import collections
import os
import datetime
import pandas as pd
import numpy as np
import math
import matplotlib.pyplot as plt
import seaborn as sns
import time
import pickle

In [None]:
experimentsdir="/home/jpr/projects/mpd-test-sets/results/"

In [None]:
experiments=list()
#experiments.append("/home/jpr/projects/mpd-test-sets/results/u2u-optimized-scaled-complete")
#experiments.append("/home/jpr/projects/mpd-test-sets/results/u2u-optimized-scaled-complete-redux")
#experiments.append("/home/jpr/projects/mpd-test-sets/results/knn-u2u-unified-sklearn-sim-train-only")
experiments.append("/home/jpr/projects/mpd-test-sets/results/knn-u2u-unified-cos-similarities2_1")
#experiments.append("/home/jpr/projects/mpd-test-sets/results/knn-u2u-unified-cos-similarities2_2")
experiments.append("/home/jpr/projects/mpd-test-sets/results/knn-u2u-unified-cos-similarities3")
experiments.append("/home/jpr/projects/mpd-test-sets/results/knn-u2u-unified-tfidf-scaling-sim")
#experiments.append("/home/jpr/projects/mpd-test-sets/results/i2i-optimized-scaled-complete")
#experiments.append("/home/jpr/projects/mpd-test-sets/results/knn-i2i-unified")
#experiments.append("/home/jpr/projects/mpd-test-sets/results/knn-i2i-original")
#experiments.append("/home/jpr/projects/mpd-test-sets/results/knn-i2i-original-train-only")
experiments.append("/home/jpr/projects/mpd-test-sets/results/knn-i2i-unified-sklearn-sim-train-only")
#experiments.append("/home/jpr/projects/mpd-test-sets/results/mfals-optimized-scaled")
experiments.append("/home/jpr/projects/mpd-test-sets/results/mfals-allmpb-wo-transpose-mpd3")
experiments.append("/home/jpr/projects/mpd-test-sets/results/vl6-mympd-full")
#experiments.append("/home/jpr/projects/mpd-test-sets/results/mfals-manual-results-2022-01-02")

In [None]:
submissions=[]

for submitdir in experiments:
    with os.scandir(submitdir) as entries:
        for entry in entries:
            if entry.name.endswith(".gz"):
                print(entry.name)
                method, challenge, trainset, ignore = entry.name.split("_", 3)
                ignore, method = method.split("-", 1)
                trainset, size = trainset.rsplit("-", 1)
                size = size.split('k')[0]
                ignore, tag = submitdir.rsplit("/", 1)

                submissions.append({"method": method, 
                        "challenge": challenge, 
                        "trainset": trainset, 
                        "trainsize": size, 
                        "tag": tag, 
                        "dir": submitdir + "/",
                        "file": entry.name})
            

In [None]:
submissions

### Load the no holdouts challenge set

In [None]:
def get_challenges(submissions):
    
    challenges=set()

    for exp in submissions:
        challenges.add(exp["challenge"])
        
    return list(challenges)

In [None]:
def reorder_holdout(noholdout):
    tmpdf=noholdout[0:6000]
    tmpdf=tmpdf.append(noholdout[7000:8000])
    tmpdf=tmpdf.append(noholdout[6000:7000])
    tmpdf=tmpdf.append(noholdout[8000:10000])
    return tmpdf

In [None]:
def load_answers(answers_list):

    answers = dict()
    
    for answer in answers_list:
        with open('data/{}/challenge_set_noholdout.json'.format(answer),'r') as f:
            data = json.loads(f.read())

        data = pd.json_normalize(data,"playlists")
        data = reorder_holdout(data)
        answers[answer] = data
        
    return answers

In [None]:
challenges = get_challenges(submissions)

In [None]:
challenges

In [None]:
answers = load_answers(challenges)

### Load challenge submission`

## Score Challenge Submission

R-precision is the fraction of correctly recommended tracks in the ground truth playlist as described on [the challenge site](https://www.aicrowd.com/challenges/spotify-million-playlist-dataset-challenge#evaluation)
    

Use the [r-precesion calculation from the hello_world metrics](https://github.com/jprorama/spotify_recSys_challenge_2018/blob/f33d82715190a20fdbc998c9ff709bcabd62a55e/utils/metrics.py#L26)

In [None]:
def get_r_precision(answer, cand):
    set_answer = set(answer)
    r = len(set_answer&set(cand[:len(set_answer)])) / len(set_answer)
    return r

def get_ndcg(answer, cand):
    cand_len = len(cand) 
    idcg=0
    dcg=0
    
    #print("cand len {}".format(cand_len))
    #print("ans len {}".format(len(answer)))
    #print("cand: {}".format(cand))
    
    for i in range(cand_len):
        #print("i {}".format(i))
        #print("cand {}".format(cand[i]))
        if cand[i] in answer: 
            dcg += (1/math.log(i+1+1,2))

    for i in range(len(set(answer))):
        idcg += (1/math.log(i+1+1,2))
    
    return dcg/idcg

In [None]:
def score_set(answer, candidates):
    rprec_sum = 0.0
    ndcg_sum = 0.0
    rprec_match = 0
    ndcg_match = 0

    scores = dict()

    for pid in answer.challenge_pid:
        #pid = startpid + i
        #print("pid={}".format(pid))
        gttracks = [track["track_uri"] for track in answer[answer["challenge_pid"]==pid].tracks.to_list()[0]]
        candtracks = candidates.loc[pid].to_list()
        rprec = get_r_precision(gttracks, candtracks)
        rprec_sum = rprec_sum + rprec
        if rprec > 0:
            rprec_match += 1
        ndcg = get_ndcg(gttracks, candtracks)
        ndcg_sum = ndcg_sum + ndcg
        if ndcg > 0:
            ndcg_match += 1
        
    scores["rprec"] = rprec_sum/len(answer)
    scores["rprec_match"] = rprec_match
    scores["ndcg"] = ndcg_sum/len(answer)
    scores["ndcg_match"] = ndcg_match
        
    return scores

In [None]:
def score_setdf(answer, candidate):

    rprec_sum = 0.0
    ndcg_sum = 0.0
    rprec_match = 0
    ndcg_match = 0

    results = answer.copy()

    #results["candidates"] = 
    results["rprec"] = 0
    #results["rprec_alt"] = 0
    results["ndcg"] = 0

    for pid in results.challenge_pid:
        #pid = startpid + i
        #print("pid={}".format(pid))
        gttracks = [track["track_uri"] for track in results[results["challenge_pid"]==pid].tracks.to_list()[0]]
        candtracks = rectracks.loc[pid].to_list()
        rprec = get_r_precision(gttracks, candtracks)
        #if not results[results["challenge_pid"]==pid].random:
        #    num_samples=results[results["challenge_pid"]==pid].num_samples
        #    rprec_alt =  get_r_precision(gttracks[num_samples:], candtracks)
        rprec_sum = rprec_sum + rprec
        if rprec > 0:
            rprec_match += 1
        ndcg = get_ndcg(gttracks, candtracks)
        ndcg_sum = ndcg_sum + ndcg
        if ndcg > 0:
            ndcg_match += 1

        #results.at[results["challenge_pid"]==pid, "candidates"] = candtracks
        results.at[results["challenge_pid"]==pid, "rprec"] = rprec
        #results.at[results["challenge_pid"]==pid, "rprec_alt"] = rprec_alt
        results.at[results["challenge_pid"]==pid, "ndcg"] = ndcg
        
    return results


In [None]:
%%time

rlist=[]
results = pd.DataFrame()
tmpdf = pd.DataFrame()

for submission in submissions:
    print("submission: {}".format(submission))
    print("file: {}".format(submission["file"]))
    
    cachefile=submission["dir"]+"cache/scored-pickle-"+submission["file"]

    start_time = time.time()    
    if (os.path.isfile(cachefile)):
        tmpdf = pd.read_pickle(cachefile)
        print("score read cache: {} sec".format(time.time()-start_time))
        
        tmpdf["trainsize"]=pd.to_numeric(submission["trainsize"])
        tmpdf["tag"]=submission["tag"]
        tmpdf["method"]=submission["method"]
 
    else:
        start_time = time.time()
        rectracks=pd.read_csv('{}/{}'.format(submission["dir"], submission["file"]), header=None, skiprows=1, index_col=0, skipinitialspace=True)
        print("load time: {} sec".format(time.time()-start_time))

        #if ("full" in submission["tag"]):
        #if (submission["challenge"]=="mympd-full"):
        #    print("scored with noholdout2")
        #    tmpdf = score_setdf(noholdout2, rectracks)
        #else:    
        #    print("scored with noholdout")
        #    tmpdf = score_setdf(noholdout, rectracks)
            
        print("scored with: {}".format(submission["challenge"]))
        tmpdf = score_setdf(answers[submission["challenge"]], rectracks)
        print("score time: {} sec".format(time.time()-start_time))

        tmpdf["trainsize"]=pd.to_numeric(submission["trainsize"])
        tmpdf["tag"]=submission["tag"]
        tmpdf["method"]=submission["method"]
    
        tmpdf.to_pickle(cachefile)
        

    print("tmpdf len: {}".format(len(tmpdf)))
    #tmpdf["trainsizek"]=submission["trainsizek"]
    #tmpdf["tag"]=submission["tag"]
    #tmpdf["method"]=submission["method"]
    rlist.append(tmpdf.drop(columns=["tracks"]))
    
results=pd.concat(rlist)
results.reset_index(drop=True, inplace=True)
del rlist

### Summarize total peformance

In [None]:
results

In [None]:
results["tag"].drop_duplicates()

In [None]:
results[["tag", "trainsize"]].drop_duplicates()

In [None]:
results["trainsize"]=pd.to_numeric(results["trainsize"])

In [None]:
rprec_scores=results.groupby(["tag", "trainsize"]).rprec.mean().reset_index()

In [None]:
rprec_scores.sort_values("trainsize")

In [None]:
print(rprec_scores[rprec_scores["trainsize"]==700].sort_values("rprec"))

In [None]:
r100=rprec_scores[rprec_scores["trainsize"]=="100"].sort_values("rprec")

In [None]:
r100.shape

In [None]:
rprec_scores.info()

In [None]:
results.info()

In [None]:
results.groupby(["tag", "trainsize"]).ndcg.mean()

In [None]:
means = results.groupby(["tag", "method", "task_name", "trainsize"], sort=False).mean().reset_index()

means.rename(columns={"trainsizek": "trainsize"}, inplace=True)

In [None]:
means["trainsize"]=pd.to_numeric(means["trainsize"])

In [None]:
rprecs=means.groupby(["tag", "trainsize"]).rprec.mean().reset_index()
ndcgs=means.groupby(["tag", "trainsize"]).ndcg.mean().reset_index()


In [None]:
rprecs

means.groupby(["tag", "trainsize"]).mean(["rprec", "ndcg"]).reset_index()

In [None]:
compete=pd.merge(rprecs, ndcgs, how="outer", on=["tag", "trainsize"])

In [None]:
compete

Performance of vl6 on aicrowd mpd challenge with f=200 and a=0.001.
Manually entered from web results.

In [None]:
vl6_mpd = pd.DataFrame({
    "trainsize": [700, 500, 400, 300, 200, 100, 80, 40, 20],
    "rprec": [0.21442484143065002, 0.21405716708527278, 0.2125500586026767, 0.2110791021070401, 0.2086606922904522, 0.19836474965963136, 0.19577806512432588, 0.1860676190999628, 0.16644686115093993],
    "ndcg": [0.38145766548782994, 0.37883582620961503, 0.37649177888485413, 0.3740181704034522, 0.3689999024579872, 0.3503513287427269, 0.3452777465178121, 0.3270989490096962, 0.2939440975370557]
                       
})
vl6_mpd["tag"] = "vl6-mpd"

In [None]:
vl6_mpd

Performance of vl6 on aicrowd mpd challenge with f=200 and a=0.001.
Manually entered from web results.

In [None]:
mfals_mpd = pd.DataFrame({
    "trainsize": [600, 500, 400, 300, 200, 100, 80, 60, 40, 20],
    "rprec": [0.1882107724508686, 0.18809184200786833, 0.18835980226790114, 0.1880982511047418, 0.1878392459596271, 0.18598005010706134, 0.18558435129023018, 0.18431149868643149, 0.18272959865796876, 0.17787711395877562],
    "ndcg": [0.3351378652838204, 0.3346447814399467, 0.3346665743142349, 0.33456525706787454, 0.33340858421290287, 0.33022997245582436, 0.3287635775126507, 0.32575433842270896, 0.3218402305031761, 0.3118166024554931]
                       
})
mfals_mpd["tag"] = "mfals-mpd"

In [None]:
mfals_mpd

In [None]:
u2u_mpd_sim21 = pd.DataFrame({
    "trainsize": [700, 600, 300, 200, 40, 20],
    "rprec": [0.1842170921417326, 0.18396194763810977, 0.1832626778689676, 0.18256715453129624, 0.17621346517051084, 0.17094090669426182],
    "ndcg": [0.3310907819918829, 0.33058729397194647, 0.32885352469852763, 0.32727258664454334, 0.31282312022279773, 0.30092662995758007]
                       
})
u2u_mpd_sim21["tag"] = "u2u-unified-sim2_1-mpd"

In [None]:
u2u_mpd_sim21

In [None]:
i2i_mpd = pd.DataFrame({
    "trainsize": [20, 10],
    "rprec": [0.1530146983297283, 0.14086699336844719],
    "ndcg": [0.2735167051481905, 0.25238909933343595]
                       
})
i2i_mpd["tag"] = "i2i-unified-mpd"

In [None]:
compete=pd.concat([compete, vl6_mpd]).reset_index(drop=True)

In [None]:
compete

In [None]:
compete=pd.concat([compete, mfals_mpd]).reset_index(drop=True)

In [None]:
compete

In [None]:
compete=pd.concat([compete, u2u_mpd_sim21]).reset_index(drop=True)

In [None]:
compete

In [None]:
compete=pd.concat([compete, i2i_mpd]).reset_index(drop=True)

Process the aicrowd-results_ prefixed files to get the results from the autosubmission pipeline.

After parsing the results merge them with the manually entered values above so they can contribute to the plots.

In [None]:
from pathlib import Path

In [None]:
exp = Path(experimentsdir)

In [None]:
aicrowd_results = []

for mpdexp in exp.glob("*-mpd"):
    mpd = Path(mpdexp)
    expname = mpdexp.name.split("/", -1)
    expname = expname[0]
    #print(expname)
    for result in mpd.glob("aicrowd-result_*"):
        #print(result.name.rsplit("/", -1))
        parts = result.name.split("_")
        with open(result) as r:
            curtest = {}
            curtest["tag"] = expname
            trainsize = parts[3].rsplit("-", 1)[1].rstrip("k")
            curtest["trainsize"] = trainsize
            for line in r.readlines():
                test, score = line.split(":")
                curtest[test] = score.strip()
            aicrowd_results.append(curtest)

In [None]:
aicrowd_results=pd.DataFrame(aicrowd_results)

In [None]:
aicrowd_results["trainsize"] = aicrowd_results["trainsize"].astype("int32")
aicrowd_results["rprec"] = aicrowd_results["rprec"].astype("float64")
aicrowd_results["ndcg"] = aicrowd_results["ndcg"].astype("float64")

In [None]:
aicrowd_results[["tag", "trainsize","rprec", "ndcg"]]

In [None]:
compete

In [None]:
compete2=pd.merge(compete, aicrowd_results[["tag", "trainsize","rprec", "ndcg"]], how="outer", on=["tag", "trainsize"])

In [None]:
compete2["rprec"] = compete2["rprec_x"].combine_first(compete2['rprec_y'])
compete2["ndcg"] = compete2["ndcg_x"].combine_first(compete2['ndcg_y'])
compete2 = compete2.drop(columns=["rprec_x", "rprec_y", "ndcg_x", "ndcg_y"])

In [None]:
compete2

In [None]:
compete=compete2

In [None]:
sns.lineplot(x="trainsize", y="rprec", data=compete, hue="tag")
#ignore=plt.title("R-prec scaling mympd and mpd")
plt.legend(bbox_to_anchor=(1.05, 1), loc=2, borderaxespad=0.)

In [None]:
sns.scatterplot(x="trainsize", y="rprec", data=compete, hue="tag")
#ignore=plt.title("R-prec scaling mympd and mpd")
plt.legend(bbox_to_anchor=(1.05, 1), loc=2, borderaxespad=0.)

In [None]:
sns.lineplot(x="trainsize", y="ndcg", data=compete, hue="tag")
#ignore=plt.title("R-prec scaling mympd and mpd")
plt.legend(bbox_to_anchor=(1.05, 1), loc=2, borderaxespad=0.)

In [None]:
sns.catplot(x = "task_name",       # x variable name
            y = "rprec",       # y variable name
            hue = "tag",  # group variable name
            data = means,     # dataframe to plot
            kind = "bar",
            aspect = 2)
ignore=plt.xticks(rotation=45)
ignore=plt.title("R-Precision Compared")

In [None]:
sns.catplot(x = "task_name",       # x variable name
            y = "ndcg",       # y variable name
            hue = "tag",  # group variable name
            data = means,     # dataframe to plot
            kind = "bar",
            aspect = 2)
ignore=plt.xticks(rotation=45)
ignore=plt.title("NDCG Compared")

In [None]:
#g = sns.FacetGrid(means, col="task_name", hue="method", col_wrap=4, sharex=False, margin_titles=True, 
#                  height=6, aspect=.75,
#                  col_order=["first_5_title", "first_5_wo_title", "first_10_title", "first_10_wo_title",
#                            "first_25_title", "first_100_title", "rand_25_title", "rand_100_title",
#                            "first_1_title", "title_only"])
#g.map(sns.lineplot, "trainsize", "rprec", marker="o", legend="full", linewidth = 4)
g = sns.lineplot(data=means, x="trainsize", y="rprec", hue="tag", marker="o", legend="full", linewidth = 4)
#g.add_legend()
#g.set_titles(col_template="{col_name}")
#g.set(xscale="log")
plt.legend(bbox_to_anchor=(1.05, 1), loc=2, borderaxespad=0.)

### Plot Panel of Methods with Spread

Separating out the range plots in a facetgrid lets me see the overall performance of each of the measured methods so far.
Vl6 definitely does a better job of lifting more playlists out of the "poverty" of poor recommendations.
You can look at the bottom and top boundaries of the curves easily.
There are several that have an interesting decrease in the lower bounary at a larger training set.

This is a very helpful plot.
Tts makes it clear that vl6 does produce a tighter bound on the recommendations (higher min and max)
Their average perf on the 700k is almost better than the best possible on all the other methods.
The i2i and u2u_sim21 and mfals are all competative but have more lower preforming recommendations.
This weights down their average.


In [None]:
g = sns.FacetGrid(means, col="tag", hue="tag", col_wrap=3, margin_titles=False, 
                  height=6, aspect=.75)
g.map(sns.lineplot, "trainsize", "rprec", marker="o",  linewidth = 4)
#g.add_legend()
#g.set_titles(col_template="{col_name}")
#g.set(xscale="log")
#plt.legend(bbox_to_anchor=(1.05, 1), loc=2, borderaxespad=0.)

In [None]:
g = sns.FacetGrid(means[means["task_name"]!="title_only"], col="tag", hue="tag", col_wrap=3, margin_titles=False, 
                  height=6, aspect=.75)
g.map(sns.lineplot, "trainsize", "rprec", marker="o",  linewidth = 4)

In [None]:
g = sns.FacetGrid(means[means["with_title"]==0], col="tag", hue="tag", col_wrap=3, margin_titles=False, 
                  height=6, aspect=.75)
g.map(sns.lineplot, "trainsize", "rprec", marker="o",  linewidth = 4)

means.

In [None]:
g = sns.FacetGrid(means, col="task_name", hue="tag", col_wrap=4, sharex=False, margin_titles=True, 
                  height=6, aspect=.75,
                  col_order=["first_5_title", "first_5_wo_title", "first_10_title", "first_10_wo_title",
                            "first_25_title", "first_100_title", "rand_25_title", "rand_100_title",
                            "first_1_title", "title_only"])
g.map(sns.lineplot, "trainsize", "rprec", marker="o", legend="full", linewidth = 4)
g.add_legend()
g.set_titles(col_template="{col_name}")
#g.set(xscale="log")
sns.set(font_scale = 1.2)

In [None]:
g = sns.FacetGrid(means, col="task_name", hue="tag", col_wrap=4, sharex=False, margin_titles=True, 
                  height=6, aspect=.75,
                  col_order=["first_5_title", "first_5_wo_title", "first_10_title", "first_10_wo_title",
                            "first_25_title", "first_100_title", "rand_25_title", "rand_100_title",
                            "first_1_title", "title_only"])
g.map(sns.lineplot, "trainsize", "rprec", marker="o", legend="full", linewidth = 4)
g.add_legend()
g.set_titles(col_template="{col_name}")
#g.set(xscale="log")

In [None]:
sns.catplot(x = "task_name",       # x variable name
            y = "rprec",       # y variable name
            hue = "tag",  # group variable name
            data = means,     # dataframe to plot
            kind = "bar",
            aspect = 2)
ignore=plt.xticks(rotation=45)
ignore=plt.title("R-Precision Compared")

In [None]:
g = sns.FacetGrid(means, col="task_name", hue="tag", col_wrap=4, sharex=False, margin_titles=True, 
                  height=6, aspect=.75,
                  col_order=["first_5_title", "first_5_wo_title", "first_10_title", "first_10_wo_title",
                            "first_25_title", "first_100_title", "rand_25_title", "rand_100_title",
                            "first_1_title", "title_only"])
g.map(sns.lineplot, "trainsize", "ndcg", marker="o", legend="full", linewidth = 4)
g.add_legend()
g.set_titles(col_template="{col_name}")
#g.set(xscale="log")

In [None]:
results[results["trainsize"]==20]#.groupby(["tag"]).rprec.mean()

In [None]:
results.groupby(["tag"]).ndcg.mean()

Remove rand100 which is the task which most significantly differentiates method performance.

In [None]:
results[results.task_name!="rand_100_title"].groupby(["tag"]).rprec.mean()

In [None]:
results[results.task_name!="rand_100_title"].groupby(["tag"]).ndcg.mean()

In [None]:
means

In [None]:
g = sns.FacetGrid(results, row="task_name", col="tag", hue="tag", margin_titles=True, height=3, aspect=4)
g.map(sns.violinplot, "tag", "rprec", palette="muted", inner="quart", cut=0, orient='v', scale='count')

Create means as dataframe rather than groupby object by using the aggregator on the groupby object rather than the series of rprec column alone.  This keeps the column names and should simplify plotting.

https://pandas.pydata.org/pandas-docs/stable/user_guide/groupby.html

In [None]:
sns.catplot(x = "task_name",       # x variable name
            y = "rprec",       # y variable name
            hue = "tag",  # group variable name
            data = means,     # dataframe to plot
            kind = "bar",
            aspect = 2)
ignore=plt.xticks(rotation=45)
ignore=plt.title("R-Precision Compared")

In [None]:
sns.catplot(x = "task_name",       # x variable name
            y = "ndcg",       # y variable name
            hue = "tag",  # group variable name
            data = means,     # dataframe to plot
            kind = "bar",
            aspect = 2)
ignore=plt.xticks(rotation=45)
ignore=plt.title("NDCG Compared")

In [None]:
sns.lineplot(x = "task_name",       # x variable name
            y = "rprec",       # y variable name
            hue = "tag",  # group variable name
            data = means,     # dataframe to plot
            #kind = "scatter",
            #aspect = 2
            )
ignore=plt.xticks(rotation=45)
ignore=plt.title("R-Precision Compared")
plt.legend(bbox_to_anchor=(1.05, 1), loc=2, borderaxespad=0.)

In [None]:
sns.lineplot(x = "task_name",       # x variable name
            y = "rprec",       # y variable name
            hue = "tag",  # group variable name
            data = results,     # dataframe to plot
            #kind = "scatter",
            #aspect = 2
            )
ignore=plt.xticks(rotation=45)
ignore=plt.title("R-Precision Compared")
plt.legend(bbox_to_anchor=(1.05, 1), loc=2, borderaxespad=0.)

## Explore Violin plot

start with the routine used during the recsys18 analysis

In [None]:
def plot_violin(df, title="Violin Plot"):
    #sns.set_style("white") 
    sns.set(rc={'figure.figsize':(8,6)})
    g = sns.violinplot(data=df, cut=0, orient='v', scale='width')
    #g = sns.violinplot(x=df.iloc[,0], y=df.iloc[0,:], cut=0, scale='width')
    g.set_title(title)
    #g.set_xlabel("Subtask")
    g.set_ylabel("Score")
    g.set_xticklabels(g.get_xticklabels(), rotation=90)

    return g

Don't need the describe data because that will come from the violin plot.  Just need to use all the raw data points.

## Explore side-by-side plots

What to take all the tasks and see the teams side by side

Create violin plots for each task. https://stackoverflow.com/a/47487445/8928529

Basically loop through the tasks and plot on each subplot axis.

fig, axes = plt.subplots(10, 2, figsize=(20, 32), sharey='row')
axes_cols = (axes.flatten()[::2], axes.flatten()[1::2])

i=0
for task in vl6desc.task_name.drop_duplicates():
    
    ax=axes_cols[0][i]
    sns.violinplot(data=vl6desc[vl6desc["task_name"]==task], cut=0, orient='v', scale='width', ax=ax)
    ax.set_title('task = {}'.format(task), y=0.95)
    ax=axes_cols[1][i]
    sns.violinplot(data=hwdesc[hwdesc["task_name"]==task], cut=0, orient='v', scale='width', ax=ax)
    ax.set_title('task = {}'.format(task), y=0.95)
    
    
    i += 1

## Explore split plot

This lets me see the data paired directly and allows easier visual comparison of differences.

The test rand_100_title task shows clear differences.

In [None]:
data=means

With the rprec and ndcg data plotted side by side it is clear that the rand_100_title was much more effectively solved by vl6.

The second most effective was first_1_title where vl6 had higher mean.

However the the title_only solution of hw is clearly better.

All the rest of the tasks had nearly identical means and distributions with the hw solution having slightly higher ndcg in those tasks.


It's much easier to get fast insight from seaborn using facet grids since they are built right from the data.

https://seaborn.pydata.org/tutorial/axis_grids.html

In [None]:
g = sns.FacetGrid(results, row="task_name", col="tag", hue="tag", margin_titles=True, height=4, aspect=2)
g.map(sns.scatterplot, "pid", "rprec", linewidth = 0)

In [None]:
data["rpid"]= data.pid % 1000

In [None]:
results["rpid"] = results.pid % 1000

In [None]:
g = sns.FacetGrid(results, row="task_name", col="tag", hue="tag", margin_titles=True, height=4, aspect=2)
g.map(sns.scatterplot, "rpid", "rprec", linewidth = 0)

Revisit violin distribution plots now with all data.

In [None]:
order = ["first_5_title", "first_5_wo_title", "first_10_title", "first_10_wo_title", "first_25_title", "first_100_title", "rand_25_title", "rand_100_title", "first_1_title", "title_only"]

g = sns.FacetGrid(results, col="task_name",  col_order=order, margin_titles=True, height=5, col_wrap=2, aspect=1)
g.map(sns.violinplot, "tag", "rprec", palette="muted", inner="quart", cut=0, orient='v', scale='count')
g.add_legend()
plt.xticks(rotation=45)

In [None]:
order = ["first_5_title", "first_5_wo_title", "first_10_title", "first_10_wo_title", "first_25_title", "first_100_title", "rand_25_title", "rand_100_title", "first_1_title", "title_only"]

g = sns.FacetGrid(data, col="task_name",  hue = "tag", col_order=order, margin_titles=True, height=5, col_wrap=2, aspect=1)
g.map(sns.kdeplot, data=data, x="rprec", hue="tag", palette="muted", cut=0, fill=False)
g.add_legend()

##  Inspect the highest scoring recommendations

Understand which tasks and methods perform the best.

In [None]:
results[results.rprec > 0.8][["rprec", "ndcg", "tag","task_name", "name", "num_tracks"]]