## EGGS Performance Results

In [20]:
import os
from itertools import product

import pandas as pd
import matplotlib.pyplot as plt
import matplotlib.gridspec as gridspec
import cufflinks as cf
import plotly.offline

from IPython.display import display, HTML
from ipywidgets import interactive_output, HBox, VBox, Layout 
from ipywidgets import Dropdown, Checkbox, ToggleButtons, SelectionRangeSlider, SelectionSlider
from ipywidgets import SelectMultiple
import ipywidgets as widgets

cf.go_offline()
pd.set_option('display.max_columns', 100)

data_dir = '../../'
df = pd.read_csv(os.path.join(data_dir, 'results.csv'))
df

Unnamed: 0,rs,base_estimator,feature_type,test_type,sgl_method,sgl_stacks,pgm,auc_mean,ap_mean,auc_std,ap_std,auc_diff_mean,ap_diff_mean,auc_diff_std,ap_diff_std,dataset
0,1,lr,full,full,,0,,0.844965,0.622869,0.040697,0.077320,0.000000,0.000000,0.000000,0.000000,youtube
1,1,lr,full,full,,0,mrf,0.856262,0.630755,0.040494,0.074351,0.011296,0.007886,0.002688,0.002406,youtube
2,1,lr,full,full,cv,1,,0.847217,0.625678,0.040048,0.077457,0.002252,0.002809,0.001092,0.001862,youtube
3,1,lr,full,full,cv,1,mrf,0.859977,0.633294,0.040948,0.076542,0.015011,0.010426,0.003476,0.002685,youtube
4,1,lr,full,full,cv,2,,0.847744,0.625210,0.038970,0.076685,0.002779,0.002341,0.000961,0.001531,youtube
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
75,1,lr,limited,inductive,cv,2,mrf,0.779445,0.480772,0.092423,0.193990,0.005959,0.020423,0.010057,0.012409,twitter
76,1,lr,limited,inductive,holdout,1,,0.773486,0.460349,0.093131,0.190217,0.000000,0.000000,0.000000,0.000000,twitter
77,1,lr,limited,inductive,holdout,1,mrf,0.777415,0.470777,0.089610,0.195430,0.003929,0.010428,0.002109,0.002291,twitter
78,1,lr,limited,inductive,holdout,2,,0.773486,0.460349,0.093131,0.190217,0.000000,0.000000,0.000000,0.000000,twitter


In [21]:
# plot options
metric = ToggleButtons(description='Metric', options=['auc', 'ap'], value='auc')
errorbar = Checkbox(description='Error bars', value=True)

# experiment options
dataset = Dropdown(description='Dataset', options=['youtube', 'twitter', 'soundcloud'], value='youtube')
feature_type = ToggleButtons(description='Feature set', options=['full', 'limited'], value='full')
test_type = ToggleButtons(description='Test', options=['full', 'inductive'], value='full')
base_estimator = ToggleButtons(description='Estimator', options=['lr', 'lgb'], value='lr')
rs = SelectionSlider(description='rs', options=[1], value=1)

# hyperparameter options
sgl_method = SelectMultiple(description='SGL method', options=['None', 'holdout', 'cv'], value=('None', 'holdout', 'cv'))
sgl_stacks = SelectionRangeSlider(description='SGL stacks', options=[0, 1, 2], index=(0, 2))
pgm = SelectMultiple(description='PGM', options=['None', 'psl', 'mrf'], value=('None', 'mrf'))

# create ui
box_1 = VBox([metric, rs, errorbar])
box_2 = VBox([dataset, feature_type, test_type, base_estimator])
box_3 = VBox([sgl_method, sgl_stacks, pgm])
ui = HBox([box_2, box_3, box_1])

# plot graphs
def f(metric, errorbar,
      dataset, feature_type, test_type, base_estimator, rs,
      sgl_method, sgl_stacks, pgm):
    
    # filter results
    temp = df.copy()
    temp = temp[temp['dataset'] == dataset]
    temp = temp[temp['feature_type'] == feature_type]
    temp = temp[temp['test_type'] == test_type]
    temp = temp[temp['base_estimator'] == base_estimator]
    temp = temp[temp['rs'] == rs]

    exp_df = temp.copy()
    
    temp = temp[temp['sgl_method'].isin(list(sgl_method))]
    temp = temp[(temp['sgl_stacks'] >= sgl_stacks[0]) & (temp['sgl_stacks'] <= sgl_stacks[1])]
    temp = temp[temp['pgm'].isin(list(pgm))]

    sgl_method_list = ['None', 'holdout', 'cv']
    sgl_stacks_list = [0, 1, 2]
    pgm_list = ['None', 'psl', 'mrf']

    tuples = product(*[pgm_list, sgl_method_list, sgl_stacks_list])

    res = []

    # get baseline
    baseline = None
    
    pgm_color_dict = {'None': 'blue', 'psl': 'orange', 'mrf': 'purple'}
    colors = []
    
    # baseline results
    if len(exp_df) > 0:
        baseline_df = exp_df[(exp_df['sgl_method'] == 'None') & (exp_df['sgl_stacks'] == 0) & (exp_df['pgm'] == 'None')]

    # EGGS results
    for pgm_i, method_i, stacks_i in tuples:

        # skip unused parameters
        if method_i == 'None' and stacks_i == 0 and pgm_i == 'None':
            continue

        eggs_df = temp[(temp['sgl_method'] == method_i) & (temp['sgl_stacks'] == stacks_i) & (temp['pgm'] == pgm_i)]

        if len(eggs_df) > 0:
            colors.append(pgm_color_dict[pgm_i])

            result = {'key': '{}\n{}\n{}'.format(method_i, stacks_i, pgm_i)}
            result['{}_diff_mean'.format(metric)] = eggs_df['{}_diff_mean'.format(metric)].values[0]
            result['{}_diff_std'.format(metric)] = eggs_df['{}_diff_std'.format(metric)].values[0]
            res.append(result)

    # plot results
    if len(temp) > 0:

        color = {'auc': 'purple', 'ap': 'purple'}
        plot_df = pd.DataFrame(res)
        
        fig, ax0 = plt.subplots(figsize=(15, 5))

        y = plot_df['{}_diff_mean'.format(metric)]
        yerr = plot_df['{}_diff_std'.format(metric)] if errorbar else None

        ax0.bar(plot_df['key'], plot_df['{}_diff_mean'.format(metric)], yerr=yerr, color=colors)
        ax0.set_ylabel(metric)
        ax0.set_title('Performance Difference against Baseline (Higher is better)')
        
        if baseline:
            ax0.axhline(baseline, linestyle='--', color='black', label='baseline')
            ax0.legend()
    
    else:
        plt.clf()

out = interactive_output(f, {'metric': metric, 'errorbar': errorbar,
                             'dataset': dataset, 'feature_type': feature_type, 'test_type': test_type,
                             'base_estimator': base_estimator, 'rs': rs,
                             'sgl_method': sgl_method, 'sgl_stacks': sgl_stacks, 'pgm': pgm})
display(ui, out)

HBox(children=(VBox(children=(Dropdown(description='Dataset', options=('youtube', 'twitter', 'soundcloud'), va…

Output()