In [1]:
import warnings
warnings.filterwarnings('ignore')
import inspect
import logging
import os

import pandas as pd

from csrank.util import setup_logging
from experiments.util import lp_metric_dict
import numpy as np
from experiments.dbconnection import DBConnector

Using TensorFlow backend.


In [2]:
DIR_PATH = os.path.dirname(os.path.abspath(inspect.getfile(inspect.currentframe())))
log_path = os.path.join(DIR_PATH, 'logs', 'results.log')
setup_logging(log_path=log_path)
logger = logging.getLogger('ResultParsing')
learning_problem = "choice_function"
schema = "choice_functions"
datasets = ['synthetic_choice', 'mnist_choice']
keys = list(lp_metric_dict[learning_problem].keys())
keys[-1] = keys[-1].format(6)
metrics = ', '.join([x.lower() for x in keys])
print(metrics)

f1score, precision, recall, subset01loss, hammingloss, informedness, aucscore, averageprecisionscore


In [3]:
def get_results_for_dataset(DATASET, del_jid = True):
    config_file_path = os.path.join(DIR_PATH, 'config', 'clusterdb.json')
    results_table = 'results.{}'.format(learning_problem)
    schema = 'choice_functions'
    start = 3
    select_jobs = "SELECT learner_params, dataset_params, hp_ranges, {0}.job_id, dataset, learner, {3} from {0} INNER JOIN {1} ON {0}.job_id = {1}.job_id where {1}.dataset=\'{2}\'"
    self = DBConnector(config_file_path=config_file_path, is_gpu=False, schema=schema)
    self.init_connection()
    avail_jobs = "{}.avail_jobs".format(schema)
    select_st = select_jobs.format(results_table, avail_jobs, DATASET, metrics)
    #print(select_st)
    self.cursor_db.execute(select_st)
    data = []
    for job in self.cursor_db.fetchall():
        job = dict(job)
        if job['learner'] in job['hp_ranges'].keys():
            n_hidden = job['hp_ranges'][job['learner']].get("n_hidden", [])
            if job['hp_ranges'][job['learner']].get("n_hidden_set_layers", None)==[1,8]:
                job['learner'] = job['learner']+'_shallow'
            elif n_hidden==[1,4] or n_hidden==[1,5]:
                job['learner'] = job['learner']+'_shallow'

        if job['learner_params'].get("add_zeroth_order_model", False):
            job['learner'] = job['learner']+'_zero'
        if "letor" in job['dataset']:
            job['dataset'] = get_letor_string(job['dataset_params'])
        elif "sushi" in job['dataset']:
            job['dataset'] =  job['dataset']
        else:
            job['dataset'] = job['dataset_params']['dataset_type']
        job['learner'] = job['learner'].upper()
        job['dataset'] = job['dataset'].upper()
        values = list(job.values())
        keys = list(job.keys())
        columns = keys[start:]
        vals = values[start:]
        
        data.append(vals)
    df_full = pd.DataFrame(data, columns=columns)
    df_full = df_full.sort_values('dataset')
    if del_jid:
        del df_full['job_id']
    columns = list(df_full.columns)
    return df_full, columns
df, cols = get_results_for_dataset(datasets[0])
df

Unnamed: 0,dataset,learner,f1score,precision,recall,subset01loss,hammingloss,informedness,aucscore,averageprecisionscore
0,PARETO,FATE_CHOICE,0.925,0.9438,0.9241,0.4397,0.0212,0.9135,0.9972,0.9884
21,PARETO,RANDOM_CHOICE,0.2315,0.1332,1.0,1.0,0.8668,0.0,0.5,0.1332
20,PARETO,GLM_CHOICE,0.5876,0.606,0.7402,0.9544,0.1302,0.6377,0.956,0.8649
19,PARETO,RANDOM_CHOICE,0.2318,0.1334,1.0,1.0,0.8666,0.0,0.5,0.1334
18,PARETO,GLM_CHOICE,0.4924,0.5052,0.6432,0.983,0.1696,0.5095,0.8673,0.7367
17,PARETO,RANDOM_CHOICE,0.2311,0.1329,1.0,1.0,0.8671,0.0,0.5,0.1329
16,PARETO,RANKSVM_CHOICE,0.59,0.5957,0.7587,0.9572,0.1337,0.6487,0.9563,0.8652
15,PARETO,RANKSVM_CHOICE,0.588,0.5981,0.7525,0.9559,0.1339,0.6436,0.9561,0.8649
14,PARETO,GLM_CHOICE,0.5719,0.5919,0.7211,0.9601,0.1358,0.615,0.9406,0.8429
13,PARETO,RANKSVM_CHOICE,0.5885,0.6046,0.7436,0.9529,0.1303,0.6405,0.9562,0.8655


In [4]:
def create_combined_dfs(DATASET, latex_row=False):
    df_full, columns = get_results_for_dataset(DATASET)
    data = []
    dataf = []
    for dataset, dgroup in df_full.groupby(['dataset']):
        max_feta = -100
        max_fate = -100
        max_ranknet = -100
        feta_r = []
        fate_r = []
        ranknet_r = []
        for learner, group in dgroup.groupby(['learner']):
            one_row = [get_name(dataset), learner]
            std = np.around(group.std(axis=0).values,3)
            mean = np.around(group.mean(axis=0).values,3)
            if np.all(np.isnan(std)):
                one_row.extend(["{:.4f}".format(m) for m in mean])
                #latex_row.extend(["${:.3f}$".format(m) for m in mean]) 
            else:
                std_err = [s for s in std]
                #std_err = [s/np.sqrt(len(group)) for s in std]
                #one_row.extend([m for m in mean])
                #one_row.extend([se for se in std_err])
                #one_row.extend(mean)
                if latex_row:
                    one_row.extend(["{:.3f}({:.0f})".format(m, s*1e3) for m, s in zip(mean, std)])
                else:
                    one_row.extend(["{:.3f}±{:.3f}".format(m, s) for m, s in zip(mean, std)])
            if "FETA" in str(learner):
                if max_feta < mean[0] - std[0]:
                    max_feta = mean[0] - std[0]
                    feta_r = one_row
                    feta_r[1] = models_dict["FETA_CHOICE"]
            elif "FATE" in str(learner):
                if max_feta < mean[0] - std[0]:
                    max_fate = mean[0] - std[0]
                    fate_r = one_row
                    fate_r[1] = models_dict["FATE_CHOICE"]
            elif "RANKNET" in str(learner):
                if max_ranknet < mean[0] - std[0]:
                    max_ranknet = mean[0] - std[0]
                    ranknet_r = one_row
                    ranknet_r[1] = models_dict["RANKNET_CHOICE"]
            else:
                one_row[1] = models_dict[one_row[1]]
                data.append(one_row)
        data.append(feta_r)
        data.append(ranknet_r)
        data.append(fate_r)
    for i in range(len(columns)):
        columns[i] = columns[i].title()
        if columns[i] == 'Learner':
            columns[i] = "ChoiceModel"
    df = pd.DataFrame(data, columns=columns)
    df.sort_values(by='Dataset')
    return df

In [9]:
for dataset in datasets:
    df = create_combined_dfs(dataset)
    df_path = os.path.join(DIR_PATH, 'detailedresults' , dataset.split('_choice')[0].title()+'.csv')
    df.to_csv(df_path, index=False, encoding='utf-8')
df

Unnamed: 0,Dataset,DCM,CategoricalAccuracy,Top-2,Top-3,Top-4,Top-5,Top-6
0,SUSHI,GNL,0.218±0.062,0.366±0.071,0.502±0.018,0.608±0.023,0.685±0.022,0.754±0.034
1,SUSHI,MLM,0.262±0.007,0.387±0.008,0.465±0.014,0.566±0.013,0.624±0.010,0.724±0.014
2,SUSHI,MNL,0.270±0.006,0.387±0.005,0.502±0.002,0.581±0.018,0.676±0.011,0.786±0.007
3,SUSHI,NLM,0.263±0.012,0.375±0.005,0.492±0.014,0.601±0.011,0.671±0.013,0.736±0.024
4,SUSHI,PCL,0.269±0.006,0.387±0.006,0.500±0.012,0.595±0.018,0.676±0.010,0.785±0.006
5,SUSHI,PairwiseSVM,0.258±0.004,0.372±0.007,0.480±0.022,0.594±0.017,0.679±0.013,0.779±0.006
6,SUSHI,FETA-Net-DC,0.292±0.007,0.401±0.023,0.507±0.026,0.602±0.040,0.687±0.032,0.769±0.035
7,SUSHI,RankNetDC,0.269±0.013,0.436±0.020,0.555±0.025,0.661±0.014,0.758±0.021,0.831±0.020
8,SUSHI,FATE-Net-DC,0.299±0.015,0.412±0.016,0.528±0.016,0.639±0.037,0.726±0.030,0.805±0.008


In [21]:
df = create_combined_dfs(DATASET, latex_row=True)
df.sort_values(by='dataset')
df

Unnamed: 0,dataset,learner,categoricalaccuracy,categoricaltopk2,categoricaltopk3,categoricaltopk4,categoricaltopk5,categoricaltopk6
0,HYPERVOLUME,GENERALIZED_EXTREME_VALUE,0.293(18),0.369(20),0.472(21),0.567(18),0.663(14),0.756(9)
1,HYPERVOLUME,MIXED_LOGIT_MODEL,0.189(14),0.338(17),0.451(19),0.542(20),0.621(14),0.692(10)
2,HYPERVOLUME,MULTINOMIAL_LOGIT_MODEL,0.201(8),0.267(10),0.360(10),0.456(8),0.559(4),0.664(4)
3,HYPERVOLUME,NESTED_LOGIT_MODEL,0.291(3),0.416(5),0.511(7),0.582(6),0.651(6),0.722(4)
4,HYPERVOLUME,PAIRED_COMBINATORIAL_LOGIT,0.185(1),0.248(1),0.340(2),0.440(2),0.550(2),0.668(2)
5,HYPERVOLUME,RANKSVM_DC,0.186(1),0.248(1),0.340(2),0.439(2),0.550(2),0.667(2)
6,HYPERVOLUME,FETA_DC,0.766(18),0.874(15),0.932(5),0.960(2),0.978(1),0.990(2)
7,HYPERVOLUME,RANKNET_DC,0.203(4),0.276(6),0.369(6),0.462(5),0.562(4),0.665(7)
8,HYPERVOLUME,FATE_DC,0.730(18),0.855(19),0.920(13),0.949(9),0.968(6),0.980(3)
9,MEDOID,GENERALIZED_EXTREME_VALUE,0.020(1),0.085(2),0.195(4),0.338(3),0.500(1),0.661(5)


In [22]:
import re
def remove_ranker(sub_df):
    remove_ranker = None
    if len(sub_df)==2:
        sub_df = sub_df[:,1:3]
        val1 = [float(x) for x in re.findall(r"[-+]?\d*\.\d+|\d+", sub_df[0][1])]
        val2 = [float(x) for x in re.findall(r"[-+]?\d*\.\d+|\d+", sub_df[1][1])]
        val1 = val1[0] if len(val1)==1 else val1[0] - val1[1]*1e-3
        val2 = val2[0] if len(val2)==1 else val2[0] - val2[1]*1e-3
        if val1 < val2 :
            remove_ranker = sub_df[0][0]
        else:
            remove_ranker = sub_df[1][0]
    
    return remove_ranker

In [23]:
def get_val(val):
    vals =  [float(x) for x in re.findall(r"[-+]?\d*\.\d+|\d+", val)]
    return [vals[0], vals[0] - vals[1]*1e-3]
def mark_best(df):
    for col in list(df.columns)[1:]:
        values_str = df[['learner',col]].as_matrix()
        values = np.array([get_val(val[1])for val in values_str])
        maxi = np.where(values[:,0] == values[:,0][np.argmax(values[:,0])])[0]
        for ind in maxi:
            values_str[ind] = [values_str[ind][0], "bfseries {}".format(values_str[ind][1])]
        df['learner'] = values_str[:,0]
        df[col] = values_str[:,1]
    return df

In [24]:
#from tabulate import tabulate
import string
grouped = df.groupby(['dataset'])
for name, group in grouped:
    
    remove_rankers = []
    sub_df = group[group['learner'].str.contains("FETA")].as_matrix()
    r1 = remove_ranker(sub_df)
    sub_df = group[group['learner'].str.contains("FATE")].as_matrix()
    r2 = remove_ranker(sub_df)
    sub_df = group[group['learner'].str.contains("RANKNET")].as_matrix()
    r3 = remove_ranker(sub_df)
    remove_rankers.append(r1)
    remove_rankers.append(r2)
    remove_rankers.append(r3)
    group = group[~group['learner'].isin(remove_rankers)]
    group = group.replace({'FETA_DC_SHALLOW_ZERO': "FETA_DC"})
    group = group.replace({'FATE_DC_SHALLOW': "FATE_DC"})
    group = group.replace({'RANKNET_DC_SHALLOW': "RANKNET_DC"})
    custom_dict = {"RANKSVM_DC":0, "RANKNET_DC":1, 'MULTINOMIAL_LOGIT_MODEL':2, 'NESTED_LOGIT_MODEL':3, 'GENERALIZED_EXTREME_VALUE':4, 
                   'PAIRED_COMBINATORIAL_LOGIT':5, "MIXED_LOGIT_MODEL":6, "FATE_DC":7, "FETA_DC":8, "FETA_DC_ZERO":9}
    group['rank'] = group['learner'].map(custom_dict)
    group.sort_values(by='rank', inplace=True)
    del group["dataset"]
    del group['rank']
    group = mark_best(group)
    if len(group)==9:
        group['learner'] = ["pairwisesvm", "ranknetdc", "mnl", "nlm", "gnl", "pcl", "mlm", "fatedc", "fetadc"]
    print("name {}".format(name))
    group = group.drop(columns='categoricaltopk6')
    if "N_5" in name:
        group = group.drop(columns='categoricaltopk5')
    latex_code = group.to_latex(index = False)
    latex_code = latex_code.replace(' ',"")
    latex_code = latex_code.replace('&'," & ")
    latex_code = str(latex_code)
    for learner in group['learner']:
        latex_code = latex_code.replace(learner, "\\{}".format(learner))
    latex_code = latex_code.replace("bfseries", "\\{} ".format("bfseries"))
    #latex_code = latex_code.replace("0.", ".")

    print(latex_code)
#df.T.to_latex()

name HYPERVOLUME
\begin{tabular}{llllll}
\toprule
learner & categoricalaccuracy & categoricaltopk2 & categoricaltopk3 & categoricaltopk4 & categoricaltopk5\\
\midrule
\pairwisesvm & 0.186(1) & 0.248(1) & 0.340(2) & 0.439(2) & 0.550(2)\\
\ranknetdc & 0.203(4) & 0.276(6) & 0.369(6) & 0.462(5) & 0.562(4)\\
\mnl & 0.201(8) & 0.267(10) & 0.360(10) & 0.456(8) & 0.559(4)\\
\nlm & 0.291(3) & 0.416(5) & 0.511(7) & 0.582(6) & 0.651(6)\\
\gnl & 0.293(18) & 0.369(20) & 0.472(21) & 0.567(18) & 0.663(14)\\
\pcl & 0.185(1) & 0.248(1) & 0.340(2) & 0.440(2) & 0.550(2)\\
\mlm & 0.189(14) & 0.338(17) & 0.451(19) & 0.542(20) & 0.621(14)\\
\fatedc & 0.730(18) & 0.855(19) & 0.920(13) & 0.949(9) & 0.968(6)\\
\fetadc & \bfseries 0.766(18) & \bfseries 0.874(15) & \bfseries 0.932(5) & \bfseries 0.960(2) & \bfseries 0.978(1)\\
\bottomrule
\end{tabular}

name MEDOID
\begin{tabular}{llllll}
\toprule
learner & categoricalaccuracy & categoricaltopk2 & categoricaltopk3 & categoricaltopk4 & categoricaltopk5\\
\midrule

In [None]:
df_path = os.path.join(DIR_PATH, 'results' , "discrete_choice.csv")

if not os.path.isfile(df_path):
    dataFrame = df
else:
    dataFrame = pd.read_csv(df_path, index_col=0)
    dataFrame = dataFrame.append(df, ignore_index=True)
dataFrame
dataFrame.to_csv(df_path)

In [None]:
grouped = df.groupby(['dataset'])
for name, group in grouped:
    df_path = os.path.join(DIR_PATH, 'results' , name.lower()+'.csv')
    group.to_csv(df_path)

In [None]:
import numpy as np
np.arange(48,87)

X_train = np.arange(40).reshape(4,5,2)

learner_params = {}
learner_params['n_objects'], learner_params['n_object_features'] = X_train.shape[1:]

In [None]:
"UNIQUE_MAX_OCCURRING".lower()