In [1]:
import warnings
warnings.filterwarnings('ignore')
import inspect
import logging
import os

import pandas as pd

from csrank.util import setup_logging
from experiments.util import lp_metric_dict
import numpy as np
from experiments.dbconnection import DBConnector

Using TensorFlow backend.


In [2]:
DIR_PATH = os.path.dirname(os.path.abspath(inspect.getfile(inspect.currentframe())))
log_path = os.path.join(DIR_PATH, 'logs', 'results.log')
setup_logging(log_path=log_path)
logger = logging.getLogger('ResultParsing')
config_file_path = os.path.join(DIR_PATH, 'config', 'clusterdb.json')
datasets = ['synthetic_dc', 'mnist_dc', 'tag_genome_dc', "letor_dc", "sushi_dc"]
DATASET = datasets[4]
learning_problem = "discrete_choice"
results_table = 'results.{}'.format(learning_problem)
schema = 'masterthesis'
start=3
select_jobs = "SELECT learner_params, dataset_params, hp_ranges, {0}.job_id, dataset, learner, {3} from {0} INNER JOIN {1} ON {0}.job_id = {1}.job_id where {1}.dataset=\'{2}\'"

In [3]:
self = DBConnector(config_file_path=config_file_path, is_gpu=False, schema=schema)

In [4]:
update_result = "UPDATE results.discrete_choice set cluster_id = %s, CategoricalAccuracy = %s, CategoricalTopK2 = %s, CategoricalTopK3 = %s, CategoricalTopK4 = %s, CategoricalTopK5 = %s, CategoricalTopK6 = %s  where job_id= %s"
values = (6636228, 0.4343, 0.6603, 0.8295, 0.9504, 1.0000, 1.0000,479)
self.init_connection()
self.cursor_db.execute(update_result, tuple(values))
self.close_connection()

In [5]:
def get_letor_string(dp):
    y =  str(dp['year']) 
    n = str(dp['n_objects'])
    return "y_{}_n_{}".format(y,n)

In [6]:
keys = list(lp_metric_dict[learning_problem].keys())
keys[-1] = keys[-1].format(6)
metrics = ', '.join([x for x in keys])
metrics

'CategoricalAccuracy, CategoricalTopK2, CategoricalTopK3, CategoricalTopK4, CategoricalTopK5, CategoricalTopK6'

In [7]:
self.init_connection()
avail_jobs = "{}.avail_jobs".format(self.schema)
select_st = select_jobs.format(results_table, avail_jobs, DATASET, metrics)
print(select_st)
self.cursor_db.execute(select_st)
data = []
for job in self.cursor_db.fetchall():
    job = dict(job)
    n_hidden = job['hp_ranges'][job['learner']].get("n_hidden", [])
    if job['hp_ranges'][job['learner']].get("n_hidden_set_layers", None)==[1,8]:
        job['learner'] = job['learner']+'_shallow'
    elif n_hidden==[1,4] or n_hidden==[1,5]:
        job['learner'] = job['learner']+'_shallow'
        
    if job['learner_params'].get("add_zeroth_order_model", False):
        job['learner'] = job['learner']+'_zero'
    if "letor" in job['dataset']:
        job['dataset'] = get_letor_string(job['dataset_params'])
    elif "sushi" in job['dataset']:
        job['dataset'] =  job['dataset']
    else:
        job['dataset'] = job['dataset_params']['dataset_type']
    job['learner'] = job['learner'].upper()
    job['dataset'] = job['dataset'].upper()
    values = list(job.values())
    keys = list(job.keys())
    columns = keys[start:]
    vals = values[start:]
    data.append(vals)

SELECT learner_params, dataset_params, hp_ranges, results.discrete_choice.job_id, dataset, learner, CategoricalAccuracy, CategoricalTopK2, CategoricalTopK3, CategoricalTopK4, CategoricalTopK5, CategoricalTopK6 from results.discrete_choice INNER JOIN masterthesis.avail_jobs ON results.discrete_choice.job_id = masterthesis.avail_jobs.job_id where masterthesis.avail_jobs.dataset='sushi_dc'


In [8]:
self.init_connection()
avail_jobs = "{}.avail_jobs".format("pymc3")
select_st = select_jobs.format(results_table, avail_jobs, DATASET, metrics)
print(select_st)
self.cursor_db.execute(select_st)
for job in self.cursor_db.fetchall():
    job = dict(job)
    if "letor" in job['dataset']:
        job['dataset'] = get_letor_string(job['dataset_params'])
    elif "sushi" in job['dataset']:
        job['dataset'] =  job['dataset']
    else:
        job['dataset'] = job['dataset_params']['dataset_type']
    job['learner'] = job['learner'].upper()
    job['dataset'] = job['dataset'].upper()
    values = list(job.values())
    keys = list(job.keys())
    columns = keys[start:]
    vals = values[start:]
    data.append(vals)
df_full = pd.DataFrame(data, columns=columns)

SELECT learner_params, dataset_params, hp_ranges, results.discrete_choice.job_id, dataset, learner, CategoricalAccuracy, CategoricalTopK2, CategoricalTopK3, CategoricalTopK4, CategoricalTopK5, CategoricalTopK6 from results.discrete_choice INNER JOIN pymc3.avail_jobs ON results.discrete_choice.job_id = pymc3.avail_jobs.job_id where pymc3.avail_jobs.dataset='sushi_dc'


In [9]:
df_full = df_full.sort_values('dataset')
#df_full['zeroonerankaccuracy'] = 1 - df_full['zeroonerankloss']
df_full.loc[df_full['learner'] == 'FETA_DC_SHALLOW_ZERO']
#df_full.head()

Unnamed: 0,job_id,dataset,learner,categoricalaccuracy,categoricaltopk2,categoricaltopk3,categoricaltopk4,categoricaltopk5,categoricaltopk6
24,413,SUSHI_DC,FETA_DC_SHALLOW_ZERO,0.232,0.413,0.493,0.5715,0.6905,0.757
6,541,SUSHI_DC,FETA_DC_SHALLOW_ZERO,0.2445,0.338,0.469,0.584,0.676,0.7315
7,538,SUSHI_DC,FETA_DC_SHALLOW_ZERO,0.1925,0.3695,0.557,0.6225,0.729,0.7745
14,540,SUSHI_DC,FETA_DC_SHALLOW_ZERO,0.257,0.3405,0.432,0.5125,0.621,0.714
12,539,SUSHI_DC,FETA_DC_SHALLOW_ZERO,0.2565,0.335,0.4715,0.593,0.688,0.7325


In [10]:
del df_full["job_id"]
grouped = df_full.groupby(['dataset', 'learner'])
data = []
for name, group in grouped:
    one_row = [name[0], str(name[1]).upper()]
    #latex_row = ["$ {}".format(name[0]), "$ {}".format(str(name[1]).upper())]
    std = group.std(axis=0).values
    mean = group.mean(axis=0).values
    if np.all(np.isnan(std)):
        one_row.extend(["{:.4f}".format(m) for m in mean])
        #latex_row.extend(["${:.3f}$".format(m) for m in mean]) 
    else:
        std = [s*1e3 for s in std]
        one_row.extend(["{:.3f}({:.0f})".format(m, s) for m, s in zip(mean, std)])
        #latex_row.extend(["$ {:.3f} \pm {:.3f} ".format(m, s) for m, s in zip(mean, std)])
    data.append(one_row)

In [11]:
df = pd.DataFrame(data, columns=columns[1:])
df.sort_values(by='dataset')
df_path = os.path.join(DIR_PATH, 'results' , DATASET+'.csv')
df.to_csv(df_path)
df

Unnamed: 0,dataset,learner,categoricalaccuracy,categoricaltopk2,categoricaltopk3,categoricaltopk4,categoricaltopk5,categoricaltopk6
0,SUSHI_DC,FATE_DC,0.292(21),0.414(18),0.559(78),0.647(45),0.730(35),0.808(11)
1,SUSHI_DC,FETA_DC_SHALLOW,0.292(7),0.401(23),0.507(26),0.602(40),0.687(32),0.769(35)
2,SUSHI_DC,FETA_DC_SHALLOW_ZERO,0.237(27),0.359(33),0.484(46),0.577(41),0.681(39),0.742(24)
3,SUSHI_DC,GENERALIZED_EXTREME_VALUE,0.218(62),0.366(71),0.502(18),0.608(23),0.685(22),0.754(34)
4,SUSHI_DC,MIXED_LOGIT_MODEL,0.262(7),0.387(8),0.465(14),0.566(13),0.624(10),0.724(14)
5,SUSHI_DC,MULTINOMIAL_LOGIT_MODEL,0.271(6),0.387(5),0.502(2),0.581(18),0.676(11),0.786(7)
6,SUSHI_DC,NESTED_LOGIT_MODEL,0.263(12),0.375(5),0.492(14),0.601(11),0.671(13),0.736(24)
7,SUSHI_DC,PAIRED_COMBINATORIAL_LOGIT,0.269(6),0.387(6),0.500(12),0.595(18),0.676(10),0.785(6)
8,SUSHI_DC,RANKNET_DC_SHALLOW,0.269(13),0.436(20),0.555(25),0.661(14),0.758(21),0.831(20)
9,SUSHI_DC,RANKSVM_DC,0.258(4),0.372(7),0.480(22),0.594(17),0.679(13),0.779(6)


In [12]:
import re
def remove_ranker(sub_df):
    remove_ranker = None
    if len(sub_df)==2:
        sub_df = sub_df[:,1:3]
        val1 = [float(x) for x in re.findall(r"[-+]?\d*\.\d+|\d+", sub_df[0][1])]
        val2 = [float(x) for x in re.findall(r"[-+]?\d*\.\d+|\d+", sub_df[1][1])]
        val1 = val1[0] if len(val1)==1 else val1[0] - val1[1]*1e-3
        val2 = val2[0] if len(val2)==1 else val2[0] - val2[1]*1e-3
        if val1 < val2 :
            remove_ranker = sub_df[0][0]
        else:
            remove_ranker = sub_df[1][0]
    
    return remove_ranker

In [52]:
def mark_best(df):
    for col in list(df.columns)[1:]:
        values_str = df[['learner',col]].as_matrix()
        values = np.array([[val[0], float(val[1].split('(')[0])] for val in values_str])
        maxi = np.argmax(values[:,1])
        values_str[maxi] = [values_str[maxi][0], "bfseries {}".format(values_str[maxi][1])]
        df['learner'] = values_str[:,0]
        df[col] = values_str[:,1]
    return df

In [54]:
#from tabulate import tabulate
import string
grouped = df.groupby(['dataset'])
for name, group in grouped:
    
    remove_rankers = []
    sub_df = group[group['learner'].str.contains("FETA")].as_matrix()
    r1 = remove_ranker(sub_df)
    sub_df = group[group['learner'].str.contains("FATE")].as_matrix()
    r2 = remove_ranker(sub_df)
    sub_df = group[group['learner'].str.contains("RANKNET")].as_matrix()
    r3 = remove_ranker(sub_df)
    remove_rankers.append(r1)
    remove_rankers.append(r2)
    remove_rankers.append(r3)
    group = group[~group['learner'].isin(remove_rankers)]
    group = group.replace({'FETA_DC_SHALLOW_ZERO': "FETA_DC"})
    group = group.replace({'FATE_DC_SHALLOW': "FATE_DC"})
    group = group.replace({'RANKNET_DC_SHALLOW': "RANKNET_DC"})
    custom_dict = {"RANKSVM_DC":0, "RANKNET_DC":1, 'MULTINOMIAL_LOGIT_MODEL':2, 'NESTED_LOGIT_MODEL':3, 'GENERALIZED_EXTREME_VALUE':4, 
                   'PAIRED_COMBINATORIAL_LOGIT':5, "MIXED_LOGIT_MODEL":6, "FATE_DC":7, "FETA_DC":8, "FETA_DC_ZERO":9}
    group['rank'] = group['learner'].map(custom_dict)
    group.sort_values(by='rank', inplace=True)
    del group["dataset"]
    del group['rank']
    group = mark_best(group)
    if len(group)==9:
        group['learner'] = ["pariwisesvm", "ranknetdc", "mnl", "nlm", "gev", "pcl", "mlm", "fate", "feta"]
    print("name {}".format(name))
    print(list(group.columns))
    latex_code = group.to_latex(index = False)
    latex_code = latex_code.replace(' ',"")
    latex_code = latex_code.replace('&'," & ")
    latex_code = str(latex_code)
    for learner in group['learner']:
        latex_code = latex_code.replace(learner, "\\{}".format(learner))
    latex_code = latex_code.replace("bfseries", "\\{} ".format("bfseries"))
    print(latex_code)
#df.T.to_latex()

name SUSHI_DC
['learner', 'categoricalaccuracy', 'categoricaltopk2', 'categoricaltopk3', 'categoricaltopk4', 'categoricaltopk5', 'categoricaltopk6']
\begin{tabular}{lllllll}
\toprule
learner & categoricalaccuracy & categoricaltopk2 & categoricaltopk3 & categoricaltopk4 & categoricaltopk5 & categoricaltopk6\\
\midrule
\pariwisesvm & 0.258(4) & 0.372(7) & 0.480(22) & 0.594(17) & 0.679(13) & 0.779(6)\\
\ranknetdc & 0.269(13) & \bfseries 0.436(20) & 0.555(25) & \bfseries 0.661(14) & \bfseries 0.758(21) & \bfseries 0.831(20)\\
\mnl & 0.271(6) & 0.387(5) & 0.502(2) & 0.581(18) & 0.676(11) & 0.786(7)\\
\nlm & 0.263(12) & 0.375(5) & 0.492(14) & 0.601(11) & 0.671(13) & 0.736(24)\\
\gev & 0.218(62) & 0.366(71) & 0.502(18) & 0.608(23) & 0.685(22) & 0.754(34)\\
\pcl & 0.269(6) & 0.387(6) & 0.500(12) & 0.595(18) & 0.676(10) & 0.785(6)\\
\mlm & 0.262(7) & 0.387(8) & 0.465(14) & 0.566(13) & 0.624(10) & 0.724(14)\\
\fate & \bfseries 0.292(21) & 0.414(18) & \bfseries 0.559(78) & 0.647(45) & 0.730(35) &

In [29]:
df_path = os.path.join(DIR_PATH, 'results' , "discrete_choice.csv")

if not os.path.isfile(df_path):
    dataFrame = df
else:
    dataFrame = pd.read_csv(df_path, index_col=0)
    dataFrame = dataFrame.append(df, ignore_index=True)
dataFrame
dataFrame.to_csv(df_path)

In [30]:
grouped = df.groupby(['dataset'])
for name, group in grouped:
    df_path = os.path.join(DIR_PATH, 'results' , name.lower()+'.csv')
    group.to_csv(df_path)

In [31]:
import numpy as np
np.arange(48,87)

X_train = np.arange(40).reshape(4,5,2)

learner_params = {}
learner_params['n_objects'], learner_params['n_object_features'] = X_train.shape[1:]

In [57]:
"UNIQUE_MAX_OCCURRING".lower()

'unique_max_occurring'