In [1]:
import inspect
import logging
import os

import pandas as pd

from csrank.util import setup_logging
from experiments.util import lp_metric_dict
import numpy as np
from experiments.dbconnection import DBConnector

Using TensorFlow backend.


In [12]:
DIR_PATH = os.path.dirname(os.path.abspath(inspect.getfile(inspect.currentframe())))
log_path = os.path.join(DIR_PATH, 'logs', 'results.log')
setup_logging(log_path=log_path)
logger = logging.getLogger('ResultParsing')
config_file_path = os.path.join(DIR_PATH, 'config', 'clusterdb.json')
datasets = ['synthetic_dc', 'mnist_dc', 'tag_genome_dc']
DATASET = datasets[0]
learning_problem = "discrete_choice"
results_table = 'results.{}'.format(learning_problem)
schema = 'masterthesis'
select_jobs = "SELECT learner_params, dataset_params, {0}.job_id, dataset, learner, {3} from {0} INNER JOIN {1} ON {0}.job_id = {1}.job_id where {1}.dataset=\'{2}\'"

In [13]:
self = DBConnector(config_file_path=config_file_path, is_gpu=False, schema=schema)

In [14]:
self.init_connection()
sel = "SELECT job_id FROM {} WHERE learner='feta_dc' and learner_params ->> 'add_zeroth_order_model' = 'TRUE' and fold_id=0"
self.cursor_db.execute(sel.format("masterthesis.avail_jobs"))
jobs = np.array(self.cursor_db.fetchall()).T[0]
jobs

array([118, 122, 126, 130, 138, 114, 103, 105, 104, 106, 110, 102, 134])

In [15]:
keys = list(lp_metric_dict[learning_problem].keys())
keys[-1] = keys[-1].format(6)
metrics = ', '.join([x for x in keys])
metrics

'CategoricalAccuracy, CategoricalTopK2, CategoricalTopK3, CategoricalTopK4, CategoricalTopK5, CategoricalTopK6'

In [16]:
self.init_connection()
avail_jobs = "{}.avail_jobs".format(self.schema)
select_st = select_jobs.format(results_table, avail_jobs, DATASET, metrics)
print(select_st)
self.cursor_db.execute(select_st)
data = []
for job in self.cursor_db.fetchall():
    job = dict(job)
    if job['learner_params'].get("add_zeroth_order_model", False):
        job['learner'] = job['learner']+'_zero'
    job['dataset'] = job['dataset_params']['dataset_type'].upper()
    job['learner'] = job['learner'].upper()
    values = list(job.values())
    keys = list(job.keys())
    columns = keys[2:]
    vals = values[2:]
    data.append(vals)

SELECT learner_params, dataset_params, results.discrete_choice.job_id, dataset, learner, CategoricalAccuracy, CategoricalTopK2, CategoricalTopK3, CategoricalTopK4, CategoricalTopK5, CategoricalTopK6 from results.discrete_choice INNER JOIN masterthesis.avail_jobs ON results.discrete_choice.job_id = masterthesis.avail_jobs.job_id where masterthesis.avail_jobs.dataset='synthetic_dc'


In [17]:
self.init_connection()
avail_jobs = "{}.avail_jobs".format("pymc3")
select_st = select_jobs.format(results_table, avail_jobs, DATASET, metrics)
print(select_st)
self.cursor_db.execute(select_st)
for job in self.cursor_db.fetchall():
    job = dict(job)
    job['dataset'] = job['dataset_params']['dataset_type'].upper()
    job['learner'] = job['learner'].upper()
    values = list(job.values())
    keys = list(job.keys())
    columns = keys[2:]
    vals = values[2:]
    data.append(vals)
df_full = pd.DataFrame(data, columns=columns)

SELECT learner_params, dataset_params, results.discrete_choice.job_id, dataset, learner, CategoricalAccuracy, CategoricalTopK2, CategoricalTopK3, CategoricalTopK4, CategoricalTopK5, CategoricalTopK6 from results.discrete_choice INNER JOIN pymc3.avail_jobs ON results.discrete_choice.job_id = pymc3.avail_jobs.job_id where pymc3.avail_jobs.dataset='synthetic_dc'


In [18]:
df_full = df_full.sort_values('dataset')
#df_full['zeroonerankaccuracy'] = 1 - df_full['zeroonerankloss']
df_full.loc[df_full['learner'] == 'FETA_DC']
#df_full.head()

Unnamed: 0,job_id,dataset,learner,categoricalaccuracy,categoricaltopk2,categoricaltopk3,categoricaltopk4,categoricaltopk5,categoricaltopk6
36,30,HYPERVOLUME,FETA_DC,0.7778,0.8847,0.9363,0.9627,0.9792,0.9895
38,32,HYPERVOLUME,FETA_DC,0.7759,0.8843,0.9341,0.9625,0.9787,0.9891
39,6,HYPERVOLUME,FETA_DC,0.7817,0.8833,0.9355,0.9624,0.9795,0.9906
53,29,HYPERVOLUME,FETA_DC,0.7303,0.8394,0.9203,0.954,0.9786,0.9917
26,31,HYPERVOLUME,FETA_DC,0.779,0.884,0.9389,0.9651,0.9814,0.9922
30,25,MEDOID,FETA_DC,0.8282,0.9641,0.9921,0.9983,0.9996,1.0
29,28,MEDOID,FETA_DC,0.8482,0.9722,0.9945,0.9992,0.9999,1.0
8,27,MEDOID,FETA_DC,0.848,0.9726,0.9949,0.9989,0.9999,1.0
7,26,MEDOID,FETA_DC,0.8485,0.9719,0.9946,0.9989,0.9998,0.9999
69,5,MEDOID,FETA_DC,0.8555,0.9745,0.9951,,0.9998,1.0


In [19]:
del df_full["job_id"]
grouped = df_full.groupby(['dataset', 'learner'])
data = []
for name, group in grouped:
    one_row = [name[0], str(name[1]).upper()]
    #latex_row = ["$ {}".format(name[0]), "$ {}".format(str(name[1]).upper())]
    std = group.std(axis=0).values
    mean = group.mean(axis=0).values
    if np.all(np.isnan(std)):
        one_row.extend(["{:.4f}".format(m) for m in mean])
        #latex_row.extend(["${:.3f}$".format(m) for m in mean]) 
    else:
        std = [s*1e3 for s in std]
        one_row.extend(["{:.3f}({:.0f})".format(m, s) for m, s in zip(mean, std)])
        #latex_row.extend(["$ {:.3f} \pm {:.3f} ".format(m, s) for m, s in zip(mean, std)])
    data.append(one_row)

In [20]:
df = pd.DataFrame(data, columns=columns[1:])
df.sort_values(by='dataset')
df_path = os.path.join(DIR_PATH, 'results' , DATASET+'.csv')
df.to_csv(df_path)
df

Unnamed: 0,dataset,learner,categoricalaccuracy,categoricaltopk2,categoricaltopk3,categoricaltopk4,categoricaltopk5,categoricaltopk6
0,HYPERVOLUME,FATE_DC,0.730(18),0.855(19),0.920(13),0.949(9),0.968(6),0.980(3)
1,HYPERVOLUME,FETA_DC,0.769(22),0.875(20),0.933(7),0.961(4),0.979(1),0.991(1)
2,HYPERVOLUME,FETA_DC_ZERO,0.766(18),0.874(15),0.932(5),0.960(2),0.978(1),0.990(2)
3,HYPERVOLUME,GENERALIZED_EXTREME_VALUE,0.366(245),0.460(287),0.552(287),0.630(256),0.708(211),0.785(156)
4,HYPERVOLUME,MIXED_LOGIT_MODEL,0.189(14),0.338(17),0.451(19),0.542(20),0.621(14),0.692(10)
5,HYPERVOLUME,MULTINOMIAL_LOGIT_MODEL,0.201(8),0.267(10),0.360(10),0.456(8),0.559(4),0.664(4)
6,HYPERVOLUME,NESTED_LOGIT_MODEL,0.291(3),0.416(5),0.511(7),0.582(6),0.651(6),0.722(4)
7,HYPERVOLUME,PAIRED_COMBINATORIAL_LOGIT,0.185(1),0.248(1),0.340(2),0.440(2),0.550(2),0.668(2)
8,HYPERVOLUME,RANKNET_DC,0.203(4),0.276(6),0.369(6),0.462(5),0.562(4),0.665(7)
9,HYPERVOLUME,RANKSVM_DC,0.186(1),0.248(1),0.340(2),0.439(2),0.550(2),0.667(2)


In [21]:
from tabulate import tabulate
grouped = df.groupby(['dataset'])
for name, group in grouped:
    custom_dict = {"RANKSVM_DC":0, "RANKNET_DC":1, 'MULTINOMIAL_LOGIT_MODEL':2, 'NESTED_LOGIT_MODEL':3, 'GENERALIZED_EXTREME_VALUE':4, 
                   'PAIRED_COMBINATORIAL_LOGIT':5, "MIXED_LOGIT_MODEL":6, "FATE_DC":7, "FETA_DC":8, "FETA_DC_ZERO":9}
    group['rank'] = group['learner'].map(custom_dict)
    group.sort_values(by='rank', inplace=True)
    del group["dataset"]
    del group['rank']
    
    sub_df = np.array(group.loc[group['learner'].isin(["FETA_DC", "FETA_DC_ZERO"])])[:,0:2]
    val1 = float(sub_df[0][1].split("(")[0]) - float(sub_df[0][1][sub_df[1][1].find("(")+1:sub_df[0][1].find(")")])*1e-3
    val2 = float(sub_df[1][1].split("(")[0]) - float(sub_df[1][1][sub_df[1][1].find("(")+1:sub_df[1][1].find(")")])*1e-3
    print(val1, val2)
    if val1 < val2 :
        remove_ranker = sub_df[0][0]
    else:
        remove_ranker = sub_df[1][0]
    print(remove_ranker)
    print(sub_df)
    group = group[group['learner'] != remove_ranker]
    group['learner'] = ["pariwisesvm", "ranknetdc", "mnl", "nlm", "gev", "pcl", "mlm", "fate", "feta"]
    print("name {}".format(name))
    print(list(group.columns))
    latex_code = group.to_latex(index = False)
    latex_code = latex_code.replace(' ',"")
    latex_code = latex_code.replace('&'," & ")
    print(latex_code)
#df.T.to_latex()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  import sys
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy


0.747 0.748
FETA_DC
[['FETA_DC' '0.769(22)']
 ['FETA_DC_ZERO' '0.766(18)']]
name HYPERVOLUME
['learner', 'categoricalaccuracy', 'categoricaltopk2', 'categoricaltopk3', 'categoricaltopk4', 'categoricaltopk5', 'categoricaltopk6']
\begin{tabular}{lllllll}
\toprule
learner & categoricalaccuracy & categoricaltopk2 & categoricaltopk3 & categoricaltopk4 & categoricaltopk5 & categoricaltopk6\\
\midrule
pariwisesvm & 0.186(1) & 0.248(1) & 0.340(2) & 0.439(2) & 0.550(2) & 0.667(2)\\
ranknetdc & 0.203(4) & 0.276(6) & 0.369(6) & 0.462(5) & 0.562(4) & 0.665(7)\\
mnl & 0.201(8) & 0.267(10) & 0.360(10) & 0.456(8) & 0.559(4) & 0.664(4)\\
nlm & 0.291(3) & 0.416(5) & 0.511(7) & 0.582(6) & 0.651(6) & 0.722(4)\\
gev & 0.366(245) & 0.460(287) & 0.552(287) & 0.630(256) & 0.708(211) & 0.785(156)\\
pcl & 0.185(1) & 0.248(1) & 0.340(2) & 0.440(2) & 0.550(2) & 0.668(2)\\
mlm & 0.189(14) & 0.338(17) & 0.451(19) & 0.542(20) & 0.621(14) & 0.692(10)\\
fate & 0.730(18) & 0.855(19) & 0.920(13) & 0.949(9) & 0.968(6) &

In [17]:
df_path = os.path.join(DIR_PATH, 'results' , "discrete_choice.csv")

if not os.path.isfile(df_path):
    dataFrame = df
else:
    dataFrame = pd.read_csv(df_path, index_col=0)
    dataFrame = dataFrame.append(df, ignore_index=True)
dataFrame
dataFrame.to_csv(df_path)

In [30]:
grouped = df.groupby(['dataset'])
for name, group in grouped:
    df_path = os.path.join(DIR_PATH, 'results' , name.lower()+'.csv')
    group.to_csv(df_path)

In [31]:
import numpy as np
np.arange(48,87)

X_train = np.arange(40).reshape(4,5,2)

learner_params = {}
learner_params['n_objects'], learner_params['n_object_features'] = X_train.shape[1:]

In [57]:
"UNIQUE_MAX_OCCURRING".lower()

'unique_max_occurring'