In [30]:
import inspect
import logging
import os

import pandas as pd

from csrank.util import setup_logging
from experiments.util import lp_metric_dict
import numpy as np
from experiments.dbconnection import DBConnector

In [31]:
DIR_PATH = os.path.dirname(os.path.abspath(inspect.getfile(inspect.currentframe())))
log_path = os.path.join(DIR_PATH, 'logs', 'results.log')
setup_logging(log_path=log_path)
logger = logging.getLogger('Result Parsing')
config_file_path = os.path.join(DIR_PATH, 'config', 'clusterdb.json')
datasets = ['synthetic_dc', 'mnist_dc', 'tag_genome_dc']
DATASET = datasets[2]
learning_problem = "discrete_choice"
results_table = 'results.{}'.format(learning_problem)
schema = 'masterthesis'
select_jobs = "SELECT learner_params, dataset_params, {0}.job_id, dataset, learner, {3} from {0} INNER JOIN {1} ON {0}.job_id = {1}.job_id where {1}.dataset=\'{2}\'"

In [32]:
self = DBConnector(config_file_path=config_file_path, is_gpu=False, schema=schema)

In [33]:
self.init_connection()
sel = "SELECT job_id FROM {} WHERE learner='feta_dc' and learner_params ->> 'add_zeroth_order_model' = 'TRUE' and fold_id=0"
self.cursor_db.execute(sel.format("masterthesis.avail_jobs"))
jobs = np.array(self.cursor_db.fetchall()).T[0]
jobs

array([118, 122, 126, 130, 138, 114, 103, 105, 104, 106, 110, 102, 134])

In [34]:
keys = list(lp_metric_dict[learning_problem].keys())
keys[-1] = keys[-1].format(6)
metrics = ', '.join([x for x in keys])
metrics

'CategoricalAccuracy, CategoricalTopK2, CategoricalTopK3, CategoricalTopK4, CategoricalTopK5, CategoricalTopK6'

In [35]:
self.init_connection()
avail_jobs = "{}.avail_jobs".format(self.schema)
select_st = select_jobs.format(results_table, avail_jobs, DATASET, metrics)
print(select_st)
self.cursor_db.execute(select_st)
data = []
for job in self.cursor_db.fetchall():
    job = dict(job)
    if job['learner_params'].get("add_zeroth_order_model", False):
        job['learner'] = job['learner']+'_zero'
    job['dataset'] = job['dataset_params']['dataset_type'].upper()
    job['learner'] = job['learner'].upper()
    values = list(job.values())
    keys = list(job.keys())
    columns = keys[2:]
    vals = values[2:]
    data.append(vals)

SELECT learner_params, dataset_params, results.discrete_choice.job_id, dataset, learner, CategoricalAccuracy, CategoricalTopK2, CategoricalTopK3, CategoricalTopK4, CategoricalTopK5, CategoricalTopK6 from results.discrete_choice INNER JOIN masterthesis.avail_jobs ON results.discrete_choice.job_id = masterthesis.avail_jobs.job_id where masterthesis.avail_jobs.dataset='tag_genome_dc'


In [36]:
self.init_connection()
avail_jobs = "{}.avail_jobs".format("pymc3")
select_st = select_jobs.format(results_table, avail_jobs, DATASET, metrics)
print(select_st)
self.cursor_db.execute(select_st)
for job in self.cursor_db.fetchall():
    job = dict(job)
    job['dataset'] = job['dataset_params']['dataset_type'].upper()
    job['learner'] = job['learner'].upper()
    values = list(job.values())
    keys = list(job.keys())
    columns = keys[2:]
    vals = values[2:]
    data.append(vals)
df_full = pd.DataFrame(data, columns=columns)

SELECT learner_params, dataset_params, results.discrete_choice.job_id, dataset, learner, CategoricalAccuracy, CategoricalTopK2, CategoricalTopK3, CategoricalTopK4, CategoricalTopK5, CategoricalTopK6 from results.discrete_choice INNER JOIN pymc3.avail_jobs ON results.discrete_choice.job_id = pymc3.avail_jobs.job_id where pymc3.avail_jobs.dataset='tag_genome_dc'


In [37]:
df_full = df_full.sort_values('dataset')
#df_full['zeroonerankaccuracy'] = 1 - df_full['zeroonerankloss']
df_full.loc[df_full['learner'] == 'FETA_DC']
#df_full.head()

Unnamed: 0,job_id,dataset,learner,categoricalaccuracy,categoricaltopk2,categoricaltopk3,categoricaltopk4,categoricaltopk5,categoricaltopk6
120,225,CRITIQUE_FIT_LESS,FETA_DC,0.1735,0.2895,0.3927,0.4914,0.5864,0.6769
122,298,CRITIQUE_FIT_LESS,FETA_DC,0.1845,0.2972,0.4,0.4948,0.588,0.6799
124,302,CRITIQUE_FIT_LESS,FETA_DC,0.1661,0.2785,0.3811,0.479,0.5744,0.667
125,303,CRITIQUE_FIT_LESS,FETA_DC,0.1618,0.2739,0.3799,0.4797,0.5764,0.6714
126,304,CRITIQUE_FIT_LESS,FETA_DC,0.1643,0.2744,0.377,0.4763,0.5707,0.6631
142,226,CRITIQUE_FIT_MORE,FETA_DC,0.2215,0.3512,0.4638,0.5662,0.6592,0.7443
107,286,CRITIQUE_FIT_MORE,FETA_DC,0.2804,0.4184,0.5285,0.6207,0.7038,0.7784
100,274,CRITIQUE_FIT_MORE,FETA_DC,0.2405,0.3698,0.4807,0.5774,0.6689,0.75
101,279,CRITIQUE_FIT_MORE,FETA_DC,0.2656,0.4043,0.5152,0.6093,0.6949,0.7718
105,283,CRITIQUE_FIT_MORE,FETA_DC,0.2483,0.3782,0.4889,0.5888,0.6756,0.7559


In [38]:
del df_full["job_id"]
grouped = df_full.groupby(['dataset', 'learner'])
data = []
for name, group in grouped:
    one_row = [name[0], str(name[1]).upper()]
    std = group.std(axis=0).values
    mean = group.mean(axis=0).values
    if np.all(np.isnan(std)):
        one_row.extend(["{:.4f}".format(m) for m in mean])
    else:
        one_row.extend(["{:.3f}+-{:.3f}".format(m, s) for m, s in zip(mean, std)])
    data.append(one_row)

In [39]:
df = pd.DataFrame(data, columns=columns[1:])
df.sort_values(by='dataset')
df_path = os.path.join(DIR_PATH, 'results' , DATASET+'.csv')
df.to_csv(df_path)
df

Unnamed: 0,dataset,learner,categoricalaccuracy,categoricaltopk2,categoricaltopk3,categoricaltopk4,categoricaltopk5,categoricaltopk6
0,CRITIQUE_FIT_LESS,FATE_DC,0.226+-0.002,0.353+-0.003,0.462+-0.004,0.560+-0.004,0.652+-0.004,0.734+-0.004
1,CRITIQUE_FIT_LESS,FETA_DC,0.170+-0.009,0.283+-0.010,0.386+-0.010,0.484+-0.008,0.579+-0.008,0.672+-0.007
2,CRITIQUE_FIT_LESS,FETA_DC_ZERO,0.175+-0.020,0.294+-0.013,0.404+-0.013,0.508+-0.010,0.600+-0.006,0.694+-0.007
3,CRITIQUE_FIT_LESS,GENERALIZED_EXTREME_VALUE,0.153+-0.010,0.264+-0.012,0.369+-0.013,0.472+-0.013,0.572+-0.011,0.668+-0.008
4,CRITIQUE_FIT_LESS,MULTINOMIAL_LOGIT_MODEL,0.161+-0.001,0.272+-0.002,0.377+-0.002,0.478+-0.002,0.576+-0.002,0.671+-0.001
5,CRITIQUE_FIT_LESS,NESTED_LOGIT_MODEL,0.158+-0.003,0.271+-0.007,0.377+-0.006,0.480+-0.007,0.578+-0.007,0.674+-0.006
6,CRITIQUE_FIT_LESS,PAIRED_COMBINATORIAL_LOGIT,0.147+-0.004,0.257+-0.003,0.362+-0.002,0.464+-0.002,0.565+-0.001,0.664+-0.003
7,CRITIQUE_FIT_LESS,RANKNET_DC,0.173+-0.002,0.306+-0.002,0.420+-0.002,0.522+-0.002,0.616+-0.003,0.704+-0.002
8,CRITIQUE_FIT_LESS,RANKSVM_DC,0.120+-0.018,0.237+-0.020,0.350+-0.021,0.458+-0.020,0.561+-0.018,0.659+-0.015
9,CRITIQUE_FIT_MORE,FATE_DC,0.355+-0.005,0.511+-0.005,0.619+-0.004,0.705+-0.004,0.778+-0.003,0.839+-0.003


In [20]:
df_path = os.path.join(DIR_PATH, 'results' , "discrete_choice.csv")

if not os.path.isfile(df_path):
    dataFrame = df
else:
    dataFrame = pd.read_csv(df_path, index_col=0)
    dataFrame = dataFrame.append(df, ignore_index=True)
dataFrame
dataFrame.to_csv(df_path)

In [30]:
grouped = df.groupby(['dataset'])
for name, group in grouped:
    df_path = os.path.join(DIR_PATH, 'results' , name.lower()+'.csv')
    group.to_csv(df_path)

In [31]:
import numpy as np
np.arange(48,87)

X_train = np.arange(40).reshape(4,5,2)

learner_params = {}
learner_params['n_objects'], learner_params['n_object_features'] = X_train.shape[1:]

In [57]:
"UNIQUE_MAX_OCCURRING".lower()

'unique_max_occurring'