## TabDPT Evaluation Notebook 

This notebook to computes confidence intervals with `rliable` as in the paper appendix

In [None]:
import numpy as np
import pandas as pd
from rliable import library as rly
from rliable import metrics

In [None]:
# Get the csvs from TabDPT results on at least 2 folds
# This is important to compute confidence intervals with rliable
tabdpt_df0 = pd.read_csv('../results_fold0.csv')
tabdpt_df1 = pd.read_csv('../results_fold1.csv')

tabdpt_df0['fold'] = 0
tabdpt_df1['fold'] = 1
tabdpt_df = pd.concat([tabdpt_df0, tabdpt_df1], axis=0)

tabdpt_df['alg_name'] = 'TabDPT'

df = tabdpt_df  # can join several tables with same structure and different alg_name

# column cc18 is 1 when auc is not NaN, 0 otherwise
df['cc18'] = df['auc'].notna().astype(int)

# column ctr is 1 when r2 is not NaN, 0 otherwise
df['ctr'] = df['r2'].notna().astype(int)

In [None]:
# Pivot table construction
def get_scores_ci(metric, suite, data):
    data_suite = data[data['cc18'] == 1] if suite == 'cc18' else data[data['ctr'] == 1]

    algorithm_metric_dict = {}
    for alg_name, group in data_suite.groupby('alg_name'):
        # Create a pivot table: rows are folds, columns are datasets, values are r2 scores
        pivot_table = group.pivot(index='fold', columns='name', values=metric)
        scores =  pivot_table.values
        # if there are NaN values, replace them with row mean
        scores = np.where(np.isnan(scores), np.nanmean(scores, axis=1, keepdims=True), scores)
        algorithm_metric_dict[alg_name] = scores
    algorithms = list(algorithm_metric_dict.keys())

    # choose one of the following aggregate functions
    aggregate_func = lambda x: np.array([
        # metrics.aggregate_median(x),
        metrics.aggregate_iqm(x),
        # metrics.aggregate_mean(x),
        # metrics.aggregate_optimality_gap(x)
    ])
    aggregate_scores, aggregate_score_cis = rly.get_interval_estimates(
        algorithm_metric_dict, aggregate_func, reps=20000
    )

    aggregate_scores = {alg: aggregate_scores[alg] for alg in algorithms}
    aggregate_score_cis = {alg: aggregate_score_cis[alg] for alg in algorithms}
    return aggregate_scores, aggregate_score_cis

In [None]:
best_is_higher = {
    'auc':  True,
    'acc':  True,
    'corr': True,
    'r2':   True,

    'auc_rank': False,
    'acc_rank': False,
    'corr_rank': False,
    'r2_rank': False,
}

metric_suite_pairs = [
    ('auc',  'cc18'),  
    ('acc',  'cc18'),  
    ('corr', 'ctr'), 
    ('r2',   'ctr'),  
]
# add _rank to the metrics
# you can uncomment this to compute ranks instead of raw scores but you need more than one alg_name
# metric_suite_pairs = [(m + "_rank", s) for m, s in metric_suite_pairs]

all_scores = {} 
all_cis    = {} 
algorithms = set()

for metric, suite in metric_suite_pairs:
    scores, ci = get_scores_ci(metric, suite, df) 
    scores = {k: float(v.squeeze()) for k, v in scores.items()}
    ci     = {k: tuple(v.squeeze()) for k, v in ci.items()}
    all_scores[(metric, suite)] = scores
    all_cis[(metric, suite)]    = ci
    algorithms.update(scores.keys())

algorithms = sorted(algorithms)

In [None]:
all_scores

In [None]:
all_cis