In [7]:
%load_ext autoreload
%autoreload 2
import os
import sys
sys.path.append('../')
from data.utils import setup_dataloaders, collect_pairs_choices, get_dataloader
from data.dataloaders import ConflictingDataset
from eval.utils import load_model
import torch
import pandas as pd

device='cuda:0'

root = '../saves/reward-models-ultrafeedback'

# Load the models for the HHT datasets
save_dict = {
    'btl-mixed' : f'{root}/hht-btl-mixed_0',
    'btl-honesty': f'{root}/hht-btl-honesty_0',
    'btl-helpfulness': f'{root}/hht-btl-helpfulness_0',
    'btl-truthfulness': f'{root}/hht-btl-truthfulness_0',
    'nppl-mixed' : f'{root}/hht-nppl-mixed_0'
}

# # Load the models for the HH datasets
# save_dict = {
#     'btl-mixed' : f'{root}/hh-btl-mixed_0',
#     'btl-honesty': f'{root}/hh-btl-honesty_0',
#     'btl-helpfulness': f'{root}/hh-btl-helpfulness_0',
#     'nppl-mixed' : f'{root}/hh-nppl-mixed_0'
# }

models = {}
cfgs = {}
for k, v in save_dict.items():
    model, cfg = load_model(v, load_it='best', device=device)
    models[k] = model
    cfgs[k] = cfg

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload
../saves/reward-models-ultrafeedback/hht-btl-mixed_0
../saves/reward-models-ultrafeedback/hht-btl-honesty_0
../saves/reward-models-ultrafeedback/hht-btl-helpfulness_0
../saves/reward-models-ultrafeedback/hht-btl-truthfulness_0
../saves/reward-models-ultrafeedback/hht-nppl-mixed_0


  state_dict = torch.load(f'{save_dir}/model_{load_it}.pt', map_location=device)


In [2]:
from argparse import Namespace
from tqdm import tqdm

cfg = cfgs['nppl-mixed']

test_data_loader = setup_dataloaders(cfg.data, splits=['test'])['test']

num_context_ls = [0, 1, 3, 5, 10]

eval_dict = {}

for model_name, model in models.items():
    eval_dict[model_name] = {}
    for metric in ['accuracy', 'unseen_accuracy', 'label']:
        eval_dict[model_name][metric] = {}
        for num_context in num_context_ls:
            eval_dict[model_name][metric][num_context] = []

for model_name, model in models.items():
    print('Evaluting model:', model_name)
    for i, batch in enumerate(tqdm(test_data_loader)):
        for num_context in num_context_ls:
            
            pairs_C, choices_C, pairs_T, choices_T = collect_pairs_choices(
                batch, 
                num_context=num_context,
                min_num_context=cfg.data.min_num_context,
                max_num_context=cfg.data.max_num_context,
                num_targets=cfg.data.num_targets,
                context_datatype=cfg.data.context_datatype
            )  

            pairs_T = pairs_T.to(device)
            choices_T = choices_T.to(device)
            pairs_C = pairs_C.to(device)
            choices_C = choices_C.to(device) 

            with torch.no_grad():
                outputs = model(pairs_T, choices_T, pairs_C, choices_C)
            
            predictions = outputs['logp_choices'].argmax(dim=-1).unsqueeze(-1)
            choices = choices_T.unsqueeze(0).expand(predictions.shape[0], -1, -1, -1)
            acc = (predictions == choices).float().mean(axis=0)

            bs = cfg.data.batch_size
            num_targets = cfg.data.num_targets

            unseen_predictions = torch.zeros(
                (predictions.shape[0], bs, num_targets - num_context, 1)
            )
            unseen_choices = torch.zeros(
                (predictions.shape[0], bs, num_targets - num_context, 1)
            )

            for i in range(bs):
                idx = torch.tensor(list(range(num_context, num_targets)))
                unseen_predictions[:, i, :, :] = predictions[:, i, idx, :]
                unseen_choices[:, i, :, :] = choices[:, i, idx, :]

            unseen_acc = (unseen_predictions == unseen_choices).float().mean(axis=0)
        
            eval_dict[model_name]['accuracy'][num_context].append(acc)
            eval_dict[model_name]['unseen_accuracy'][num_context].append(unseen_acc)
            eval_dict[model_name]['label'][num_context].append(batch['labels_T'])

Loading dataset from disk:   0%|          | 0/25 [00:00<?, ?it/s]

Loading dataset from disk:   0%|          | 0/25 [00:00<?, ?it/s]

Evaluting model: btl-mixed


100%|██████████| 53/53 [00:08<00:00,  6.59it/s]


Evaluting model: btl-honesty


100%|██████████| 53/53 [00:08<00:00,  6.37it/s]


Evaluting model: btl-helpfulness


100%|██████████| 53/53 [00:08<00:00,  6.38it/s]


Evaluting model: btl-truthfulness


100%|██████████| 53/53 [00:08<00:00,  6.30it/s]


Evaluting model: nppl-mixed


100%|██████████| 53/53 [00:08<00:00,  6.06it/s]


In [3]:
num_context = 0

def get_res_df(eval_dict, acc_type="accuracy"):
    acc_dict = eval_dict[acc_type]
    labels_dict = eval_dict['label']
    res_df = pd.DataFrame()
    for num_context in num_context_ls:
        mean_acc = torch.stack(acc_dict[num_context]).squeeze(-1).mean(dim=-1).cpu().numpy()
        labels = [z for zs in labels_dict[num_context] for z in zs]
        try:
            res = pd.DataFrame({
                'acc' : mean_acc.flatten() * 100,
                'labels' : labels,
                'num_context' : num_context
            })
        except(ValueError):
            res = pd.DataFrame()
            print(acc_type, num_context)
        res_df = pd.concat([res_df, res])

    return res_df

res_df = pd.DataFrame()

for model_name in models.keys():
    res_df_model = get_res_df(eval_dict[model_name], acc_type="unseen_accuracy" if 'nppl' in model_name else "accuracy")
    res_df_model['model'] = model_name
    res_df = pd.concat([res_df, res_df_model])


summary_df = res_df.groupby(['model', 'num_context', 'labels'])['acc'].agg(['mean', 'sem']) 
summary_df['acc'] = summary_df.apply(lambda x: f'{x['mean']:.1f} ± {x['sem']:.1f}', axis=1)
summary_df.reset_index().pivot(index='num_context', columns=['model', 'labels'], values='acc')

model,btl-helpfulness,btl-helpfulness,btl-helpfulness,btl-honesty,btl-honesty,btl-honesty,btl-mixed,btl-mixed,btl-mixed,btl-truthfulness,btl-truthfulness,btl-truthfulness,nppl-mixed,nppl-mixed,nppl-mixed
labels,helpfulness,honesty,truthfulness,helpfulness,honesty,truthfulness,helpfulness,honesty,truthfulness,helpfulness,honesty,truthfulness,helpfulness,honesty,truthfulness
num_context,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2,Unnamed: 13_level_2,Unnamed: 14_level_2,Unnamed: 15_level_2
0,77.6 ± 0.3,43.1 ± 0.3,30.3 ± 0.3,41.9 ± 0.3,61.1 ± 0.3,51.2 ± 0.3,50.6 ± 0.3,52.6 ± 0.3,54.2 ± 0.3,29.8 ± 0.3,53.0 ± 0.3,70.5 ± 0.3,70.4 ± 0.3,46.2 ± 0.3,33.8 ± 0.3
1,77.6 ± 0.3,43.1 ± 0.3,30.3 ± 0.3,41.9 ± 0.3,61.1 ± 0.3,51.2 ± 0.3,50.6 ± 0.3,52.6 ± 0.3,54.2 ± 0.3,29.8 ± 0.3,53.0 ± 0.3,70.5 ± 0.3,71.9 ± 0.3,47.5 ± 0.4,35.6 ± 0.3
3,77.6 ± 0.3,43.1 ± 0.3,30.3 ± 0.3,41.9 ± 0.3,61.1 ± 0.3,51.2 ± 0.3,50.6 ± 0.3,52.6 ± 0.3,54.2 ± 0.3,29.8 ± 0.3,53.0 ± 0.3,70.5 ± 0.3,73.2 ± 0.4,53.8 ± 0.4,53.2 ± 0.4
5,77.6 ± 0.3,43.1 ± 0.3,30.3 ± 0.3,41.9 ± 0.3,61.1 ± 0.3,51.2 ± 0.3,50.6 ± 0.3,52.6 ± 0.3,54.2 ± 0.3,29.8 ± 0.3,53.0 ± 0.3,70.5 ± 0.3,73.5 ± 0.4,57.5 ± 0.4,67.5 ± 0.4
10,77.6 ± 0.3,43.1 ± 0.3,30.3 ± 0.3,41.9 ± 0.3,61.1 ± 0.3,51.2 ± 0.3,50.6 ± 0.3,52.6 ± 0.3,54.2 ± 0.3,29.8 ± 0.3,53.0 ± 0.3,70.5 ± 0.3,71.5 ± 0.5,60.4 ± 0.5,72.3 ± 0.4
