In [1]:
import pandas as pd

In [2]:
import torch
import numpy as np
import pandas as pd
from datasets import Dataset
from transformers import AutoTokenizer
from transformers import AutoModelForMultipleChoice, TrainingArguments, Trainer
from transformers.tokenization_utils_base import PreTrainedTokenizerBase, PaddingStrategy
from torch.utils.data import DataLoader

caused by: ['/opt/conda/lib/python3.10/site-packages/tensorflow_io/python/ops/libtensorflow_io_plugins.so: undefined symbol: _ZN3tsl6StatusC1EN10tensorflow5error4CodeESt17basic_string_viewIcSt11char_traitsIcEENS_14SourceLocationE']
caused by: ['/opt/conda/lib/python3.10/site-packages/tensorflow_io/python/ops/libtensorflow_io.so: undefined symbol: _ZTVN10tensorflow13GcsFileSystemE']


In [3]:
# exp004
import dataclasses
import torch
import pandas as pd
from sklearn.model_selection import KFold
from transformers import AutoModel, AutoTokenizer, AutoModelForMultipleChoice
from transformers.tokenization_utils_base import PreTrainedTokenizerBase, PaddingStrategy
from transformers import AutoModelForMultipleChoice, TrainingArguments, Trainer
from datasets import Dataset
from typing import Optional, Union

import transformers
import wandb
from datetime import datetime as dt
import os
import numpy as np
import tqdm

import logging 
from logging import Logger



@dataclasses.dataclass
class BertConfig:
    
    experiment_name: str
    dataset_dir: str
    
    debug: bool = False

    lr: float = 1e-5
    model_name: str = "OpenAssistant/reward-model-deberta-v3-large-v2"
    num_context: int = 3
    max_length: int = 512
    batch_size: int = 2
    epochs: int = 10
    iters_to_accumlate: int = 8
    weight_decay: float = 0.01
    warmup_ratio: float = 0.1
    
    freeze_embeddings: bool = True
    freeze_layers: int = 18
    reinitialize_layers: int = 0
    
    assume_completely_retrieved: bool = False
    n_samples: int = None
    steps: int = 100
    
    lora_r: float = 2
    lora_alpha: float = 4
    lora_dropout: float = 0.1
    use_peft: bool = False

def get_logger(
    output_dir: str,
):
    """
    logger を作成する. formatter は "%Y-%m-%d %H:%M:%S" で作成する.
    """
    logger = logging.getLogger(__name__)
    logger.setLevel(logging.DEBUG)
    
    # formatter
    formatter = logging.Formatter("%(asctime)s %(levelname)s %(message)s")
    
    # handler
    handler = logging.StreamHandler()
    handler.setLevel(logging.DEBUG)
    handler.setFormatter(formatter)
    logger.addHandler(handler)
    
    handler = logging.FileHandler(f"{output_dir}/log.txt", "w")
    handler.setLevel(logging.DEBUG)
    handler.setFormatter(formatter)
    logger.addHandler(handler)
    
    return logger


@dataclasses.dataclass
class DataCollatorForMultipleChoice:
    tokenizer: PreTrainedTokenizerBase
    padding: Union[bool, str, PaddingStrategy] = True
    max_length: Optional[int] = None
    pad_to_multiple_of: Optional[int] = None
    
    def __call__(self, features):
        label_name = 'label' if 'label' in features[0].keys() else 'labels'
        labels = [feature.pop(label_name) for feature in features]
        batch_size = len(features)
        num_choices = len(features[0]['input_ids'])
        flattened_features = [
            [{k: v[i] for k, v in feature.items()} for i in range(num_choices)] for feature in features
        ]
        flattened_features = sum(flattened_features, [])
        
        batch = self.tokenizer.pad(
            flattened_features,
            padding=self.padding,
            max_length=self.max_length,
            pad_to_multiple_of=self.pad_to_multiple_of,
            return_tensors='pt',
        )
        batch = {k: v.view(batch_size, num_choices, -1) for k, v in batch.items()}
        batch['labels'] = torch.tensor(labels, dtype=torch.int64)
        return batch


        
def preprocess_df(df, config):
    df["context"] = ""
    for i in range(config.num_context):
        df["context"] += df[f"searched_wiki_id_{i}"].astype(str) + "\n\n"
    
    for col in ["A", "B", "C", "D", "E"]:
        df[col] = df[col].fillna("")
    return df[["prompt", "context", "A", "B", "C", "D", "E", "answer"]]


def map_at_3(predictions, labels):
    map_sum = 0
    pred = np.argsort(-1*np.array(predictions),axis=1)[:,:3]
    for x,y in zip(pred,labels):
        z = [1/i if y==j else 0 for i,j in zip([1,2,3],x)]
        map_sum += np.sum(z)
    return map_sum / len(predictions)

def compute_metrics(p):
    predictions = p.predictions.tolist()
    labels = p.label_ids.tolist()
    return {"map@3": map_at_3(predictions, labels)}


In [4]:
def calc_map3(
    df_test,
    model_name,
    max_length,
    num_content,
):
    option_to_index = {option: idx for idx, option in enumerate('ABCDE')}
    label = df_test["answer"].map(option_to_index).values
    logger = get_logger(output_dir="")
    logger.info("load data")
    
    config = BertConfig(
        debug=False,
        batch_size=1,
        experiment_name=f"",
        dataset_dir="",
        model_name=model_name,
        max_length=max_length,
        num_context=num_content,
    )
    df_test = preprocess_df(df_test, config)
    df_test = df_test[df_test["answer"].isin(["A", "B", "C", "D", "E"])]
    
    tokenizer = AutoTokenizer.from_pretrained(config.model_name)
    option_to_index = {option: idx for idx, option in enumerate('ABCDE')}
    index_to_option = {v: k for k,v in option_to_index.items()}
    def preprocess(example):
        first_sentence = [ "[CLS] " + example['context'] ] * 5
        second_sentences = [" #### " + example['prompt'] + " [SEP] " + example[option] + " [SEP]" for option in 'ABCDE']
        tokenized_example = tokenizer(first_sentence, second_sentences, truncation="only_first", 
                                      max_length=config.max_length, add_special_tokens=False)
        tokenized_example['label'] = option_to_index[example['answer']]
        return tokenized_example

    test_dataset = Dataset.from_pandas(df_test)
    tokenized_test_dataset = test_dataset.map(preprocess, remove_columns=["prompt", "context", "A", "B", "C", "D", "E", "answer", "__index_level_0__"])

    data_collator = DataCollatorForMultipleChoice(tokenizer=tokenizer)
    test_dataloader = DataLoader(tokenized_test_dataset, batch_size=1, shuffle=False, collate_fn=data_collator)
    
    model = AutoModelForMultipleChoice.from_pretrained(config.model_name).cuda()
    
    test_predictions = []
    for batch in tqdm.tqdm(test_dataloader):
        with torch.autocast("cuda"):
            for k in batch.keys():
                batch[k] = batch[k].cuda()
            with torch.no_grad():
                outputs = model(**batch)
            test_predictions.append(outputs.logits.cpu().detach())

    test_predictions = torch.cat(test_predictions)
    test_predictions = test_predictions.numpy()
    
    print(map_at_3(test_predictions, label))
    return {
        "model_name": model_name,
        "max_length": max_length,
        "num_content": num_content,
        "map3": map_at_3(test_predictions, label),
        "predictions": test_predictions
    }

In [5]:
def make_df(indices, tta_id):
    df_ret = []
    for j, index in enumerate(indices):
        df = df_test.copy()
        df["tta_id"] = tta_id
        df["tta_no"] = j
        for i, idx in enumerate(index):
            df[f"searched_wiki_id_{i}"] = df[f"searched_wiki_id_{idx}"]
        df_ret.append(df)
    return pd.concat(df_ret).reset_index(drop=True)

In [6]:
logger = get_logger(output_dir="")
logger.info("load data")

2023-10-08 06:25:07,715 INFO load data


In [7]:
ret = []

In [8]:
# l90-s3 base
fname = "../output/context_pipeline/stage1/exp009.py/20231007080623_gte-base_wikiall_without_sep_targetprompt_and_choice_without_sep_token_length90_stride_sentence3_drop_categoryTrue_all/valid.feather"
model_name = "../output/stage2/exp005.py/20230923195407_new_data_all300val_maxlen256/fold0"

df_test = pd.concat([
    pd.read_feather(fname),
])
df_test["data_id"] = np.arange(len(df_test))
df_tests = []

indices = np.concatenate([[[0]] * 6, [[1], [2], [3], [3], [2], [1]], np.arange(2, 32).reshape(6, 5)], axis=1)
# indices = np.concatenate([[[0]] * 4, np.arange(1, 21).reshape(4, 5)], axis=1)
indices = indices[:4]
print(indices)
df_test = make_df(indices=indices, tta_id="first_order")

ret.append(calc_map3(
    df_test,
    model_name=model_name,
    max_length=768,
    num_content=7,
))

2023-10-08 06:25:07,998 INFO load data
2023-10-08 06:25:07,998 INFO load data


[[ 0  1  2  3  4  5  6]
 [ 0  2  7  8  9 10 11]
 [ 0  3 12 13 14 15 16]
 [ 0  3 17 18 19 20 21]]


  0%|          | 0/2000 [00:00<?, ?ex/s]

  0%|                                                                                                                                                                                                 | 0/2000 [00:00<?, ?it/s]You're using a DebertaV2TokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.
100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 2000/2000 [09:33<00:00,  3.48it/s]

0.8840833333333324





In [9]:
# l90-s3 base
fname = "../output/context_pipeline/stage1/exp009.py/20231007080623_gte-base_wikiall_without_sep_targetprompt_and_choice_without_sep_token_length90_stride_sentence3_drop_categoryTrue_all/valid.feather"
model_name = "../output/stage2/exp005.py/20230926162839_new_data_all300val_maxlen256_bs2_microsoft/deberta-v3-large/fold0"

df_test = pd.concat([
    pd.read_feather(fname),
])
df_test["data_id"] = np.arange(len(df_test))
df_tests = []

indices = np.concatenate([[[0]] * 6, [[1], [2], [3], [3], [2], [1]], np.arange(2, 32).reshape(6, 5)], axis=1)
# indices = np.concatenate([[[0]] * 4, np.arange(1, 21).reshape(4, 5)], axis=1)
indices = indices[:4]
print(indices)
df_test = make_df(indices=indices, tta_id="first_order")

ret.append(calc_map3(
    df_test,
    model_name=model_name,
    max_length=768,
    num_content=7,
))

2023-10-08 06:35:06,249 INFO load data
2023-10-08 06:35:06,249 INFO load data
2023-10-08 06:35:06,249 INFO load data


[[ 0  1  2  3  4  5  6]
 [ 0  2  7  8  9 10 11]
 [ 0  3 12 13 14 15 16]
 [ 0  3 17 18 19 20 21]]


  0%|          | 0/2000 [00:00<?, ?ex/s]

  0%|                                                                                                                                                                                                 | 0/2000 [00:00<?, ?it/s]You're using a DebertaV2TokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.
100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 2000/2000 [09:15<00:00,  3.60it/s]

0.8748333333333325





In [10]:
np.corrcoef([
    ret[0]["predictions"].flatten(),
    ret[1]["predictions"].flatten(),
])

array([[1.        , 0.93817732],
       [0.93817732, 1.        ]])

In [11]:
df_tests2 = df_test.copy()

In [12]:
for i, col in enumerate("ABCDE"):
    # df_tests2[f"pred_{col}"] = ret[0]["predictions"][:, i][:2000] + ret[1]["predictions"][:, i][:2000] + ret[2]["predictions"][:, i] [:2000]
    df_tests2[f"pred_{col}"] = ret[0]["predictions"][:, i] + ret[1]["predictions"][:, i]
    df_tests2[f"pred_0_{col}"] = ret[0]["predictions"][:, i]
    df_tests2[f"pred_1_{col}"] = ret[1]["predictions"][:, i]

In [13]:
def f(x):
    if x < 300: 
        return "val" 
    else:
        return "test"
df_tests2["tta_id"] = df_tests2["data_id"].apply(f)

In [14]:
ret_scores = []
option_to_index = {option: idx for idx, option in enumerate('ABCDE')}
for group, w_df in df_tests2.groupby(["tta_id", "tta_no"]):
    ret_scores.append({
        "tta_id": group[0],
        "tta_no": group[1],
        "score": map_at_3(
            w_df[["pred_A", "pred_B", "pred_C", "pred_D", "pred_E"]], 
            w_df["answer"].map(option_to_index)
        )
    })
pd.DataFrame(ret_scores)

Unnamed: 0,tta_id,tta_no,score
0,test,0,0.969167
1,test,1,0.974167
2,test,2,0.961667
3,test,3,0.9775
4,val,0,0.843333
5,val,1,0.82
6,val,2,0.823333
7,val,3,0.831111


In [15]:
ret_scores = []
option_to_index = {option: idx for idx, option in enumerate('ABCDE')}
for group, w_df in df_tests2.groupby("tta_id"):
    ret_scores.append({
        "tta_id": group,
        "score": map_at_3(
            w_df.groupby("data_id")[["pred_A", "pred_B", "pred_C", "pred_D", "pred_E"]].max() + 
            w_df.groupby("data_id")[["pred_A", "pred_B", "pred_C", "pred_D", "pred_E"]].mean(), 
            w_df.groupby("data_id")["answer"].head(1).map(option_to_index).values
        )
    })
pd.DataFrame(ret_scores)

Unnamed: 0,tta_id,score
0,test,0.981667
1,val,0.846111


In [16]:
ret_scores = []
option_to_index = {option: idx for idx, option in enumerate('ABCDE')}
for group, w_df in df_tests2[df_tests2["tta_no"] <= 1].groupby("tta_id"):
    ret_scores.append({
        "tta_id": group,
        "score": map_at_3(
            w_df.groupby("data_id")[["pred_A", "pred_B", "pred_C", "pred_D", "pred_E"]].max() + 
            w_df.groupby("data_id")[["pred_A", "pred_B", "pred_C", "pred_D", "pred_E"]].mean(), 
            w_df.groupby("data_id")["answer"].head(1).map(option_to_index).values
        )
    })
pd.DataFrame(ret_scores)

Unnamed: 0,tta_id,score
0,test,0.975
1,val,0.839444


In [17]:
ret_scores = []
option_to_index = {option: idx for idx, option in enumerate('ABCDE')}
def agg_func(w_df, model_id):
    cols = [f"pred_{model_id}_{x}" for x in "ABCDE"]
    return (
        w_df.groupby("data_id")[cols].max() + 
        w_df.groupby("data_id")[cols].mean()
    ).values

for group, w_df in df_tests2[df_tests2["tta_no"] <= 3].groupby("tta_id"):
    pred_0 = agg_func(w_df, model_id=0) 
    pred_1 = agg_func(w_df, model_id=1) * 0.1
    pred = np.stack([pred_0, pred_1])
    ret_scores.append({
        "tta_id": group,
        "score": map_at_3(
            pred.mean(axis=0) + pred.max(axis=0),
            w_df.groupby("data_id")["answer"].head(1).map(option_to_index).values
        )
    })
pd.DataFrame(ret_scores)

Unnamed: 0,tta_id,score
0,test,0.98
1,val,0.857778


In [18]:
ret_scores = []
option_to_index = {option: idx for idx, option in enumerate('ABCDE')}
def agg_func(w_df, model_id):
    cols = [f"pred_{model_id}_{x}" for x in "ABCDE"]
    return (
        w_df.groupby("data_id")[cols].max() + 
        w_df.groupby("data_id")[cols].mean()
    ).values

for group, w_df in df_tests2[df_tests2["tta_no"] <= 2].groupby("tta_id"):
    pred_0 = agg_func(w_df, model_id=0) * 0
    pred_1 = agg_func(w_df, model_id=1) * 0
    pred_2 = agg_func(w_df, model_id=2) 
    pred = np.stack([pred_0, pred_1, pred_2])
    ret_scores.append({
        "tta_id": group,
        "score": map_at_3(
            pred.mean(axis=0) + pred.max(axis=0),
            w_df.groupby("data_id")["answer"].head(1).map(option_to_index).values
        )
    })
pd.DataFrame(ret_scores)

In [22]:
ret_scores = []
option_to_index = {option: idx for idx, option in enumerate('ABCDE')}
def agg_func(w_df, model_id):
    cols = [f"pred_{model_id}_{x}" for x in "ABCDE"]
    return (
        w_df.groupby("data_id")[cols].max() + 
        w_df.groupby("data_id")[cols].mean()
    ).values

for group, w_df in df_tests2[df_tests2["tta_no"] <= 3].groupby("tta_id"):
    pred_0 = agg_func(w_df, model_id=0) * 0.75
    pred_1 = agg_func(w_df, model_id=1) * 0.25
    
    pred = np.stack([pred_0, pred_1])
    ret_scores.append({
        "tta_id": group,
        "score": map_at_3(
            pred.mean(axis=0) + pred.max(axis=0),
            w_df.groupby("data_id")["answer"].head(1).map(option_to_index).values
        )
    })
pd.DataFrame(ret_scores)

Unnamed: 0,tta_id,score
0,test,0.989167
1,val,0.825556


In [81]:
ret_scores = []
option_to_index = {option: idx for idx, option in enumerate('ABCDE')}
for group, w_df in df_tests2[df_tests2["tta_no"] <= 3].groupby("tta_id"):
    ret_scores.append({
        "tta_id": group,
        "score": map_at_3(
            w_df[w_df["tta_no"] == 0].groupby("data_id")[["pred_A", "pred_B", "pred_C", "pred_D", "pred_E"]].mean() + 
            w_df[w_df["tta_no"] == 1].groupby("data_id")[["pred_A", "pred_B", "pred_C", "pred_D", "pred_E"]].mean() * 0.25+ 
            w_df[w_df["tta_no"] == 2].groupby("data_id")[["pred_A", "pred_B", "pred_C", "pred_D", "pred_E"]].mean() * 0.25+ 
            w_df[w_df["tta_no"] == 3].groupby("data_id")[["pred_A", "pred_B", "pred_C", "pred_D", "pred_E"]].mean() * 0.25+
            w_df.groupby("data_id")[["pred_A", "pred_B", "pred_C", "pred_D", "pred_E"]].max() * 2,
            w_df.groupby("data_id")["answer"].head(1).map(option_to_index).values
        )
    })
pd.DataFrame(ret_scores)

Unnamed: 0,tta_id,score
0,test,0.986667
1,val,0.837222


In [28]:
ret_scores = []
option_to_index = {option: idx for idx, option in enumerate('ABCDE')}
for group, w_df in df_tests2[df_tests2["tta_no"] <= 2].groupby("tta_id"):
    ret_scores.append({
        "tta_id": group,
        "score": map_at_3(
            w_df.groupby("data_id")[["pred_A", "pred_B", "pred_C", "pred_D", "pred_E"]].max() + 
            w_df.groupby("data_id")[["pred_A", "pred_B", "pred_C", "pred_D", "pred_E"]].mean(), 
            w_df.groupby("data_id")["answer"].head(1).map(option_to_index).values
        )
    })
pd.DataFrame(ret_scores)

Unnamed: 0,tta_id,score
0,test,0.979167
1,val,0.865556


In [29]:
ret_scores = []
option_to_index = {option: idx for idx, option in enumerate('ABCDE')}
for group, w_df in df_tests2.groupby("tta_id"):
    ret_scores.append({
        "tta_id": group,
        "score": map_at_3(
            w_df.groupby("data_id")[["pred_A", "pred_B", "pred_C", "pred_D", "pred_E"]].max() * 2+ 
            w_df.groupby("data_id")[["pred_A", "pred_B", "pred_C", "pred_D", "pred_E"]].mean(), 
            w_df.groupby("data_id")["answer"].head(1).map(option_to_index).values
        )
    })
pd.DataFrame(ret_scores)

Unnamed: 0,tta_id,score
0,test,0.984167
1,val,0.852778


In [30]:
ret_scores = []
option_to_index = {option: idx for idx, option in enumerate('ABCDE')}
for group, w_df in df_tests2[df_tests2["tta_no"] <= 4].groupby("tta_id"):
    ret_scores.append({
        "tta_id": group,
        "score": map_at_3(
            w_df.groupby("data_id")[["pred_A", "pred_B", "pred_C", "pred_D", "pred_E"]].max() * 2 + 
            w_df.groupby("data_id")[["pred_A", "pred_B", "pred_C", "pred_D", "pred_E"]].mean(), 
            w_df.groupby("data_id")["answer"].head(1).map(option_to_index).values
        )
    })
pd.DataFrame(ret_scores)

Unnamed: 0,tta_id,score
0,test,0.984167
1,val,0.852778


In [27]:
ret_scores = []
option_to_index = {option: idx for idx, option in enumerate('ABCDE')}
for group, w_df in df_tests2[df_tests2["tta_no"] <= 3].groupby("tta_id"):
    ret_scores.append({
        "tta_id": group,
        "score": map_at_3(
            w_df.groupby("data_id")[["pred_A", "pred_B", "pred_C", "pred_D", "pred_E"]].max() * 2+ 
            w_df.groupby("data_id")[["pred_A", "pred_B", "pred_C", "pred_D", "pred_E"]].mean(), 
            w_df.groupby("data_id")["answer"].head(1).map(option_to_index).values
        )
    })
pd.DataFrame(ret_scores)

Unnamed: 0,tta_id,score
0,test,0.985
1,val,0.844444


In [29]:
ret_scores = []
option_to_index = {option: idx for idx, option in enumerate('ABCDE')}
for group, w_df in df_tests2.groupby("tta_id"):
    ret_scores.append({
        "tta_id": group,
        "score": map_at_3(
            (
                w_df.groupby("data_id")[["pred_A", "pred_B", "pred_C", "pred_D", "pred_E"]].max() * 2/3 +
                w_df.groupby("data_id")[["pred_A", "pred_B", "pred_C", "pred_D", "pred_E"]].mean() * 1/3
            ),
            w_df.groupby("data_id")["answer"].head(1).map(option_to_index).values
        )
    })
pd.DataFrame(ret_scores)

Unnamed: 0,tta_id,score
0,first_order,0.889333


In [133]:
w_df.groupby("data_id")[["pred_A", "pred_B", "pred_C", "pred_D", "pred_E"]].nth(0)

Unnamed: 0_level_0,pred_A,pred_B,pred_C,pred_D,pred_E
data_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
0,-8.921875,-8.734375,-2.863281,-8.453125,-8.718750
1,-7.160156,-4.304688,7.804688,-0.665039,-5.101562
2,-2.060547,-3.628906,-3.695312,6.296875,-4.785156
3,-7.398438,-7.671875,-4.269531,-8.875000,-0.409668
4,-3.525391,-3.458984,4.417969,-2.359375,2.728516
...,...,...,...,...,...
3895,-1.964844,-2.603516,4.457031,-5.929688,-4.792969
3896,-1.558594,-8.468750,-7.570312,-8.257812,-3.177734
3897,1.481445,0.339111,4.433594,-8.210938,-3.841797
3898,-3.007812,-7.242188,-5.953125,-5.281250,-2.789062


In [182]:
ret_scores = []
option_to_index = {option: idx for idx, option in enumerate('ABCDE')}
for group, w_df in df_tests2[df_tests2["tta_no"] <= 5].groupby("tta_id"):
    ret_scores.append({
        "tta_id": group,
        "score": map_at_3(
            w_df.groupby("data_id")[["pred_A", "pred_B", "pred_C", "pred_D", "pred_E"]].max() + w_df.groupby("data_id")[["pred_A", "pred_B", "pred_C", "pred_D", "pred_E"]].mean(), 
            w_df.groupby("data_id")["answer"].head(1).map(option_to_index).values
        )
    })
pd.DataFrame(ret_scores)

Unnamed: 0,tta_id,score
0,first_order,0.884188
1,order,0.872607
2,skip_index,0.876154


In [193]:
ret_scores = []
option_to_index = {option: idx for idx, option in enumerate('ABCDE')}
for group, w_df in df_tests2[df_tests2["tta_no"] <= 5].groupby("tta_id"):
    ret_scores.append({
        "tta_id": group,
        "score": map_at_3(
            (
                w_df.groupby("data_id")[["pred_A", "pred_B", "pred_C", "pred_D", "pred_E"]].max() +
                w_df.groupby("data_id")[["pred_A", "pred_B", "pred_C", "pred_D", "pred_E"]].mean() 
            ),
            w_df.groupby("data_id")["answer"].head(1).map(option_to_index).values
        )
    })
pd.DataFrame(ret_scores)

Unnamed: 0,tta_id,score
0,first_order,0.884188
1,order,0.872607
2,skip_index,0.876154


In [192]:
ret_scores = []
option_to_index = {option: idx for idx, option in enumerate('ABCDE')}
for group, w_df in df_tests2[df_tests2["tta_no"] <= 5].groupby("tta_id"):
    ret_scores.append({
        "tta_id": group,
        "score": map_at_3(
            (
                w_df.groupby("data_id")[["pred_A", "pred_B", "pred_C", "pred_D", "pred_E"]].max() +
                w_df.groupby("data_id")[["pred_A", "pred_B", "pred_C", "pred_D", "pred_E"]].mean() +
                w_df.groupby("data_id")[["pred_A", "pred_B", "pred_C", "pred_D", "pred_E"]].min()
            ),
            w_df.groupby("data_id")["answer"].head(1).map(option_to_index).values
        )
    })
pd.DataFrame(ret_scores)

Unnamed: 0,tta_id,score
0,first_order,0.881624
1,order,0.865726
2,skip_index,0.869274


In [184]:
ret_scores = []
option_to_index = {option: idx for idx, option in enumerate('ABCDE')}
for group, w_df in df_tests2[df_tests2["tta_no"] <= 5].groupby("tta_id"):
    ret_scores.append({
        "tta_id": group,
        "score": map_at_3(
            (
                w_df.groupby("data_id")[["pred_A", "pred_B", "pred_C", "pred_D", "pred_E"]].max()*2/3 +
                w_df.groupby("data_id")[["pred_A", "pred_B", "pred_C", "pred_D", "pred_E"]].mean()*1/3
            ),
            w_df.groupby("data_id")["answer"].head(1).map(option_to_index).values
        )
    })
pd.DataFrame(ret_scores)

Unnamed: 0,tta_id,score
0,first_order,0.883675
1,order,0.873419
2,skip_index,0.878248


In [188]:
ret_scores = []
option_to_index = {option: idx for idx, option in enumerate('ABCDE')}
for group, w_df in df_tests2[df_tests2["tta_no"] <= 5].groupby("tta_id"):
    ret_scores.append({
        "tta_id": group,
        "score": map_at_3(
            (
                w_df.groupby("data_id")[["pred_A", "pred_B", "pred_C", "pred_D", "pred_E"]].max() * 2/3 +
                w_df.groupby("data_id")[["pred_A", "pred_B", "pred_C", "pred_D", "pred_E"]].mean() * 1/10 + 
                w_df[w_df["tta_no"] <= 1].groupby("data_id")[["pred_A", "pred_B", "pred_C", "pred_D", "pred_E"]].mean() * 1/3
            ),
            w_df.groupby("data_id")["answer"].head(1).map(option_to_index).values
        )
    })
pd.DataFrame(ret_scores)

Unnamed: 0,tta_id,score
0,first_order,0.883974
1,order,0.877137
2,skip_index,0.878248


In [194]:
ret_scores = []
option_to_index = {option: idx for idx, option in enumerate('ABCDE')}
for group, w_df in df_tests2[df_tests2["tta_no"] <= 3].groupby("tta_id"):
    ret_scores.append({
        "tta_id": group,
        "score": map_at_3(
            (
                w_df.groupby("data_id")[["pred_A", "pred_B", "pred_C", "pred_D", "pred_E"]].max() +
                w_df.groupby("data_id")[["pred_A", "pred_B", "pred_C", "pred_D", "pred_E"]].mean() 
            ),
            w_df.groupby("data_id")["answer"].head(1).map(option_to_index).values
        )
    })
pd.DataFrame(ret_scores)

Unnamed: 0,tta_id,score
0,first_order,0.882393
1,order,0.873547
2,skip_index,0.876068


In [197]:
ret_scores = []
option_to_index = {option: idx for idx, option in enumerate('ABCDE')}
for group, w_df in df_tests2[df_tests2["tta_no"] <= 3].groupby("tta_id"):
    ret_scores.append({
        "tta_id": group,
        "score": map_at_3(
            (
                w_df.groupby("data_id")[["pred_A", "pred_B", "pred_C", "pred_D", "pred_E"]].max() +
                w_df.groupby("data_id")[["pred_A", "pred_B", "pred_C", "pred_D", "pred_E"]].mean()
            ),
            w_df.groupby("data_id")["answer"].head(1).map(option_to_index).values
        )
    })
pd.DataFrame(ret_scores)

Unnamed: 0,tta_id,score
0,first_order,0.882393
1,order,0.873547
2,skip_index,0.876068


In [134]:
df_tests2

Unnamed: 0.1,Unnamed: 0,prompt,A,B,C,D,E,answer,source,dataset,...,searched_wiki_id_48,searched_wiki_id_49,data_id,tta_id,tta_no,pred_A,pred_B,pred_C,pred_D,pred_E
0,72133,What is the primary occupation of Javon Mascel...,Javon Mascellus Bullard is a renowned scientis...,Javon Mascellus Bullard is an acclaimed chef k...,Javon Mascellus Bullard is a professional Amer...,Javon Mascellus Bullard is a highly successful...,Javon Mascellus Bullard is an accomplished pia...,C,1,valid,...,#J. Marshall Shepherd\nJames Marshall Shepherd...,#Jabari Smith Jr.\n Smith began playing basket...,0,order,0,-8.921875,-8.734375,-2.863281,-8.453125,-8.718750
1,72134,How was Karnavati University established?,Karnavati University was established by the In...,Karnavati University was established through a...,Karnavati University was established in 2017 b...,Karnavati University was established through a...,Karnavati University was established through a...,C,1,valid,...,#Krantiguru Shyamji Krishna Verma Kachchh Univ...,#Krantiguru Shyamji Krishna Verma Kachchh Univ...,1,order,0,-7.160156,-4.304688,7.804688,-0.665039,-5.101562
2,72135,When was VA-144 established as VA-116?,23 February 1959,29 January 1959,1 December 1959,1 December 1955,29 January 1971,D,1,valid,...,#VFA-115\n The squadron received their first K...,#VA-122 (U.S. Navy)\n December 1969: The squad...,2,order,0,-2.060547,-3.628906,-3.695312,6.296875,-4.785156
3,72136,What was Lev Andreevich Navrozov primarily kno...,Lev Andreevich Navrozov was primarily known fo...,Lev Andreevich Navrozov was primarily known fo...,Lev Andreevich Navrozov was primarily known fo...,Lev Andreevich Navrozov was primarily known fo...,Lev Andreevich Navrozov was primarily known fo...,E,1,valid,...,#Russia\n Élie Metchnikoff is known for his gr...,#Lev Danilkin\n By the time he departed from A...,3,order,0,-7.398438,-7.671875,-4.269531,-8.875000,-0.409668
4,72137,What were the titles and positions held by Mic...,Michel Le Tellier held the titles of seigneur ...,Michel Le Tellier held the positions of seigne...,Michel Le Tellier held the title of marquis de...,Michel Le Tellier held the title of marquis de...,Michel Le Tellier held the title of marquis de...,C,1,valid,...,"#Michel Le Tellier\n He died in Versailles, 15...",#Jean de Forcade de Biaix\n and president of t...,4,order,0,-3.525391,-3.458984,4.417969,-2.359375,2.728516
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3895,76028,Which logical fallacy is used in the text?\nMs...,circular reasoning: an argument that supports ...,guilt by association: a negative association i...,straw man: a misrepresentation of an opponent'...,a bicycle that moved 55miles north in 10hours,Do toy cars go faster down the ramp made of wo...,C,additional_data/ScienceQA/test.parquet,valid,...,#Invincible ignorance fallacy\nThe invincible ...,"#Argument\n In logic, an argument is usually e...",3895,skip_index,7,0.246338,-0.680176,5.425781,-6.253906,-3.292969
3896,76029,How long is a potato?,16 centimeters,16 kilometers,16 millimeters,16 meters,interrogative,A,additional_data/ScienceQA/test.parquet,valid,...,#Potato (disambiguation)\nLook up potato in Wi...,"#Marah (plant)\n They are large, and spherical...",3896,skip_index,7,-0.039001,-8.171875,-6.531250,-7.667969,-1.585938
3897,76030,Complete the statement. Assume that the motorc...,stayed the same,decreased,increased,Both are caused by heating.,a motorboat that moved 115kilometers east in 5...,C,additional_data/ScienceQA/test.parquet,valid,...,#Mass–energy equivalence\n If its temperature ...,#Einstein's thought experiments\n {\displaysty...,3897,skip_index,7,2.757812,-0.155029,5.238281,-7.210938,-2.224609
3898,76031,Which type of sentence is this?\nBecause most ...,simple,compound-complex,complex,compound,the ways the organism behaves,C,additional_data/ScienceQA/test.parquet,valid,...,"#Woods Cree\n For example, the sentence ""the c...",#English compound\n The term compound verb was...,3898,skip_index,7,-1.751953,-6.832031,-6.144531,-5.039062,-1.845703


In [181]:
ret_scores = []
option_to_index = {option: idx for idx, option in enumerate('ABCDE')}
for group, w_df in df_tests2.groupby("tta_id"):
    ret_scores.append({
        "tta_id": group,
        "score": map_at_3(
            (
                w_df.groupby("data_id")[["pred_A", "pred_B", "pred_C", "pred_D", "pred_E"]].max() * 2/3 +
                w_df.groupby("data_id")[["pred_A", "pred_B", "pred_C", "pred_D", "pred_E"]].mean() * 1/10 + 
                w_df[w_df["tta_no"] <= 1].groupby("data_id")[["pred_A", "pred_B", "pred_C", "pred_D", "pred_E"]].mean() * 1/3
            ),
            w_df.groupby("data_id")["answer"].head(1).map(option_to_index).values
        )
    })
pd.DataFrame(ret_scores)

Unnamed: 0,tta_id,score
0,first_order,0.885513
1,order,0.876709
2,skip_index,0.878419


In [106]:

ret_scores = []
for th in [0, 1, 2, 3, 4, 5, 6]:
    predictions = []
    w_df = df_tests2[df_tests2["tta_id"] == "first_order"]
    for data_id, ww_df in tqdm.tqdm(w_df.groupby("data_id")):
        pred_final = ww_df.iloc[0][["pred_A", "pred_B", "pred_C", "pred_D", "pred_E"]].values

        for i in range(len(ww_df)):
            pred = ww_df.iloc[i][["pred_A", "pred_B", "pred_C", "pred_D", "pred_E"]].values
            if pred.max() > th:
                pred_final = pred
                continue
        predictions.append(pred_final)
    ret_scores.append({
        "tta_id": group,
        "th": th,
        "score": map_at_3(
            predictions, 
            w_df.groupby("data_id")["answer"].head(1).map(option_to_index).values
        )
    })
pd.DataFrame(ret_scores)

100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 3900/3900 [00:14<00:00, 260.62it/s]
100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 3900/3900 [00:14<00:00, 264.87it/s]
100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 3900/3900 [00:14<00:00, 264.08it/s]
100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 3900/3900 [00:15<00:00, 259.76it/s]
100%|███████████████████████████████████████████████████████

Unnamed: 0,tta_id,th,score
0,skip_index,0,0.864444
1,skip_index,1,0.868077
2,skip_index,2,0.871838
3,skip_index,3,0.875043
4,skip_index,4,0.875342
5,skip_index,5,0.876538
6,skip_index,6,0.875726


In [108]:

ret_scores = []
for th in [0, 1, 2, 3, 4, 5, 6]:
    predictions = []
    w_df = df_tests2[df_tests2["tta_id"] == "first_order"]
    for data_id, ww_df in tqdm.tqdm(w_df.groupby("data_id")):
        pred_final = ww_df[["pred_A", "pred_B", "pred_C", "pred_D", "pred_E"]].max().values

        for i in range(len(ww_df)):
            pred = ww_df.iloc[i][["pred_A", "pred_B", "pred_C", "pred_D", "pred_E"]].values
            if pred.max() > th:
                pred_final = pred
                continue
        predictions.append(pred_final)
    ret_scores.append({
        "tta_id": group,
        "th": th,
        "score": map_at_3(
            predictions, 
            w_df.groupby("data_id")["answer"].head(1).map(option_to_index).values
        )
    })
pd.DataFrame(ret_scores)

100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 3900/3900 [00:15<00:00, 249.40it/s]
100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 3900/3900 [00:15<00:00, 250.63it/s]
100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 3900/3900 [00:15<00:00, 249.97it/s]
100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 3900/3900 [00:15<00:00, 252.97it/s]
100%|███████████████████████████████████████████████████████

Unnamed: 0,tta_id,th,score
0,skip_index,0,0.863162
1,skip_index,1,0.867393
2,skip_index,2,0.871026
3,skip_index,3,0.875256
4,skip_index,4,0.877308
5,skip_index,5,0.879274
6,skip_index,6,0.880043


In [93]:
ww_df

Unnamed: 0.1,Unnamed: 0,prompt,A,B,C,D,E,answer,source,dataset,...,searched_wiki_id_48,searched_wiki_id_49,data_id,tta_id,tta_no,pred_A,pred_B,pred_C,pred_D,pred_E
4199,76332,Which of the following statements is the most ...,The Toba supereruption caused a volcanic winte...,The Toba supereruption had no impact on the cl...,Ash from the Toba supereruption in Lake Malawi...,The Toba supereruption caused a significant co...,The Toba supereruption had no effect on any cl...,C,20230924083304_gpt5,valid,...,#Toba catastrophe theory\n Computational ash d...,#Impact winter\n If the asteroid hit an ocean ...,4199,skip_index,0,-2.082031,-3.876953,3.318359,-2.525391,-2.984375
4199,76332,Which of the following statements is the most ...,The Toba supereruption caused a volcanic winte...,The Toba supereruption had no impact on the cl...,Ash from the Toba supereruption in Lake Malawi...,The Toba supereruption caused a significant co...,The Toba supereruption had no effect on any cl...,C,20230924083304_gpt5,valid,...,#Toba catastrophe theory\n Computational ash d...,#Impact winter\n If the asteroid hit an ocean ...,4199,skip_index,1,-0.141357,-1.822266,3.072266,-2.068359,-3.417969
4199,76332,Which of the following statements is the most ...,The Toba supereruption caused a volcanic winte...,The Toba supereruption had no impact on the cl...,Ash from the Toba supereruption in Lake Malawi...,The Toba supereruption caused a significant co...,The Toba supereruption had no effect on any cl...,C,20230924083304_gpt5,valid,...,#Toba catastrophe theory\n Computational ash d...,#Impact winter\n If the asteroid hit an ocean ...,4199,skip_index,2,-0.835449,-2.949219,3.462891,-1.401367,-2.455078
4199,76332,Which of the following statements is the most ...,The Toba supereruption caused a volcanic winte...,The Toba supereruption had no impact on the cl...,Ash from the Toba supereruption in Lake Malawi...,The Toba supereruption caused a significant co...,The Toba supereruption had no effect on any cl...,C,20230924083304_gpt5,valid,...,#Toba catastrophe theory\n Computational ash d...,#Impact winter\n If the asteroid hit an ocean ...,4199,skip_index,3,-0.525879,-5.128906,-0.990723,1.044922,-5.324219
4199,76332,Which of the following statements is the most ...,The Toba supereruption caused a volcanic winte...,The Toba supereruption had no impact on the cl...,Ash from the Toba supereruption in Lake Malawi...,The Toba supereruption caused a significant co...,The Toba supereruption had no effect on any cl...,C,20230924083304_gpt5,valid,...,#Toba catastrophe theory\n Computational ash d...,#Impact winter\n If the asteroid hit an ocean ...,4199,skip_index,4,-0.431885,-2.419922,3.884766,-1.599609,-2.185547
4199,76332,Which of the following statements is the most ...,The Toba supereruption caused a volcanic winte...,The Toba supereruption had no impact on the cl...,Ash from the Toba supereruption in Lake Malawi...,The Toba supereruption caused a significant co...,The Toba supereruption had no effect on any cl...,C,20230924083304_gpt5,valid,...,#Toba catastrophe theory\n Computational ash d...,#Impact winter\n If the asteroid hit an ocean ...,4199,skip_index,5,-1.055664,-1.759766,3.638672,-1.146484,-1.402344
4199,76332,Which of the following statements is the most ...,The Toba supereruption caused a volcanic winte...,The Toba supereruption had no impact on the cl...,Ash from the Toba supereruption in Lake Malawi...,The Toba supereruption caused a significant co...,The Toba supereruption had no effect on any cl...,C,20230924083304_gpt5,valid,...,#Toba catastrophe theory\n Computational ash d...,#Impact winter\n If the asteroid hit an ocean ...,4199,skip_index,6,-1.682617,-3.505859,3.482422,-2.96875,-3.251953
4199,76332,Which of the following statements is the most ...,The Toba supereruption caused a volcanic winte...,The Toba supereruption had no impact on the cl...,Ash from the Toba supereruption in Lake Malawi...,The Toba supereruption caused a significant co...,The Toba supereruption had no effect on any cl...,C,20230924083304_gpt5,valid,...,#Toba catastrophe theory\n Computational ash d...,#Impact winter\n If the asteroid hit an ocean ...,4199,skip_index,7,1.264648,-2.806641,-0.122498,-2.507812,-2.384766


In [None]:

ret_scores = []
predictions = []
th = 1
for group, w_df in df_tests2.groupby("tta_id"):
    for data_id, ww_df in tqdm.tqdm(w_df.groupby("data_id")):
        pred_final = ww_df.iloc[0][["pred_A", "pred_B", "pred_C", "pred_D", "pred_E"]].values

        for i in range(len(ww_df)):
            pred = ww_df.iloc[i][["pred_A", "pred_B", "pred_C", "pred_D", "pred_E"]].values
            if pred.max() > th:
                pred_final = pred
                break
        predictions.append(pred_final)
    ret_scores.append({
        "tta_id": group,
        "score": map_at_3(
            predictions, 
            w_df.groupby("data_id")["answer"].head(1).map(option_to_index).values
        )
    })
pd.DataFrame(ret_scores)

Unnamed: 0,tta_id,tta_no,score
0,first_order,0,0.867738
1,first_order,1,0.855159
2,first_order,2,0.854643
3,first_order,3,0.855595
4,first_order,4,0.85
5,first_order,5,0.850476
6,first_order,6,0.848254
7,first_order,7,0.845198
8,order,0,0.867738
9,order,1,0.796032


In [11]:
ret.append(calc_map3(
    model_name="../output/stage2/exp005.py/20230925235801_new_data_all300val_maxlen256_4epochs/checkpoint-13000/",
    max_length=384,
    num_content=3,
))

2023-09-26 06:42:17,023 INFO load data
2023-09-26 06:42:17,023 INFO load data
2023-09-26 06:42:17,023 INFO load data


  0%|          | 0/4200 [00:00<?, ?ex/s]

  0%|                                                  | 0/4200 [00:00<?, ?it/s]You're using a DebertaV2TokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.
100%|███████████████████████████████████████| 4200/4200 [03:39<00:00, 19.14it/s]


0.8781349206349229


In [12]:
ret.append(calc_map3(
    model_name="../output/stage2/exp005.py/20230925235801_new_data_all300val_maxlen256_4epochs/checkpoint-13000/",
    max_length=512,
    num_content=4,
))

2023-09-26 06:46:27,481 INFO load data
2023-09-26 06:46:27,481 INFO load data
2023-09-26 06:46:27,481 INFO load data
2023-09-26 06:46:27,481 INFO load data


  0%|          | 0/4200 [00:00<?, ?ex/s]

  0%|                                                  | 0/4200 [00:00<?, ?it/s]You're using a DebertaV2TokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.
100%|███████████████████████████████████████| 4200/4200 [04:36<00:00, 15.21it/s]


0.8803174603174629


In [13]:
ret.append(calc_map3(
    model_name="../output/stage2/exp005.py/20230925235801_new_data_all300val_maxlen256_4epochs/checkpoint-13000/",
    max_length=768,
    num_content=7,
))

2023-09-26 06:51:37,665 INFO load data
2023-09-26 06:51:37,665 INFO load data
2023-09-26 06:51:37,665 INFO load data
2023-09-26 06:51:37,665 INFO load data
2023-09-26 06:51:37,665 INFO load data


  0%|          | 0/4200 [00:00<?, ?ex/s]

  0%|                                                  | 0/4200 [00:00<?, ?it/s]You're using a DebertaV2TokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.
100%|███████████████████████████████████████| 4200/4200 [09:32<00:00,  7.33it/s]


0.8750793650793676


In [15]:
df = pd.DataFrame(ret)

In [16]:
df

Unnamed: 0,model_name,max_length,num_content,map3,predictions
0,../output/stage2/exp005.py/20230925235801_new_...,256,2,0.871944,"[[-11.45, -9.97, -6.613, -8.914, -10.195], [-8..."
1,../output/stage2/exp005.py/20230925235801_new_...,384,3,0.878135,"[[-11.26, -10.39, -5.52, -8.484, -9.77], [-10...."
2,../output/stage2/exp005.py/20230925235801_new_...,512,4,0.880317,"[[-10.94, -10.12, -4.906, -7.586, -9.38], [-10..."
3,../output/stage2/exp005.py/20230925235801_new_...,768,7,0.875079,"[[-10.52, -9.93, -3.178, -6.89, -8.91], [-9.05..."


In [19]:
df_test = pd.read_parquet("../output/context_pipeline/stage1/exp009.py/20230925104920_gte-base_wikiall_without_sep_targetprompt_and_choice_without_sep_token_length120_stride_sentence4_drop_categoryTrue_all/valid.parquet")
option_to_index = {option: idx for idx, option in enumerate('ABCDE')}
label = df_test["answer"].map(option_to_index).values


In [20]:
ret2 = []
for i in range(len(df)):
    series = df.iloc[i]
    df_test_ = df_test.copy()
    df_test_["pred"] = [x for x in series["predictions"]]
    
    for source, w_df in df_test_.groupby("source"):
        print(source)
        option_to_index = {option: idx for idx, option in enumerate('ABCDE')}
        label = w_df["answer"].map(option_to_index).values
        test_predictions = np.stack(w_df["pred"].values)
        ret2.append({
            "model_name": series["model_name"],
            "max_length": series["max_length"],
            "num_content": series["num_content"],
            "source": source,
            "map3": map_at_3(test_predictions, label)
        })

1
11
2
20230924083304_gpt5
3
4
5
6
7
8
9
additional_data/MMLU/test.csv
additional_data/OpenBookQA-V1-Sep2018/OpenBookQA-V1-Sep2018/Data/Main/train.tsv
additional_data/ScienceQA/test.parquet
1
11
2
20230924083304_gpt5
3
4
5
6
7
8
9
additional_data/MMLU/test.csv
additional_data/OpenBookQA-V1-Sep2018/OpenBookQA-V1-Sep2018/Data/Main/train.tsv
additional_data/ScienceQA/test.parquet
1
11
2
20230924083304_gpt5
3
4
5
6
7
8
9
additional_data/MMLU/test.csv
additional_data/OpenBookQA-V1-Sep2018/OpenBookQA-V1-Sep2018/Data/Main/train.tsv
additional_data/ScienceQA/test.parquet
1
11
2
20230924083304_gpt5
3
4
5
6
7
8
9
additional_data/MMLU/test.csv
additional_data/OpenBookQA-V1-Sep2018/OpenBookQA-V1-Sep2018/Data/Main/train.tsv
additional_data/ScienceQA/test.parquet


In [21]:
df_ret2 = pd.DataFrame(ret2)

In [22]:
df_ret2.sort_values(["model_name", "source", "max_length"])

Unnamed: 0,model_name,max_length,num_content,source,map3
0,../output/stage2/exp005.py/20230925235801_new_...,256,2,1,0.868889
14,../output/stage2/exp005.py/20230925235801_new_...,384,3,1,0.875556
28,../output/stage2/exp005.py/20230925235801_new_...,512,4,1,0.873889
42,../output/stage2/exp005.py/20230925235801_new_...,768,7,1,0.872222
1,../output/stage2/exp005.py/20230925235801_new_...,256,2,11,0.941667
15,../output/stage2/exp005.py/20230925235801_new_...,384,3,11,0.942778
29,../output/stage2/exp005.py/20230925235801_new_...,512,4,11,0.951111
43,../output/stage2/exp005.py/20230925235801_new_...,768,7,11,0.952778
2,../output/stage2/exp005.py/20230925235801_new_...,256,2,2,0.892778
16,../output/stage2/exp005.py/20230925235801_new_...,384,3,2,0.903889


In [23]:
df.to_pickle("20230926_ret.pickle")
df_ret2.to_pickle("20230926_ret2.pickle")

In [24]:
df_ret2.pivot_table(columns=["model_name", "source"], index="max_length", values="map3")

model_name,../output/stage2/exp005.py/20230925235801_new_data_all300val_maxlen256_4epochs/checkpoint-13000/,../output/stage2/exp005.py/20230925235801_new_data_all300val_maxlen256_4epochs/checkpoint-13000/,../output/stage2/exp005.py/20230925235801_new_data_all300val_maxlen256_4epochs/checkpoint-13000/,../output/stage2/exp005.py/20230925235801_new_data_all300val_maxlen256_4epochs/checkpoint-13000/,../output/stage2/exp005.py/20230925235801_new_data_all300val_maxlen256_4epochs/checkpoint-13000/,../output/stage2/exp005.py/20230925235801_new_data_all300val_maxlen256_4epochs/checkpoint-13000/,../output/stage2/exp005.py/20230925235801_new_data_all300val_maxlen256_4epochs/checkpoint-13000/,../output/stage2/exp005.py/20230925235801_new_data_all300val_maxlen256_4epochs/checkpoint-13000/,../output/stage2/exp005.py/20230925235801_new_data_all300val_maxlen256_4epochs/checkpoint-13000/,../output/stage2/exp005.py/20230925235801_new_data_all300val_maxlen256_4epochs/checkpoint-13000/,../output/stage2/exp005.py/20230925235801_new_data_all300val_maxlen256_4epochs/checkpoint-13000/,../output/stage2/exp005.py/20230925235801_new_data_all300val_maxlen256_4epochs/checkpoint-13000/,../output/stage2/exp005.py/20230925235801_new_data_all300val_maxlen256_4epochs/checkpoint-13000/,../output/stage2/exp005.py/20230925235801_new_data_all300val_maxlen256_4epochs/checkpoint-13000/
source,1,11,2,20230924083304_gpt5,3,4,5,6,7,8,9,additional_data/MMLU/test.csv,additional_data/OpenBookQA-V1-Sep2018/OpenBookQA-V1-Sep2018/Data/Main/train.tsv,additional_data/ScienceQA/test.parquet
max_length,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2,Unnamed: 13_level_2,Unnamed: 14_level_2
256,0.868889,0.941667,0.892778,0.789444,0.86,0.880556,0.936667,0.883333,0.88,0.836667,0.795556,0.741667,0.954444,0.945556
384,0.875556,0.942778,0.903889,0.798333,0.861111,0.887222,0.932222,0.895556,0.884444,0.846667,0.815,0.745556,0.961111,0.944444
512,0.873889,0.951111,0.906111,0.787778,0.852778,0.901667,0.938333,0.889444,0.897222,0.848333,0.828889,0.742778,0.96,0.946111
768,0.872222,0.952778,0.904444,0.789444,0.840556,0.882778,0.918889,0.876111,0.885,0.848889,0.827222,0.755,0.951111,0.946667


In [25]:
df_ret2.pivot_table(index=["model_name", "max_length"], columns="source", values="map3")

Unnamed: 0_level_0,source,1,11,2,20230924083304_gpt5,3,4,5,6,7,8,9,additional_data/MMLU/test.csv,additional_data/OpenBookQA-V1-Sep2018/OpenBookQA-V1-Sep2018/Data/Main/train.tsv,additional_data/ScienceQA/test.parquet
model_name,max_length,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1
../output/stage2/exp005.py/20230925235801_new_data_all300val_maxlen256_4epochs/checkpoint-13000/,256,0.868889,0.941667,0.892778,0.789444,0.86,0.880556,0.936667,0.883333,0.88,0.836667,0.795556,0.741667,0.954444,0.945556
../output/stage2/exp005.py/20230925235801_new_data_all300val_maxlen256_4epochs/checkpoint-13000/,384,0.875556,0.942778,0.903889,0.798333,0.861111,0.887222,0.932222,0.895556,0.884444,0.846667,0.815,0.745556,0.961111,0.944444
../output/stage2/exp005.py/20230925235801_new_data_all300val_maxlen256_4epochs/checkpoint-13000/,512,0.873889,0.951111,0.906111,0.787778,0.852778,0.901667,0.938333,0.889444,0.897222,0.848333,0.828889,0.742778,0.96,0.946111
../output/stage2/exp005.py/20230925235801_new_data_all300val_maxlen256_4epochs/checkpoint-13000/,768,0.872222,0.952778,0.904444,0.789444,0.840556,0.882778,0.918889,0.876111,0.885,0.848889,0.827222,0.755,0.951111,0.946667


In [26]:
df_ret2.pivot_table(index=["model_name", "max_length"], columns="source", values="map3")

Unnamed: 0_level_0,source,1,11,2,20230924083304_gpt5,3,4,5,6,7,8,9,additional_data/MMLU/test.csv,additional_data/OpenBookQA-V1-Sep2018/OpenBookQA-V1-Sep2018/Data/Main/train.tsv,additional_data/ScienceQA/test.parquet
model_name,max_length,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1
../output/stage2/exp005.py/20230925235801_new_data_all300val_maxlen256_4epochs/checkpoint-13000/,256,0.868889,0.941667,0.892778,0.789444,0.86,0.880556,0.936667,0.883333,0.88,0.836667,0.795556,0.741667,0.954444,0.945556
../output/stage2/exp005.py/20230925235801_new_data_all300val_maxlen256_4epochs/checkpoint-13000/,384,0.875556,0.942778,0.903889,0.798333,0.861111,0.887222,0.932222,0.895556,0.884444,0.846667,0.815,0.745556,0.961111,0.944444
../output/stage2/exp005.py/20230925235801_new_data_all300val_maxlen256_4epochs/checkpoint-13000/,512,0.873889,0.951111,0.906111,0.787778,0.852778,0.901667,0.938333,0.889444,0.897222,0.848333,0.828889,0.742778,0.96,0.946111
../output/stage2/exp005.py/20230925235801_new_data_all300val_maxlen256_4epochs/checkpoint-13000/,768,0.872222,0.952778,0.904444,0.789444,0.840556,0.882778,0.918889,0.876111,0.885,0.848889,0.827222,0.755,0.951111,0.946667


In [27]:
df_ret2[df_ret2["source"].isin(["1", "2", "3", "4", "5", "6", "7", "8"])].groupby(["model_name", "max_length"])["map3"].mean()

model_name                                                                                        max_length
../output/stage2/exp005.py/20230925235801_new_data_all300val_maxlen256_4epochs/checkpoint-13000/  256           0.879861
                                                                                                  384           0.885833
                                                                                                  512           0.888472
                                                                                                  768           0.878611
Name: map3, dtype: float64

In [55]:
df_ret2[df_ret2["source"].isin(["1", "2", "3", "4", "5", "6", "7", "8"])].groupby(["model_name", "max_length"])["map3"].mean()

model_name                                                                     max_length
../output/stage2/exp005.py/20230923195407_new_data_all300val_maxlen256/fold0/  256           0.859722
                                                                               384           0.864931
                                                                               512           0.862153
                                                                               768           0.863750
../output/stage2/exp005.py/20230924021914_new_data_all300val_maxlen384/fold0/  384           0.867014
                                                                               512           0.865417
                                                                               768           0.863264
../output/stage2/exp005.py/20230924084819_new_data_all300val_maxlen512/fold0/  512           0.867222
                                                                               768           0

In [56]:
df_ret2.groupby(["model_name", "max_length"])["map3"].mean()

model_name                                                                     max_length
../output/stage2/exp005.py/20230923195407_new_data_all300val_maxlen256/fold0/  256           0.867009
                                                                               384           0.869444
                                                                               512           0.867863
                                                                               768           0.869103
../output/stage2/exp005.py/20230924021914_new_data_all300val_maxlen384/fold0/  384           0.871111
                                                                               512           0.869573
                                                                               768           0.869060
../output/stage2/exp005.py/20230924084819_new_data_all300val_maxlen512/fold0/  512           0.872692
                                                                               768           0

In [72]:
df_test[df_test["A"].apply(len) < 30]["source"].value_counts()

11                                                                                 273
additional_data/OpenBookQA-V1-Sep2018/OpenBookQA-V1-Sep2018/Data/Main/train.tsv    261
9                                                                                  215
additional_data/ScienceQA/test.parquet                                             184
additional_data/MMLU/test.csv                                                      180
8                                                                                  143
7                                                                                  132
4                                                                                  100
3                                                                                   91
2                                                                                   69
5                                                                                   67
1                                          

In [69]:
ret3 = []
for i in range(len(df)):
    series = df.iloc[i]
    df_test_ = df_test.copy()
    df_test_["pred"] = [x for x in series["predictions"]]
    df_test_ = df_test_[df_test_["A"].apply(len) < 20]
    
    for source, w_df in df_test_.groupby("source"):
        print(source)
        option_to_index = {option: idx for idx, option in enumerate('ABCDE')}
        label = w_df["answer"].map(option_to_index).values
        test_predictions = np.stack(w_df["pred"].values)
        ret3.append({
            "model_name": series["model_name"],
            "max_length": series["max_length"],
            "num_content": series["num_content"],
            "source": source,
            "map3": map_at_3(test_predictions, label)
        })

1
11
2
3
4
5
6
7
8
9
additional_data/MMLU/test.csv
additional_data/OpenBookQA-V1-Sep2018/OpenBookQA-V1-Sep2018/Data/Main/train.tsv
additional_data/ScienceQA/test.parquet
1
11
2
3
4
5
6
7
8
9
additional_data/MMLU/test.csv
additional_data/OpenBookQA-V1-Sep2018/OpenBookQA-V1-Sep2018/Data/Main/train.tsv
additional_data/ScienceQA/test.parquet
1
11
2
3
4
5
6
7
8
9
additional_data/MMLU/test.csv
additional_data/OpenBookQA-V1-Sep2018/OpenBookQA-V1-Sep2018/Data/Main/train.tsv
additional_data/ScienceQA/test.parquet
1
11
2
3
4
5
6
7
8
9
additional_data/MMLU/test.csv
additional_data/OpenBookQA-V1-Sep2018/OpenBookQA-V1-Sep2018/Data/Main/train.tsv
additional_data/ScienceQA/test.parquet
1
11
2
3
4
5
6
7
8
9
additional_data/MMLU/test.csv
additional_data/OpenBookQA-V1-Sep2018/OpenBookQA-V1-Sep2018/Data/Main/train.tsv
additional_data/ScienceQA/test.parquet
1
11
2
3
4
5
6
7
8
9
additional_data/MMLU/test.csv
additional_data/OpenBookQA-V1-Sep2018/OpenBookQA-V1-Sep2018/Data/Main/train.tsv
additional_data/Sci

In [70]:
df_ret3 = pd.DataFrame(ret3)

In [71]:
df_ret3.pivot_table(index=["model_name", "max_length"], columns="source", values="map3")

Unnamed: 0_level_0,source,1,11,2,3,4,5,6,7,8,9,additional_data/MMLU/test.csv,additional_data/OpenBookQA-V1-Sep2018/OpenBookQA-V1-Sep2018/Data/Main/train.tsv,additional_data/ScienceQA/test.parquet
model_name,max_length,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1
../output/stage2/exp005.py/20230923195407_new_data_all300val_maxlen256/fold0/,256,0.913194,0.959302,0.833333,0.813853,0.897287,0.859649,0.825758,0.792517,0.831349,0.853682,0.681306,0.938272,0.936559
../output/stage2/exp005.py/20230923195407_new_data_all300val_maxlen256/fold0/,384,0.916667,0.95801,0.884615,0.831169,0.918605,0.856725,0.82197,0.82483,0.821429,0.843992,0.689189,0.932099,0.930108
../output/stage2/exp005.py/20230923195407_new_data_all300val_maxlen256/fold0/,512,0.90625,0.958656,0.852564,0.82684,0.916667,0.812865,0.80303,0.836735,0.819444,0.854651,0.693694,0.93287,0.932258
../output/stage2/exp005.py/20230923195407_new_data_all300val_maxlen256/fold0/,768,0.909722,0.965116,0.86859,0.84632,0.922481,0.809942,0.814394,0.857143,0.839286,0.835271,0.685811,0.940586,0.935484
../output/stage2/exp005.py/20230924021914_new_data_all300val_maxlen384/fold0/,384,0.909722,0.955426,0.846154,0.813853,0.918605,0.865497,0.814394,0.82483,0.84127,0.84593,0.706081,0.928241,0.933333
../output/stage2/exp005.py/20230924021914_new_data_all300val_maxlen384/fold0/,512,0.899306,0.955426,0.865385,0.820346,0.926357,0.845029,0.814394,0.831633,0.84127,0.832364,0.706081,0.933642,0.933333
../output/stage2/exp005.py/20230924021914_new_data_all300val_maxlen384/fold0/,768,0.920139,0.965116,0.875,0.820346,0.926357,0.833333,0.787879,0.853741,0.849206,0.827519,0.688063,0.9375,0.937634
../output/stage2/exp005.py/20230924084819_new_data_all300val_maxlen512/fold0/,512,0.913194,0.966408,0.858974,0.82684,0.905039,0.839181,0.810606,0.846939,0.849206,0.843023,0.726351,0.934414,0.937634
../output/stage2/exp005.py/20230924084819_new_data_all300val_maxlen512/fold0/,768,0.909722,0.967054,0.865385,0.865801,0.906977,0.812865,0.799242,0.816327,0.847222,0.828488,0.712838,0.936728,0.941935
