Skip to content

Commit

Permalink
Add circular eval (open-compass#610)
Browse files Browse the repository at this point in the history
* refactor default, add circular summarizer

* add circular

* update impl

* update doc

* minor update

* no more to be added
  • Loading branch information
Leymore authored and BunnyRunnerX committed Nov 23, 2023
1 parent 2e5c912 commit ddf1d8c
Show file tree
Hide file tree
Showing 12 changed files with 919 additions and 144 deletions.
91 changes: 91 additions & 0 deletions configs/eval_circular.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,91 @@
from mmengine.config import read_base
from opencompass.datasets.circular import (CircularCEvalDataset, CircularMMLUDataset, CircularCMMLUDataset, CircularCSQADataset,
CircularARCDataset, CircularHSWAGDataset, CircularOBQADataset, CircularRaceDataset, CircularEvaluator)
from opencompass.summarizers import CircularSummarizer

with read_base():
from .datasets.ceval.ceval_gen_5f30c7 import ceval_datasets
from .datasets.mmlu.mmlu_gen_a484b3 import mmlu_datasets
from .datasets.cmmlu.cmmlu_gen_c13365 import cmmlu_datasets
from .datasets.hellaswag.hellaswag_gen_6faab5 import hellaswag_datasets
from .datasets.ARC_e.ARC_e_gen_1e0de5 import ARC_e_datasets
from .datasets.ARC_c.ARC_c_gen_1e0de5 import ARC_c_datasets
from .datasets.commonsenseqa.commonsenseqa_gen_1da2d0 import commonsenseqa_datasets
from .datasets.obqa.obqa_gen_9069e4 import obqa_datasets
from .datasets.race.race_gen_69ee4f import race_datasets

from .models.hf_internlm.hf_internlm_chat_7b import models as hf_internlm_chat_7b_model
from .models.hf_internlm.hf_internlm_chat_20b import models as hf_internlm_chat_20b_model
from .models.qwen.hf_qwen_7b_chat import models as hf_qwen_7b_chat_model
from .models.qwen.hf_qwen_14b_chat import models as hf_qwen_14b_chat_model

from .summarizers.groups.mmlu import mmlu_summary_groups
from .summarizers.groups.cmmlu import cmmlu_summary_groups
from .summarizers.groups.ceval import ceval_summary_groups

for ds, t in [
(ceval_datasets, CircularCEvalDataset),
(mmlu_datasets, CircularMMLUDataset),
(cmmlu_datasets, CircularCMMLUDataset),
(hellaswag_datasets, CircularHSWAGDataset),
(ARC_e_datasets, CircularARCDataset),
(ARC_c_datasets, CircularARCDataset),
(commonsenseqa_datasets, CircularCSQADataset),
(obqa_datasets, CircularOBQADataset),
(race_datasets, CircularRaceDataset),
]:
for d in ds:
d['type'] = t
d['abbr'] = d['abbr'] + '-circular-4'
d['eval_cfg']['evaluator'] = {'type': CircularEvaluator, 'circular_pattern': 'circular'}
d['circular_patterns'] = 'circular'


datasets = sum([v for k, v in locals().items() if k.endswith("_datasets") or k == 'datasets'], [])
models = sum([v for k, v in locals().items() if k.endswith("_model")], [])

# config summarizer
other_summary_groups = [
{'name': 'average',
'subsets': ['ceval', 'mmlu', 'cmmlu', 'hellaswag', 'ARC-e', 'ARC-c', 'commonsense_qa', 'openbookqa_fact', 'race-middle', 'race-high']},
]
origin_summary_groups = sum([v for k, v in locals().items() if k.endswith("_summary_groups")], [])
new_summary_groups = []
for item in origin_summary_groups:
new_summary_groups.append(
{
'name': item['name'] + '-circular-4',
'subsets': [i + '-circular-4' for i in item['subsets']],
}
)
summarizer = dict(
type=CircularSummarizer,
metric_types=['acc_origin', 'perf_circular'],
dataset_abbrs = [
'average-circular-4',
'ceval-circular-4',
'mmlu-circular-4',
'cmmlu-circular-4',
'hellaswag-circular-4',
'ARC-e-circular-4',
'ARC-c-circular-4',
'commonsense_qa-circular-4',
'openbookqa_fact-circular-4',
'race-middle-circular-4',
'race-high-circular-4',
'ceval-humanities-circular-4',
'ceval-stem-circular-4',
'ceval-social-science-circular-4',
'ceval-other-circular-4',
'mmlu-humanities-circular-4',
'mmlu-stem-circular-4',
'mmlu-social-science-circular-4',
'mmlu-other-circular-4',
'cmmlu-humanities-circular-4',
'cmmlu-stem-circular-4',
'cmmlu-social-science-circular-4',
'cmmlu-other-circular-4',
'cmmlu-china-specific-circular-4',
],
summary_groups=new_summary_groups,
)
113 changes: 113 additions & 0 deletions docs/en/advanced_guides/circular_eval.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,113 @@
# CircularEval

## Background

For multiple-choice questions, when a Language Model (LLM) provides the correct option, it does not necessarily imply a true understanding and reasoning of the question. It could be a guess. To differentiate these scenarios and reduce LLM bias towards options, CircularEval (CircularEval) can be utilized. A multiple-choice question is augmented by shuffling its options, and if the LLM correctly answers all variations of the augmented question, it is considered correct under CircularEval.

## Adding Your Own CircularEval Dataset

Generally, to evaluate a dataset using CircularEval, both its loading and evaluation methods need to be rewritten. Modifications are required in both the OpenCompass main library and configuration files. We will use C-Eval as an example for explanation.

OpenCompass main library:

```python
from opencompass.datasets.ceval import CEvalDataset
from opencompass.datasets.circular import CircularDatasetMeta

class CircularCEvalDataset(CEvalDataset, metaclass=CircularDatasetMeta):
# The overloaded dataset class
dataset_class = CEvalDataset

# Splits of the DatasetDict that need CircularEval. For CEvalDataset, which loads [dev, val, test], we only need 'val' and 'test' for CircularEval, not 'dev'
default_circular_splits = ['val', 'test']

# List of keys to be shuffled
default_option_keys = ['A', 'B', 'C', 'D']

# If the content of 'answer_key' is one of ['A', 'B', 'C', 'D'], representing the correct answer. This field indicates how to update the correct answer after shuffling options. Choose either this or default_answer_key_switch_method
default_answer_key = 'answer'

# If 'answer_key' content is not one of ['A', 'B', 'C', 'D'], a function can be used to specify the correct answer after shuffling options. Choose either this or default_answer_key
# def default_answer_key_switch_method(item, circular_pattern):
# # 'item' is the original data item
# # 'circular_pattern' is a tuple indicating the order after shuffling options, e.g., ('D', 'A', 'B', 'C') means the original option A is now D, and so on
# item['answer'] = circular_pattern['ABCD'.index(item['answer'])]
# return item
```

`CircularCEvalDataset` accepts the `circular_pattern` parameter with two values:

- `circular`: Indicates a single cycle. It is the default value. ABCD is expanded to ABCD, BCDA, CDAB, DABC, a total of 4 variations.
- `all_possible`: Indicates all permutations. ABCD is expanded to ABCD, ABDC, ACBD, ACDB, ADBC, ADCB, BACD, ..., a total of 24 variations.

Additionally, we provide a `CircularEvaluator` to replace `AccEvaluator`. This Evaluator also accepts `circular_pattern`, and it should be consistent with the above. It produces the following metrics:

- `acc_{origin|circular|all_possible}`: Treating each question with shuffled options as separate, calculating accuracy.
- `perf_{origin|circular|all_possible}`: Following Circular logic, a question is considered correct only if all its variations with shuffled options are answered correctly, calculating accuracy.
- `more_{num}_{origin|circular|all_possible}`: According to Circular logic, a question is deemed correct if the number of its variations answered correctly is greater than or equal to num, calculating accuracy.

OpenCompass configuration file:

```python
from mmengine.config import read_base
from opencompass.datasets.circular import CircularCEvalDataset

with read_base():
from .datasets.ceval.ceval_gen_5f30c7 import ceval_datasets

for d in ceval_datasets:
# Overloading the load method
d['type'] = CircularCEvalDataset
# Renaming for differentiation from non-circular evaluation versions
d['abbr'] = d['abbr'] + '-circular-4'
# Overloading the evaluation method
d['eval_cfg']['evaluator'] = {'type': CircularEvaluator}

# The dataset after the above operations looks like this:
# dict(
# type=CircularCEvalDataset,
# path='./data/ceval/formal_ceval', # Unchanged
# name='computer_network', # Unchanged
# abbr='ceval-computer_network-circular-4',
# reader_cfg=dict(...), # Unchanged
# infer_cfg=dict(...), # Unchanged
# eval_cfg=dict(evaluator=dict(type=CircularEvaluator), ...),
# )
```

Additionally, for better presentation of results in CircularEval, consider using the following summarizer:

```python


from mmengine.config import read_base
from opencompass.summarizers import CircularSummarizer

with read_base():
from ...summarizers.groups.ceval.ceval_summary_groups

new_summary_groups = []
for item in ceval_summary_groups:
new_summary_groups.append(
{
'name': item['name'] + '-circular-4',
'subsets': [i + '-circular-4' for i in item['subsets']],
}
)

summarizer = dict(
type=CircularSummarizer,
# Select specific metrics to view
metric_types=['acc_origin', 'perf_circular'],
dataset_abbrs = [
'ceval-circular-4',
'ceval-humanities-circular-4',
'ceval-stem-circular-4',
'ceval-social-science-circular-4',
'ceval-other-circular-4',
],
summary_groups=new_summary_groups,
)
```

For more complex evaluation examples, refer to this sample code: https://github.com/open-compass/opencompass/tree/main/configs/eval_circular.py
1 change: 1 addition & 0 deletions docs/en/index.rst
Original file line number Diff line number Diff line change
Expand Up @@ -67,6 +67,7 @@ We always welcome *PRs* and *Issues* for the betterment of OpenCompass.
advanced_guides/prompt_attack.md
advanced_guides/longeval.md
advanced_guides/subjective_evaluation.md
advanced_guides/circular_eval.md

.. _Tools:
.. toctree::
Expand Down
111 changes: 111 additions & 0 deletions docs/zh_cn/advanced_guides/circular_eval.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,111 @@
# 循环评测

## 背景

对于选择题而言,当 LLM 给出正确的选项,并不一定代表着它能真正地理解题意并经过推理得出答案,它也有可能是蒙对的。为了将这两种情形区分开,同时也为了降低 LLM 对选项的偏见,我们可以尝试使用循环评测 (CircularEval)。我们会将一道选择题按照打乱选项的方式进行增广,若 LLM 可以在增广后的每道题上均得到正确的答案,那么我们认为在循环评测的意义下,这道题被做对了。

## 新增自己的循环评测数据集

一般来说,为了将一个数据集使用循环评测的方式进行评测,它的加载方式和评测方式是需要被重写的,OpenCompass 主库和配置文件均需要进行修改。后续我们以 C-Eval 为例进行讲解。

OpenCompass 主库:

```python
from opencompass.datasets.ceval import CEvalDataset
from opencompass.datasets.circular import CircularDatasetMeta

class CircularCEvalDataset(CEvalDataset, metaclass=CircularDatasetMeta):
# 被重载的数据集类
dataset_class = CEvalDataset

# 若原 load 方法得到一 DatasetDict,其哪些 split 需要被循环评测。CEvalDataset load 得到 [dev, val, test],我们只需要对 val 和 test 进行循环评测,dev 不需要
default_circular_splits = ['val', 'test']

# 需要被打乱的 key 列表
default_option_keys = ['A', 'B', 'C', 'D']

# 若 answer_key 的内容属于是 ['A', 'B', 'C', 'D'] 之一,并表示正确答案。该字段表示打乱选项后,需要如何更新正确答案。与 default_answer_key_switch_method 二选一
default_answer_key = 'answer'

# 如果 answer_key 的内容不属于 ['A', 'B', 'C', 'D'] 之一,那么可以使用函数的方式来指定打乱选项后的正确答案。与 default_answer_key 二选一
# def default_answer_key_switch_method(item, circular_pattern):
# # item 是原本的数据项
# # circular_pattern 是一个 tuple,表示打乱选项后的顺序,例如 ('D', 'A', 'B', 'C') 表示原来的 A 选项变成了 D,原来的 B 选项变成了 A,以此类推
# item['answer'] = circular_pattern['ABCD'.index(item['answer'])]
# return item
```

`CircularCEvalDataset` 会接受 `circular_pattern` 参数,它有两个取值:

- `circular`: 表示单项循环。默认为该值。ABCD 会被扩充为 ABCD, BCDA, CDAB, DABC, 共 4 种
- `all_possible`: 表示全排列。ABCD 会被扩充为 ABCD, ABDC, ACBD, ACDB, ADBC, ADCB, BACD, ..., 共 24 种

另外我们提供了一个 `CircularEvaluator` 用于替换 `AccEvaluator`,该 Evaluator 同样接受 `circular_pattern`,该参数应与上述保持一致。它会产出以下指标:

- `acc_{origin|circular|all_possible}`: 将打乱后选项顺序后的题目视作多道单独的题目,计算准确率
- `perf_{origin|circular|all_possible}`: 按照 circular 的逻辑,若选项打乱后的题目都回答正确,才会视为这道题正确,计算准确率
- `more_{num}_{origin|circular|all_possible}`: 按照 circular 的逻辑,若选项打乱后的题目回答正确的数量大于等于 num,就会视为这道题正确,计算准确率

OpenCompass 配置文件:

```python
from mmengine.config import read_base
from opencompass.datasets.circular import CircularCEvalDataset

with read_base():
from .datasets.ceval.ceval_gen_5f30c7 import ceval_datasets

for d in ceval_datasets:
# 重载 load 方法
d['type'] = CircularCEvalDataset
# 为了与非循环评测版本做区分而进行改名
d['abbr'] = d['abbr'] + '-circular-4'
# 重载评测方法
d['eval_cfg']['evaluator'] = {'type': CircularEvaluator}

# 上述操作后的 dataset 形如下:
# dict(
# type=CircularCEvalDataset,
# path='./data/ceval/formal_ceval', # 未改变
# name='computer_network', # 未改变
# abbr='ceval-computer_network-circular-4',
# reader_cfg=dict(...), # 未改变
# infer_cfg=dict(...), # 未改变
# eval_cfg=dict(evaluator=dict(type=CircularEvaluator), ...),
# )
```

另外评测时为了针对循环评测有更良好的结果呈现,建议考虑使用以下 summarizer

```python
from mmengine.config import read_base
from opencompass.summarizers import CircularSummarizer

with read_base():
from ...summarizers.groups.ceval import ceval_summary_groups

new_summary_groups = []
for item in ceval_summary_groups:
new_summary_groups.append(
{
'name': item['name'] + '-circular-4',
'subsets': [i + '-circular-4' for i in item['subsets']],
}
)

summarizer = dict(
type=CircularSummarizer,
# 选择具体看哪些指标
metric_types=['acc_origin', 'perf_circular'],
dataset_abbrs = [
'ceval-circular-4',
'ceval-humanities-circular-4',
'ceval-stem-circular-4',
'ceval-social-science-circular-4',
'ceval-other-circular-4',
],
summary_groups=new_summary_groups,
)
```

更多复杂的评测案例可以参考这个样例代码: https://github.com/open-compass/opencompass/tree/main/configs/eval_circular.py
1 change: 1 addition & 0 deletions docs/zh_cn/index.rst
Original file line number Diff line number Diff line change
Expand Up @@ -67,6 +67,7 @@ OpenCompass 上手路线
advanced_guides/prompt_attack.md
advanced_guides/longeval.md
advanced_guides/subjective_evaluation.md
advanced_guides/circular_eval.md

.. _工具:
.. toctree::
Expand Down
1 change: 1 addition & 0 deletions opencompass/datasets/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,7 @@
from .ceval import * # noqa: F401, F403
from .chid import * # noqa: F401, F403
from .cibench import * # noqa: F401, F403
from .circular import * # noqa: F401, F403
from .civilcomments import * # noqa: F401, F403
from .clozeTest_maxmin import * # noqa: F401, F403
from .cluewsc import * # noqa: F401, F403
Expand Down
37 changes: 12 additions & 25 deletions opencompass/datasets/arc.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,32 +14,19 @@ class ARCDataset(BaseDataset):
def load(path: str):
with open(path, 'r', errors='ignore') as in_f:
rows = []
for i, line in enumerate(in_f):
sample = json.loads(line.strip())
answerKey = sample['answerKey']
sample = sample['question']
question = sample['stem']
choices = sample['choices']
if len(choices) != 4:
for line in in_f:
item = json.loads(line.strip())
question = item['question']
if len(question['choices']) != 4:
continue
textA = choices[0]['text']
textB = choices[1]['text']
textC = choices[2]['text']
textD = choices[3]['text']
labels = [c['label'] for c in question['choices']]
answerKey = 'ABCD'[labels.index(item['answerKey'])]
rows.append({
'question': question,
'question': question['stem'],
'answerKey': answerKey,
'textA': textA,
'textB': textB,
'textC': textC,
'textD': textD
'textA': question['choices'][0]['text'],
'textB': question['choices'][1]['text'],
'textC': question['choices'][2]['text'],
'textD': question['choices'][3]['text'],
})
dataset = Dataset.from_dict({
'question': [row['question'] for row in rows],
'answerKey': [row['answerKey'] for row in rows],
'textA': [row['textA'] for row in rows],
'textB': [row['textB'] for row in rows],
'textC': [row['textC'] for row in rows],
'textD': [row['textD'] for row in rows]
})
return dataset
return Dataset.from_list(rows)
Loading

0 comments on commit ddf1d8c

Please sign in to comment.