forked from open-compass/opencompass
-
Notifications
You must be signed in to change notification settings - Fork 0
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Add circular eval (open-compass#610)
* refactor default, add circular summarizer * add circular * update impl * update doc * minor update * no more to be added
- Loading branch information
1 parent
2e5c912
commit ddf1d8c
Showing
12 changed files
with
919 additions
and
144 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,91 @@ | ||
from mmengine.config import read_base | ||
from opencompass.datasets.circular import (CircularCEvalDataset, CircularMMLUDataset, CircularCMMLUDataset, CircularCSQADataset, | ||
CircularARCDataset, CircularHSWAGDataset, CircularOBQADataset, CircularRaceDataset, CircularEvaluator) | ||
from opencompass.summarizers import CircularSummarizer | ||
|
||
with read_base(): | ||
from .datasets.ceval.ceval_gen_5f30c7 import ceval_datasets | ||
from .datasets.mmlu.mmlu_gen_a484b3 import mmlu_datasets | ||
from .datasets.cmmlu.cmmlu_gen_c13365 import cmmlu_datasets | ||
from .datasets.hellaswag.hellaswag_gen_6faab5 import hellaswag_datasets | ||
from .datasets.ARC_e.ARC_e_gen_1e0de5 import ARC_e_datasets | ||
from .datasets.ARC_c.ARC_c_gen_1e0de5 import ARC_c_datasets | ||
from .datasets.commonsenseqa.commonsenseqa_gen_1da2d0 import commonsenseqa_datasets | ||
from .datasets.obqa.obqa_gen_9069e4 import obqa_datasets | ||
from .datasets.race.race_gen_69ee4f import race_datasets | ||
|
||
from .models.hf_internlm.hf_internlm_chat_7b import models as hf_internlm_chat_7b_model | ||
from .models.hf_internlm.hf_internlm_chat_20b import models as hf_internlm_chat_20b_model | ||
from .models.qwen.hf_qwen_7b_chat import models as hf_qwen_7b_chat_model | ||
from .models.qwen.hf_qwen_14b_chat import models as hf_qwen_14b_chat_model | ||
|
||
from .summarizers.groups.mmlu import mmlu_summary_groups | ||
from .summarizers.groups.cmmlu import cmmlu_summary_groups | ||
from .summarizers.groups.ceval import ceval_summary_groups | ||
|
||
for ds, t in [ | ||
(ceval_datasets, CircularCEvalDataset), | ||
(mmlu_datasets, CircularMMLUDataset), | ||
(cmmlu_datasets, CircularCMMLUDataset), | ||
(hellaswag_datasets, CircularHSWAGDataset), | ||
(ARC_e_datasets, CircularARCDataset), | ||
(ARC_c_datasets, CircularARCDataset), | ||
(commonsenseqa_datasets, CircularCSQADataset), | ||
(obqa_datasets, CircularOBQADataset), | ||
(race_datasets, CircularRaceDataset), | ||
]: | ||
for d in ds: | ||
d['type'] = t | ||
d['abbr'] = d['abbr'] + '-circular-4' | ||
d['eval_cfg']['evaluator'] = {'type': CircularEvaluator, 'circular_pattern': 'circular'} | ||
d['circular_patterns'] = 'circular' | ||
|
||
|
||
datasets = sum([v for k, v in locals().items() if k.endswith("_datasets") or k == 'datasets'], []) | ||
models = sum([v for k, v in locals().items() if k.endswith("_model")], []) | ||
|
||
# config summarizer | ||
other_summary_groups = [ | ||
{'name': 'average', | ||
'subsets': ['ceval', 'mmlu', 'cmmlu', 'hellaswag', 'ARC-e', 'ARC-c', 'commonsense_qa', 'openbookqa_fact', 'race-middle', 'race-high']}, | ||
] | ||
origin_summary_groups = sum([v for k, v in locals().items() if k.endswith("_summary_groups")], []) | ||
new_summary_groups = [] | ||
for item in origin_summary_groups: | ||
new_summary_groups.append( | ||
{ | ||
'name': item['name'] + '-circular-4', | ||
'subsets': [i + '-circular-4' for i in item['subsets']], | ||
} | ||
) | ||
summarizer = dict( | ||
type=CircularSummarizer, | ||
metric_types=['acc_origin', 'perf_circular'], | ||
dataset_abbrs = [ | ||
'average-circular-4', | ||
'ceval-circular-4', | ||
'mmlu-circular-4', | ||
'cmmlu-circular-4', | ||
'hellaswag-circular-4', | ||
'ARC-e-circular-4', | ||
'ARC-c-circular-4', | ||
'commonsense_qa-circular-4', | ||
'openbookqa_fact-circular-4', | ||
'race-middle-circular-4', | ||
'race-high-circular-4', | ||
'ceval-humanities-circular-4', | ||
'ceval-stem-circular-4', | ||
'ceval-social-science-circular-4', | ||
'ceval-other-circular-4', | ||
'mmlu-humanities-circular-4', | ||
'mmlu-stem-circular-4', | ||
'mmlu-social-science-circular-4', | ||
'mmlu-other-circular-4', | ||
'cmmlu-humanities-circular-4', | ||
'cmmlu-stem-circular-4', | ||
'cmmlu-social-science-circular-4', | ||
'cmmlu-other-circular-4', | ||
'cmmlu-china-specific-circular-4', | ||
], | ||
summary_groups=new_summary_groups, | ||
) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,113 @@ | ||
# CircularEval | ||
|
||
## Background | ||
|
||
For multiple-choice questions, when a Language Model (LLM) provides the correct option, it does not necessarily imply a true understanding and reasoning of the question. It could be a guess. To differentiate these scenarios and reduce LLM bias towards options, CircularEval (CircularEval) can be utilized. A multiple-choice question is augmented by shuffling its options, and if the LLM correctly answers all variations of the augmented question, it is considered correct under CircularEval. | ||
|
||
## Adding Your Own CircularEval Dataset | ||
|
||
Generally, to evaluate a dataset using CircularEval, both its loading and evaluation methods need to be rewritten. Modifications are required in both the OpenCompass main library and configuration files. We will use C-Eval as an example for explanation. | ||
|
||
OpenCompass main library: | ||
|
||
```python | ||
from opencompass.datasets.ceval import CEvalDataset | ||
from opencompass.datasets.circular import CircularDatasetMeta | ||
|
||
class CircularCEvalDataset(CEvalDataset, metaclass=CircularDatasetMeta): | ||
# The overloaded dataset class | ||
dataset_class = CEvalDataset | ||
|
||
# Splits of the DatasetDict that need CircularEval. For CEvalDataset, which loads [dev, val, test], we only need 'val' and 'test' for CircularEval, not 'dev' | ||
default_circular_splits = ['val', 'test'] | ||
|
||
# List of keys to be shuffled | ||
default_option_keys = ['A', 'B', 'C', 'D'] | ||
|
||
# If the content of 'answer_key' is one of ['A', 'B', 'C', 'D'], representing the correct answer. This field indicates how to update the correct answer after shuffling options. Choose either this or default_answer_key_switch_method | ||
default_answer_key = 'answer' | ||
|
||
# If 'answer_key' content is not one of ['A', 'B', 'C', 'D'], a function can be used to specify the correct answer after shuffling options. Choose either this or default_answer_key | ||
# def default_answer_key_switch_method(item, circular_pattern): | ||
# # 'item' is the original data item | ||
# # 'circular_pattern' is a tuple indicating the order after shuffling options, e.g., ('D', 'A', 'B', 'C') means the original option A is now D, and so on | ||
# item['answer'] = circular_pattern['ABCD'.index(item['answer'])] | ||
# return item | ||
``` | ||
|
||
`CircularCEvalDataset` accepts the `circular_pattern` parameter with two values: | ||
|
||
- `circular`: Indicates a single cycle. It is the default value. ABCD is expanded to ABCD, BCDA, CDAB, DABC, a total of 4 variations. | ||
- `all_possible`: Indicates all permutations. ABCD is expanded to ABCD, ABDC, ACBD, ACDB, ADBC, ADCB, BACD, ..., a total of 24 variations. | ||
|
||
Additionally, we provide a `CircularEvaluator` to replace `AccEvaluator`. This Evaluator also accepts `circular_pattern`, and it should be consistent with the above. It produces the following metrics: | ||
|
||
- `acc_{origin|circular|all_possible}`: Treating each question with shuffled options as separate, calculating accuracy. | ||
- `perf_{origin|circular|all_possible}`: Following Circular logic, a question is considered correct only if all its variations with shuffled options are answered correctly, calculating accuracy. | ||
- `more_{num}_{origin|circular|all_possible}`: According to Circular logic, a question is deemed correct if the number of its variations answered correctly is greater than or equal to num, calculating accuracy. | ||
|
||
OpenCompass configuration file: | ||
|
||
```python | ||
from mmengine.config import read_base | ||
from opencompass.datasets.circular import CircularCEvalDataset | ||
|
||
with read_base(): | ||
from .datasets.ceval.ceval_gen_5f30c7 import ceval_datasets | ||
|
||
for d in ceval_datasets: | ||
# Overloading the load method | ||
d['type'] = CircularCEvalDataset | ||
# Renaming for differentiation from non-circular evaluation versions | ||
d['abbr'] = d['abbr'] + '-circular-4' | ||
# Overloading the evaluation method | ||
d['eval_cfg']['evaluator'] = {'type': CircularEvaluator} | ||
|
||
# The dataset after the above operations looks like this: | ||
# dict( | ||
# type=CircularCEvalDataset, | ||
# path='./data/ceval/formal_ceval', # Unchanged | ||
# name='computer_network', # Unchanged | ||
# abbr='ceval-computer_network-circular-4', | ||
# reader_cfg=dict(...), # Unchanged | ||
# infer_cfg=dict(...), # Unchanged | ||
# eval_cfg=dict(evaluator=dict(type=CircularEvaluator), ...), | ||
# ) | ||
``` | ||
|
||
Additionally, for better presentation of results in CircularEval, consider using the following summarizer: | ||
|
||
```python | ||
|
||
|
||
from mmengine.config import read_base | ||
from opencompass.summarizers import CircularSummarizer | ||
|
||
with read_base(): | ||
from ...summarizers.groups.ceval.ceval_summary_groups | ||
|
||
new_summary_groups = [] | ||
for item in ceval_summary_groups: | ||
new_summary_groups.append( | ||
{ | ||
'name': item['name'] + '-circular-4', | ||
'subsets': [i + '-circular-4' for i in item['subsets']], | ||
} | ||
) | ||
|
||
summarizer = dict( | ||
type=CircularSummarizer, | ||
# Select specific metrics to view | ||
metric_types=['acc_origin', 'perf_circular'], | ||
dataset_abbrs = [ | ||
'ceval-circular-4', | ||
'ceval-humanities-circular-4', | ||
'ceval-stem-circular-4', | ||
'ceval-social-science-circular-4', | ||
'ceval-other-circular-4', | ||
], | ||
summary_groups=new_summary_groups, | ||
) | ||
``` | ||
|
||
For more complex evaluation examples, refer to this sample code: https://github.com/open-compass/opencompass/tree/main/configs/eval_circular.py |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,111 @@ | ||
# 循环评测 | ||
|
||
## 背景 | ||
|
||
对于选择题而言,当 LLM 给出正确的选项,并不一定代表着它能真正地理解题意并经过推理得出答案,它也有可能是蒙对的。为了将这两种情形区分开,同时也为了降低 LLM 对选项的偏见,我们可以尝试使用循环评测 (CircularEval)。我们会将一道选择题按照打乱选项的方式进行增广,若 LLM 可以在增广后的每道题上均得到正确的答案,那么我们认为在循环评测的意义下,这道题被做对了。 | ||
|
||
## 新增自己的循环评测数据集 | ||
|
||
一般来说,为了将一个数据集使用循环评测的方式进行评测,它的加载方式和评测方式是需要被重写的,OpenCompass 主库和配置文件均需要进行修改。后续我们以 C-Eval 为例进行讲解。 | ||
|
||
OpenCompass 主库: | ||
|
||
```python | ||
from opencompass.datasets.ceval import CEvalDataset | ||
from opencompass.datasets.circular import CircularDatasetMeta | ||
|
||
class CircularCEvalDataset(CEvalDataset, metaclass=CircularDatasetMeta): | ||
# 被重载的数据集类 | ||
dataset_class = CEvalDataset | ||
|
||
# 若原 load 方法得到一 DatasetDict,其哪些 split 需要被循环评测。CEvalDataset load 得到 [dev, val, test],我们只需要对 val 和 test 进行循环评测,dev 不需要 | ||
default_circular_splits = ['val', 'test'] | ||
|
||
# 需要被打乱的 key 列表 | ||
default_option_keys = ['A', 'B', 'C', 'D'] | ||
|
||
# 若 answer_key 的内容属于是 ['A', 'B', 'C', 'D'] 之一,并表示正确答案。该字段表示打乱选项后,需要如何更新正确答案。与 default_answer_key_switch_method 二选一 | ||
default_answer_key = 'answer' | ||
|
||
# 如果 answer_key 的内容不属于 ['A', 'B', 'C', 'D'] 之一,那么可以使用函数的方式来指定打乱选项后的正确答案。与 default_answer_key 二选一 | ||
# def default_answer_key_switch_method(item, circular_pattern): | ||
# # item 是原本的数据项 | ||
# # circular_pattern 是一个 tuple,表示打乱选项后的顺序,例如 ('D', 'A', 'B', 'C') 表示原来的 A 选项变成了 D,原来的 B 选项变成了 A,以此类推 | ||
# item['answer'] = circular_pattern['ABCD'.index(item['answer'])] | ||
# return item | ||
``` | ||
|
||
`CircularCEvalDataset` 会接受 `circular_pattern` 参数,它有两个取值: | ||
|
||
- `circular`: 表示单项循环。默认为该值。ABCD 会被扩充为 ABCD, BCDA, CDAB, DABC, 共 4 种 | ||
- `all_possible`: 表示全排列。ABCD 会被扩充为 ABCD, ABDC, ACBD, ACDB, ADBC, ADCB, BACD, ..., 共 24 种 | ||
|
||
另外我们提供了一个 `CircularEvaluator` 用于替换 `AccEvaluator`,该 Evaluator 同样接受 `circular_pattern`,该参数应与上述保持一致。它会产出以下指标: | ||
|
||
- `acc_{origin|circular|all_possible}`: 将打乱后选项顺序后的题目视作多道单独的题目,计算准确率 | ||
- `perf_{origin|circular|all_possible}`: 按照 circular 的逻辑,若选项打乱后的题目都回答正确,才会视为这道题正确,计算准确率 | ||
- `more_{num}_{origin|circular|all_possible}`: 按照 circular 的逻辑,若选项打乱后的题目回答正确的数量大于等于 num,就会视为这道题正确,计算准确率 | ||
|
||
OpenCompass 配置文件: | ||
|
||
```python | ||
from mmengine.config import read_base | ||
from opencompass.datasets.circular import CircularCEvalDataset | ||
|
||
with read_base(): | ||
from .datasets.ceval.ceval_gen_5f30c7 import ceval_datasets | ||
|
||
for d in ceval_datasets: | ||
# 重载 load 方法 | ||
d['type'] = CircularCEvalDataset | ||
# 为了与非循环评测版本做区分而进行改名 | ||
d['abbr'] = d['abbr'] + '-circular-4' | ||
# 重载评测方法 | ||
d['eval_cfg']['evaluator'] = {'type': CircularEvaluator} | ||
|
||
# 上述操作后的 dataset 形如下: | ||
# dict( | ||
# type=CircularCEvalDataset, | ||
# path='./data/ceval/formal_ceval', # 未改变 | ||
# name='computer_network', # 未改变 | ||
# abbr='ceval-computer_network-circular-4', | ||
# reader_cfg=dict(...), # 未改变 | ||
# infer_cfg=dict(...), # 未改变 | ||
# eval_cfg=dict(evaluator=dict(type=CircularEvaluator), ...), | ||
# ) | ||
``` | ||
|
||
另外评测时为了针对循环评测有更良好的结果呈现,建议考虑使用以下 summarizer | ||
|
||
```python | ||
from mmengine.config import read_base | ||
from opencompass.summarizers import CircularSummarizer | ||
|
||
with read_base(): | ||
from ...summarizers.groups.ceval import ceval_summary_groups | ||
|
||
new_summary_groups = [] | ||
for item in ceval_summary_groups: | ||
new_summary_groups.append( | ||
{ | ||
'name': item['name'] + '-circular-4', | ||
'subsets': [i + '-circular-4' for i in item['subsets']], | ||
} | ||
) | ||
|
||
summarizer = dict( | ||
type=CircularSummarizer, | ||
# 选择具体看哪些指标 | ||
metric_types=['acc_origin', 'perf_circular'], | ||
dataset_abbrs = [ | ||
'ceval-circular-4', | ||
'ceval-humanities-circular-4', | ||
'ceval-stem-circular-4', | ||
'ceval-social-science-circular-4', | ||
'ceval-other-circular-4', | ||
], | ||
summary_groups=new_summary_groups, | ||
) | ||
``` | ||
|
||
更多复杂的评测案例可以参考这个样例代码: https://github.com/open-compass/opencompass/tree/main/configs/eval_circular.py |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Oops, something went wrong.