In [1]:
!pip install evaluate --quiet

In [2]:
# 評価モジュールの一覧表示
import evaluate
evaluate.list_evaluation_modules(
    module_type="metric",  # ツール種別
    include_community=False,  # コミュニティを含むか
    with_details=True)  # 詳細を含むか

[{'name': 'precision', 'type': 'metric', 'community': False, 'likes': 0},
 {'name': 'code_eval', 'type': 'metric', 'community': False, 'likes': 6},
 {'name': 'roc_auc', 'type': 'metric', 'community': False, 'likes': 0},
 {'name': 'cuad', 'type': 'metric', 'community': False, 'likes': 0},
 {'name': 'xnli', 'type': 'metric', 'community': False, 'likes': 0},
 {'name': 'rouge', 'type': 'metric', 'community': False, 'likes': 10},
 {'name': 'pearsonr', 'type': 'metric', 'community': False, 'likes': 1},
 {'name': 'mse', 'type': 'metric', 'community': False, 'likes': 0},
 {'name': 'super_glue', 'type': 'metric', 'community': False, 'likes': 4},
 {'name': 'comet', 'type': 'metric', 'community': False, 'likes': 2},
 {'name': 'cer', 'type': 'metric', 'community': False, 'likes': 5},
 {'name': 'sacrebleu', 'type': 'metric', 'community': False, 'likes': 6},
 {'name': 'mahalanobis', 'type': 'metric', 'community': False, 'likes': 0},
 {'name': 'wer', 'type': 'metric', 'community': False, 'likes': 9},

In [3]:
# 評価モジュール属性の確認
accuracy = evaluate.load("accuracy")
print(accuracy.description)

Downloading builder script:   0%|          | 0.00/4.20k [00:00<?, ?B/s]


Accuracy is the proportion of correct predictions among the total number of cases processed. It can be computed with:
Accuracy = (TP + TN) / (TP + TN + FP + FN)
 Where:
TP: True positive
TN: True negative
FP: False positive
FN: False negative



In [4]:
accuracy.features

{'predictions': Value(dtype='int32', id=None),
 'references': Value(dtype='int32', id=None)}

In [5]:
accuracy.compute(references=[0,1,0,1], predictions=[1,0,0,1])

{'accuracy': 0.5}

In [6]:
for ref, pred in zip([0,1,0,1], [1,0,0,1]):
    accuracy.add(references=ref, predictions=pred)
accuracy.compute()

{'accuracy': 0.5}

In [7]:
for refs, preds in zip([[0,1],[0,1]], [[1,0],[0,1]]):
    accuracy.add_batch(references=refs, predictions=preds)
accuracy.compute()

{'accuracy': 0.5}

In [8]:
# 評価モジュールを組み合わせる
clf_metrics = evaluate.combine(["accuracy", "f1", "precision", "recall"])
clf_metrics.compute(predictions=[0, 1, 0], references=[0, 1, 1])

Downloading builder script:   0%|          | 0.00/6.77k [00:00<?, ?B/s]

Downloading builder script:   0%|          | 0.00/7.55k [00:00<?, ?B/s]

Downloading builder script:   0%|          | 0.00/7.36k [00:00<?, ?B/s]

{'accuracy': 0.6666666666666666,
 'f1': 0.6666666666666666,
 'precision': 1.0,
 'recall': 0.5}

In [11]:
# パッケージのインストール
!pip install transformers datasets --quiet
!pip install evaluate[evaluator] --quiet

In [12]:
from transformers import pipeline
from datasets import load_dataset
from evaluate import evaluator
import evaluate

# モデルとデータセットとメトリクスの準備
pipe = pipeline("text-classification", model="lvwerra/distilbert-imdb", device=0)
data = load_dataset("imdb", split="test").shuffle().select(range(1000))
metric = evaluate.load("accuracy")

# Evaluatorの準備
eval = evaluator("text-classification")

# スコアの計算
results = eval.compute(
    model_or_pipeline=pipe, 
    data=data, metric=metric,
    label_mapping={"NEGATIVE": 0, "POSITIVE": 1},
)
print(results)

Downloading (…)lve/main/config.json:   0%|          | 0.00/735 [00:00<?, ?B/s]

Downloading pytorch_model.bin:   0%|          | 0.00/268M [00:00<?, ?B/s]

Downloading (…)okenizer_config.json:   0%|          | 0.00/333 [00:00<?, ?B/s]

Downloading (…)solve/main/vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

Downloading (…)/main/tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

Downloading (…)cial_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

Downloading builder script:   0%|          | 0.00/4.31k [00:00<?, ?B/s]

Downloading metadata:   0%|          | 0.00/2.17k [00:00<?, ?B/s]

Downloading readme:   0%|          | 0.00/7.59k [00:00<?, ?B/s]

Downloading and preparing dataset imdb/plain_text to /home/ec2-user/.cache/huggingface/datasets/imdb/plain_text/1.0.0/d613c88cf8fa3bab83b4ded3713f1f74830d1100e171db75bbddb80b3345c9c0...


Downloading data:   0%|          | 0.00/84.1M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/25000 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/25000 [00:00<?, ? examples/s]

Generating unsupervised split:   0%|          | 0/50000 [00:00<?, ? examples/s]

Dataset imdb downloaded and prepared to /home/ec2-user/.cache/huggingface/datasets/imdb/plain_text/1.0.0/d613c88cf8fa3bab83b4ded3713f1f74830d1100e171db75bbddb80b3345c9c0. Subsequent calls will reuse this data.
{'accuracy': 0.939, 'total_time_in_seconds': 10.316356958006509, 'samples_per_second': 96.93344308175587, 'latency_in_seconds': 0.010316356958006508}
