In [None]:
!pip install transformers==4.57.6

In [None]:
# 必要なもの
from datasets import load_dataset
dataset = load_dataset("cnn_dailymail", "3.0.0")

sample_text = dataset["train"][1]["article"][:2000]
summaries = {}

## NLTK パッケージ
import nltk
from nltk.tokenize import sent_tokenize
nltk.download("punkt")
nltk.download("punkt_tab")

## ベースライン
def three_sentence_summary(text):
  return "\n".join(sent_tokenize(text)[:3])
summaries["baseline"] = three_sentence_summary(sample_text)

## GPT-2 による結果
from transformers import pipeline, set_seed
set_seed(42)
pipe = pipeline("text-generation", model="gpt2-xl")
gpt2_query = sample_text + "\nTL;DR:\n"
pipe_out = pipe(gpt2_query, max_length=512, clean_up_tokenization_spaces=True)
summaries["gpt2"] = "\n".join(
    sent_tokenize(pipe_out[0]["generated_text"][len(gpt2_query) :]))

## T5 による結果
pipe = pipeline("summarization", model="t5-large")
pipe_out = pipe(sample_text)
summaries["t5"] = "\n".join(sent_tokenize(pipe_out[0]["summary_text"]))

## BART による結果
pipe = pipeline("summarization", model="facebook/bart-large-cnn")
pipe_out = pipe(sample_text)
summaries["bart"] = "\n".join(sent_tokenize(pipe_out[0]["summary_text"]))

## PEGASUS による結果
pipe = pipeline("summarization", model="google/pegasus-cnn_dailymail")
pipe_out = pipe(sample_text)
summaries["pegasus"] = pipe_out[0]["summary_text"].replace(" .", ".\n")

In [None]:
# パッケージインスト―ル
!pip install evaluate sacrebleu rouge_score

In [None]:
# BLEU スコアメトリック
from sacrebleu.metrics import BLEU

bleu_metric = BLEU(smooth_method="floor", smooth_value=0)

In [None]:
# スコア確認
import pandas as pd
import numpy as np

def get_bleu_results(bleu_score):
  results = {}
  results["score"] = bleu_score.score
  results["counts"] = bleu_score.counts
  results["totals"] = bleu_score.totals
  results["precisions"] = [np.round(p, 2) for p in bleu_score.precisions]
  results["bp"] = bleu_score.bp
  results["sys_len"] = bleu_score.sys_len
  results["ref_len"] = bleu_score.ref_len
  results["ratio"] = bleu_score.ratio
  return results

bleu_score = bleu_metric.sentence_score(hypothesis="the the the the the the", references=["the cat is on the mat"])
pd.DataFrame.from_dict(get_bleu_results(bleu_score), orient="index", columns=["Value"])

In [None]:
# 予測が良い場合のスコア確認
bleu_score = bleu_metric.sentence_score(hypothesis="the cat is on mat", references=["the cat is on the mat"])
pd.DataFrame.from_dict(get_bleu_results(bleu_score), orient="index", columns=["Value"])

In [None]:
# ROUGE スコアメトリック
import evaluate
rouge_metric = evaluate.load('rouge')

In [None]:
# ROUGE スコア計算
reference = dataset["train"][1]["highlights"]
records = []
rouge_names = ["rouge1", "rouge2", "rougeL", "rougeLsum"]

for model_name in summaries:
  score = rouge_metric.compute(predictions=[summaries[model_name]], references=[reference])
  rouge_dict = dict((rn, score[rn]) for rn in rouge_names)
  records.append(rouge_dict)
pd.DataFrame.from_records(records, index=summaries.keys())