# ROUGE-n 评估


In [1]:
from transformers import AutoModelForCausalLM, AutoTokenizer
from datasets import load_dataset
from nltk.translate.bleu_score import corpus_bleu, SmoothingFunction, sentence_bleu
import numpy as np

# 加载预训练模型和 tokenizer
model_name = "C:/Users/Admin/Desktop/model/qwen/Qwen2___5-0___5B-Instruct"
model = AutoModelForCausalLM.from_pretrained(
    model_name,
    device_map="auto",
    torch_dtype="auto",
)
tokenizer = AutoTokenizer.from_pretrained(model_name)
# 确保 tokenizer 定义了填充标记
tokenizer.pad_token = tokenizer.eos_token
model.config.pad_token_id = model.config.eos_token_id

# 加载评估数据集
data_files = {"test": "C:/Users/Admin/Desktop/Data/huatuo_train.json"}
dataset = load_dataset("json", data_files=data_files, split="test")

  from .autonotebook import tqdm as notebook_tqdm
Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


In [2]:
# 提取对话中的问题和参考值
def extract_conversations(dataset):
    questions = []
    references = []
    for item in dataset:
        for conversation in item["conversations"]:
            if conversation["from"] == "human":
                questions.append(conversation["value"])
            elif conversation["from"] == "gpt":
                references.append(conversation["value"])
    return questions, references


questions, references = extract_conversations(dataset)

In [None]:
import evaluate
from tqdm import tqdm
import torch
import jieba

# 初始化 ROUGE 评估器
rouge = evaluate.load("rouge")

# 对每个问题生成模型预测
predictions = []
for question in tqdm(questions):
    inputs = tokenizer(
        question,
        return_tensors="pt",
        truncation=True,
        padding=True,
    )
    input_ids = inputs.input_ids.to(model.device)
    with torch.no_grad():
        generated_ids = model.generate(
            input_ids,
            max_length=512,
            num_beams=4,
            early_stopping=True,
        )
    generated_text = tokenizer.decode(
        generated_ids[0],
        skip_special_tokens=True,
    )
    predictions.append(generated_text)

# 分词处理预测回答和真实回答
tokenized_predictions = [" ".join(jieba.cut(pred)) for pred in predictions]
tokenized_references = [" ".join(jieba.cut(ref)) for ref in references]


# 计算 ROUGE-n 分数
results = rouge.compute(
    predictions=tokenized_predictions, references=tokenized_references
)

print("ROUGE 评估结果：")
for key in results:
    print(f"{key}: {results[key]:.4f}")

AttributeError: module 'evaluate' has no attribute 'load_metric'