In [1]:
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM, Trainer, TrainingArguments
from torch.utils.data import Dataset, DataLoader
import torch
from datasets import load_dataset
# 加载 WMT19 中英文翻译数据集
data_path = './data/wmt19-zh-en'
dataset = load_dataset(data_path)


# 1. 加载预训练模型和分词器
# 确认CUDA是否可用
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model_name = "Helsinki-NLP/opus-mt-zh-en"
tokenizer = AutoTokenizer.from_pretrained(model_name)
pretrained_model = AutoModelForSeq2SeqLM.from_pretrained(model_name).to(device)





  table = cls._concat_blocks(blocks, axis=0)


In [2]:
# 1. 定义自定义数据集
class TranslationDataset(Dataset):
    def __init__(self, dataset, tokenizer, type, max_length=128, size=None):
        self.dataset = dataset[type][:size]['translation'] if size else dataset[type]['translation']
        self.tokenizer = tokenizer
        self.max_length = max_length

    def __len__(self):
        return len(self.dataset)

    def __getitem__(self, idx):
        sample = self.dataset[idx]
        source_text = sample['zh']
        target_text = sample['en']

        # Tokenize source and target texts
        source_encoding = self.tokenizer(source_text, max_length=self.max_length, padding="max_length", truncation=True,
                                         return_tensors="pt")
        target_encoding = self.tokenizer(target_text, max_length=self.max_length, padding="max_length", truncation=True,
                                         return_tensors="pt")

        labels = target_encoding["input_ids"].squeeze()
        labels[labels == tokenizer.pad_token_id] = -100  # 忽略填充部分

        return {
            "input_ids": source_encoding["input_ids"].squeeze(),
            "attention_mask": source_encoding["attention_mask"].squeeze(),
            "labels": labels,
        }

In [3]:
# 2. 创建数据集对象
train_dataset = TranslationDataset(dataset, tokenizer, type='train', size=40000)
eval_dataset = TranslationDataset(dataset, tokenizer, type='validation', size=100)

# 3. 定义数据加载器
# 数据加载器
train_loader = DataLoader(train_dataset, batch_size=8, shuffle=True, pin_memory=False)
eval_loader = DataLoader(eval_dataset, batch_size=8, pin_memory=False)

# 4. 定义训练参数
training_args = TrainingArguments(
    output_dir="./results",
    evaluation_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    weight_decay=0.01,
    save_total_limit=3,
    num_train_epochs=5,
    fp16=True,  # 使用混合精度训练以节省显存
    fp16_opt_level="O1",  # 混合精度优化级别
)


# 5. 使用Trainer API进行训练
trainer = Trainer(
    model=pretrained_model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=eval_dataset,
)



In [4]:
# 开始训练
trainer.train()

# 保存微调后的模型
trainer.save_model("./model/fine-tuned-model")

Epoch,Training Loss,Validation Loss
1,0.9062,2.152803
2,0.6801,1.985045
3,0.5696,1.926454
4,0.5187,1.900353
5,0.476,1.89853




In [5]:
# 实践BLEU评估
import nltk
from nltk.translate.bleu_score import sentence_bleu, SmoothingFunction

def translate(text, tokenizer, model, max_length=128):
    # 进行分词
    inputs = tokenizer(text, return_tensors="pt", padding=True, truncation=True, max_length=max_length).to(device)

    # 模型推理
    with torch.no_grad():
        outputs = model.generate(**inputs, max_length=max_length)

    # 解码输出
    translated_text = tokenizer.decode(outputs[0], skip_special_tokens=True)
    return translated_text
    
# 计算 BLEU 评分的函数
def compute_bleu(references, hypothesis):
    """
    计算 BLEU 评分
    :param references: 参考翻译（list of lists）
    :param hypothesis: 机器翻译结果（string）
    :return: BLEU score
    """
    reference_tokens = [nltk.word_tokenize(ref) for ref in references]  # 参考翻译分词
    hypothesis_tokens = nltk.word_tokenize(hypothesis)  # 模型翻译分词
    smooth = SmoothingFunction().method1  # 进行平滑处理，防止极端情况
    return sentence_bleu(reference_tokens, hypothesis_tokens, smoothing_function=smooth)


def Bleu_score(references, translations):
    bleu_scores = []
    for reference, translation in zip(references, translations):
        bleu_score = compute_bleu([reference], translation)
        bleu_scores.append(bleu_score)

    avg_bleu = sum(bleu_scores) / len(bleu_scores)  # 计算 BLEU 平均分
    return avg_bleu

In [6]:
# 1. 指定已保存的模型路径
model_path = "./model/fine-tuned-model"

# 2. 加载保存的模型和分词器
tokenizer = AutoTokenizer.from_pretrained("Helsinki-NLP/opus-mt-zh-en")
model = AutoModelForSeq2SeqLM.from_pretrained(model_path).to(device)

In [8]:
# 开始验证测试集
import torch
test_sentences = dataset['validation']['translation'][:5]
references = []
translations = []
for sample in test_sentences:
    references.append(sample['en'])
    translated = translate(sample['zh'], tokenizer, model)  # 通过模型翻译
    translations.append(translated)
    print(f"源文本:{sample['zh']}")
    print(f"参考翻译:{sample['en']}")
    print(f"模板翻译:{translated}")

avg_bleu = Bleu_score(references, translations)  # 计算 BLEU 平均分
print(f"BLEU 平均分: {avg_bleu}")

源文本:上周，古装剧《美人私房菜》临时停播，意外引发了关于国产剧收视率造假的热烈讨论。
参考翻译:Last week, the broadcast of period drama “Beauty Private Kitchen” was temporarily halted, and accidentally triggered heated debate about faked ratings of locally produced dramas.
模板翻译:The present cease of the present presence of the present drafting, United States private level, rapidly briefed up a heavy discussion on the fallowing rates of national products.
源文本:民权团体针对密苏里州发出旅行警告
源文本:由于密苏里州的歧视性政策和种族主义袭击，美国有色人种促进协会 (NAACP) 向准备前往密苏里州出游的有色人群发出旅行警告。
参考翻译:The National Association for the Advancement of Colored People has put out an alert for people of color traveling to Missouri because of the state's discriminatory policies and racist attacks.
源文本:“2017 年 8 月 28 日生效的 NAACP 密苏里州旅行咨询中呼吁，因近期密苏里州发生了一系列可疑的种族性事件，所有非裔美籍旅行者、游客以及密苏里州人在密苏里州旅行时应特别注意并采取极其谨慎的态度，特此告知，”该团体的声明宣称。
参考翻译:"The NAACP Travel Advisory for the state of Missouri, effective through August 28th, 2017, calls for African American travelers, visitors and Missourians to pay special atten