In [None]:
!pip install -U transformers

In [None]:
!pip install torch torchdata datasets evaluate rouge_score loralib peft awscli llmx sacremoses

In [None]:
!huggingface-cli login

In [None]:
from transformers import MBartForConditionalGeneration, MBart50TokenizerFast

model = MBartForConditionalGeneration.from_pretrained("yonyou-sg/mbart-large-50-one-to-many-mmt")
tokenizer = MBart50TokenizerFast.from_pretrained("yonyou-sg/mbart-large-50-one-to-many-mmt")

In [None]:
from datasets import load_dataset
from transformers import AutoModelForSeq2SeqLM, AutoTokenizer, Seq2SeqTrainingArguments, Seq2SeqTrainer
import torch
import time
import evaluate
import pandas as pd
import numpy as np

In [None]:
huggingface_dataset_name = "yonyou-sg/datacenter-initial-zh-en"
dataset = load_dataset(huggingface_dataset_name)

In [None]:
dataset

In [None]:
tokenizer.tokenize(df_train['简体中文(源)'][0])

In [None]:
smpl = pd.DataFrame()

In [None]:
import re
def word_tokenize(text):
    """
    Split a text into words, numbers, and punctuation marks
    (for languages where words are separated by spaces)
    """
    return re.findall('(\w+|[^\w\s])', str(text))

In [None]:
# 应用预处理和分析
from tqdm.auto import tqdm, trange
src_lang_index = '简体中文(源)'
target_lang_index = '参考语言(英文)'
smpl = pd.DataFrame(dataset['train'])
smpl['src_lang_tokens'] = smpl[src_lang_index].apply(lambda x: tokenizer.tokenize(str(x))) # [tokenizer.tokenize(str(text)) for text in df_train['简体中文(源)']] #
smpl['target_lang_tokens'] = smpl[target_lang_index].apply(lambda x: tokenizer.tokenize(str(x)))
smpl['src_lang_words'] = smpl[src_lang_index].apply(word_tokenize)
smpl['target_lang_words'] = smpl[target_lang_index].apply(word_tokenize)

# 统计
try:
    stats = smpl[['src_lang_tokens', 'target_lang_tokens', 'src_lang_words', 'target_lang_words']].applymap(len).describe()
    print("原始语言token转化比：",stats['src_lang_tokens']['mean'] / stats['src_lang_words']['mean'])
    print("目标语言token转化比：",stats['target_lang_tokens']['mean'] / stats['target_lang_words']['mean'])
except TypeError as e:
    print(f"Caught an error: {e}")

In [None]:
from sacremoses import MosesPunctNormalizer
import sys
import unicodedata

mpn = MosesPunctNormalizer(lang="en")
mpn.substitutions = [
    (re.compile(r), sub) for r, sub in mpn.substitutions
]

# 查找包含未知符号的文本
texts_with_unk = [
    text for text in tqdm(smpl[src_lang_index].to_list())
    if tokenizer.unk_token_id in tokenizer(str(text)).input_ids
]
print("未知符号数量：", len(texts_with_unk))

def get_non_printing_char_replacer(replace_by: str = " "):
    non_printable_map = {
        ord(c): replace_by
        for c in (chr(i) for i in range(sys.maxunicode + 1))
        # same as \p{C} in perl
        # see https://www.unicode.org/reports/tr44/#General_Category_Values
        if unicodedata.category(c) in {"C", "Cc", "Cf", "Cs", "Co", "Cn"}
    }

    def replace_non_printing_char(line) -> str:
        return line.translate(non_printable_map)

    return replace_non_printing_char

replace_nonprint = get_non_printing_char_replacer(" ")

def preproc(text):
    clean = mpn.normalize(text)
    clean = replace_nonprint(clean)
    # replace 𝓕𝔯𝔞𝔫𝔠𝔢𝔰𝔠𝔞 by Francesca
    clean = unicodedata.normalize("NFKC", clean)
    return clean
# 应用进一步的文本清洗
texts_with_unk_normed = [text for text in tqdm(texts_with_unk) if tokenizer.unk_token_id in tokenizer(preproc(str(text))).input_ids]
print("处理掉非标准标点符号后，未知符号数量：", len(texts_with_unk_normed))

# 打印统计结果
print(stats)

In [None]:
from collections import Counter
import sentencepiece as spm
from sentencepiece import sentencepiece_model_pb2 as sp_pb2_model
from transformers import NllbTokenizer, AutoModelForSeq2SeqLM

model_name = "yonyou-sg/mbart-large-50-one-to-many-mmt"
all_texts = smpl[src_lang_index].dropna().tolist() + smpl[target_lang_index].dropna().tolist()
all_text_normalized = [preproc(t) for t in tqdm(all_texts)]
chars_cnt = Counter(c for t in all_text_normalized for c in t)
required_chars = ''.join([
    k for k, v in chars_cnt.most_common()
    if v >= 4 and k not in ' '
])
# 我们将文本转储到一个纯文本文件中，并在此文件上训练一个新的句子分词器模型，以便将其标记添加到现有的MBart分词器中。Sentencepiece是训练分词器的流行算法之一。
all_texts_file = 'all_texts_plain.txt'
SPM_PREFIX = 'spm_new_text_16k'
with open(all_texts_file, 'w') as f:
    for i, text in enumerate(all_texts):
        print(text, file=f)

spm.SentencePieceTrainer.train(
    input=all_texts_file,
    model_prefix=SPM_PREFIX,
    vocab_size=2**14,  # 16K
    character_coverage = 1,
    num_threads=16,
    train_extremely_large_corpus=False,
    add_dummy_prefix=False,
    max_sentencepiece_length=128,
    max_sentence_length=4192*4,
    pad_id=0,
    eos_id=1,
    unk_id=2,
    bos_id=-1,
    required_chars=required_chars,
)
# 读取MBart分词器和新训练的分词器模型
tokenizer = MBart50TokenizerFast.from_pretrained(model_name)
sp_trained = spm.SentencePieceProcessor(model_file=f'{SPM_PREFIX}.model')
added_spm = sp_pb2_model.ModelProto()
added_spm.ParseFromString(sp_trained.serialized_model_proto())
old_spm = sp_pb2_model.ModelProto()
old_spm.ParseFromString(tokenizer.sp_model.serialized_model_proto())

# 将缺失的tokens加入MBart分词器模型
mbart_tokens_set = {p.piece for p in old_spm.pieces}
prev_min_score = old_spm.pieces[-1].score
for p in added_spm.pieces:
    piece = p.piece
    if piece not in mbart_tokens_set:
        new_p = sp_pb2_model.ModelProto().SentencePiece()
        new_p.piece = piece
        # for all new tokens, I'll set a lower score (priority)
        new_p.score = p.score + prev_min_score
        old_spm.pieces.append(new_p)

# 保存结果
NEW_SPM_NAME = 'spm_mbart_new.model'
with open(NEW_SPM_NAME, 'wb') as f:
    f.write(old_spm.SerializeToString())

# 加载tokenizers
tokenizer_old = tokenizer = MBart50TokenizerFast.from_pretrained(model_name)
tokenizer = MBart50TokenizerFast.from_pretrained(model_name, vocab_file=NEW_SPM_NAME)
print('原始版本分词器词汇量：',len(tokenizer_old),'新版本分词器词汇量：',len(tokenizer))
added_vocab = set(tokenizer.get_vocab()).difference(set(tokenizer_old.get_vocab()))
print('新增的词汇量：',len(added_vocab))

# 加载并调整embedding层大小
model = AutoModelForSeq2SeqLM.from_pretrained(model_name)
model.resize_token_embeddings(len(tokenizer))

# 初始化新的embedding层
for t in tqdm(added_vocab):
    tt = tokenizer_old(t, add_special_tokens=False).input_ids
    if len(tt) == 0:
        tt = [tokenizer_old.unk_token_id]
    idx = tokenizer.convert_tokens_to_ids(t)
    model.model.shared.weight.data[idx] = model.model.shared.weight.data[tt].mean(0)

In [None]:
def print_number_of_trainable_model_parameters(model):
    trainable_model_params = 0
    all_model_params = 0
    for _, param in model.named_parameters():
        all_model_params += param.numel()
        if param.requires_grad:
            trainable_model_params += param.numel()
    return f"trainable model parameters: {trainable_model_params}\nall model parameters: {all_model_params}\npercentage of trainable model parameters: {100 * trainable_model_params / all_model_params:.2f}%"

print(print_number_of_trainable_model_parameters(model))

In [None]:
args = Seq2SeqTrainingArguments(output_dir="./mbart_yonyou/",
                        do_train=True,
                        remove_unused_columns=False,
                        do_eval=True,
                        evaluation_strategy="epoch",
                        per_device_train_batch_size=16,
                        per_device_eval_batch_size=16,
                        learning_rate=5e-5,
                        num_train_epochs=1,
                        logging_dir="/logs")

In [None]:
dataset

In [None]:
train_dataset, eval_dataset = dataset["train"], dataset["validation"]

In [None]:
def data_collator(features):
    labels_key = '简体中文(源)'
    inputs_key = '参考语言(英文)'

    # 从列表中提取数据
    labels = [str(f[labels_key]) for f in features]
    inputs = [str(f[inputs_key]) for f in features]

    # 检查是否有足够的样本
    if len(labels) == 0 or len(inputs) == 0:
        raise ValueError("训练数据为空。")

    # 使用 tokenizer.prepare_seq2seq_batch 处理数据
    batch = tokenizer.prepare_seq2seq_batch(
        src_texts=inputs,
        src_lang="en_XX",
        tgt_lang="zh_CN",
        tgt_texts=labels,
        max_length=32,
        max_target_length=32
    )

    # 将数据转换为tensor
    for k in batch:
        batch[k] = torch.tensor(batch[k])

    return batch

In [None]:
trainer = Seq2SeqTrainer(model=model,
                args=args,
                data_collator=data_collator,
                # train_dataset=train_dataset.select(range(25600,64000)),
                train_dataset=train_dataset,
                eval_dataset=eval_dataset)

In [None]:
import warnings

# Ignore all warnings
warnings.filterwarnings("ignore")

In [None]:
trainer.train()

这里会显示我们训练的模型运行结果

In [None]:
model =  MBartForConditionalGeneration.from_pretrained('./mbart_yonyou/checkpoint-4000')

In [None]:
text = "The effective unit price precision [%s] in Org currency is greater than the currency unit price precision [%s]"

In [None]:
encoded_batch = tokenizer(text, return_tensors="pt", padding=True)
generated_tokens = model.generate(
    **encoded_batch,
    forced_bos_token_id=tokenizer.lang_code_to_id["zh_CN"],
)
translated_batch = tokenizer.batch_decode(generated_tokens, skip_special_tokens=True)

In [None]:
translated_batch

这里会显示我们原来的模型运行结果

In [None]:
original_model = MBartForConditionalGeneration.from_pretrained("yonyou-sg/mbart-large-50-one-to-many-mmt")

In [None]:
encoded_batch = tokenizer(text, return_tensors="pt", padding=True)
generated_tokens = original_model.generate(
    **encoded_batch,
    forced_bos_token_id=tokenizer.lang_code_to_id["zh_CN"],
)
translated_batch = tokenizer.batch_decode(generated_tokens, skip_special_tokens=True)

In [None]:
translated_batch

可以看到效果好了很多，最后，我们把模型保存到我们的模型仓库中，名字自己定义，tokenizer也要保存

In [None]:
model.push_to_hub("yonyou-sg/mbart-large-50-one-to-many-mmt-finetuned")

In [None]:
tokenizer.push_to_hub("yonyou-sg/mbart-large-50-one-to-many-mmt-finetuned")