In [5]:
import os
os.environ["CUDA_VISIBLE_DEVICES"]="0,1,2,3"

In [6]:
#from google.colab import drive
#drive.mount('/content/drive')

In [7]:
#!pip install rouge_score -U
#!pip install evaluate
#!pip install sentencepiece
#!pip install accelerate -U

In [8]:
import json
import torch
import evaluate
import transformers
import numpy as np
from tqdm import tqdm
from collections import defaultdict
from datasets import load_dataset
from transformers import MBartForConditionalGeneration, MBartTokenizer
from transformers import DataCollatorForSeq2Seq, TrainingArguments, Trainer

device = "cuda" if torch.cuda.is_available() else "cpu"

# "mbart-large-cc25" is a pre-trained (not fine-tuned) multilingual mbart model.
# https://huggingface.co/facebook/mbart-large-cc25
model = MBartForConditionalGeneration.from_pretrained("facebook/mbart-large-cc25", device_map="auto")
tokenizer = MBartTokenizer.from_pretrained("facebook/mbart-large-cc25", src_lang="zh_CN", tgt_lang="en_XX")
rouge = evaluate.load("rouge")

tokenizer(["1519年600名西班牙人在墨西哥登陆，去征服几百万人口的阿兹特克帝国，初次交锋他们损兵三分之二。"])

{'input_ids': [[423, 2947, 470, 10715, 1795, 54222, 62302, 133152, 174567, 4, 1677, 25786, 10871, 211971, 3895, 24008, 43, 6128, 112538, 2657, 3987, 147415, 4, 9224, 4465, 6582, 70374, 2963, 87239, 19752, 133608, 1420, 3195, 30, 2, 250025]], 'attention_mask': [[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]]}

In [9]:
import random

def set_seed(seed):
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = False
    torch.manual_seed(seed)
    torch.cuda.manual_seed_all(seed)
    np.random.seed(seed)
    random.seed(seed)

## Inferencing MBart off-the-shelf

In [10]:
article_cn = "1519年600名西班牙人在墨西哥登陆，去征服几百万人口的阿兹特克帝国，初次交锋他们损兵三分之二。"

chinese_input = tokenizer(article_cn, return_tensors="pt")
#model.to(device)

# translate Chinese to English
translated_ids = model.generate(**chinese_input.to(device), forced_bos_token_id=tokenizer.lang_code_to_id["en_XX"])
translated_text = tokenizer.decode(translated_ids[0], skip_special_tokens=True)
print (translated_text)

ref_en = ("In 1519, six hundred Spaniards landed in Mexico to conquer the Aztec Empire with a population of a few million. "
"They lost two thirds of their soldiers in the first clash.")

rouge_score = rouge.compute(predictions = [translated_text], references = [ref_en])

print (rouge_score)

1898年600年600年600年600年600年600年600年600年600年600年600年600年600年600年600年600年600年600年600年600年600年600年600名西班牙人在西班牙人在西班牙人在墨西哥登陆,西班牙人在墨西哥登陆,去征服几百万人口的阿兹特克维也纳的,他们损兵三分之二。
{'rouge1': 0.0, 'rouge2': 0.0, 'rougeL': 0.0, 'rougeLsum': 0.0}


## Finetuning MBart on WMT18 Zh-En

In [11]:
# load dataset
import pandas as pd

df_train = pd.read_json("WMT18_Zh-En/train.json")[:3000].rename(columns={0: "en", 1: "zh"})
df_dev = pd.read_json("WMT18_Zh-En/dev.json")[:500].rename(columns={0: "en", 1: "zh"})
df_test = pd.read_json("WMT18_Zh-En/test.json")[:500].rename(columns={0: "en", 1: "zh"})

df_train.head()

Unnamed: 0,en,zh
0,"Military leaders know this, and the threat tha...",在担任总理取得实权后，她最终可能会重新审视她与穆沙拉夫的协议。
1,The researchers concluded that self-reported o...,研究者的结论是，在参加了医疗补助的人中间，自我报告的健康和抑郁情况有所好转，并且这一群体的糖...
2,There is a vast number of important Buddhist s...,在斯瓦特河谷和巴基斯坦西北部有着大量重要的佛教文物。
3,But Howard Hughes’s success as a film producer...,但霍华德·休斯作为电影制片人和航空公司老板的成功使得他跻身20世纪前半叶最富有的美国人行列。
4,The concluding sentence of his review is widel...,他在文章结尾的被他的崇拜者所广泛引用：“我们之所以在构建我们的模型的过程中排除掉所有这些故事...


In [12]:
from datasets import Dataset

dataset_train = Dataset.from_pandas(df_train)
dataset_dev = Dataset.from_pandas(df_dev)
dataset_test = Dataset.from_pandas(df_test)

dataset_train

Dataset({
    features: ['en', 'zh'],
    num_rows: 3000
})

In [13]:
dataset_train[0]

{'en': 'Military leaders know this, and the threat that they will eventually push him aside will plague his presidency well into next year.',
 'zh': '在担任总理取得实权后，她最终可能会重新审视她与穆沙拉夫的协议。'}

In [14]:
def convert_examples_to_features(example_batch):
    input_encodings = tokenizer(example_batch["zh"], max_length=256, truncation=True)

    with tokenizer.as_target_tokenizer():
        target_encodings = tokenizer(example_batch['en'], max_length=128, truncation=True)

    return {"input_ids": input_encodings["input_ids"],
            "attention_mask": input_encodings["attention_mask"],
            "labels": target_encodings["input_ids"]}

In [15]:
def chunks(list_of_elements, batch_size): # dataloader
    """ Yield successive batch-sized chunks from list_of_elements. """
    for i in range(0, len(list_of_elements), batch_size):
        yield list_of_elements[i: i + batch_size]

In [16]:
def evaluate_summaries(dataset, metric, model, tokenizer,
                       src_lang="zh", tgt_lang="en",
                       batch_size=16, device=device):

    src_batches = list(chunks(dataset[src_lang], batch_size))
    tgt_batches = list(chunks(dataset[tgt_lang], batch_size))

    for source_batch, target_batch in tqdm(
            zip(src_batches, tgt_batches), total=len(src_batches)):

        inputs = tokenizer(source_batch, truncation=True,
                           padding=True, return_tensors="pt")

        translations = model.generate(input_ids=inputs["input_ids"].to(device),
                                   attention_mask=inputs["attention_mask"].to(device),
                                   length_penalty=0.8, num_beams=4, max_length=128)

        decoded_translations = [tokenizer.decode(s, skip_special_tokens=True, clean_up_tokenization_spaces=True) for s in translations]
        decoded_translations = [d.replace("<n>", " ") for d in decoded_translations]

        metric.add_batch(predictions=decoded_translations, references=target_batch)

    rslt = metric.compute()
    return rslt

In [17]:
torch.cuda.empty_cache()

seed = 42
set_seed(seed)

dataset_train = dataset_train.shuffle(seed=seed)

train = dataset_train.map(convert_examples_to_features, batched=True)
dev = dataset_dev.map(convert_examples_to_features, batched=True)

columns = ["input_ids", "labels", "attention_mask"]
train.set_format(type="torch", columns=columns)
dev.set_format(type="torch", columns=columns)

seq2seq_data_collator = DataCollatorForSeq2Seq(tokenizer, model=model)

training_args = TrainingArguments(
    output_dir='bart', num_train_epochs=1,
    per_device_train_batch_size=16, per_device_eval_batch_size=16,
    weight_decay=0.01, logging_steps=2, push_to_hub=False,
    evaluation_strategy='steps', eval_steps=2, save_steps=1e6,
    gradient_accumulation_steps=16
)

# Loss fn: cross entropy on vocab size one hot vector
trainer = Trainer(
    model=model, args=training_args,
    tokenizer=tokenizer, data_collator=seq2seq_data_collator,
    train_dataset=train, eval_dataset=dev
)

trainer.train()

Map:   0%|          | 0/3000 [00:00<?, ? examples/s]

Map: 100%|██████████| 3000/3000 [00:00<00:00, 5593.64 examples/s]
Map: 100%|██████████| 500/500 [00:00<00:00, 5559.39 examples/s]
dataloader_config = DataLoaderConfiguration(dispatch_batches=None, split_batches=False, even_batches=True, use_seedable_sampler=True)


Step,Training Loss,Validation Loss
2,12.4144,3.269489
4,3.3472,2.761258
6,3.0039,2.625229
8,2.849,2.548951
10,2.7157,2.514463


TrainOutput(global_step=11, training_loss=4.673707550222224, metrics={'train_runtime': 64.4121, 'train_samples_per_second': 46.575, 'train_steps_per_second': 0.171, 'total_flos': 343286899802112.0, 'train_loss': 4.673707550222224, 'epoch': 0.94})

In [18]:
rouge_sc = evaluate_summaries(dataset_test, rouge, model, tokenizer, batch_size=16)
print (rouge_sc)

100%|██████████| 32/32 [05:45<00:00, 10.79s/it]


{'rouge1': 0.04647767974430111, 'rouge2': 0.008888541328267692, 'rougeL': 0.04023329271680775, 'rougeLsum': 0.040438789949491016}


In [19]:
set_seed(1)

# Evaluate on our example again
article_cn = "1519年600名西班牙人在墨西哥登陆，去征服几百万人口的阿兹特克帝国，初次交锋他们损兵三分之二。"

chinese_input = tokenizer(article_cn, return_tensors="pt")

# translate Chinese to English
translated_ids = model.generate(**chinese_input.to(device), forced_bos_token_id=tokenizer.lang_code_to_id["en_XX"])
translated_text = tokenizer.decode(translated_ids[0], skip_special_tokens=True)
print (translated_text)

ref_en = "In 1519, six hundred Spaniards landed in Mexico to conquer the Aztec Empire with a population of a few million. They lost two thirds of their soldiers in the first clash."

rouge_score = rouge.compute(predictions = [translated_text], references = [ref_en])
print (rouge_score)

coax of Spanish-American Indians in Mexico’s Mexico, and the Albuquerquerquerquerquer the Allied Empire’s army of the Spanish-occuarian empire, the last time it was to be the last battle between them, and their troops were damaged by three quarters of a century ago.
{'rouge1': 0.2597402597402597, 'rouge2': 0.053333333333333344, 'rougeL': 0.15584415584415587, 'rougeLsum': 0.15584415584415587}
