# Cantonese Translation Project

## Installing Dependencies Required

In [1]:
!pip3 install transformers
!pip3 install transformers[sentencepiece]
!pip3 install sacremoses
!pip3 install datasets
!pip3 install transformers[torch]
!pip3 install sacrebleu
!pip3 install evaluate
!pip install chinese-converter

Collecting sacremoses
  Downloading sacremoses-0.1.1-py3-none-any.whl (897 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m897.5/897.5 kB[0m [31m16.7 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: sacremoses
Successfully installed sacremoses-0.1.1
Collecting datasets
  Downloading datasets-2.17.1-py3-none-any.whl (536 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m536.7/536.7 kB[0m [31m11.5 MB/s[0m eta [36m0:00:00[0m
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl (116 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m116.3/116.3 kB[0m [31m14.5 MB/s[0m eta [36m0:00:00[0m
Collecting multiprocess (from datasets)
  Downloading multiprocess-0.70.16-py310-none-any.whl (134 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m134.8/134.8 kB[0m [31m17.5 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: dill, multiprocess, datasets
Successfull

## Imports

In [2]:
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
from transformers import DataCollatorForSeq2Seq, Seq2SeqTrainingArguments
from transformers import MBartForConditionalGeneration, MBart50TokenizerFast
from transformers import Seq2SeqTrainer
from transformers import BertTokenizer
from datasets.dataset_dict import DatasetDict
from datasets import Dataset
import pandas as pd
import random
from tqdm import tqdm
import gc
import torch
import evaluate
import numpy as np
import chinese_converter

## Import Data

In [3]:
# Folder and File Name
base_path = "/content/drive/MyDrive/Cantonese-NLP/Processed-Data"
folders = ["/train-short","/train","/dev","/test"]
file_name = ["/yue.txt","/en.txt"]

In [4]:
yue = []
en = []
for folder in folders:
  with open(base_path+folder+file_name[0],'r') as f_yue, open(base_path+folder+file_name[1],'r') as f_en:
    yue.append(f_yue.read().splitlines())
    en.append(f_en.read().splitlines())


In [5]:
# Concatenate Short and Long Training Example
yue[1] = yue[0] + yue[1]
del yue[0]
en[1] = en[0] + en[1]
del en[0]

In [6]:
import pandas as pd

train = zip(yue[0], en[0])
dev = zip(yue[1], en[1])
test = zip(yue[2], en[2])

train_df = pd.DataFrame(train, columns = ['zh','en'])
dev_df = pd.DataFrame(dev, columns = ['zh','en'])
test_df = pd.DataFrame(test, columns = ['zh','en'])

In [7]:
from datasets.dataset_dict import DatasetDict
from datasets import Dataset


dataset = DatasetDict({
    'train': Dataset.from_pandas(train_df),
    'dev': Dataset.from_pandas(dev_df),
    'test': Dataset.from_pandas(test_df)})


In [8]:
dataset

DatasetDict({
    train: Dataset({
        features: ['zh', 'en'],
        num_rows: 38044
    })
    dev: Dataset({
        features: ['zh', 'en'],
        num_rows: 3001
    })
    test: Dataset({
        features: ['zh', 'en'],
        num_rows: 3000
    })
})

## OPUS-MT Model

Load Model

In [None]:
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM, Seq2SeqTrainingArguments,DataCollatorForSeq2Seq, Seq2SeqTrainer

model_name = "Helsinki-NLP/opus-mt-zh-en"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSeq2SeqLM.from_pretrained(model_name).to("cuda")

tokenizer_config.json:   0%|          | 0.00/44.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/1.39k [00:00<?, ?B/s]

source.spm:   0%|          | 0.00/805k [00:00<?, ?B/s]

target.spm:   0%|          | 0.00/807k [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/1.62M [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/312M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/293 [00:00<?, ?B/s]

Testing Model without finetuning

In [None]:
import random
n = random.randint(0, len(dataset["test"]) - 1)
test = dataset["test"][n]
test_string = test["zh"]
print("Orginal Data: " + test_string)
tokens = tokenizer(test_string, return_tensors="pt", padding=False).to("cuda")
translated = model.generate(**tokens)
print("Model Translation: " + tokenizer.batch_decode(translated, skip_special_tokens=True)[0])
print("Dataset Translation: " + test["en"])

Orginal Data: 三叔份人好疏爽㗎！如果你經濟上真係出咗問題嘅，你可以試下揾佢幫幫手㗎噃。
Model Translation: Uncle San is so lazy! If there's an economic problem, you can try to help him.
Dataset Translation: Uncle Three is very generous, you should seek his help if you are hit with a financial downturn.


In [None]:
import evaluate
from tqdm import tqdm

metric = evaluate.load("sacrebleu")

source_lang = "zh"
target_lang = "en"


def compute_bleu(src_sentences, tgt_sentences, batch_size=32):
  pred = []

  for i in tqdm(range(0, len(src_sentences), batch_size)):
    batch = src_sentences[i:i + batch_size]
    encoded_input = tokenizer(batch, return_tensors="pt", padding=True, truncation=True, max_length=512).to("cuda")
    translated_output = model.generate(**encoded_input)
    translated_batch = [tokenizer.decode(t, skip_special_tokens=True) for t in translated_output]
    pred.extend(translated_batch)

  res = metric.compute(predictions=pred, references=tgt_sentences)
  return res["score"]


bleu_score = compute_bleu(dataset["test"][source_lang], dataset["test"][target_lang])
print(f"Bleu Score Before Finetuning with {model_name}: {bleu_score}")

100%|██████████| 94/94 [02:43<00:00,  1.74s/it]


Bleu Score Before Finetunign with Helsinki-NLP/opus-mt-zh-en: 10.48731370511051


### Preprocessing Data

In [None]:
max_length = 128
source_lang = "zh"
target_lang = "en"

def preprocess_function(examples):
    inputs = [ex[source_lang] for ex in examples]
    targets = [ex[target_lang] for ex in examples]
    model_inputs = tokenizer(
        inputs, text_target=targets, max_length=max_length, truncation=True
    )
    return model_inputs

In [None]:
tokenized_datasets = dataset.map(preprocess_function, batched=True)

Map:   0%|          | 0/38044 [00:00<?, ? examples/s]



Map:   0%|          | 0/3000 [00:00<?, ? examples/s]

Map:   0%|          | 0/3001 [00:00<?, ? examples/s]

In [None]:
tokenized_datasets

DatasetDict({
    train: Dataset({
        features: ['translation', 'input_ids', 'attention_mask', 'labels'],
        num_rows: 38044
    })
    test: Dataset({
        features: ['translation', 'input_ids', 'attention_mask', 'labels'],
        num_rows: 3000
    })
    dev: Dataset({
        features: ['translation', 'input_ids', 'attention_mask', 'labels'],
        num_rows: 3001
    })
})

### Fine-Tuning Datasets

In [None]:
from transformers import DataCollatorForSeq2Seq

data_collator = DataCollatorForSeq2Seq(tokenizer, model=model, return_tensors="tf")

In [None]:
model_name = model_name.split("/")[-1]
args = Seq2SeqTrainingArguments(
    f"{model_name}-finetuned",
    evaluation_strategy = "epoch",
    save_strategy = "epoch",
    learning_rate=1e-4,
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=batch_size,
    weight_decay=0.01,
    save_total_limit=3,
    num_train_epochs=10,
    predict_with_generate=True,
    fp16=True,
    load_best_model_at_end=True,
)

In [None]:
import numpy as np
import evaluate

metric = evaluate.load("sacrebleu")

def postprocess_text(preds, labels):
    preds = [pred.strip() for pred in preds]
    labels = [[label.strip()] for label in labels]
    return preds, labels
def compute_metrics(eval_preds):
    preds, labels = eval_preds
    if isinstance(preds, tuple):
        preds = preds[0]
    decoded_preds = tokenizer.batch_decode(preds, skip_special_tokens=True)
    # Replace -100 in the labels as we can't decode them.
    labels = np.where(labels != -100, labels, tokenizer.pad_token_id)
    decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)
    # Some simple post-processing
    decoded_preds, decoded_labels = postprocess_text(decoded_preds, decoded_labels)
    result = metric.compute(predictions=decoded_preds, references=decoded_labels)
    result = {"bleu": result["score"]}
    prediction_lens = [np.count_nonzero(pred != tokenizer.pad_token_id) for pred in preds]
    result["gen_len"] = np.mean(prediction_lens)
    result = {k: round(v, 4) for k, v in result.items()}
    return result

In [None]:
trainer = Seq2SeqTrainer(
    model,
    args,
    train_dataset=tokenized_datasets["train"],
    eval_dataset=tokenized_datasets["dev"],
    data_collator=data_collator,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics,
)

In [None]:
trainer.train()

Epoch,Training Loss,Validation Loss,Bleu,Gen Len
1,1.3762,2.290523,14.753,16.8147
2,1.0711,2.265406,15.5791,16.4282
3,1.0367,2.3523,15.633,16.6425
4,0.7689,2.423655,15.5669,16.7977
5,0.5697,2.512335,16.284,16.4239
6,0.4389,2.602198,16.1996,16.2303
7,0.3295,2.674185,16.4155,16.4262
8,0.2581,2.744746,16.6506,16.4195
9,0.2007,2.798014,16.4332,16.3822
10,0.1644,2.816994,16.6048,16.4092


There were missing keys in the checkpoint model loaded: ['model.encoder.embed_tokens.weight', 'model.encoder.embed_positions.weight', 'model.decoder.embed_tokens.weight', 'model.decoder.embed_positions.weight', 'lm_head.weight'].


TrainOutput(global_step=11890, training_loss=0.6239168252134845, metrics={'train_runtime': 2277.365, 'train_samples_per_second': 167.053, 'train_steps_per_second': 5.221, 'total_flos': 2698951798554624.0, 'train_loss': 0.6239168252134845, 'epoch': 10.0})

In [None]:
trainer.evaluate(tokenized_dataset["test"])

In [None]:
trainer.save_model("/content/drive/MyDrive/Cantonese-NLP/model/opus-mt-zh-en-finetuned")

In [None]:
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSeq2SeqLM.from_pretrained("/content/drive/MyDrive/Cantonese-NLP/model/opus-mt-zh-en-finetuned", local_files_only=True).to("cuda")

In [None]:
bleu_score = compute_bleu(dataset["test"][source_lang], dataset["test"][target_lang])
print(f"Bleu Score After Finetuning with {model_name}: {bleu_score}")

100%|██████████| 94/94 [02:28<00:00,  1.58s/it]


Bleu Score Before Finetuning with Helsinki-NLP/opus-mt-zh-en: 15.57886606811519


In [None]:
test_string = test["zh"]
print("Original Data: " + test_string)
tokens = tokenizer(test_string, return_tensors="pt", padding=False).to("cuda")
translated = model.generate(**tokens)
print("Model Translation: " + str([tokenizer.decode(t, skip_special_tokens=True) for t in translated]))
print("Original Translation: " + test["en"])

Original Data: 三叔份人好疏爽㗎！如果你經濟上真係出咗問題嘅，你可以試下揾佢幫幫手㗎噃。
Model Translation: ['Uncle Sam is very absent-minded! If you have serious problems with your economy, you can ask him for help.']
Original Translation: Uncle Three is very generous, you should seek his help if you are hit with a financial downturn.


## NLLB-200

### Preprocessing Data

In [None]:
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM, Seq2SeqTrainingArguments,DataCollatorForSeq2Seq, Seq2SeqTrainer

model_name = "facebook/nllb-200-distilled-600M"

tokenizer = AutoTokenizer.from_pretrained(
    "facebook/nllb-200-distilled-600M", src_lang="yue_Hant", tgt_lang="eng_Latn"
)
model = AutoModelForSeq2SeqLM.from_pretrained("facebook/nllb-200-distilled-600M").to("cuda")

In [None]:
# import random
# n = random.randint(0, len(dataset["test"]))
test = dataset["test"][n]
test_string = test["zh"]
print("Orginal Data: " + test_string)


inputs = tokenizer(test["zh"], return_tensors="pt", padding=True, truncation=True).to("cuda")

translated_tokens = model.generate(
    **inputs, forced_bos_token_id=tokenizer.lang_code_to_id["eng_Latn"], max_length=128
)

print("Model Translation: " + tokenizer.batch_decode(translated_tokens, skip_special_tokens=True)[0])
print("Dataset Translation: " + test["en"])

Orginal Data: 三叔份人好疏爽㗎！如果你經濟上真係出咗問題嘅，你可以試下揾佢幫幫手㗎噃。
Model Translation: You're so distracted! If you're really in financial trouble, you can try to get him to help you.
Dataset Translation: Uncle Three is very generous, you should seek his help if you are hit with a financial downturn.


In [None]:
from tqdm import tqdm
import evaluate

metric = evaluate.load("sacrebleu")

def compute_bleu_nllb(src_sentences, tgt_sentences, batch_size=32):
  pred = []

  for i in tqdm(range(0, len(src_sentences), batch_size)):
    batch = src_sentences[i:i + batch_size]
    encoded_input = tokenizer(batch, return_tensors="pt", padding=True, truncation=True, max_length=128).to("cuda")
    translated_tokens = model.generate(**encoded_input, forced_bos_token_id=tokenizer.lang_code_to_id["eng_Latn"], max_length=128)
    translated_batch = tokenizer.batch_decode(translated_tokens, skip_special_tokens=True)
    pred.extend(translated_batch)

  res = metric.compute(predictions=pred, references=tgt_sentences)
  return res["score"]


Downloading builder script:   0%|          | 0.00/8.15k [00:00<?, ?B/s]

In [None]:
bleu_score = compute_bleu_nllb(dataset["test"]["zh"], dataset["test"]["en"])
print(f"Bleu Score Before Finetuning with {model_name}: {bleu_score}")

100%|██████████| 94/94 [01:34<00:00,  1.00s/it]


Bleu Score Before Finetuning with facebook/nllb-200-distilled-600M: 11.13838860020088


In [None]:
prefix = ""
max_input_length = 128
max_target_length = 128
source_lang = "zh"
target_lang = "en"
def preprocess_function(examples):
    inputs =  examples[source_lang]
    targets = examples[target_lang]
    model_inputs = tokenizer(inputs, return_tensors="pt", max_length=max_input_length, padding=True, truncation=True)
    labels = tokenizer(targets, return_tensors="pt", max_length=max_input_length, padding=True, truncation=True)
    model_inputs["labels"] = labels["input_ids"]
    return model_inputs

In [None]:
tokenized_datasets = dataset.map(preprocess_function, batched=True)

Map:   0%|          | 0/38044 [00:00<?, ? examples/s]

Map:   0%|          | 0/3000 [00:00<?, ? examples/s]

Map:   0%|          | 0/3001 [00:00<?, ? examples/s]

In [None]:
batch_size = 8
model_name = model_name.split("/")[-1]
args = Seq2SeqTrainingArguments(
    f"{model_name}-finetuned",
    evaluation_strategy = "epoch",
    save_strategy = "epoch",
    learning_rate=1e-4,
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=batch_size,
    weight_decay=0.01,
    save_total_limit=3,
    num_train_epochs=10,
    predict_with_generate=True,
    fp16 = True,
    load_best_model_at_end=True,
)

In [None]:
data_collator = DataCollatorForSeq2Seq(tokenizer, model=model)

In [None]:
import numpy as np
import evaluate

metric = evaluate.load("sacrebleu")

def postprocess_text(preds, labels):
    preds = [pred.strip() for pred in preds]
    labels = [[label.strip()] for label in labels]
    return preds, labels

def compute_metrics(eval_preds):
    preds, labels = eval_preds
    if isinstance(preds, tuple):
        preds = preds[0]
    decoded_preds = tokenizer.batch_decode(preds, skip_special_tokens=True)
    # Replace -100 in the labels as we can't decode them.
    labels = np.where(labels != -100, labels, tokenizer.pad_token_id)
    decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)
    # Some simple post-processing
    decoded_preds, decoded_labels = postprocess_text(decoded_preds, decoded_labels)
    result = metric.compute(predictions=decoded_preds, references=decoded_labels)
    result = {"bleu": result["score"]}
    prediction_lens = [np.count_nonzero(pred != tokenizer.pad_token_id) for pred in preds]
    result["gen_len"] = np.mean(prediction_lens)
    result = {k: round(v, 4) for k, v in result.items()}
    return result

In [None]:
from transformers import EarlyStoppingCallback

trainer = Seq2SeqTrainer(
    model,
    args,
    train_dataset=tokenized_datasets["train"],
    eval_dataset=tokenized_datasets["dev"],
    data_collator=data_collator,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics,
)

In [None]:
trainer.train()

You're using a NllbTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


Epoch,Training Loss,Validation Loss,Bleu,Gen Len
1,0.5474,0.600572,15.8422,19.038
2,0.414,0.597614,16.6074,18.6597
3,0.3069,0.62815,16.8143,18.7653
4,0.2237,0.682231,16.4586,19.21
5,0.1644,0.742678,16.496,19.01
6,0.1151,0.810697,16.3032,18.98
7,0.0823,0.860636,16.3953,19.389
8,0.0571,0.903166,16.6701,19.03
9,0.0408,0.928514,16.885,19.1353
10,0.0321,0.942336,16.9497,19.0793


There were missing keys in the checkpoint model loaded: ['model.encoder.embed_tokens.weight', 'model.decoder.embed_tokens.weight', 'lm_head.weight'].


TrainOutput(global_step=47560, training_loss=0.208083332838182, metrics={'train_runtime': 28677.0782, 'train_samples_per_second': 13.266, 'train_steps_per_second': 1.658, 'total_flos': 4.774283812995072e+16, 'train_loss': 0.208083332838182, 'epoch': 10.0})

In [None]:
trainer.save_model(f"/content/drive/MyDrive/Cantonese-NLP/model/{model_name}-finetuned")

In [None]:
import torch
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM

tokenizer = AutoTokenizer.from_pretrained("facebook/nllb-200-distilled-600M", src_lang="yue_Hant", tgt_lang="eng_Latn")
model = AutoModelForSeq2SeqLM.from_pretrained(f"/content/drive/MyDrive/Cantonese-NLP/model/nllb-200-distilled-600M-finetuned", local_files_only=True).to("cuda")

In [None]:
test = dataset["test"][n]
test_string = test["zh"]
print("Orginal Data: " + test_string)


inputs = tokenizer(test["zh"], return_tensors="pt").to("cuda")

translated_tokens = model.generate(
    **inputs, forced_bos_token_id=tokenizer.lang_code_to_id["eng_Latn"], max_length=30
)

print("Model Translation: " + tokenizer.batch_decode(translated_tokens, skip_special_tokens=True)[0])
print("Dataset Translation: " + test["en"])

Orginal Data: 三叔份人好疏爽㗎！如果你經濟上真係出咗問題嘅，你可以試下揾佢幫幫手㗎噃。
Model Translation: Uncle Sam is a sloppy person! If you really have a financial problem, you can try to ask him to help.
Dataset Translation: Uncle Three is very generous, you should seek his help if you are hit with a financial downturn.


In [None]:
bleu_score = compute_bleu_nllb(dataset["test"]["zh"], dataset["test"]["en"])
print(f"Bleu Score after Finetuning with nllb-200-distilled-600M: {bleu_score}")

100%|██████████| 94/94 [01:24<00:00,  1.11it/s]


Bleu Score after Finetuning with nllb-200-distilled-600M: 16.121964105036355


## mBART

In [9]:
from transformers import MBartForConditionalGeneration, MBart50TokenizerFast
model_name = "facebook/mbart-large-50-many-to-many-mmt"
model = MBartForConditionalGeneration.from_pretrained(model_name).to("cuda")
tokenizer = MBart50TokenizerFast.from_pretrained(model_name)
tokenizer.src_lang = "zh_CN"

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


config.json:   0%|          | 0.00/1.43k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/2.44G [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/261 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/529 [00:00<?, ?B/s]

sentencepiece.bpe.model:   0%|          | 0.00/5.07M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/649 [00:00<?, ?B/s]

In [10]:
import chinese_converter
from tqdm import tqdm
import evaluate

metric = evaluate.load("sacrebleu")

def compute_bleu(src_sentences, tgt_sentences, batch_size=32):
  pred = []
  source_lang = "zh_CN"
  target_lang = "en_XX"

  for i in tqdm(range(0, len(src_sentences), batch_size)):
    batch = src_sentences[i:i + batch_size]
    encoded_input = tokenizer(
        batch,
        padding=True,
        truncation=True,
        max_length = 512,
        return_tensors="pt"
    ).to("cuda")
    generated_tokens = model.generate(
        **encoded_input,
        forced_bos_token_id=tokenizer.lang_code_to_id[target_lang]
    )
    translated_batch = tokenizer.batch_decode(
        generated_tokens,
        skip_special_tokens=True
    )
    for ind, sentence in enumerate(translated_batch):
      translated_batch[ind] = chinese_converter.to_traditional(sentence)
    pred.extend(translated_batch)

  res = metric.compute(predictions=pred, references=tgt_sentences)
  return res["score"]


Downloading builder script:   0%|          | 0.00/8.15k [00:00<?, ?B/s]

In [11]:
bleu_score = compute_bleu(dataset["test"]["zh"], dataset["test"]["en"])
print(f"Bleu Score Before Finetuning with {model_name}: {bleu_score}")

100%|██████████| 94/94 [01:55<00:00,  1.23s/it]


Bleu Score Before Finetuning with facebook/mbart-large-50-many-to-many-mmt: 8.315683152180513


In [12]:
max_input_length = 512
source_lang = "zh"
target_lang = "en"

def preprocess_function(examples):
    inputs =  examples[source_lang]
    targets = examples[target_lang]
    model_inputs = tokenizer(
        inputs,
        return_tensors="pt",
        max_length=max_input_length,
        padding=True,
        truncation=True
    )
    tokenizer.src_lang = "en_XX"
    labels = tokenizer(
        targets,
        return_tensors="pt",
        max_length=max_input_length,
        padding=True,
        truncation=True
    )
    tokenizer.src_lang = "zh_CN"
    model_inputs["labels"] = labels["input_ids"]
    return model_inputs

In [13]:
tokenized_datasets = dataset.map(preprocess_function, batched=True)

Map:   0%|          | 0/38044 [00:00<?, ? examples/s]

Map:   0%|          | 0/3001 [00:00<?, ? examples/s]

Map:   0%|          | 0/3000 [00:00<?, ? examples/s]

In [14]:
tokenized_datasets

DatasetDict({
    train: Dataset({
        features: ['zh', 'en', 'input_ids', 'attention_mask', 'labels'],
        num_rows: 38044
    })
    dev: Dataset({
        features: ['zh', 'en', 'input_ids', 'attention_mask', 'labels'],
        num_rows: 3001
    })
    test: Dataset({
        features: ['zh', 'en', 'input_ids', 'attention_mask', 'labels'],
        num_rows: 3000
    })
})

In [15]:
from transformers import DataCollatorForSeq2Seq
data_collator = DataCollatorForSeq2Seq(tokenizer, model=model, return_tensors="pt")

In [16]:
from transformers import Seq2SeqTrainingArguments
args = Seq2SeqTrainingArguments(
    f"{model_name}-finetuned",
    evaluation_strategy = "epoch",
    save_strategy = "epoch",
    learning_rate=1e-4,
    weight_decay=0.01,
    save_total_limit=3,
    num_train_epochs=3,
    auto_find_batch_size = True,
    predict_with_generate=True,
    load_best_model_at_end=True,
)

In [17]:
metric = evaluate.load("sacrebleu")

def postprocess_text(preds, labels):
    preds = [pred.strip() for pred in preds]
    labels = [[label.strip()] for label in labels]
    return preds, labels
def compute_metrics(eval_preds):
    preds, labels = eval_preds
    if isinstance(preds, tuple):
        preds = preds[0]
    decoded_preds = tokenizer.batch_decode(preds, skip_special_tokens=True)
    for ind, sentence in enumerate(decoded_preds):
      decoded_preds[ind] = chinese_converter.to_traditional(sentence)
    # Replace -100 in the labels as we can't decode them.
    labels = np.where(labels != -100, labels, tokenizer.pad_token_id)
    decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)
    # Some simple post-processing
    decoded_preds, decoded_labels = postprocess_text(
        decoded_preds,
        decoded_labels
    )
    result = metric.compute(
        predictions=decoded_preds,
        references=decoded_labels
    )
    result = {"bleu": result["score"]}
    prediction_lens = [np.count_nonzero(pred != tokenizer.pad_token_id) for pred in preds]
    result["gen_len"] = np.mean(prediction_lens)
    result = {k: round(v, 4) for k, v in result.items()}
    return result

In [18]:
trainer = Seq2SeqTrainer(
    model,
    args,
    train_dataset=tokenized_datasets["train"],
    eval_dataset=tokenized_datasets["dev"],
    data_collator=data_collator,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics,
)

In [19]:
trainer.train()

Epoch,Training Loss,Validation Loss,Bleu,Gen Len
1,0.5986,0.577818,14.882,19.4299
2,0.3342,0.560304,16.225,19.4345
3,0.1383,0.660802,16.2072,19.3859


Non-default generation parameters: {'max_length': 200, 'early_stopping': True, 'num_beams': 5, 'forced_eos_token_id': 2}
Non-default generation parameters: {'max_length': 200, 'early_stopping': True, 'num_beams': 5, 'forced_eos_token_id': 2}
Non-default generation parameters: {'max_length': 200, 'early_stopping': True, 'num_beams': 5, 'forced_eos_token_id': 2}
There were missing keys in the checkpoint model loaded: ['model.encoder.embed_tokens.weight', 'model.decoder.embed_tokens.weight', 'lm_head.weight'].


TrainOutput(global_step=14268, training_loss=0.3802789137882781, metrics={'train_runtime': 3624.255, 'train_samples_per_second': 31.491, 'train_steps_per_second': 3.937, 'total_flos': 1.4145236936097792e+16, 'train_loss': 0.3802789137882781, 'epoch': 3.0})

In [20]:
trainer.save_model(f"{base_path}model/mBart-baseline")

Non-default generation parameters: {'max_length': 200, 'early_stopping': True, 'num_beams': 5, 'forced_eos_token_id': 2}


In [21]:
model_name = "facebook/mbart-large-50-many-to-many-mmt"
tokenizer = MBart50TokenizerFast.from_pretrained(model_name)
model = MBartForConditionalGeneration.from_pretrained(
    f"{base_path}model/mBart-baseline",
    local_files_only = True,
).to("cuda")
tokenizer.src_lang = "zh_CN"

In [22]:
bleu_score = compute_bleu(dataset["test"]["zh"], dataset["test"]["en"])
print(f"Bleu Score Before Finetuning with {model_name}: {bleu_score}")

100%|██████████| 94/94 [01:59<00:00,  1.27s/it]


Bleu Score Before Finetuning with facebook/mbart-large-50-many-to-many-mmt: 15.751320872414007


In [23]:
from google.colab import runtime
runtime.unassign()

## Model Evaluation

In [None]:
with open("/content/drive/MyDrive/Cantonese-NLP/SoTA-Translations/bing_translated.txt") as f:
  bing_translated = f.read().splitlines()

In [None]:
len(dataset["test"]["en"])

3000

In [None]:
import evaluate

sacrebleu = evaluate.load("sacrebleu")

res = sacrebleu.compute(predictions=bing_translated, references=dataset["test"]["en"])
print(res["score"])

17.109785638339876


In [None]:
def compute_bleu(src_sentences, tgt_sentences, batch_size=32):
  pred = []

  for i in tqdm(range(0, len(src_sentences), batch_size)):
    batch = src_sentences[i:i + batch_size]
    encoded_input = tokenizer(batch, return_tensors="pt", padding=True, truncation=True, max_length=512).to("cuda")
    translated_output = model.generate(**encoded_input)
    translated_batch = [tokenizer.decode(t, skip_special_tokens=True) for t in translated_output]
    pred.extend(translated_batch)

  res = metric.compute(predictions=pred, references=tgt_sentences)
  return res["score"]

In [None]:
def get_translation_opus(test_string, model, tokenizer):
  tokens = tokenizer(test_string, return_tensors="pt", padding=False).to("cuda")
  translated = model.generate(**tokens)
  return str([tokenizer.decode(t, skip_special_tokens=True) for t in translated][0])


def get_translation_nllb(test_string, model, tokenizer):
  inputs = tokenizer(test_string, return_tensors="pt").to("cuda")
  translated_tokens = model.generate(
      **inputs, forced_bos_token_id=tokenizer.lang_code_to_id["eng_Latn"], max_length=30
  )
  return tokenizer.batch_decode(translated_tokens, skip_special_tokens=True)[0]



In [None]:
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM

opus_tokenizer = AutoTokenizer.from_pretrained("Helsinki-NLP/opus-mt-zh-en")
opus_model = AutoModelForSeq2SeqLM.from_pretrained("Helsinki-NLP/opus-mt-zh-en").to("cuda")
opus_ft_model = AutoModelForSeq2SeqLM.from_pretrained("/content/drive/MyDrive/Cantonese-NLP/model/opus-mt-zh-en-finetuned", local_files_only=True).to("cuda")


nllb_tokenizer = AutoTokenizer.from_pretrained("facebook/nllb-200-distilled-600M")
nllb_model = AutoModelForSeq2SeqLM.from_pretrained("facebook/nllb-200-distilled-600M").to("cuda")
nllb_ft_model = AutoModelForSeq2SeqLM.from_pretrained("/content/drive/MyDrive/Cantonese-NLP/model/nllb-200-distilled-600M-finetuned", local_files_only=True).to("cuda")

tokenizer_config.json:   0%|          | 0.00/44.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/1.39k [00:00<?, ?B/s]

source.spm:   0%|          | 0.00/805k [00:00<?, ?B/s]

target.spm:   0%|          | 0.00/807k [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/1.62M [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/312M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/293 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/564 [00:00<?, ?B/s]

sentencepiece.bpe.model:   0%|          | 0.00/4.85M [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/17.3M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/3.55k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/846 [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/2.46G [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/189 [00:00<?, ?B/s]

In [None]:
def get_result(test_string, gold_standard, opus_tokenizer, opus_model, opus_ft_model, nllb_tokenizer, nllb_model, nllb_ft_model):
  print(f"Original Text: {test_string}")
  print(f"Gold Standard Translation: {gold_standard}")
  print(f"--------------------------------------------")
  print(f"OPUS-MT-en-zh Translation: {get_translation_opus(test_string, opus_model, opus_tokenizer)}")
  print(f"OPUS-MT-en-zh-finetuned Translation: {get_translation_opus(test_string, opus_ft_model, opus_tokenizer)}")
  print(f"--------------------------------------------")
  print(f"NLLB-200 Translation: {get_translation_nllb(test_string, nllb_model, nllb_tokenizer)}")
  print(f"NLLB-200-finetuned Translation: {get_translation_nllb(test_string, nllb_ft_model, nllb_tokenizer)}")

In [None]:
import random
n = random.randint(0, len(dataset["test"]) - 1)
test = dataset["test"][n]

get_result(test_string=test["zh"],
           gold_standard=test["en"],
           opus_tokenizer=opus_tokenizer,
           opus_model=opus_model,
           opus_ft_model=opus_ft_model,
           nllb_tokenizer=nllb_tokenizer,
           nllb_model=nllb_model,
           nllb_ft_model=nllb_ft_model)

Original Text: 落咗髮泥之後，你要用手摷鬆個頭。
Gold Standard Translation: After applying hair wax, you have to rake your hand through your hair.
--------------------------------------------
OPUS-MT-en-zh Translation: After the hair's mud, you have to relax your head with your hands.
OPUS-MT-en-zh-finetuned Translation: You need to loosen your hair with your hands after you've got a muddled hair.
--------------------------------------------
NLLB-200 Translation: After you lose your hair, you'll have to use your hands to loosen your head.
NLLB-200-finetuned Translation: After you lose your hair, you have to apply a hair dressing hand.
