## Install Necessary Dependencies

In [1]:
!pip install -U accelerate
!pip install -U transformers
!pip install datasets
!pip install sacrebleu
!pip install evaluate
!pip install sacremoses
!pip install chinese-converter

Collecting accelerate
  Downloading accelerate-0.27.2-py3-none-any.whl (279 kB)
[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/280.0 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [91m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m[90m╺[0m [32m276.5/280.0 kB[0m [31m8.0 MB/s[0m eta [36m0:00:01[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m280.0/280.0 kB[0m [31m6.4 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: accelerate
Successfully installed accelerate-0.27.2
Collecting transformers
  Downloading transformers-4.38.1-py3-none-any.whl (8.5 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m8.5/8.5 MB[0m [31m68.3 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: transformers
  Attempting uninstall: transformers
    Found existing installation: transformers 4.37.2
    Uninstalling transformers-4.37.2:
      Successfully uninstalled transformers-4.37.2
Successfully installed transformers-4.38.1


## Import and Mount Google Drive

In [2]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [3]:
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
from transformers import DataCollatorForSeq2Seq, Seq2SeqTrainingArguments
from transformers import MBartForConditionalGeneration, MBart50TokenizerFast
from transformers import Seq2SeqTrainer
from transformers import BertTokenizer
from datasets.dataset_dict import DatasetDict
from datasets import Dataset
import pandas as pd
import random
from tqdm import tqdm
import gc
import torch
import evaluate
import numpy as np
import chinese_converter

## Import Data

In [4]:
base_path = "/content/drive/MyDrive/Cantonese-NLP/"
folders = ["train-short","train","dev","test"]
file_name = ["yue.txt","en.txt"]

In [5]:
yue = []
en = []
for folder in folders:
  with open(f"{base_path}Processed-Data/{folder}/{file_name[0]}",'r') as f_yue, open(f"{base_path}Processed-Data/{folder}/{file_name[1]}",'r') as f_en:
    yue.append(f_yue.read().splitlines())
    en.append(f_en.read().splitlines())


In [6]:
# Concatenate Short and Long Training Example
yue[1] = yue[0] + yue[1]
del yue[0]
en[1] = en[0] + en[1]
del en[0]

In [7]:
train = zip(yue[0], en[0])
dev = zip(yue[1], en[1])
test = zip(yue[2], en[2])

train_df = pd.DataFrame(train, columns = ['zh','en'])
dev_df = pd.DataFrame(dev, columns = ['zh','en'])
test_df = pd.DataFrame(test, columns = ['zh','en'])

In [8]:
dataset = DatasetDict({
    'train': Dataset.from_pandas(train_df),
    'dev': Dataset.from_pandas(dev_df),
    'test': Dataset.from_pandas(test_df)})


In [9]:
dataset

DatasetDict({
    train: Dataset({
        features: ['zh', 'en'],
        num_rows: 38044
    })
    dev: Dataset({
        features: ['zh', 'en'],
        num_rows: 3001
    })
    test: Dataset({
        features: ['zh', 'en'],
        num_rows: 3000
    })
})

In [10]:
with open(f"{base_path}/Processed-Data/mono/mono_data.txt", "r", encoding='utf-8') as f:
  mono_data = f.readlines()

In [11]:
print(f"Number of Sentences: {len(mono_data)}")
print(f"Average Length: {sum([len(d) for d in mono_data])/ len(mono_data):.02f}")

Number of Sentences: 1135989
Average Length: 29.89


## Model Finetune

### mBART

In [None]:
model = MBartForConditionalGeneration.from_pretrained("facebook/mbart-large-50-one-to-many-mmt").to("cuda")
tokenizer = MBart50TokenizerFast.from_pretrained("facebook/mbart-large-50-one-to-many-mmt", src_lang="en_XX")

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


config.json:   0%|          | 0.00/1.43k [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/2.44G [00:00<?, ?B/s]

  return self.fget.__get__(instance, owner)()


generation_config.json:   0%|          | 0.00/261 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/528 [00:00<?, ?B/s]

sentencepiece.bpe.model:   0%|          | 0.00/5.07M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/717 [00:00<?, ?B/s]

In [None]:
metric = evaluate.load("sacrebleu")

def compute_bleu_mbart_zh(src_sentences, tgt_sentences, model, tokenizer, batch_size=32):
  pred = []

  for i in tqdm(range(0, len(src_sentences), batch_size)):
    batch = src_sentences[i:i + batch_size]
    encoded_input = tokenizer(batch, return_tensors="pt", padding=True, truncation=True).to("cuda")
    generated_tokens = model.generate(
      **encoded_input,
      forced_bos_token_id=tokenizer.lang_code_to_id["zh_CN"]
    )
    translated_batch = tokenizer.batch_decode(generated_tokens, skip_special_tokens=True)
    for ind, sentence in enumerate(translated_batch):
      translated_batch[ind] = chinese_converter.to_traditional(sentence)
    pred.extend(translated_batch)

  res = metric.compute(predictions=pred, references=tgt_sentences, tokenize="zh")
  return res["score"]

In [None]:
bleu_score = compute_bleu_mbart_zh(dataset["test"]["en"], dataset["test"]["zh"], model, tokenizer)
print(f"Bleu Score Before Finetuning with mBART-50: {bleu_score}")

100%|██████████| 94/94 [01:51<00:00,  1.18s/it]


Bleu Score Before Finetuning with mBART-50: 2.786519020794235


In [None]:
prefix = ""
max_input_length = 128
max_target_length = 128
source_lang = "en"
target_lang = "zh"
def preprocess_function(examples):
    inputs =  examples[source_lang]
    targets = examples[target_lang]
    model_inputs = tokenizer(inputs, return_tensors="pt", max_length=max_input_length, padding=True, truncation=True)
    labels = tokenizer(targets, return_tensors="pt", max_length=max_input_length, padding=True, truncation=True)
    model_inputs["labels"] = labels["input_ids"]
    return model_inputs

In [None]:
tokenized_datasets = dataset.map(preprocess_function, batched=True)

Map:   0%|          | 0/38044 [00:00<?, ? examples/s]

Map:   0%|          | 0/3001 [00:00<?, ? examples/s]

Map:   0%|          | 0/3000 [00:00<?, ? examples/s]

In [None]:
metric = evaluate.load("sacrebleu")

def postprocess_text(preds, labels):
    preds = [pred.strip() for pred in preds]
    labels = [[label.strip()] for label in labels]
    return preds, labels

def compute_metrics(eval_preds):
    preds, labels = eval_preds
    if isinstance(preds, tuple):
        preds = preds[0]
    decoded_preds = tokenizer.batch_decode(preds, skip_special_tokens=True)
    # Replace -100 in the labels as we can't decode them.
    labels = np.where(labels != -100, labels, tokenizer.pad_token_id)
    decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)
    # Some simple post-processing
    decoded_preds, decoded_labels = postprocess_text(decoded_preds, decoded_labels)
    result = metric.compute(predictions=decoded_preds, references=decoded_labels, tokenize="zh")
    result = {"bleu": result["score"]}
    prediction_lens = [np.count_nonzero(pred != tokenizer.pad_token_id) for pred in preds]
    result["gen_len"] = np.mean(prediction_lens)
    result = {k: round(v, 4) for k, v in result.items()}
    return result

In [None]:
args = Seq2SeqTrainingArguments(
    f"mBART-finetuned",
    evaluation_strategy = "epoch",
    save_strategy = "epoch",
    learning_rate=1e-4,
    auto_find_batch_size=True,
    weight_decay=0.01,
    save_total_limit=3,
    num_train_epochs=10,
    predict_with_generate=True,
    fp16 = True,
    load_best_model_at_end=True,
)

In [None]:
data_collator = DataCollatorForSeq2Seq(tokenizer, model=model)

In [None]:
trainer = Seq2SeqTrainer(
    model,
    args,
    train_dataset=tokenized_datasets["train"],
    eval_dataset=tokenized_datasets["dev"],
    data_collator=data_collator,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics,
)

In [None]:
trainer.train()

Epoch,Training Loss,Validation Loss,Bleu,Gen Len
1,0.9687,0.833423,19.5614,17.4495
2,0.6136,0.817782,20.3425,16.5968
3,0.3475,0.896749,20.6819,16.7448
4,0.1763,1.015107,20.5266,17.0613
5,0.0968,1.090085,20.1156,17.055
6,0.0562,1.129093,20.8094,16.9334
7,0.0296,1.15401,21.2803,17.0553
8,0.0179,1.164104,21.2782,17.058
9,0.0111,1.175725,21.6811,16.862
10,0.0052,1.181492,22.0575,16.9847


Non-default generation parameters: {'max_length': 200, 'early_stopping': True, 'num_beams': 5, 'forced_eos_token_id': 2}
Non-default generation parameters: {'max_length': 200, 'early_stopping': True, 'num_beams': 5, 'forced_eos_token_id': 2}
Non-default generation parameters: {'max_length': 200, 'early_stopping': True, 'num_beams': 5, 'forced_eos_token_id': 2}
Non-default generation parameters: {'max_length': 200, 'early_stopping': True, 'num_beams': 5, 'forced_eos_token_id': 2}
Non-default generation parameters: {'max_length': 200, 'early_stopping': True, 'num_beams': 5, 'forced_eos_token_id': 2}
Non-default generation parameters: {'max_length': 200, 'early_stopping': True, 'num_beams': 5, 'forced_eos_token_id': 2}
Non-default generation parameters: {'max_length': 200, 'early_stopping': True, 'num_beams': 5, 'forced_eos_token_id': 2}
Non-default generation parameters: {'max_length': 200, 'early_stopping': True, 'num_beams': 5, 'forced_eos_token_id': 2}
Non-default generation parameter

TrainOutput(global_step=47560, training_loss=0.24267727914317203, metrics={'train_runtime': 12996.1609, 'train_samples_per_second': 29.273, 'train_steps_per_second': 3.66, 'total_flos': 5.19538378679255e+16, 'train_loss': 0.24267727914317203, 'epoch': 10.0})

In [None]:
trainer.save_model(f"{base_path}model/mBART-back-finetuned")

Non-default generation parameters: {'max_length': 200, 'early_stopping': True, 'num_beams': 5, 'forced_eos_token_id': 2}


In [None]:
tokenizer = MBart50TokenizerFast.from_pretrained(
    "facebook/mbart-large-50-one-to-many-mmt",
    src_lang="en_XX"
)
model = MBartForConditionalGeneration.from_pretrained(
    f"{base_path}model/mBART-back-finetuned",
    local_files_only=True
).half().to("cuda")

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/528 [00:00<?, ?B/s]

sentencepiece.bpe.model:   0%|          | 0.00/5.07M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/717 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/1.43k [00:00<?, ?B/s]

In [None]:
n = random.randint(0,3000)
test = dataset["test"][n]
test_string = test["en"]

print("Orginal Data: " + test_string)

model_inputs = tokenizer(test_string, return_tensors="pt").to("cuda")

# translate from English to Chinese
generated_tokens = model.generate(
    **model_inputs,
    forced_bos_token_id=tokenizer.lang_code_to_id["zh_CN"]
)

translation = chinese_converter.to_traditional(
    tokenizer.batch_decode(generated_tokens, skip_special_tokens=True)[0]
)

print("Model Translation: " + translation)

print("Dataset Translation: " + test["zh"])

In [None]:
bleu_score = compute_bleu_mbart_zh(dataset["test"]["en"], dataset["test"]["zh"], model, tokenizer)
print(f"Bleu Score After Finetuning with mBart: {bleu_score}")

### Opus-MT

In [None]:
model_name = "Helsinki-NLP/opus-mt-en-zh"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSeq2SeqLM.from_pretrained(model_name).to("cuda")

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/44.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/1.40k [00:00<?, ?B/s]

source.spm:   0%|          | 0.00/806k [00:00<?, ?B/s]

target.spm:   0%|          | 0.00/805k [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/1.62M [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/312M [00:00<?, ?B/s]

  return self.fget.__get__(instance, owner)()


generation_config.json:   0%|          | 0.00/293 [00:00<?, ?B/s]

In [None]:
metric = evaluate.load("sacrebleu")

source_lang = "en"
target_lang = "zh"


def compute_bleu(src_sentences, tgt_sentences, batch_size=32):
  pred = []

  for i in tqdm(range(0, len(src_sentences), batch_size)):
    batch = src_sentences[i:i + batch_size]
    encoded_input = tokenizer(batch, return_tensors="pt", padding=True, truncation=True, max_length=512).to("cuda")
    translated_output = model.generate(**encoded_input)
    translated_batch = [tokenizer.decode(t, skip_special_tokens=True) for t in translated_output]
    pred.extend(translated_batch)

  res = metric.compute(predictions=pred, references=tgt_sentences, tokenize="zh")
  return res["score"]


bleu_score = compute_bleu(dataset["test"][source_lang], dataset["test"][target_lang])
print(f"Bleu Score Before Finetuning with {model_name}: {bleu_score}")

In [None]:
max_length = 128
source_lang = "en"
target_lang = "zh"

def preprocess_function(examples):
    inputs = examples[source_lang]
    targets = examples[target_lang]
    model_inputs = tokenizer(
        inputs, text_target=targets, max_length=max_length, truncation=True
    )
    return model_inputs

In [None]:
tokenized_datasets = dataset.map(preprocess_function, batched=True)

Map:   0%|          | 0/38044 [00:00<?, ? examples/s]

Map:   0%|          | 0/3001 [00:00<?, ? examples/s]

Map:   0%|          | 0/3000 [00:00<?, ? examples/s]

In [None]:
tokenized_datasets

DatasetDict({
    train: Dataset({
        features: ['zh', 'en', 'input_ids', 'attention_mask', 'labels'],
        num_rows: 38044
    })
    dev: Dataset({
        features: ['zh', 'en', 'input_ids', 'attention_mask', 'labels'],
        num_rows: 3001
    })
    test: Dataset({
        features: ['zh', 'en', 'input_ids', 'attention_mask', 'labels'],
        num_rows: 3000
    })
})

In [None]:
data_collator = DataCollatorForSeq2Seq(tokenizer, model=model, return_tensors="pt")

In [None]:
model_name = model_name.split("/")[-1]
args = Seq2SeqTrainingArguments(
    f"{model_name}-finetuned",
    evaluation_strategy = "epoch",
    save_strategy = "epoch",
    auto_find_batch_size = True,
    learning_rate=1e-4,
    weight_decay=0.01,
    save_total_limit=3,
    num_train_epochs=10,
    predict_with_generate=True,
    fp16=True,
    load_best_model_at_end=True,

)

In [None]:
metric = evaluate.load("sacrebleu")

def postprocess_text(preds, labels):
    preds = [pred.strip() for pred in preds]
    labels = [[label.strip()] for label in labels]
    return preds, labels
def compute_metrics(eval_preds):
    preds, labels = eval_preds
    if isinstance(preds, tuple):
        preds = preds[0]
    decoded_preds = tokenizer.batch_decode(preds, skip_special_tokens=True)
    # Replace -100 in the labels as we can't decode them.
    labels = np.where(labels != -100, labels, tokenizer.pad_token_id)
    decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)
    # Some simple post-processing
    decoded_preds, decoded_labels = postprocess_text(decoded_preds, decoded_labels)
    result = metric.compute(predictions=decoded_preds, references=decoded_labels, tokenize="zh")
    result = {"bleu": result["score"]}
    prediction_lens = [np.count_nonzero(pred != tokenizer.pad_token_id) for pred in preds]
    result["gen_len"] = np.mean(prediction_lens)
    result = {k: round(v, 4) for k, v in result.items()}
    return result

In [None]:
trainer = Seq2SeqTrainer(
    model,
    args,
    train_dataset=tokenized_datasets["train"],
    eval_dataset=tokenized_datasets["dev"],
    data_collator=data_collator,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics,
)

In [None]:
trainer.train()

Epoch,Training Loss,Validation Loss,Bleu,Gen Len
1,2.6693,2.592637,19.2738,15.7534
2,2.0785,2.534734,19.9802,15.2029
3,1.6136,2.492463,20.7449,15.8554
4,1.2467,2.581617,20.5071,15.5878
5,0.8976,2.647686,21.1605,15.6941
6,0.6372,2.735911,21.3927,15.3332
7,0.4542,2.813333,21.5392,15.3579
8,0.306,2.88742,21.589,15.4042
9,0.2049,2.948853,21.8606,15.3066
10,0.1437,2.975725,22.1852,15.3905


Non-default generation parameters: {'max_length': 512, 'num_beams': 4, 'bad_words_ids': [[65000]], 'forced_eos_token_id': 0}
Non-default generation parameters: {'max_length': 512, 'num_beams': 4, 'bad_words_ids': [[65000]], 'forced_eos_token_id': 0}
Non-default generation parameters: {'max_length': 512, 'num_beams': 4, 'bad_words_ids': [[65000]], 'forced_eos_token_id': 0}
Non-default generation parameters: {'max_length': 512, 'num_beams': 4, 'bad_words_ids': [[65000]], 'forced_eos_token_id': 0}
Non-default generation parameters: {'max_length': 512, 'num_beams': 4, 'bad_words_ids': [[65000]], 'forced_eos_token_id': 0}
Non-default generation parameters: {'max_length': 512, 'num_beams': 4, 'bad_words_ids': [[65000]], 'forced_eos_token_id': 0}
Non-default generation parameters: {'max_length': 512, 'num_beams': 4, 'bad_words_ids': [[65000]], 'forced_eos_token_id': 0}
Non-default generation parameters: {'max_length': 512, 'num_beams': 4, 'bad_words_ids': [[65000]], 'forced_eos_token_id': 0}


TrainOutput(global_step=47560, training_loss=1.0142546685830798, metrics={'train_runtime': 5244.3011, 'train_samples_per_second': 72.544, 'train_steps_per_second': 9.069, 'total_flos': 2258926657339392.0, 'train_loss': 1.0142546685830798, 'epoch': 10.0})

In [None]:
trainer.save_model(f"{base_path}model/opus-mt-en-zh-finetuned")

Non-default generation parameters: {'max_length': 512, 'num_beams': 4, 'bad_words_ids': [[65000]], 'forced_eos_token_id': 0}


In [None]:
model_name = "Helsinki-NLP/opus-mt-en-zh"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSeq2SeqLM.from_pretrained(
  f"{base_path}model/opus-mt-en-zh-finetuned",
  local_files_only=True
).to("cuda")

In [None]:
bleu_score = compute_bleu(dataset["test"]["en"], dataset["test"]["zh"])
print(f"Bleu Score After Finetuning with mBart: {bleu_score}")

100%|██████████| 94/94 [01:33<00:00,  1.01it/s]


Bleu Score After Finetuning with mBart: 17.78788939731462


In [None]:
from google.colab import runtime
runtime.unassign()

## Synthesize Data

### mBART

In [None]:
with open(f"{base_path}Processed-Data/mono/mono_data_en.txt", "r", encoding='utf-8') as f:
  mono_data_en = f.readlines()

In [None]:
print(f"Number of Sentences: {len(mono_data_en)}")
print(f"Average Length: {sum([len(d) for d in mono_data_en])/ len(mono_data_en):.02f}")

Number of Sentences: 434684
Average Length: 133.42


In [None]:
random.seed(42)
SYNTHETIC_DATA_SIZE = 200_000
TRANSLATION_BATCH_SIZE = 50

In [None]:
syn_data_en = random.sample(mono_data_en, SYNTHETIC_DATA_SIZE)
syn_data_yue = []

with open(f"{base_path}Synthetic-Data/mBART/yue.txt", "a") as yue, open(f"{base_path}Synthetic-Data/mBART/en.txt", "a") as en:
  for i in tqdm(range(3771 * TRANSLATION_BATCH_SIZE, SYNTHETIC_DATA_SIZE, TRANSLATION_BATCH_SIZE)):
    batch = syn_data_en[i:i + TRANSLATION_BATCH_SIZE]
    encoded_input = tokenizer(batch, return_tensors="pt", padding=True, truncation=True).to("cuda")
    generated_tokens = model.generate(
      **encoded_input,
      forced_bos_token_id=tokenizer.lang_code_to_id["zh_CN"]
    )
    translated_batch = tokenizer.batch_decode(generated_tokens, skip_special_tokens=True)
    for ind, sentence in enumerate(translated_batch):
      translated_batch[ind] = chinese_converter.to_traditional(sentence)

    syn_data_yue.extend(translated_batch)
    for j in range(len(batch)):
      yue.write(f"{translated_batch[j]}\n")
      en.write(f"{batch[j]}")

100%|██████████| 229/229 [18:27<00:00,  4.84s/it]


In [None]:
train_sys = zip(syn_data_yue, syn_data_en)
train_sys_df = pd.DataFrame(train_sys, columns = ['zh','en'])

In [None]:
train_sys_df

In [None]:
with open(f"{base_path}Synthetic-Data/mBART/yue.txt", "w", encoding="UTF-8") as f:
  for line in syn_data_yue:
    f.write(f"{line}\n")

In [None]:
with open(f"{base_path}Synthetic-Data/mBART/en.txt", "w", encoding="UTF-8") as f:
  for line in syn_data_en:
    f.write(f"{line}")

### Opus-MT

In [None]:
with open(f"{base_path}Processed-Data/mono/mono_data_en.txt", "r", encoding='utf-8') as f:
  mono_data_en = f.readlines()

In [None]:
print(f"Number of Sentences: {len(mono_data_en)}")
print(f"Average Length: {sum([len(d) for d in mono_data_en])/ len(mono_data_en):.02f}")

Number of Sentences: 434684
Average Length: 133.42


In [None]:
random.seed(42)
SYNTHETIC_DATA_SIZE = 200_000
TRANSLATION_BATCH_SIZE = 50

In [None]:
syn_data_en = random.sample(mono_data_en, SYNTHETIC_DATA_SIZE)
syn_data_yue = []

for i in tqdm(range(0, SYNTHETIC_DATA_SIZE, TRANSLATION_BATCH_SIZE)):
  batch = syn_data_en[i:i+TRANSLATION_BATCH_SIZE]
  encoded_input = tokenizer(batch, return_tensors="pt", padding=True, truncation=True, max_length=512).to("cuda")
  translated_output = model.generate(**encoded_input)
  translated_batch = [tokenizer.decode(t, skip_special_tokens=True) for t in translated_output]
  syn_data_yue.extend(translated_batch)


100%|██████████| 4000/4000 [4:23:32<00:00,  3.95s/it]


In [None]:
with open(f"{base_path}Synthetic-Data/opus-mt/yue.txt", "w", encoding="UTF-8") as f:
  for line in syn_data_yue:
    f.write(f"{line}\n")

In [None]:
with open(f"{base_path}Synthetic-Data/opus-mt/en.txt", "w", encoding="UTF-8") as f:
  for line in syn_data_en:
    f.write(f"{line}")

## Finetune NLLB Model

### 1:1 Ratio (w/ Opus-MT)

In [None]:
with open(f"{base_path}Synthetic-Data/opus-mt/yue.txt", "r", encoding="UTF-8") as f:
  syn_data_yue = f.read().splitlines()

In [None]:
with open(f"{base_path}Synthetic-Data/opus-mt/en.txt", "r", encoding="UTF-8") as f:
  syn_data_en = f.read().splitlines()

In [None]:
train_sys = zip(syn_data_yue, syn_data_en)
train_sys_df = pd.DataFrame(train_sys, columns = ['zh','en'])

In [None]:
random.seed(42)
mixed_train_df = pd.concat([train_df, train_sys_df.sample(len(train_df) * 1)],ignore_index=True)

In [None]:
mixed_train_df

Unnamed: 0,zh,en
0,50自,50-m freestyle race
1,AV線,audio-visual cable
2,DT堂,DT lesson
3,DT室,DT room
4,OL衫,clothes for office ladies
...,...,...
76083,"「你睇,我講佢哋都識我㗎喇。」","""Look, when I spoke they did understand me,"" C..."
76084,個MG俱樂部可以令入場嘅球場好似吧噉運作好多。,The MUG Club allows the infield to function mu...
76085,"「幾好玩,估係我玩㗎啦。」","“It’s pretty good, but I guess that’s my style..."
76086,"今日世界四號轉咗六二七五個六二五個六二小時後,杜貝克奇第四屆連冠。",But the world number four turned the tables to...


In [None]:
dataset = DatasetDict({
    'train': Dataset.from_pandas(mixed_train_df),
    'dev': Dataset.from_pandas(dev_df),
    'test': Dataset.from_pandas(test_df)})

In [None]:
dataset

DatasetDict({
    train: Dataset({
        features: ['zh', 'en'],
        num_rows: 76088
    })
    dev: Dataset({
        features: ['zh', 'en'],
        num_rows: 3001
    })
    test: Dataset({
        features: ['zh', 'en'],
        num_rows: 3000
    })
})

In [None]:
model_name = "facebook/nllb-200-distilled-600M"

nllb_tokenizer = AutoTokenizer.from_pretrained(
    "facebook/nllb-200-distilled-600M", src_lang="yue_Hant", tgt_lang="eng_Latn"
)

nllb_model = AutoModelForSeq2SeqLM.from_pretrained("facebook/nllb-200-distilled-600M").to("cuda")

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/564 [00:00<?, ?B/s]

sentencepiece.bpe.model:   0%|          | 0.00/4.85M [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/17.3M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/3.55k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/846 [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/2.46G [00:00<?, ?B/s]

  return self.fget.__get__(instance, owner)()


generation_config.json:   0%|          | 0.00/189 [00:00<?, ?B/s]

In [None]:
prefix = ""
max_input_length = 128
max_target_length = 128
source_lang = "zh"
target_lang = "en"
def preprocess_function(examples):
    inputs =  examples[source_lang]
    targets = examples[target_lang]
    model_inputs = nllb_tokenizer(inputs, return_tensors="pt", max_length=max_input_length, padding=True, truncation=True)
    labels = nllb_tokenizer(targets, return_tensors="pt", max_length=max_input_length, padding=True, truncation=True)
    model_inputs["labels"] = labels["input_ids"]
    return model_inputs

In [None]:
tokenized_datasets = dataset.map(preprocess_function, batched=True)

Map:   0%|          | 0/76088 [00:00<?, ? examples/s]

Map:   0%|          | 0/3001 [00:00<?, ? examples/s]

Map:   0%|          | 0/3000 [00:00<?, ? examples/s]

In [None]:
batch_size = 8
model_name = model_name.split("/")[-1]
args = Seq2SeqTrainingArguments(
    f"{model_name}-finetuned",
    evaluation_strategy = "epoch",
    save_strategy = "epoch",
    learning_rate=1e-4,
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=batch_size,
    weight_decay=0.01,
    save_total_limit=3,
    num_train_epochs=3,
    predict_with_generate=True,
    fp16 = True,
    load_best_model_at_end=True,
)

In [None]:
data_collator = DataCollatorForSeq2Seq(nllb_tokenizer, model=nllb_model)

In [None]:
metric = evaluate.load("sacrebleu")

def postprocess_text(preds, labels):
    preds = [pred.strip() for pred in preds]
    labels = [[label.strip()] for label in labels]
    return preds, labels

def compute_metrics(eval_preds):
    preds, labels = eval_preds
    if isinstance(preds, tuple):
        preds = preds[0]
    decoded_preds = nllb_tokenizer.batch_decode(preds, skip_special_tokens=True)
    # Replace -100 in the labels as we can't decode them.
    labels = np.where(labels != -100, labels, nllb_tokenizer.pad_token_id)
    decoded_labels = nllb_tokenizer.batch_decode(labels, skip_special_tokens=True)
    # Some simple post-processing
    decoded_preds, decoded_labels = postprocess_text(decoded_preds, decoded_labels)
    result = metric.compute(predictions=decoded_preds, references=decoded_labels)
    result = {"bleu": result["score"]}
    prediction_lens = [np.count_nonzero(pred != nllb_tokenizer.pad_token_id) for pred in preds]
    result["gen_len"] = np.mean(prediction_lens)
    result = {k: round(v, 4) for k, v in result.items()}
    return result

Downloading builder script:   0%|          | 0.00/8.15k [00:00<?, ?B/s]

In [None]:
trainer = Seq2SeqTrainer(
    nllb_model,
    args,
    train_dataset=tokenized_datasets["train"],
    eval_dataset=tokenized_datasets["dev"],
    data_collator=data_collator,
    tokenizer=nllb_tokenizer,
    compute_metrics=compute_metrics,
)

In [None]:
trainer.train()

Epoch,Training Loss,Validation Loss,Bleu,Gen Len
1,0.6487,0.520938,16.1337,18.3519
2,0.5235,0.508297,17.1222,18.7004


Non-default generation parameters: {'max_length': 200}
Non-default generation parameters: {'max_length': 200}


Epoch,Training Loss,Validation Loss,Bleu,Gen Len
1,0.6487,0.520938,16.1337,18.3519
2,0.5235,0.508297,17.1222,18.7004
3,0.463,0.515043,17.143,18.7178


Non-default generation parameters: {'max_length': 200}
There were missing keys in the checkpoint model loaded: ['model.encoder.embed_tokens.weight', 'model.decoder.embed_tokens.weight', 'lm_head.weight'].


TrainOutput(global_step=28533, training_loss=0.5773001101533977, metrics={'train_runtime': 7758.6565, 'train_samples_per_second': 29.421, 'train_steps_per_second': 3.678, 'total_flos': 6.167515361692877e+16, 'train_loss': 0.5773001101533977, 'epoch': 3.0})

In [None]:
trainer.save_model(f"{base_path}model/{model_name}-finetuned-opus-1:1")

Non-default generation parameters: {'max_length': 200}


In [None]:
nllb_tokenizer = AutoTokenizer.from_pretrained(
    "facebook/nllb-200-distilled-600M",
    src_lang="yue_Hant",
    tgt_lang="eng_Latn"
)

nllb_model = AutoModelForSeq2SeqLM.from_pretrained(
    f"{base_path}model/nllb-200-distilled-600M-finetuned-opus-1:1",
    local_files_only=True
).to("cuda")

In [None]:
n = random.randint(0,3000)
test = dataset["test"][n]
test_string = test["zh"]
print("Orginal Data: " + test_string)


inputs = nllb_tokenizer(test["zh"], return_tensors="pt").to("cuda")

translated_tokens = nllb_model.generate(
    **inputs, forced_bos_token_id=nllb_tokenizer.lang_code_to_id["eng_Latn"], max_length=30
)

print("Model Translation: " + nllb_tokenizer.batch_decode(translated_tokens, skip_special_tokens=True)[0])
print("Dataset Translation: " + test["en"])

Orginal Data: 噚日我唔記得帶遮，焗住要冒住大雨跑返屋企。
Model Translation: I forgot to bring my umbrella yesterday. I had to run home in the heavy rain.
Dataset Translation: Yesterday I forgot my umbrella and had to run back home in the heavy rain.


In [None]:
metric = evaluate.load("sacrebleu")

def compute_bleu_nllb(src_sentences, tgt_sentences, batch_size=32):
  pred = []

  for i in tqdm(range(0, len(src_sentences), batch_size)):
    batch = src_sentences[i:i + batch_size]
    encoded_input = nllb_tokenizer(batch, return_tensors="pt", padding=True, truncation=True, max_length=128).to("cuda")
    translated_tokens = nllb_model.generate(**encoded_input, forced_bos_token_id=nllb_tokenizer.lang_code_to_id["eng_Latn"], max_length=128)
    translated_batch = nllb_tokenizer.batch_decode(translated_tokens, skip_special_tokens=True)
    pred.extend(translated_batch)

  res = metric.compute(predictions=pred, references=tgt_sentences)
  return res["score"]

In [None]:
bleu_score = compute_bleu_nllb(dataset["test"]["zh"], dataset["test"]["en"])
print(f"Bleu Score after Finetuning w/ 1:1 Ratio Data and opus-mt-en-zh: {bleu_score}")

100%|██████████| 94/94 [01:15<00:00,  1.24it/s]


Bleu Score after Finetuning w/ 1:1 Ratio Data and nllb-200-distilled-600M: 16.53045739745836


### 1:3 Ratio (w/ Opus-MT)

In [None]:
with open(f"{base_path}Synthetic-Data/opus-mt/yue.txt", "r", encoding="UTF-8") as f:
  syn_data_yue = f.read().splitlines()

In [None]:
with open(f"{base_path}Synthetic-Data/opus-mt/en.txt", "r", encoding="UTF-8") as f:
  syn_data_en = f.read().splitlines()

In [None]:
train_sys = zip(syn_data_yue, syn_data_en)
train_sys_df = pd.DataFrame(train_sys, columns = ['zh','en'])

In [None]:
random.seed(42)
mixed_train_df = pd.concat([train_df, train_sys_df.sample(len(train_df) * 3)],ignore_index=True)

In [None]:
mixed_train_df

Unnamed: 0,zh,en
0,50自,50-m freestyle race
1,AV線,audio-visual cable
2,DT堂,DT lesson
3,DT室,DT room
4,OL衫,clothes for office ladies
...,...,...
152171,"歐盟嘅「強制」決策,同等重要。","Equally significant is the EU's decision to ""f..."
152172,"最後我想去畢業學校做社工,因為我同學生嘅致命債券隨時加強咗。","Eventually, I’d like to go to graduate school ..."
152173,"影帝大增廣,而家好易出手,係總統米高超領導嘅紀錄團員選擇重組取取獎嘅規則嘅其中一個原因。",The broadening field for documentaries - techn...
152174,"佢話「裝修費好慢,重點慢,一個好深嘅窿」。","""Construction spending appears to be slowly cl..."


In [None]:
dataset = DatasetDict({
    'train': Dataset.from_pandas(mixed_train_df),
    'dev': Dataset.from_pandas(dev_df),
    'test': Dataset.from_pandas(test_df)})

In [None]:
dataset

DatasetDict({
    train: Dataset({
        features: ['zh', 'en'],
        num_rows: 152176
    })
    dev: Dataset({
        features: ['zh', 'en'],
        num_rows: 3001
    })
    test: Dataset({
        features: ['zh', 'en'],
        num_rows: 3000
    })
})

In [None]:
model_name = "facebook/nllb-200-distilled-600M"

nllb_tokenizer = AutoTokenizer.from_pretrained(
    "facebook/nllb-200-distilled-600M", src_lang="yue_Hant", tgt_lang="eng_Latn"
)

nllb_model = AutoModelForSeq2SeqLM.from_pretrained("facebook/nllb-200-distilled-600M").to("cuda")

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/564 [00:00<?, ?B/s]

sentencepiece.bpe.model:   0%|          | 0.00/4.85M [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/17.3M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/3.55k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/846 [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/2.46G [00:00<?, ?B/s]

  return self.fget.__get__(instance, owner)()


generation_config.json:   0%|          | 0.00/189 [00:00<?, ?B/s]

In [None]:
prefix = ""
max_input_length = 128
max_target_length = 128
source_lang = "zh"
target_lang = "en"
def preprocess_function(examples):
    inputs =  examples[source_lang]
    targets = examples[target_lang]
    model_inputs = nllb_tokenizer(inputs, return_tensors="pt", max_length=max_input_length, padding=True, truncation=True)
    labels = nllb_tokenizer(targets, return_tensors="pt", max_length=max_input_length, padding=True, truncation=True)
    model_inputs["labels"] = labels["input_ids"]
    return model_inputs

In [None]:
tokenized_datasets = dataset.map(preprocess_function, batched=True)

Map:   0%|          | 0/152176 [00:00<?, ? examples/s]

Map:   0%|          | 0/3001 [00:00<?, ? examples/s]

Map:   0%|          | 0/3000 [00:00<?, ? examples/s]

In [None]:
batch_size = 8
model_name = model_name.split("/")[-1]
args = Seq2SeqTrainingArguments(
    f"{model_name}-finetuned",
    evaluation_strategy = "epoch",
    save_strategy = "epoch",
    learning_rate=1e-4,
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=batch_size,
    weight_decay=0.01,
    save_total_limit=3,
    num_train_epochs=3,
    predict_with_generate=True,
    fp16 = True,
    load_best_model_at_end=True,
)

In [None]:
data_collator = DataCollatorForSeq2Seq(nllb_tokenizer, model=nllb_model)

In [None]:
metric = evaluate.load("sacrebleu")

def postprocess_text(preds, labels):
    preds = [pred.strip() for pred in preds]
    labels = [[label.strip()] for label in labels]
    return preds, labels

def compute_metrics(eval_preds):
    preds, labels = eval_preds
    if isinstance(preds, tuple):
        preds = preds[0]
    decoded_preds = nllb_tokenizer.batch_decode(preds, skip_special_tokens=True)
    # Replace -100 in the labels as we can't decode them.
    labels = np.where(labels != -100, labels, nllb_tokenizer.pad_token_id)
    decoded_labels = nllb_tokenizer.batch_decode(labels, skip_special_tokens=True)
    # Some simple post-processing
    decoded_preds, decoded_labels = postprocess_text(decoded_preds, decoded_labels)
    result = metric.compute(predictions=decoded_preds, references=decoded_labels)
    result = {"bleu": result["score"]}
    prediction_lens = [np.count_nonzero(pred != nllb_tokenizer.pad_token_id) for pred in preds]
    result["gen_len"] = np.mean(prediction_lens)
    result = {k: round(v, 4) for k, v in result.items()}
    return result

Downloading builder script:   0%|          | 0.00/8.15k [00:00<?, ?B/s]

In [None]:
trainer = Seq2SeqTrainer(
    nllb_model,
    args,
    train_dataset=tokenized_datasets["train"],
    eval_dataset=tokenized_datasets["dev"],
    data_collator=data_collator,
    tokenizer=nllb_tokenizer,
    compute_metrics=compute_metrics,
)

In [None]:
trainer.train()

Epoch,Training Loss,Validation Loss,Bleu,Gen Len
1,0.641,0.534058,15.3974,18.9267
2,0.5666,0.521136,16.428,18.7331


Non-default generation parameters: {'max_length': 200}
Non-default generation parameters: {'max_length': 200}


Epoch,Training Loss,Validation Loss,Bleu,Gen Len
1,0.641,0.534058,15.3974,18.9267
2,0.5666,0.521136,16.428,18.7331
3,0.4977,0.522922,17.1117,18.7641


Non-default generation parameters: {'max_length': 200}
There were missing keys in the checkpoint model loaded: ['model.encoder.embed_tokens.weight', 'model.decoder.embed_tokens.weight', 'lm_head.weight'].


TrainOutput(global_step=57066, training_loss=0.5910329153931062, metrics={'train_runtime': 13760.8457, 'train_samples_per_second': 33.176, 'train_steps_per_second': 4.147, 'total_flos': 1.236679912241234e+17, 'train_loss': 0.5910329153931062, 'epoch': 3.0})

In [None]:
trainer.save_model(f"{base_path}model/{model_name}-finetuned-opus-1:3")

Non-default generation parameters: {'max_length': 200}


In [None]:
nllb_tokenizer = AutoTokenizer.from_pretrained(
    "facebook/nllb-200-distilled-600M",
    src_lang="yue_Hant",
    tgt_lang="eng_Latn"
)

nllb_model = AutoModelForSeq2SeqLM.from_pretrained(
    f"{base_path}model/nllb-200-distilled-600M-finetuned-opus-1:3",
    local_files_only=True
).to("cuda")

In [None]:
n = random.randint(0,3000)
test = dataset["test"][n]
test_string = test["zh"]
print("Orginal Data: " + test_string)


inputs = nllb_tokenizer(test["zh"], return_tensors="pt").to("cuda")

translated_tokens = nllb_model.generate(
    **inputs, forced_bos_token_id=nllb_tokenizer.lang_code_to_id["eng_Latn"], max_length=30
)

print("Model Translation: " + nllb_tokenizer.batch_decode(translated_tokens, skip_special_tokens=True)[0])
print("Dataset Translation: " + test["en"])

Orginal Data: 噚日我唔記得帶遮，焗住要冒住大雨跑返屋企。
Model Translation: I forgot to bring my umbrella yesterday, I had to run home in case of heavy rain.
Dataset Translation: Yesterday I forgot my umbrella and had to run back home in the heavy rain.


In [None]:
metric = evaluate.load("sacrebleu")

def compute_bleu_nllb(src_sentences, tgt_sentences, batch_size=32):
  pred = []

  for i in tqdm(range(0, len(src_sentences), batch_size)):
    batch = src_sentences[i:i + batch_size]
    encoded_input = nllb_tokenizer(batch, return_tensors="pt", padding=True, truncation=True, max_length=128).to("cuda")
    translated_tokens = nllb_model.generate(**encoded_input, forced_bos_token_id=nllb_tokenizer.lang_code_to_id["eng_Latn"], max_length=128)
    translated_batch = nllb_tokenizer.batch_decode(translated_tokens, skip_special_tokens=True)
    pred.extend(translated_batch)

  res = metric.compute(predictions=pred, references=tgt_sentences)
  return res["score"]

In [None]:
bleu_score = compute_bleu_nllb(dataset["test"]["zh"], dataset["test"]["en"])
print(f"Bleu Score after Finetuning w/ 1:3 Ratio Data and opus-mt-en-zh: {bleu_score}")

100%|██████████| 94/94 [01:03<00:00,  1.47it/s]


Bleu Score after Finetuning w/ 1:3 Ratio Data and opus-mt-en-zh: 16.104899368970084


In [None]:
from google.colab import runtime
runtime.unassign()

### 1:1 Ratio (w/ mBART)

In [None]:
with open(f"{base_path}Synthetic-Data/mBART/yue.txt", "r", encoding="UTF-8") as f:
  syn_data_yue = f.read().splitlines()

In [None]:
with open(f"{base_path}Synthetic-Data/mBART/en.txt", "r", encoding="UTF-8") as f:
  syn_data_en = f.read().splitlines()

In [None]:
train_sys = zip(syn_data_yue, syn_data_en)
train_sys_df = pd.DataFrame(train_sys, columns = ['zh','en'])

In [None]:
random.seed(42)
mixed_train_df = pd.concat([train_df, train_sys_df.sample(len(train_df) * 1)],ignore_index=True)

In [None]:
mixed_train_df

Unnamed: 0,zh,en
0,50自,50-m freestyle race
1,AV線,audio-visual cable
2,DT堂,DT lesson
3,DT室,DT room
4,OL衫,clothes for office ladies
...,...,...
76083,聽日佢話將會喺下年早推出策略更新。,"Yesterday, he said he would provide a strategi..."
76084,馬友拳王今日譴責一位初級警官同前屆世界重量級拳王鬥拳。,AMATEUR boxing bosses today condemned an “unli...
76085,"首領自首,投訴所羅列嘅罪行全部都係唔理基本權嘅文化所致。","""Leadership starts at the top, and all of the ..."
76086,"A Pew研究指出,25至29歲嘅成人總共有4111,最近就同父母住咗。",A Pew study reports that 41 percent of adults ...


In [None]:
dataset = DatasetDict({
    'train': Dataset.from_pandas(mixed_train_df),
    'dev': Dataset.from_pandas(dev_df),
    'test': Dataset.from_pandas(test_df)})

In [None]:
dataset

DatasetDict({
    train: Dataset({
        features: ['zh', 'en'],
        num_rows: 76088
    })
    dev: Dataset({
        features: ['zh', 'en'],
        num_rows: 3001
    })
    test: Dataset({
        features: ['zh', 'en'],
        num_rows: 3000
    })
})

In [None]:
model_name = "facebook/nllb-200-distilled-600M"

nllb_tokenizer = AutoTokenizer.from_pretrained(
    "facebook/nllb-200-distilled-600M", src_lang="yue_Hant", tgt_lang="eng_Latn"
)

nllb_model = AutoModelForSeq2SeqLM.from_pretrained("facebook/nllb-200-distilled-600M").to("cuda")

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/564 [00:00<?, ?B/s]

sentencepiece.bpe.model:   0%|          | 0.00/4.85M [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/17.3M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/3.55k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/846 [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/2.46G [00:00<?, ?B/s]

  return self.fget.__get__(instance, owner)()


generation_config.json:   0%|          | 0.00/189 [00:00<?, ?B/s]

In [None]:
prefix = ""
max_input_length = 128
max_target_length = 128
source_lang = "zh"
target_lang = "en"
def preprocess_function(examples):
    inputs =  examples[source_lang]
    targets = examples[target_lang]
    model_inputs = nllb_tokenizer(inputs, return_tensors="pt", max_length=max_input_length, padding=True, truncation=True)
    labels = nllb_tokenizer(targets, return_tensors="pt", max_length=max_input_length, padding=True, truncation=True)
    model_inputs["labels"] = labels["input_ids"]
    return model_inputs

In [None]:
tokenized_datasets = dataset.map(preprocess_function, batched=True)

Map:   0%|          | 0/76088 [00:00<?, ? examples/s]

Map:   0%|          | 0/3001 [00:00<?, ? examples/s]

Map:   0%|          | 0/3000 [00:00<?, ? examples/s]

In [None]:
batch_size = 8
model_name = model_name.split("/")[-1]
args = Seq2SeqTrainingArguments(
    f"{model_name}-finetuned",
    evaluation_strategy = "epoch",
    save_strategy = "epoch",
    learning_rate=1e-4,
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=batch_size,
    weight_decay=0.01,
    save_total_limit=3,
    num_train_epochs=3,
    predict_with_generate=True,
    fp16 = True,
    load_best_model_at_end=True,
)

In [None]:
data_collator = DataCollatorForSeq2Seq(nllb_tokenizer, model=nllb_model)

In [None]:
metric = evaluate.load("sacrebleu")

def postprocess_text(preds, labels):
    preds = [pred.strip() for pred in preds]
    labels = [[label.strip()] for label in labels]
    return preds, labels

def compute_metrics(eval_preds):
    preds, labels = eval_preds
    if isinstance(preds, tuple):
        preds = preds[0]
    decoded_preds = nllb_tokenizer.batch_decode(preds, skip_special_tokens=True)
    # Replace -100 in the labels as we can't decode them.
    labels = np.where(labels != -100, labels, nllb_tokenizer.pad_token_id)
    decoded_labels = nllb_tokenizer.batch_decode(labels, skip_special_tokens=True)
    # Some simple post-processing
    decoded_preds, decoded_labels = postprocess_text(decoded_preds, decoded_labels)
    result = metric.compute(predictions=decoded_preds, references=decoded_labels)
    result = {"bleu": result["score"]}
    prediction_lens = [np.count_nonzero(pred != nllb_tokenizer.pad_token_id) for pred in preds]
    result["gen_len"] = np.mean(prediction_lens)
    result = {k: round(v, 4) for k, v in result.items()}
    return result

Downloading builder script:   0%|          | 0.00/8.15k [00:00<?, ?B/s]

In [None]:
trainer = Seq2SeqTrainer(
    nllb_model,
    args,
    train_dataset=tokenized_datasets["train"],
    eval_dataset=tokenized_datasets["dev"],
    data_collator=data_collator,
    tokenizer=nllb_tokenizer,
    compute_metrics=compute_metrics,
)

In [None]:
trainer.train()

Epoch,Training Loss,Validation Loss,Bleu,Gen Len
1,0.6488,0.519405,15.7858,18.4868


Non-default generation parameters: {'max_length': 200}


Epoch,Training Loss,Validation Loss,Bleu,Gen Len
1,0.6488,0.519405,15.7858,18.4868
2,0.535,0.508741,16.8644,18.5232
3,0.4598,0.514917,16.9973,18.6295


Non-default generation parameters: {'max_length': 200}
Non-default generation parameters: {'max_length': 200}
There were missing keys in the checkpoint model loaded: ['model.encoder.embed_tokens.weight', 'model.decoder.embed_tokens.weight', 'lm_head.weight'].


TrainOutput(global_step=28533, training_loss=0.5775919297814966, metrics={'train_runtime': 7327.7236, 'train_samples_per_second': 31.151, 'train_steps_per_second': 3.894, 'total_flos': 6.170749088091341e+16, 'train_loss': 0.5775919297814966, 'epoch': 3.0})

In [None]:
trainer.save_model(f"{base_path}model/{model_name}-finetuned-mbart-1:1")

Non-default generation parameters: {'max_length': 200}


In [None]:
nllb_tokenizer = AutoTokenizer.from_pretrained(
    "facebook/nllb-200-distilled-600M",
    src_lang="yue_Hant",
    tgt_lang="eng_Latn"
)

nllb_model = AutoModelForSeq2SeqLM.from_pretrained(
    f"{base_path}model/nllb-200-distilled-600M-finetuned-mbart-1:1",
    local_files_only=True
).to("cuda")

In [None]:
n = random.randint(0,3000)
test = dataset["test"][n]
test_string = test["zh"]
print("Orginal Data: " + test_string)


inputs = nllb_tokenizer(test["zh"], return_tensors="pt").to("cuda")

translated_tokens = nllb_model.generate(
    **inputs, forced_bos_token_id=nllb_tokenizer.lang_code_to_id["eng_Latn"], max_length=30
)

print("Model Translation: " + nllb_tokenizer.batch_decode(translated_tokens, skip_special_tokens=True)[0])
print("Dataset Translation: " + test["en"])

Orginal Data: 萬一佢間公司破產，佢爭你嗰筆數點追返？
Model Translation: If his company goes bankrupt, how will he recover your money?
Dataset Translation: If in case his company goes bankrupt, how would you recover your money from him?


In [None]:
metric = evaluate.load("sacrebleu")

def compute_bleu_nllb(src_sentences, tgt_sentences, batch_size=32):
  pred = []

  for i in tqdm(range(0, len(src_sentences), batch_size)):
    batch = src_sentences[i:i + batch_size]
    encoded_input = nllb_tokenizer(batch, return_tensors="pt", padding=True, truncation=True, max_length=128).to("cuda")
    translated_tokens = nllb_model.generate(**encoded_input, forced_bos_token_id=nllb_tokenizer.lang_code_to_id["eng_Latn"], max_length=128)
    translated_batch = nllb_tokenizer.batch_decode(translated_tokens, skip_special_tokens=True)
    pred.extend(translated_batch)

  res = metric.compute(predictions=pred, references=tgt_sentences)
  return res["score"]

In [None]:
bleu_score = compute_bleu_nllb(dataset["test"]["zh"], dataset["test"]["en"])
print(f"Bleu Score after Finetuning w/ 1:1 Ratio Data and mBART: {bleu_score}")

100%|██████████| 94/94 [01:26<00:00,  1.08it/s]


Bleu Score after Finetuning w/ 1:1 Ratio Data and mBART: 16.75961944765883


In [None]:
from google.colab import runtime
runtime.unassign()

### 1:3 Ratio (w/mBART)

In [None]:
with open(f"{base_path}Synthetic-Data/mBART/yue.txt", "r", encoding="UTF-8") as f:
  syn_data_yue = f.read().splitlines()

In [None]:
with open(f"{base_path}Synthetic-Data/mBART/en.txt", "r", encoding="UTF-8") as f:
  syn_data_en = f.read().splitlines()

In [None]:
train_sys = zip(syn_data_yue, syn_data_en)
train_sys_df = pd.DataFrame(train_sys, columns = ['zh','en'])

In [None]:
random.seed(42)
mixed_train_df = pd.concat([train_df, train_sys_df.sample(len(train_df) * 3)],ignore_index=True)

In [None]:
mixed_train_df

Unnamed: 0,zh,en
0,50自,50-m freestyle race
1,AV線,audio-visual cable
2,DT堂,DT lesson
3,DT室,DT room
4,OL衫,clothes for office ladies
...,...,...
152171,"阿修而家學咗好多佢嘅老教,間公司最近裁員,想喺高原柏拉圖租樓。",Hsu is now learning the same hard lessons as m...
152172,其他同學已經被送返屋企當日。,Other students had already been sent home for ...
152173,卡梅倫提出嘅方案係一種可以解決卡卡塔利僵局同埋利比利亞僵局嘅試圖。,Mr Cameron's proposal is an attempt to bypass ...
152174,"佢為咗衝線先開始出賽,唔怪得球根球迷喺呢幾年睇到全場五線射線入球。",He decided to play after the pregame shootarou...


In [None]:
dataset = DatasetDict({
    'train': Dataset.from_pandas(mixed_train_df),
    'dev': Dataset.from_pandas(dev_df),
    'test': Dataset.from_pandas(test_df)})

In [None]:
dataset

DatasetDict({
    train: Dataset({
        features: ['zh', 'en'],
        num_rows: 152176
    })
    dev: Dataset({
        features: ['zh', 'en'],
        num_rows: 3001
    })
    test: Dataset({
        features: ['zh', 'en'],
        num_rows: 3000
    })
})

In [None]:
model_name = "facebook/nllb-200-distilled-600M"

nllb_tokenizer = AutoTokenizer.from_pretrained(
    "facebook/nllb-200-distilled-600M", src_lang="yue_Hant", tgt_lang="eng_Latn"
)

nllb_model = AutoModelForSeq2SeqLM.from_pretrained("facebook/nllb-200-distilled-600M").to("cuda")

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/564 [00:00<?, ?B/s]

sentencepiece.bpe.model:   0%|          | 0.00/4.85M [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/17.3M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/3.55k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/846 [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/2.46G [00:00<?, ?B/s]

  return self.fget.__get__(instance, owner)()


generation_config.json:   0%|          | 0.00/189 [00:00<?, ?B/s]

In [None]:
prefix = ""
max_input_length = 128
max_target_length = 128
source_lang = "zh"
target_lang = "en"
def preprocess_function(examples):
    inputs =  examples[source_lang]
    targets = examples[target_lang]
    model_inputs = nllb_tokenizer(inputs, return_tensors="pt", max_length=max_input_length, padding=True, truncation=True)
    labels = nllb_tokenizer(targets, return_tensors="pt", max_length=max_input_length, padding=True, truncation=True)
    model_inputs["labels"] = labels["input_ids"]
    return model_inputs

In [None]:
tokenized_datasets = dataset.map(preprocess_function, batched=True)

Map:   0%|          | 0/152176 [00:00<?, ? examples/s]

Map:   0%|          | 0/3001 [00:00<?, ? examples/s]

Map:   0%|          | 0/3000 [00:00<?, ? examples/s]

In [None]:
batch_size = 8
model_name = model_name.split("/")[-1]
args = Seq2SeqTrainingArguments(
    f"{model_name}-finetuned",
    evaluation_strategy = "epoch",
    save_strategy = "epoch",
    learning_rate=1e-4,
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=batch_size,
    weight_decay=0.01,
    save_total_limit=3,
    num_train_epochs=3,
    predict_with_generate=True,
    fp16 = True,
    load_best_model_at_end=True,
)

In [None]:
data_collator = DataCollatorForSeq2Seq(nllb_tokenizer, model=nllb_model)

In [None]:
metric = evaluate.load("sacrebleu")

def postprocess_text(preds, labels):
    preds = [pred.strip() for pred in preds]
    labels = [[label.strip()] for label in labels]
    return preds, labels

def compute_metrics(eval_preds):
    preds, labels = eval_preds
    if isinstance(preds, tuple):
        preds = preds[0]
    decoded_preds = nllb_tokenizer.batch_decode(preds, skip_special_tokens=True)
    # Replace -100 in the labels as we can't decode them.
    labels = np.where(labels != -100, labels, nllb_tokenizer.pad_token_id)
    decoded_labels = nllb_tokenizer.batch_decode(labels, skip_special_tokens=True)
    # Some simple post-processing
    decoded_preds, decoded_labels = postprocess_text(decoded_preds, decoded_labels)
    result = metric.compute(predictions=decoded_preds, references=decoded_labels)
    result = {"bleu": result["score"]}
    prediction_lens = [np.count_nonzero(pred != nllb_tokenizer.pad_token_id) for pred in preds]
    result["gen_len"] = np.mean(prediction_lens)
    result = {k: round(v, 4) for k, v in result.items()}
    return result

Downloading builder script:   0%|          | 0.00/8.15k [00:00<?, ?B/s]

In [None]:
trainer = Seq2SeqTrainer(
    nllb_model,
    args,
    train_dataset=tokenized_datasets["train"],
    eval_dataset=tokenized_datasets["dev"],
    data_collator=data_collator,
    tokenizer=nllb_tokenizer,
    compute_metrics=compute_metrics,
)

In [None]:
trainer.train()

Epoch,Training Loss,Validation Loss,Bleu,Gen Len
1,0.657,0.532878,15.7486,18.6624
2,0.5515,0.517394,16.5309,18.6711


Non-default generation parameters: {'max_length': 200}
Non-default generation parameters: {'max_length': 200}


Epoch,Training Loss,Validation Loss,Bleu,Gen Len
1,0.657,0.532878,15.7486,18.6624
2,0.5515,0.517394,16.5309,18.6711
3,0.4994,0.521316,16.5388,18.7747


Non-default generation parameters: {'max_length': 200}
There were missing keys in the checkpoint model loaded: ['model.encoder.embed_tokens.weight', 'model.decoder.embed_tokens.weight', 'lm_head.weight'].


TrainOutput(global_step=57066, training_loss=0.5856729440666024, metrics={'train_runtime': 13886.0179, 'train_samples_per_second': 32.877, 'train_steps_per_second': 4.11, 'total_flos': 1.236679912241234e+17, 'train_loss': 0.5856729440666024, 'epoch': 3.0})

In [None]:
trainer.save_model(f"{base_path}model/{model_name}-finetuned-mbart-1:3")

Non-default generation parameters: {'max_length': 200}


In [None]:
nllb_tokenizer = AutoTokenizer.from_pretrained(
    "facebook/nllb-200-distilled-600M",
    src_lang="yue_Hant",
    tgt_lang="eng_Latn"
)

nllb_model = AutoModelForSeq2SeqLM.from_pretrained(
    f"{base_path}model/nllb-200-distilled-600M-finetuned-mbart-1:3",
    local_files_only=True
).to("cuda")

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/564 [00:00<?, ?B/s]

sentencepiece.bpe.model:   0%|          | 0.00/4.85M [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/17.3M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/3.55k [00:00<?, ?B/s]

In [None]:
n = random.randint(0,3000)
test = dataset["test"][n]
test_string = test["zh"]
print("Orginal Data: " + test_string)


inputs = nllb_tokenizer(test["zh"], return_tensors="pt").to("cuda")

translated_tokens = nllb_model.generate(
    **inputs, forced_bos_token_id=nllb_tokenizer.lang_code_to_id["eng_Latn"], max_length=30
)

print("Model Translation: " + nllb_tokenizer.batch_decode(translated_tokens, skip_special_tokens=True)[0])
print("Dataset Translation: " + test["en"])

Orginal Data: 對人卑躬屈膝好冇尊嚴。
Model Translation: It is dishonest to bow to others.
Dataset Translation: It is undignified to be subservient and groveling to others.


In [None]:
metric = evaluate.load("sacrebleu")

def compute_bleu_nllb(src_sentences, tgt_sentences, batch_size=32):
  pred = []

  for i in tqdm(range(0, len(src_sentences), batch_size)):
    batch = src_sentences[i:i + batch_size]
    encoded_input = nllb_tokenizer(batch, return_tensors="pt", padding=True, truncation=True, max_length=128).to("cuda")
    translated_tokens = nllb_model.generate(**encoded_input, forced_bos_token_id=nllb_tokenizer.lang_code_to_id["eng_Latn"], max_length=128)
    translated_batch = nllb_tokenizer.batch_decode(translated_tokens, skip_special_tokens=True)
    pred.extend(translated_batch)

  res = metric.compute(predictions=pred, references=tgt_sentences)
  return res["score"]

Downloading builder script:   0%|          | 0.00/8.15k [00:00<?, ?B/s]

In [None]:
bleu_score = compute_bleu_nllb(dataset["test"]["zh"], dataset["test"]["en"])
print(f"Bleu Score after Finetuning w/ 1:3 Ratio Data and mBART: {bleu_score}")

100%|██████████| 94/94 [01:08<00:00,  1.38it/s]


Bleu Score after Finetuning w/ 1:1 Ratio Data and mBART: 15.854980519528175


In [None]:
from google.colab import runtime
runtime.unassign()

## Finetune Other Model

### mBART (1:1)

In [12]:
with open(f"{base_path}Synthetic-Data/NLLB/yue.txt", "r", encoding="UTF-8") as f:
  syn_data_yue = f.read().splitlines()

In [13]:
with open(f"{base_path}Synthetic-Data/NLLB/en.txt", "r", encoding="UTF-8") as f:
  syn_data_en = f.read().splitlines()

In [14]:
train_sys = zip(syn_data_yue, syn_data_en)
train_sys_df = pd.DataFrame(train_sys, columns = ['zh','en'])

In [15]:
random.seed(42)
mixed_train_df = pd.concat([train_df, train_sys_df.sample(len(train_df) * 1)],ignore_index=True)

In [16]:
mixed_train_df

Unnamed: 0,zh,en
0,50自,50-m freestyle race
1,AV線,audio-visual cable
2,DT堂,DT lesson
3,DT室,DT room
4,OL衫,clothes for office ladies
...,...,...
76083,我哋想見到你二月廿九嘅最佳飛躍!,We want to see your best February 29 leap!
76084,"廿八歲嘅阿媽,三個幼兒嘅阿叔普斯話,影相係學生人影嘅,真係死難者。","But Colleps, who is 28 and the mother of three..."
76085,"檔案圖 - 今次六月十三日,華盛頓紅皮隊嘅後格里芬三世喺美國華盛頓州阿實出場。","FILE - In this June 13, 2012, file photo, Wash..."
76086,"而家阿美可以專心健康,承擔職責,可以繼續喺1974上領國八冠嘅計劃度領導。",Now Summitt can focus on her health and taking...


In [17]:
dataset = DatasetDict({
    'train': Dataset.from_pandas(mixed_train_df),
    'dev': Dataset.from_pandas(dev_df),
    'test': Dataset.from_pandas(test_df)})

In [18]:
dataset

DatasetDict({
    train: Dataset({
        features: ['zh', 'en'],
        num_rows: 76088
    })
    dev: Dataset({
        features: ['zh', 'en'],
        num_rows: 3001
    })
    test: Dataset({
        features: ['zh', 'en'],
        num_rows: 3000
    })
})

In [25]:
model_name = "facebook/mbart-large-50-many-to-many-mmt"
model = MBartForConditionalGeneration.from_pretrained(model_name).to("cuda")
tokenizer = MBart50TokenizerFast.from_pretrained(model_name)
tokenizer.src_lang = "zh_CN"

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


config.json:   0%|          | 0.00/1.43k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/2.44G [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/261 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/529 [00:00<?, ?B/s]

sentencepiece.bpe.model:   0%|          | 0.00/5.07M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/649 [00:00<?, ?B/s]

In [19]:
metric = evaluate.load("sacrebleu")


def compute_bleu(src_sentences, tgt_sentences, batch_size=32):
  pred = []
  source_lang = "zh_CN"
  target_lang = "en_XX"

  for i in tqdm(range(0, len(src_sentences), batch_size)):
    batch = src_sentences[i:i + batch_size]
    encoded_input = tokenizer(
        batch,
        padding=True,
        truncation=True,
        max_length = 512,
        return_tensors="pt"
    ).to("cuda")
    generated_tokens = model.generate(
        **encoded_input,
        forced_bos_token_id=tokenizer.lang_code_to_id[target_lang]
    )
    translated_batch = tokenizer.batch_decode(
        generated_tokens,
        skip_special_tokens=True
    )
    for ind, sentence in enumerate(translated_batch):
      translated_batch[ind] = chinese_converter.to_traditional(sentence)
    pred.extend(translated_batch)

  res = metric.compute(predictions=pred, references=tgt_sentences)
  return res["score"]


Downloading builder script:   0%|          | 0.00/8.15k [00:00<?, ?B/s]

In [21]:
bleu_score = compute_bleu(dataset["test"]["zh"], dataset["test"]["en"])
print(f"Bleu Score Before Finetuning with {model_name}: {bleu_score}")

100%|██████████| 94/94 [01:53<00:00,  1.21s/it]


Bleu Score Before Finetuning with facebook/mbart-large-50-many-to-many-mmt: 8.315683152180513


In [20]:
max_input_length = 512
source_lang = "zh"
target_lang = "en"

def preprocess_function(examples):
    inputs =  examples[source_lang]
    targets = examples[target_lang]
    model_inputs = tokenizer(
        inputs,
        return_tensors="pt",
        max_length=max_input_length,
        padding=True,
        truncation=True
    )
    tokenizer.src_lang = "en_XX"
    labels = tokenizer(
        targets,
        return_tensors="pt",
        max_length=max_input_length,
        padding=True,
        truncation=True
    )
    tokenizer.src_lang = "zh_CN"
    model_inputs["labels"] = labels["input_ids"]
    return model_inputs

In [26]:
tokenized_datasets = dataset.map(preprocess_function, batched=True)

Map:   0%|          | 0/76088 [00:00<?, ? examples/s]

Map:   0%|          | 0/3001 [00:00<?, ? examples/s]

Map:   0%|          | 0/3000 [00:00<?, ? examples/s]

In [27]:
tokenized_datasets

DatasetDict({
    train: Dataset({
        features: ['zh', 'en', 'input_ids', 'attention_mask', 'labels'],
        num_rows: 76088
    })
    dev: Dataset({
        features: ['zh', 'en', 'input_ids', 'attention_mask', 'labels'],
        num_rows: 3001
    })
    test: Dataset({
        features: ['zh', 'en', 'input_ids', 'attention_mask', 'labels'],
        num_rows: 3000
    })
})

In [28]:
data_collator = DataCollatorForSeq2Seq(tokenizer, model=model, return_tensors="pt")

In [30]:
args = Seq2SeqTrainingArguments(
    f"{model_name}-finetuned",
    evaluation_strategy = "epoch",
    save_strategy = "epoch",
    learning_rate=1e-4,
    weight_decay=0.01,
    save_total_limit=3,
    num_train_epochs=3,
    auto_find_batch_size = True,
    predict_with_generate=True,
    load_best_model_at_end=True,
)

In [31]:
metric = evaluate.load("sacrebleu")

def postprocess_text(preds, labels):
    preds = [pred.strip() for pred in preds]
    labels = [[label.strip()] for label in labels]
    return preds, labels
def compute_metrics(eval_preds):
    preds, labels = eval_preds
    if isinstance(preds, tuple):
        preds = preds[0]
    decoded_preds = tokenizer.batch_decode(preds, skip_special_tokens=True)
    for ind, sentence in enumerate(decoded_preds):
      decoded_preds[ind] = chinese_converter.to_traditional(sentence)
    # Replace -100 in the labels as we can't decode them.
    labels = np.where(labels != -100, labels, tokenizer.pad_token_id)
    decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)
    # Some simple post-processing
    decoded_preds, decoded_labels = postprocess_text(
        decoded_preds,
        decoded_labels
    )
    result = metric.compute(
        predictions=decoded_preds,
        references=decoded_labels
    )
    result = {"bleu": result["score"]}
    prediction_lens = [np.count_nonzero(pred != tokenizer.pad_token_id) for pred in preds]
    result["gen_len"] = np.mean(prediction_lens)
    result = {k: round(v, 4) for k, v in result.items()}
    return result

In [32]:
trainer = Seq2SeqTrainer(
    model,
    args,
    train_dataset=tokenized_datasets["train"],
    eval_dataset=tokenized_datasets["dev"],
    data_collator=data_collator,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics,
)

In [33]:
trainer.train()

Epoch,Training Loss,Validation Loss,Bleu,Gen Len
1,0.5571,0.569484,14.639,19.4082
2,0.3767,0.541259,16.2603,19.2639
3,0.209,0.596258,16.0981,19.4419


Non-default generation parameters: {'max_length': 200, 'early_stopping': True, 'num_beams': 5, 'forced_eos_token_id': 2}
Non-default generation parameters: {'max_length': 200, 'early_stopping': True, 'num_beams': 5, 'forced_eos_token_id': 2}
Non-default generation parameters: {'max_length': 200, 'early_stopping': True, 'num_beams': 5, 'forced_eos_token_id': 2}
There were missing keys in the checkpoint model loaded: ['model.encoder.embed_tokens.weight', 'model.decoder.embed_tokens.weight', 'lm_head.weight'].


TrainOutput(global_step=28533, training_loss=0.40694443582005296, metrics={'train_runtime': 11779.7248, 'train_samples_per_second': 19.378, 'train_steps_per_second': 2.422, 'total_flos': 4.902127109367398e+16, 'train_loss': 0.40694443582005296, 'epoch': 3.0})

In [34]:
trainer.save_model(f"{base_path}model/mBart-1:1-nllb")

Non-default generation parameters: {'max_length': 200, 'early_stopping': True, 'num_beams': 5, 'forced_eos_token_id': 2}


In [35]:
model_name = "facebook/mbart-large-50-many-to-many-mmt"
tokenizer = MBart50TokenizerFast.from_pretrained(model_name)
model = MBartForConditionalGeneration.from_pretrained(
    f"{base_path}model/mBart-1:1-nllb",
    local_files_only = True,
).to("cuda")
tokenizer.src_lang = "zh_CN"

In [36]:
bleu_score = compute_bleu(dataset["test"]["zh"], dataset["test"]["en"])
print(f"Bleu Score Before Finetuning with {model_name}: {bleu_score}")

100%|██████████| 94/94 [01:58<00:00,  1.26s/it]


Bleu Score Before Finetuning with facebook/mbart-large-50-many-to-many-mmt: 16.035756358682907


In [37]:
from google.colab import runtime
runtime.unassign()

### mBART (1:3)

In [19]:
with open(f"{base_path}Synthetic-Data/NLLB/yue.txt", "r", encoding="UTF-8") as f:
  syn_data_yue = f.read().splitlines()

In [20]:
with open(f"{base_path}Synthetic-Data/NLLB/en.txt", "r", encoding="UTF-8") as f:
  syn_data_en = f.read().splitlines()

In [21]:
train_sys = zip(syn_data_yue, syn_data_en)
train_sys_df = pd.DataFrame(train_sys, columns = ['zh','en'])

In [22]:
random.seed(42)
mixed_train_df = pd.concat([train_df, train_sys_df.sample(len(train_df) * 3)],ignore_index=True)

In [23]:
mixed_train_df

Unnamed: 0,zh,en
0,50自,50-m freestyle race
1,AV線,audio-visual cable
2,DT堂,DT lesson
3,DT室,DT room
4,OL衫,clothes for office ladies
...,...,...
152171,"呢個變化咁唔公平,會令成千上萬嘅家庭一個星期損失七十三英,放工會好啲。",But this unfair and damaging change will mean ...
152172,我唔係佢。,I wasn’t taking a stand either for or against ...
152173,"魚寶係一個好好嘅度假村,有澳洲最長嘅跑步,易為教學同滑雪地盤。","Thredbo is a great resort, with the longest ru..."
152174,"荷格斯特話:「今日嘅公告......應該明顯你係邊個都一定會俾人,如果你犯咗噉嘅罪行就會畀人問。」","""Today's announcement ... should make clear th..."


In [24]:
dataset = DatasetDict({
    'train': Dataset.from_pandas(mixed_train_df),
    'dev': Dataset.from_pandas(dev_df),
    'test': Dataset.from_pandas(test_df)})

In [25]:
dataset

DatasetDict({
    train: Dataset({
        features: ['zh', 'en'],
        num_rows: 152176
    })
    dev: Dataset({
        features: ['zh', 'en'],
        num_rows: 3001
    })
    test: Dataset({
        features: ['zh', 'en'],
        num_rows: 3000
    })
})

In [26]:
model_name = "facebook/mbart-large-50-many-to-many-mmt"
model = MBartForConditionalGeneration.from_pretrained(model_name).to("cuda")
tokenizer = MBart50TokenizerFast.from_pretrained(model_name)
tokenizer.src_lang = "zh_CN"

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


config.json:   0%|          | 0.00/1.43k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/2.44G [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/261 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/529 [00:00<?, ?B/s]

sentencepiece.bpe.model:   0%|          | 0.00/5.07M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/649 [00:00<?, ?B/s]

In [27]:
metric = evaluate.load("sacrebleu")


def compute_bleu(src_sentences, tgt_sentences, batch_size=32):
  pred = []
  source_lang = "zh_CN"
  target_lang = "en_XX"

  for i in tqdm(range(0, len(src_sentences), batch_size)):
    batch = src_sentences[i:i + batch_size]
    encoded_input = tokenizer(
        batch,
        padding=True,
        truncation=True,
        max_length = 512,
        return_tensors="pt"
    ).to("cuda")
    generated_tokens = model.generate(
        **encoded_input,
        forced_bos_token_id=tokenizer.lang_code_to_id[target_lang]
    )
    translated_batch = tokenizer.batch_decode(
        generated_tokens,
        skip_special_tokens=True
    )
    for ind, sentence in enumerate(translated_batch):
      translated_batch[ind] = chinese_converter.to_traditional(sentence)
    pred.extend(translated_batch)

  res = metric.compute(predictions=pred, references=tgt_sentences)
  return res["score"]


Downloading builder script:   0%|          | 0.00/8.15k [00:00<?, ?B/s]

In [None]:
bleu_score = compute_bleu(dataset["test"]["zh"], dataset["test"]["en"])
print(f"Bleu Score Before Finetuning with {model_name}: {bleu_score}")

100%|██████████| 94/94 [01:53<00:00,  1.21s/it]


Bleu Score Before Finetuning with facebook/mbart-large-50-many-to-many-mmt: 8.315683152180513


In [28]:
max_input_length = 512
source_lang = "zh"
target_lang = "en"

def preprocess_function(examples):
    inputs =  examples[source_lang]
    targets = examples[target_lang]
    model_inputs = tokenizer(
        inputs,
        return_tensors="pt",
        max_length=max_input_length,
        padding=True,
        truncation=True
    )
    tokenizer.src_lang = "en_XX"
    labels = tokenizer(
        targets,
        return_tensors="pt",
        max_length=max_input_length,
        padding=True,
        truncation=True
    )
    tokenizer.src_lang = "zh_CN"
    model_inputs["labels"] = labels["input_ids"]
    return model_inputs

In [29]:
tokenized_datasets = dataset.map(preprocess_function, batched=True)

Map:   0%|          | 0/152176 [00:00<?, ? examples/s]

Map:   0%|          | 0/3001 [00:00<?, ? examples/s]

Map:   0%|          | 0/3000 [00:00<?, ? examples/s]

In [30]:
tokenized_datasets

DatasetDict({
    train: Dataset({
        features: ['zh', 'en', 'input_ids', 'attention_mask', 'labels'],
        num_rows: 152176
    })
    dev: Dataset({
        features: ['zh', 'en', 'input_ids', 'attention_mask', 'labels'],
        num_rows: 3001
    })
    test: Dataset({
        features: ['zh', 'en', 'input_ids', 'attention_mask', 'labels'],
        num_rows: 3000
    })
})

In [31]:
data_collator = DataCollatorForSeq2Seq(tokenizer, model=model, return_tensors="pt")

In [32]:
args = Seq2SeqTrainingArguments(
    f"{model_name}-finetuned",
    evaluation_strategy = "epoch",
    save_strategy = "epoch",
    learning_rate=1e-4,
    weight_decay=0.01,
    save_total_limit=3,
    num_train_epochs=3,
    auto_find_batch_size = True,
    predict_with_generate=True,
    load_best_model_at_end=True,
)

In [33]:
metric = evaluate.load("sacrebleu")

def postprocess_text(preds, labels):
    preds = [pred.strip() for pred in preds]
    labels = [[label.strip()] for label in labels]
    return preds, labels
def compute_metrics(eval_preds):
    preds, labels = eval_preds
    if isinstance(preds, tuple):
        preds = preds[0]
    decoded_preds = tokenizer.batch_decode(preds, skip_special_tokens=True)
    for ind, sentence in enumerate(decoded_preds):
      decoded_preds[ind] = chinese_converter.to_traditional(sentence)
    # Replace -100 in the labels as we can't decode them.
    labels = np.where(labels != -100, labels, tokenizer.pad_token_id)
    decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)
    # Some simple post-processing
    decoded_preds, decoded_labels = postprocess_text(
        decoded_preds,
        decoded_labels
    )
    result = metric.compute(
        predictions=decoded_preds,
        references=decoded_labels
    )
    result = {"bleu": result["score"]}
    prediction_lens = [np.count_nonzero(pred != tokenizer.pad_token_id) for pred in preds]
    result["gen_len"] = np.mean(prediction_lens)
    result = {k: round(v, 4) for k, v in result.items()}
    return result

In [34]:
trainer = Seq2SeqTrainer(
    model,
    args,
    train_dataset=tokenized_datasets["train"],
    eval_dataset=tokenized_datasets["dev"],
    data_collator=data_collator,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics,
)

In [None]:
trainer.train()

Epoch,Training Loss,Validation Loss,Bleu,Gen Len
1,0.5664,0.576824,14.3905,19.7534
2,0.4204,0.546646,15.6796,19.3302
3,0.274,0.574887,16.1576,19.4855


Non-default generation parameters: {'max_length': 200, 'early_stopping': True, 'num_beams': 5, 'forced_eos_token_id': 2}
Non-default generation parameters: {'max_length': 200, 'early_stopping': True, 'num_beams': 5, 'forced_eos_token_id': 2}
Non-default generation parameters: {'max_length': 200, 'early_stopping': True, 'num_beams': 5, 'forced_eos_token_id': 2}
There were missing keys in the checkpoint model loaded: ['model.encoder.embed_tokens.weight', 'model.decoder.embed_tokens.weight', 'lm_head.weight'].


TrainOutput(global_step=57066, training_loss=0.4381862871070241, metrics={'train_runtime': 22031.7792, 'train_samples_per_second': 20.721, 'train_steps_per_second': 2.59, 'total_flos': 1.0272863862718464e+17, 'train_loss': 0.4381862871070241, 'epoch': 3.0})

In [None]:
trainer.save_model(f"{base_path}model/mBart-1:3-nllb")

Non-default generation parameters: {'max_length': 200, 'early_stopping': True, 'num_beams': 5, 'forced_eos_token_id': 2}


In [None]:
model_name = "facebook/mbart-large-50-many-to-many-mmt"
tokenizer = MBart50TokenizerFast.from_pretrained(model_name)
model = MBartForConditionalGeneration.from_pretrained(
    f"{base_path}model/mBart-1:1-nllb",
    local_files_only = True,
).to("cuda")
tokenizer.src_lang = "zh_CN"

In [None]:
bleu_score = compute_bleu(dataset["test"]["zh"], dataset["test"]["en"])
print(f"Bleu Score Before Finetuning with {model_name}: {bleu_score}")

In [None]:
from google.colab import runtime
runtime.unassign()

### Opus-MT (1:1)

In [None]:
with open(f"{base_path}Synthetic-Data/NLLB/yue.txt", "r", encoding="UTF-8") as f:
  syn_data_yue = f.read().splitlines()

In [None]:
with open(f"{base_path}Synthetic-Data/NLLB/en.txt", "r", encoding="UTF-8") as f:
  syn_data_en = f.read().splitlines()

In [None]:
train_sys = zip(syn_data_yue, syn_data_en)
train_sys_df = pd.DataFrame(train_sys, columns = ['zh','en'])

In [None]:
random.seed(42)
mixed_train_df = pd.concat([train_df, train_sys_df.sample(len(train_df) * 1)],ignore_index=True)

In [None]:
mixed_train_df

Unnamed: 0,zh,en
0,50自,50-m freestyle race
1,AV線,audio-visual cable
2,DT堂,DT lesson
3,DT室,DT room
4,OL衫,clothes for office ladies
...,...,...
76083,好多基督徒都話:唔得。,"""Many in the Christian faith have said, 'Well,..."
76084,近年啲快餐小朋友嘅食嘢受到嚴密嘅審查。,Fast-food kids meals have been subject to inte...
76085,"康普納:希臘可能係地心,但係呢個係歐洲危機。",John Kampfner: Greece may be the epicentre – b...
76086,"你唔想返部電腦去修理工廠,自己換返個死爛電腦鎖匙幾難貴呀?",How hard and expensive is it to replace stuck ...


In [None]:
dataset = DatasetDict({
    'train': Dataset.from_pandas(mixed_train_df),
    'dev': Dataset.from_pandas(dev_df),
    'test': Dataset.from_pandas(test_df)})

In [None]:
dataset

DatasetDict({
    train: Dataset({
        features: ['zh', 'en'],
        num_rows: 76088
    })
    dev: Dataset({
        features: ['zh', 'en'],
        num_rows: 3001
    })
    test: Dataset({
        features: ['zh', 'en'],
        num_rows: 3000
    })
})

In [None]:
model_name = "Helsinki-NLP/opus-mt-zh-en"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSeq2SeqLM.from_pretrained(model_name).to("cuda")

pytorch_model.bin:   0%|          | 0.00/312M [00:00<?, ?B/s]

  return self.fget.__get__(instance, owner)()


generation_config.json:   0%|          | 0.00/293 [00:00<?, ?B/s]

In [None]:
metric = evaluate.load("sacrebleu")

source_lang = "zh"
target_lang = "en"


def compute_bleu(src_sentences, tgt_sentences, batch_size=32):
  pred = []

  for i in tqdm(range(0, len(src_sentences), batch_size)):
    batch = src_sentences[i:i + batch_size]
    encoded_input = tokenizer(batch, return_tensors="pt", padding=True, truncation=True, max_length=512).to("cuda")
    translated_output = model.generate(**encoded_input)
    translated_batch = [tokenizer.decode(t, skip_special_tokens=True) for t in translated_output]
    pred.extend(translated_batch)

  res = metric.compute(predictions=pred, references=tgt_sentences)
  return res["score"]

In [None]:
bleu_score = compute_bleu(dataset["test"][source_lang], dataset["test"][target_lang])
print(f"Bleu Score Before Finetuning with {model_name}: {bleu_score}")

100%|██████████| 94/94 [01:35<00:00,  1.01s/it]


Bleu Score Before Finetuning with Helsinki-NLP/opus-mt-zh-en: 10.403463499107957


In [None]:
max_input_length = 512
source_lang = "zh"
target_lang = "en"

def preprocess_function(examples):
    inputs =  examples[source_lang]
    targets = examples[target_lang]
    model_inputs = tokenizer(inputs, return_tensors="pt", max_length=max_input_length, padding=True, truncation=True)
    labels = tokenizer(targets, return_tensors="pt", max_length=max_input_length, padding=True, truncation=True)
    model_inputs["labels"] = labels["input_ids"]
    return model_inputs

In [None]:
tokenized_datasets = dataset.map(preprocess_function, batched=True)

Map:   0%|          | 0/76088 [00:00<?, ? examples/s]

Map:   0%|          | 0/3001 [00:00<?, ? examples/s]

Map:   0%|          | 0/3000 [00:00<?, ? examples/s]

In [None]:
tokenized_datasets

DatasetDict({
    train: Dataset({
        features: ['zh', 'en', 'input_ids', 'attention_mask', 'labels'],
        num_rows: 76088
    })
    dev: Dataset({
        features: ['zh', 'en', 'input_ids', 'attention_mask', 'labels'],
        num_rows: 3001
    })
    test: Dataset({
        features: ['zh', 'en', 'input_ids', 'attention_mask', 'labels'],
        num_rows: 3000
    })
})

In [None]:
data_collator = DataCollatorForSeq2Seq(tokenizer, model=model, return_tensors="pt")

In [None]:
batch_size = 16
model_name = model_name.split("/")[-1]
args = Seq2SeqTrainingArguments(
    f"{model_name}-finetuned",
    evaluation_strategy = "epoch",
    save_strategy = "epoch",
    learning_rate=1e-4,
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=batch_size,
    weight_decay=0.01,
    save_total_limit=3,
    num_train_epochs=20,
    predict_with_generate=True,
    fp16=True,
    load_best_model_at_end=True,
)

In [None]:
metric = evaluate.load("sacrebleu")

def postprocess_text(preds, labels):
    preds = [pred.strip() for pred in preds]
    labels = [[label.strip()] for label in labels]
    return preds, labels
def compute_metrics(eval_preds):
    preds, labels = eval_preds
    if isinstance(preds, tuple):
        preds = preds[0]
    decoded_preds = tokenizer.batch_decode(preds, skip_special_tokens=True)
    # Replace -100 in the labels as we can't decode them.
    labels = np.where(labels != -100, labels, tokenizer.pad_token_id)
    decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)
    # Some simple post-processing
    decoded_preds, decoded_labels = postprocess_text(decoded_preds, decoded_labels)
    result = metric.compute(predictions=decoded_preds, references=decoded_labels)
    result = {"bleu": result["score"]}
    prediction_lens = [np.count_nonzero(pred != tokenizer.pad_token_id) for pred in preds]
    result["gen_len"] = np.mean(prediction_lens)
    result = {k: round(v, 4) for k, v in result.items()}
    return result

In [None]:
from transformers import EarlyStoppingCallback
trainer = Seq2SeqTrainer(
    model,
    args,
    train_dataset=tokenized_datasets["train"],
    eval_dataset=tokenized_datasets["dev"],
    data_collator=data_collator,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics,
)

In [None]:
trainer.train()

Epoch,Training Loss,Validation Loss,Bleu,Gen Len
1,0.3987,0.450381,11.5199,28.6038
2,0.3332,0.453063,12.1279,28.4155
3,0.3563,0.440233,12.1215,28.5591
4,0.3287,0.442217,12.4147,29.0516
5,0.3042,0.447568,12.509,29.2692
6,0.2833,0.456032,12.2967,28.963
7,0.2608,0.465638,12.237,29.0283
8,0.2454,0.47669,12.6126,29.2483
9,0.2254,0.483098,12.4114,29.018
10,0.2081,0.501238,12.4304,29.3522


Checkpoint destination directory opus-mt-zh-en-finetuned/checkpoint-4756 already exists and is non-empty. Saving will proceed but saved results may be invalid.
Non-default generation parameters: {'max_length': 512, 'num_beams': 6, 'bad_words_ids': [[65000]], 'forced_eos_token_id': 0}
Checkpoint destination directory opus-mt-zh-en-finetuned/checkpoint-9512 already exists and is non-empty. Saving will proceed but saved results may be invalid.
Non-default generation parameters: {'max_length': 512, 'num_beams': 6, 'bad_words_ids': [[65000]], 'forced_eos_token_id': 0}
Non-default generation parameters: {'max_length': 512, 'num_beams': 6, 'bad_words_ids': [[65000]], 'forced_eos_token_id': 0}
Non-default generation parameters: {'max_length': 512, 'num_beams': 6, 'bad_words_ids': [[65000]], 'forced_eos_token_id': 0}
Non-default generation parameters: {'max_length': 512, 'num_beams': 6, 'bad_words_ids': [[65000]], 'forced_eos_token_id': 0}
Non-default generation parameters: {'max_length': 512, 

Epoch,Training Loss,Validation Loss,Bleu,Gen Len
1,0.3987,0.450381,11.5199,28.6038
2,0.3332,0.453063,12.1279,28.4155
3,0.3563,0.440233,12.1215,28.5591
4,0.3287,0.442217,12.4147,29.0516
5,0.3042,0.447568,12.509,29.2692
6,0.2833,0.456032,12.2967,28.963
7,0.2608,0.465638,12.237,29.0283
8,0.2454,0.47669,12.6126,29.2483
9,0.2254,0.483098,12.4114,29.018
10,0.2081,0.501238,12.4304,29.3522


Non-default generation parameters: {'max_length': 512, 'num_beams': 6, 'bad_words_ids': [[65000]], 'forced_eos_token_id': 0}
Non-default generation parameters: {'max_length': 512, 'num_beams': 6, 'bad_words_ids': [[65000]], 'forced_eos_token_id': 0}
Non-default generation parameters: {'max_length': 512, 'num_beams': 6, 'bad_words_ids': [[65000]], 'forced_eos_token_id': 0}
Non-default generation parameters: {'max_length': 512, 'num_beams': 6, 'bad_words_ids': [[65000]], 'forced_eos_token_id': 0}
Non-default generation parameters: {'max_length': 512, 'num_beams': 6, 'bad_words_ids': [[65000]], 'forced_eos_token_id': 0}
Non-default generation parameters: {'max_length': 512, 'num_beams': 6, 'bad_words_ids': [[65000]], 'forced_eos_token_id': 0}
Non-default generation parameters: {'max_length': 512, 'num_beams': 6, 'bad_words_ids': [[65000]], 'forced_eos_token_id': 0}
Non-default generation parameters: {'max_length': 512, 'num_beams': 6, 'bad_words_ids': [[65000]], 'forced_eos_token_id': 0}


TrainOutput(global_step=95120, training_loss=0.21442790556796165, metrics={'train_runtime': 16144.2396, 'train_samples_per_second': 94.26, 'train_steps_per_second': 5.892, 'total_flos': 4.254371493602918e+16, 'train_loss': 0.21442790556796165, 'epoch': 20.0})

In [None]:
trainer.save_model(f"{base_path}model/opus-mt-zh-en-1:1-20E-nllb")

Non-default generation parameters: {'max_length': 512, 'num_beams': 6, 'bad_words_ids': [[65000]], 'forced_eos_token_id': 0}


In [None]:
tokenizer = AutoTokenizer.from_pretrained("Helsinki-NLP/opus-mt-zh-en")
model = AutoModelForSeq2SeqLM.from_pretrained(
  f"{base_path}model/opus-mt-zh-en-1:1-20E-nllb",
  local_files_only=True
).to("cuda")

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/44.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/1.39k [00:00<?, ?B/s]

source.spm:   0%|          | 0.00/805k [00:00<?, ?B/s]

target.spm:   0%|          | 0.00/807k [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/1.62M [00:00<?, ?B/s]

In [None]:
bleu_score = compute_bleu(dataset["test"][source_lang], dataset["test"][target_lang])
print(f"Bleu Score After Finetuning with Helsinki-NLP/opus-mt-zh-en: {bleu_score}")

Bleu Score After Finetuning with Helsinki-NLP/opus-mt-zh-en: 13.062283169701946


In [None]:
from google.colab import runtime
runtime.unassign()

### Opus-MT (1:3)

In [None]:
with open(f"{base_path}Synthetic-Data/NLLB/yue.txt", "r", encoding="UTF-8") as f:
  syn_data_yue = f.read().splitlines()

In [None]:
with open(f"{base_path}Synthetic-Data/NLLB/en.txt", "r", encoding="UTF-8") as f:
  syn_data_en = f.read().splitlines()

In [None]:
train_sys = zip(syn_data_yue, syn_data_en)
train_sys_df = pd.DataFrame(train_sys, columns = ['zh','en'])

In [None]:
random.seed(42)
mixed_train_df = pd.concat([train_df, train_sys_df.sample(len(train_df) * 3)],ignore_index=True)

In [None]:
mixed_train_df

Unnamed: 0,zh,en
0,50自,50-m freestyle race
1,AV線,audio-visual cable
2,DT堂,DT lesson
3,DT室,DT room
4,OL衫,clothes for office ladies
...,...,...
152171,"佢老婆喺一九五年死咗,個仔喺一九七就搬咗。",His wife died in 1995 and his son moved away i...
152172,"愛和平,星期四由蛇尾出版,九九九九九。","Peace, Love & Potatoes, is published by Serpen..."
152173,"投資額最低五百蚊,個户可以轉換上年津貼。","There is a minimum investment of £500, and the..."
152174,"呢隻易飲嘅白金酒,成百年嘅香水,返個軟果汁埋一條精細嘅根。","This easy-drinking, amber-tinted blonde ale tu..."


In [None]:
dataset = DatasetDict({
    'train': Dataset.from_pandas(mixed_train_df),
    'dev': Dataset.from_pandas(dev_df),
    'test': Dataset.from_pandas(test_df)})

In [None]:
dataset

DatasetDict({
    train: Dataset({
        features: ['zh', 'en'],
        num_rows: 152176
    })
    dev: Dataset({
        features: ['zh', 'en'],
        num_rows: 3001
    })
    test: Dataset({
        features: ['zh', 'en'],
        num_rows: 3000
    })
})

In [None]:
model_name = "Helsinki-NLP/opus-mt-zh-en"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSeq2SeqLM.from_pretrained(model_name).to("cuda")

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/44.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/1.39k [00:00<?, ?B/s]

source.spm:   0%|          | 0.00/805k [00:00<?, ?B/s]

target.spm:   0%|          | 0.00/807k [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/1.62M [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/312M [00:00<?, ?B/s]

  return self.fget.__get__(instance, owner)()


generation_config.json:   0%|          | 0.00/293 [00:00<?, ?B/s]

In [14]:
metric = evaluate.load("sacrebleu")

source_lang = "zh"
target_lang = "en"


def compute_bleu(src_sentences, tgt_sentences, batch_size=32):
  pred = []

  for i in tqdm(range(0, len(src_sentences), batch_size)):
    batch = src_sentences[i:i + batch_size]
    encoded_input = tokenizer(batch, return_tensors="pt", padding=True, truncation=True, max_length=512).to("cuda")
    translated_output = model.generate(**encoded_input)
    translated_batch = [tokenizer.decode(t, skip_special_tokens=True) for t in translated_output]
    pred.extend(translated_batch)

  res = metric.compute(predictions=pred, references=tgt_sentences)
  return res["score"]


Downloading builder script:   0%|          | 0.00/8.15k [00:00<?, ?B/s]

In [None]:
bleu_score = compute_bleu(dataset["test"][source_lang], dataset["test"][target_lang])
print(f"Bleu Score Before Finetuning with {model_name}: {bleu_score}")

Downloading builder script:   0%|          | 0.00/8.15k [00:00<?, ?B/s]

100%|██████████| 94/94 [01:44<00:00,  1.12s/it]


Bleu Score Before Finetuning with Helsinki-NLP/opus-mt-zh-en: 10.403463499107957


In [None]:
max_input_length = 512
source_lang = "zh"
target_lang = "en"

def preprocess_function(examples):
    inputs =  examples[source_lang]
    targets = examples[target_lang]
    model_inputs = tokenizer(inputs, return_tensors="pt", max_length=max_input_length, padding=True, truncation=True)
    labels = tokenizer(targets, return_tensors="pt", max_length=max_input_length, padding=True, truncation=True)
    model_inputs["labels"] = labels["input_ids"]
    return model_inputs

In [None]:
tokenized_datasets = dataset.map(preprocess_function, batched=True)

Map:   0%|          | 0/152176 [00:00<?, ? examples/s]

Map:   0%|          | 0/3001 [00:00<?, ? examples/s]

Map:   0%|          | 0/3000 [00:00<?, ? examples/s]

In [None]:
tokenized_datasets

DatasetDict({
    train: Dataset({
        features: ['zh', 'en', 'input_ids', 'attention_mask', 'labels'],
        num_rows: 152176
    })
    dev: Dataset({
        features: ['zh', 'en', 'input_ids', 'attention_mask', 'labels'],
        num_rows: 3001
    })
    test: Dataset({
        features: ['zh', 'en', 'input_ids', 'attention_mask', 'labels'],
        num_rows: 3000
    })
})

In [None]:
data_collator = DataCollatorForSeq2Seq(tokenizer, model=model, return_tensors="pt")

In [None]:
batch_size = 16
model_name = model_name.split("/")[-1]
args = Seq2SeqTrainingArguments(
    f"{model_name}-finetuned",
    evaluation_strategy = "epoch",
    save_strategy = "epoch",
    learning_rate=1e-4,
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=batch_size,
    weight_decay=0.01,
    save_total_limit=3,
    num_train_epochs=10,
    predict_with_generate=True,
    fp16=True,
    load_best_model_at_end=True,
)

In [None]:
metric = evaluate.load("sacrebleu")

def postprocess_text(preds, labels):
    preds = [pred.strip() for pred in preds]
    labels = [[label.strip()] for label in labels]
    return preds, labels
def compute_metrics(eval_preds):
    preds, labels = eval_preds
    if isinstance(preds, tuple):
        preds = preds[0]
    decoded_preds = tokenizer.batch_decode(preds, skip_special_tokens=True)
    # Replace -100 in the labels as we can't decode them.
    labels = np.where(labels != -100, labels, tokenizer.pad_token_id)
    decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)
    # Some simple post-processing
    decoded_preds, decoded_labels = postprocess_text(decoded_preds, decoded_labels)
    result = metric.compute(predictions=decoded_preds, references=decoded_labels)
    result = {"bleu": result["score"]}
    prediction_lens = [np.count_nonzero(pred != tokenizer.pad_token_id) for pred in preds]
    result["gen_len"] = np.mean(prediction_lens)
    result = {k: round(v, 4) for k, v in result.items()}
    return result

In [None]:
trainer = Seq2SeqTrainer(
    model,
    args,
    train_dataset=tokenized_datasets["train"],
    eval_dataset=tokenized_datasets["dev"],
    data_collator=data_collator,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics,
)

In [None]:
trainer.train()

Epoch,Training Loss,Validation Loss,Bleu,Gen Len
1,0.4668,0.47538,10.4487,28.977
2,0.4133,0.440898,12.1053,28.9247
3,0.3761,0.424051,12.3089,28.6435
4,0.3481,0.412891,13.103,28.6508
5,0.3258,0.408879,13.1745,28.7687
6,0.3072,0.405631,13.6459,29.2423
7,0.286,0.406872,13.5239,28.96
8,0.2666,0.407357,13.7627,28.9427
9,0.2546,0.409567,13.7374,28.9107
10,0.2416,0.412231,13.8416,28.9454


Non-default generation parameters: {'max_length': 512, 'num_beams': 6, 'bad_words_ids': [[65000]], 'forced_eos_token_id': 0}
Non-default generation parameters: {'max_length': 512, 'num_beams': 6, 'bad_words_ids': [[65000]], 'forced_eos_token_id': 0}
Non-default generation parameters: {'max_length': 512, 'num_beams': 6, 'bad_words_ids': [[65000]], 'forced_eos_token_id': 0}
Non-default generation parameters: {'max_length': 512, 'num_beams': 6, 'bad_words_ids': [[65000]], 'forced_eos_token_id': 0}
Non-default generation parameters: {'max_length': 512, 'num_beams': 6, 'bad_words_ids': [[65000]], 'forced_eos_token_id': 0}
Non-default generation parameters: {'max_length': 512, 'num_beams': 6, 'bad_words_ids': [[65000]], 'forced_eos_token_id': 0}
Non-default generation parameters: {'max_length': 512, 'num_beams': 6, 'bad_words_ids': [[65000]], 'forced_eos_token_id': 0}
Non-default generation parameters: {'max_length': 512, 'num_beams': 6, 'bad_words_ids': [[65000]], 'forced_eos_token_id': 0}


TrainOutput(global_step=95110, training_loss=0.3353392251970991, metrics={'train_runtime': 15016.2341, 'train_samples_per_second': 101.341, 'train_steps_per_second': 6.334, 'total_flos': 4.387472152304026e+16, 'train_loss': 0.3353392251970991, 'epoch': 10.0})

In [None]:
trainer.save_model(f"{base_path}model/opus-mt-zh-en-1:3-10E-nllb")

Non-default generation parameters: {'max_length': 512, 'num_beams': 6, 'bad_words_ids': [[65000]], 'forced_eos_token_id': 0}


In [12]:
model_name = "Helsinki-NLP/opus-mt-zh-en"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSeq2SeqLM.from_pretrained(
  f"{base_path}model/opus-mt-zh-en-1:3-10E-nllb",
  local_files_only=True
).to("cuda")

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/44.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/1.39k [00:00<?, ?B/s]

source.spm:   0%|          | 0.00/805k [00:00<?, ?B/s]

target.spm:   0%|          | 0.00/807k [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/1.62M [00:00<?, ?B/s]

In [15]:
bleu_score = compute_bleu(dataset["test"][source_lang], dataset["test"][target_lang])
print(f"Bleu Score After Finetuning with {model_name}: {bleu_score}")

100%|██████████| 94/94 [02:22<00:00,  1.51s/it]


Bleu Score After Finetuning with Helsinki-NLP/opus-mt-zh-en: 13.366554317566488


In [None]:
from google.colab import runtime
runtime.unassign()