## Install Necessary Dependencies

In [1]:
!pip install -U accelerate
!pip install -U transformers
!pip install datasets
!pip install sacrebleu
!pip install evaluate

Collecting accelerate
  Downloading accelerate-0.27.2-py3-none-any.whl (279 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m280.0/280.0 kB[0m [31m7.2 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: accelerate
Successfully installed accelerate-0.27.2
Collecting transformers
  Downloading transformers-4.38.2-py3-none-any.whl (8.5 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m8.5/8.5 MB[0m [31m70.7 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: transformers
  Attempting uninstall: transformers
    Found existing installation: transformers 4.38.1
    Uninstalling transformers-4.38.1:
      Successfully uninstalled transformers-4.38.1
Successfully installed transformers-4.38.2
Collecting datasets
  Downloading datasets-2.18.0-py3-none-any.whl (510 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m510.5/510.5 kB[0m [31m9.6 MB/s[0m eta [36m0:00:00[0m
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  

## Import and Mount Google Drive

In [2]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [3]:
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
from transformers import DataCollatorForSeq2Seq, Seq2SeqTrainingArguments
from transformers import Seq2SeqTrainer
from transformers import BertTokenizer
from datasets.dataset_dict import DatasetDict
from datasets import Dataset
import pandas as pd
import random
from tqdm import tqdm
import gc
import torch
import evaluate
import numpy as np

## Import Data

In [4]:
base_path = "/content/drive/MyDrive/Cantonese-NLP/"
folders = ["train-short/","train","dev","test"]
file_name = ["yue.txt","en.txt"]

In [5]:
yue = []
en = []
for folder in folders:
  with open(f"{base_path}Processed-Data/{folder}/{file_name[0]}",'r') as f_yue, open(f"{base_path}Processed-Data/{folder}/{file_name[1]}",'r') as f_en:
    yue.append(f_yue.read().splitlines())
    en.append(f_en.read().splitlines())


In [6]:
# Concatenate Short and Long Training Example
yue[1] = yue[0] + yue[1]
del yue[0]
en[1] = en[0] + en[1]
del en[0]

In [7]:
train = zip(yue[0], en[0])
dev = zip(yue[1], en[1])
test = zip(yue[2], en[2])

train_df = pd.DataFrame(train, columns = ['zh','en'])
dev_df = pd.DataFrame(dev, columns = ['zh','en'])
test_df = pd.DataFrame(test, columns = ['zh','en'])

In [8]:
dataset = DatasetDict({
    'train': Dataset.from_pandas(train_df),
    'dev': Dataset.from_pandas(dev_df),
    'test': Dataset.from_pandas(test_df)})


In [9]:
dataset

DatasetDict({
    train: Dataset({
        features: ['zh', 'en'],
        num_rows: 38044
    })
    dev: Dataset({
        features: ['zh', 'en'],
        num_rows: 3001
    })
    test: Dataset({
        features: ['zh', 'en'],
        num_rows: 3000
    })
})

In [10]:
with open(f"{base_path}/Processed-Data/mono/mono_data.txt", "r", encoding='utf-8') as f:
  mono_data = f.readlines()

In [11]:
print(f"Number of Sentences: {len(mono_data)}")
print(f"Average Length: {sum([len(d) for d in mono_data])/ len(mono_data):.02f}")

Number of Sentences: 1135989
Average Length: 29.89


## Import Baseline Model for Translation

In [None]:
nllb_tokenizer = AutoTokenizer.from_pretrained("facebook/nllb-200-distilled-600M", src_lang="yue_Hant", tgt_lang="eng_Latn")
nllb_ft_forward_model = AutoModelForSeq2SeqLM.from_pretrained(f"{base_path}model/nllb-200-distilled-600M-finetuned",local_files_only=True).half().to("cuda")

In [None]:
def get_translation(original_sentences, model, tokenizer):
  inputs = tokenizer(original_sentences, return_tensors="pt", padding=True, truncation=True).to("cuda")
  translated_tokens = model.generate(
      **inputs, forced_bos_token_id=tokenizer.lang_code_to_id["eng_Latn"], max_length=30
  )
  return tokenizer.batch_decode(translated_tokens, skip_special_tokens=True)


In [None]:
get_translation(mono_data[0:3], nllb_ft_forward_model, nllb_tokenizer)

['I might spend a few hundred dollars to get a message on YouTube.',
 "Apple's junk is a dog. You can only play closed system, but when it gets out of business, it becomes a tr",
 "I've grown up and it's hard to find a lover. Even if I see a pretty girl in big boobs, she"]

## Synthesize Data

In [None]:
random.seed(42)

In [None]:
SYNTHETIC_DATA_SIZE = 200_000
TRANSLATION_BATCH_SIZE = 100

In [None]:
syn_data_yue = random.sample(mono_data, SYNTHETIC_DATA_SIZE)
syn_data_en = []

for i in tqdm(range(0, SYNTHETIC_DATA_SIZE, TRANSLATION_BATCH_SIZE)):
  inputs = nllb_tokenizer(syn_data_yue[i:i+TRANSLATION_BATCH_SIZE], return_tensors="pt", padding=True, truncation=True).to("cuda")
  translated_tokens = nllb_ft_forward_model.generate(
      **inputs, forced_bos_token_id=nllb_tokenizer.lang_code_to_id["eng_Latn"], max_length=30
  )
  syn_data_en.extend(nllb_tokenizer.batch_decode(translated_tokens, skip_special_tokens=True))


100%|██████████| 2000/2000 [18:39<00:00,  1.79it/s]


In [None]:
with open(f"{base_path}Synthetic-Data/Baseline/yue.txt", "w", encoding="UTF-8") as f:
  for line in syn_data_yue:
    f.write(f"{line}")

In [None]:
with open(f"{base_path}Synthetic-Data/Baseline/en.txt", "w", encoding="UTF-8") as f:
  for line in syn_data_en:
    f.write(f"{line}\n")

## Load Synthetic Data

In [None]:
with open(f"{base_path}Synthetic-Data/Baseline/yue.txt", "r", encoding="UTF-8") as f:
  syn_data_yue = f.read().splitlines()

In [None]:
with open(f"{base_path}Synthetic-Data/Baseline/en.txt", "r", encoding="UTF-8") as f:
  syn_data_en = f.read().splitlines()

In [None]:
train_sys = zip(syn_data_yue, syn_data_en)
train_sys_df = pd.DataFrame(train_sys, columns = ['zh','en'])

In [None]:
train_sys_df

Unnamed: 0,zh,en
0,我建議師傅完job後報國安拉鳩樓主,I suggest the teacher call in the police after...
1,拍得長的，通常都係冇晒新鮮感之後,"When you take a long shot, you usually lose yo..."
2,女「用到就得啦，你有無諗過我地將來呀，仲fing啲無謂錢，點解你可以咁自私」,"The woman said, ""It's okay, have you ever thou..."
3,我覺得LNG易過KT T1打返LCK就鳩鳩哋,I think it's easier to play LNG than KT1 to pl...
4,唔洗收玩具咩，我個仔玩完收晒玩具，機械人咪出黎開工,I don't need to take the toys away. My son has...
...,...,...
199995,所以呢啲死亡既元素應該係for女方而唔係男方,Is it because you don't want to talk to her af...
199996,藍井 呀姐 Aimer TrySail ClariS 是但加多一個都打得爆個場啦,Anyone who doesn't have the skills knows how t...
199997,點知撞正肺炎 全部收晒皮咁款(有冇人知有邊D參加既國家係有好大收益嫁??),I am sure there is a public record in the elec...
199998,咁老實講拍片係佢既工作,"This time, it's obvious that the verdict has n..."


In [None]:
mixed_train_df = pd.concat([train_df, train_sys_df.sample(len(train_df) * 0)],ignore_index=True)

In [None]:
mixed_train_df

Unnamed: 0,zh,en
0,50自,50-m freestyle race
1,AV線,audio-visual cable
2,DT堂,DT lesson
3,DT室,DT room
4,OL衫,clothes for office ladies
...,...,...
38039,呢單嘢好杰，你要小心啲處理！,"This is a very troublesome case, so you have t..."
38040,平板電腦與手提電腦優缺點大剖白,A comparison between the pros and cons of lapt...
38041,嚟嚟去去都係得呢幾味餸，好厭啊！,The dishes are the same every time. I'm tired ...
38042,供養父母係一件天經地義嘅事。,Supporting our parents is our moral obligation.


In [None]:
dataset = DatasetDict({
    'train': Dataset.from_pandas(mixed_train_df),
    'dev': Dataset.from_pandas(dev_df),
    'test': Dataset.from_pandas(test_df)})

In [None]:
dataset

DatasetDict({
    train: Dataset({
        features: ['zh', 'en'],
        num_rows: 38044
    })
    dev: Dataset({
        features: ['zh', 'en'],
        num_rows: 3001
    })
    test: Dataset({
        features: ['zh', 'en'],
        num_rows: 3000
    })
})

## Create Back-Translation Model

In [None]:
model_name = "facebook/nllb-200-distilled-600M"

nllb_back_tokenizer = AutoTokenizer.from_pretrained(
    model_name, src_lang="eng_Latn", tgt_lang="yue_Hant"
)
nllb_model = AutoModelForSeq2SeqLM.from_pretrained(model_name).to("cuda")

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/564 [00:00<?, ?B/s]

sentencepiece.bpe.model:   0%|          | 0.00/4.85M [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/17.3M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/3.55k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/846 [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/2.46G [00:00<?, ?B/s]

  return self.fget.__get__(instance, owner)()


generation_config.json:   0%|          | 0.00/189 [00:00<?, ?B/s]

In [None]:
n = random.randint(0,3000)
test = dataset["test"][n]
test_string = test["en"]
print("Orginal Data: " + test_string)


inputs = nllb_back_tokenizer(test["en"], return_tensors="pt", padding=True, truncation=True).to("cuda")

translated_tokens = nllb_model.generate(
    **inputs, forced_bos_token_id=nllb_back_tokenizer.lang_code_to_id["yue_Hant"], max_length=128
)

print("Model Translation: " + nllb_back_tokenizer.batch_decode(translated_tokens, skip_special_tokens=True)[0])
print("Dataset Translation: " + test["zh"])

Orginal Data: He is generous and everyone respects him a lot.
Model Translation: 佢係慷慨嘅,人人都非常尊重佢.
Dataset Translation: 佢為人厚道，大家都好敬重佢。


In [None]:
metric = evaluate.load("sacrebleu")

def compute_bleu_nllb_zh(src_sentences, tgt_sentences, model, tokenizer, batch_size=32):
  pred = []

  for i in tqdm(range(0, len(src_sentences), batch_size)):
    batch = src_sentences[i:i + batch_size]
    encoded_input = tokenizer(batch, return_tensors="pt", padding=True, truncation=True, max_length=128).to("cuda")
    translated_tokens = model.generate(**encoded_input, forced_bos_token_id=tokenizer.lang_code_to_id["yue_Hant"], max_length=128)
    translated_batch = tokenizer.batch_decode(translated_tokens, skip_special_tokens=True)
    pred.extend(translated_batch)

  res = metric.compute(predictions=pred, references=tgt_sentences, tokenize="zh")
  return res["score"]


Downloading builder script:   0%|          | 0.00/8.15k [00:00<?, ?B/s]

In [None]:
bleu_score = compute_bleu_nllb_zh(dataset["test"]["en"], dataset["test"]["zh"], nllb_model, nllb_back_tokenizer)
print(f"Bleu Score Before Finetuning with {model_name}: {bleu_score}")

100%|██████████| 94/94 [00:54<00:00,  1.74it/s]


Bleu Score Before Finetuning with facebook/nllb-200-distilled-600M: 11.451673584586171


In [None]:
prefix = ""
max_input_length = 128
max_target_length = 128
source_lang = "en"
target_lang = "zh"
def preprocess_function(examples):
    inputs =  examples[source_lang]
    targets = examples[target_lang]
    model_inputs = nllb_back_tokenizer(inputs, return_tensors="pt", max_length=max_input_length, padding=True, truncation=True)

    labels = nllb_back_tokenizer(targets, return_tensors="pt", max_length=max_input_length, padding=True, truncation=True)
    model_inputs["labels"] = labels["input_ids"]
    return model_inputs

In [None]:
tokenized_datasets = dataset.map(preprocess_function, batched=True)

Map:   0%|          | 0/38044 [00:00<?, ? examples/s]

Map:   0%|          | 0/3001 [00:00<?, ? examples/s]

Map:   0%|          | 0/3000 [00:00<?, ? examples/s]

In [None]:
tokenized_datasets

DatasetDict({
    train: Dataset({
        features: ['zh', 'en', 'input_ids', 'attention_mask', 'labels'],
        num_rows: 38044
    })
    dev: Dataset({
        features: ['zh', 'en', 'input_ids', 'attention_mask', 'labels'],
        num_rows: 3001
    })
    test: Dataset({
        features: ['zh', 'en', 'input_ids', 'attention_mask', 'labels'],
        num_rows: 3000
    })
})

In [None]:
metric = evaluate.load("sacrebleu")

def postprocess_text(preds, labels):
    preds = [pred.strip() for pred in preds]
    labels = [[label.strip()] for label in labels]
    return preds, labels

def compute_metrics(eval_preds):
    preds, labels = eval_preds
    if isinstance(preds, tuple):
        preds = preds[0]
    decoded_preds = nllb_back_tokenizer.batch_decode(preds, skip_special_tokens=True)
    # Replace -100 in the labels as we can't decode them.
    labels = np.where(labels != -100, labels, nllb_back_tokenizer.pad_token_id)
    decoded_labels = nllb_back_tokenizer.batch_decode(labels, skip_special_tokens=True)
    # Some simple post-processing
    decoded_preds, decoded_labels = postprocess_text(decoded_preds, decoded_labels)
    result = metric.compute(predictions=decoded_preds, references=decoded_labels, tokenize="zh")
    result = {"bleu": result["score"]}
    prediction_lens = [np.count_nonzero(pred != nllb_back_tokenizer.pad_token_id) for pred in preds]
    result["gen_len"] = np.mean(prediction_lens)
    result = {k: round(v, 4) for k, v in result.items()}
    return result

In [None]:
args = Seq2SeqTrainingArguments(
    f"{model_name}-finetuned",
    evaluation_strategy = "epoch",
    save_strategy = "epoch",
    learning_rate=1e-4,
    auto_find_batch_size=True,
    weight_decay=0.01,
    save_total_limit=3,
    num_train_epochs=10,
    predict_with_generate=True,
    fp16 = True,
    load_best_model_at_end=True,
)

In [None]:
data_collator = DataCollatorForSeq2Seq(nllb_back_tokenizer, model=nllb_model)

In [None]:
trainer = Seq2SeqTrainer(
    nllb_model,
    args,
    train_dataset=tokenized_datasets["train"],
    eval_dataset=tokenized_datasets["dev"],
    data_collator=data_collator,
    tokenizer=nllb_back_tokenizer,
    compute_metrics=compute_metrics,
)

In [None]:
trainer.train()

Epoch,Training Loss,Validation Loss,Bleu,Gen Len
1,0.9147,0.763185,20.1599,16.5631
2,0.7148,0.735305,22.0989,17.0347
3,0.5583,0.749171,22.3735,16.9417
4,0.4192,0.765189,22.5476,16.7774
5,0.3166,0.79833,22.5628,16.9017
6,0.2389,0.824402,22.5819,16.8924
7,0.1679,0.850814,22.7414,17.042
8,0.128,0.872401,22.7024,16.9907
9,0.0976,0.888213,22.6335,16.8397
10,0.0815,0.894755,22.8023,16.9277


Non-default generation parameters: {'max_length': 200}
Non-default generation parameters: {'max_length': 200}
Non-default generation parameters: {'max_length': 200}
Non-default generation parameters: {'max_length': 200}
Non-default generation parameters: {'max_length': 200}
Non-default generation parameters: {'max_length': 200}
Non-default generation parameters: {'max_length': 200}
Non-default generation parameters: {'max_length': 200}
Non-default generation parameters: {'max_length': 200}
Non-default generation parameters: {'max_length': 200}
There were missing keys in the checkpoint model loaded: ['model.encoder.embed_tokens.weight', 'model.decoder.embed_tokens.weight', 'lm_head.weight'].


TrainOutput(global_step=47560, training_loss=0.37852500469570305, metrics={'train_runtime': 12437.6329, 'train_samples_per_second': 30.588, 'train_steps_per_second': 3.824, 'total_flos': 5.138182155426202e+16, 'train_loss': 0.37852500469570305, 'epoch': 10.0})

In [None]:
trainer.save_model(f"{base_path}model/{model_name}-back-finetuned-it1-10E")

Non-default generation parameters: {'max_length': 200}


In [None]:
model_name = "facebook/nllb-200-distilled-600M"

nllb_back_tokenizer = AutoTokenizer.from_pretrained(
    model_name,
    src_lang="yue_Hant",
    tgt_lang="eng_Latn"
)
nllb_back_ft_model_it1 = AutoModelForSeq2SeqLM.from_pretrained(
    f"{base_path}model/{model_name}-back-finetuned-it1-10E",
    local_files_only=True
).to("cuda")

In [None]:
n = random.randint(0,3000)
test = dataset["test"][n]
test_string = test["en"]

print("Orginal Data: " + test_string)


inputs = nllb_back_tokenizer(test_string, return_tensors="pt", padding=True, truncation=True).to("cuda")

print(inputs)

translated_tokens = nllb_back_ft_model_it1.generate(
    **inputs, forced_bos_token_id=nllb_back_tokenizer.lang_code_to_id["yue_Hant"], max_length=128
)

print(translated_tokens)
print("Model Translation: " + nllb_back_tokenizer.batch_decode(translated_tokens, skip_special_tokens=True)[0])

# print("Dataset Translation: " + test["zh"])

Orginal Data: Yesterday I forgot my umbrella and had to run back home in the heavy rain.
{'input_ids': tensor([[256047,   5342, 113434,    117, 149275,   1537,    505, 207301,    540,
           2908,    202,   8331,  11535,  13003,    108,    349, 131172, 105686,
         248075,      2]], device='cuda:0'), 'attention_mask': tensor([[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]],
       device='cuda:0')}


NameError: name 'nllb_back_ft_model_it1' is not defined

In [None]:
bleu_score = compute_bleu_nllb_zh(dataset["test"]["en"], dataset["test"]["zh"], nllb_back_ft_model_it1, nllb_back_tokenizer)
print(f"Bleu Score Before Finetuning with {model_name}: {bleu_score}")

NameError: name 'nllb_back_ft_model_it1' is not defined

In [None]:
from google.colab import runtime
runtime.unassign()

## Back Translation Model Synthetic Data

In [None]:
model_name = "facebook/nllb-200-distilled-600M"

nllb_back_tokenizer = AutoTokenizer.from_pretrained(
    model_name,
    src_lang="eng_Latn",
    tgt_lang="yue_Hant"
)

nllb_back_ft_model = AutoModelForSeq2SeqLM.from_pretrained(
    f"{base_path}model/{model_name}-back-finetuned-it1-1:0",
    local_files_only=True
).to("cuda")

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/564 [00:00<?, ?B/s]

sentencepiece.bpe.model:   0%|          | 0.00/4.85M [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/17.3M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/3.55k [00:00<?, ?B/s]

In [None]:
with open(f"{base_path}Processed-Data/mono/mono_data_en.txt", "r", encoding='utf-8') as f:
  mono_data_en = f.readlines()

In [None]:
print(f"Number of Sentences: {len(mono_data_en)}")
print(f"Average Length: {sum([len(d) for d in mono_data_en])/ len(mono_data_en):.02f}")

Number of Sentences: 434684
Average Length: 133.42


In [None]:
random.seed(42)

In [None]:
SYNTHETIC_DATA_SIZE = 200_000
TRANSLATION_BATCH_SIZE = 50

In [None]:
syn_data_en = random.sample(mono_data_en, SYNTHETIC_DATA_SIZE)
syn_data_yue = []

for i in tqdm(range(0, SYNTHETIC_DATA_SIZE, TRANSLATION_BATCH_SIZE)):
  inputs = nllb_back_tokenizer(syn_data_en[i:i+TRANSLATION_BATCH_SIZE], return_tensors="pt", padding=True, truncation=True).to("cuda")
  translated_tokens = nllb_back_ft_model.generate(
      **inputs, forced_bos_token_id=nllb_back_tokenizer.lang_code_to_id["yue_Hant"], max_length=100
  )
  syn_data_yue.extend(nllb_back_tokenizer.batch_decode(translated_tokens, skip_special_tokens=True))


 30%|███       | 1210/4000 [29:59<1:02:45,  1.35s/it]

In [None]:
train_sys = zip(syn_data_yue, syn_data_en)
train_sys_df = pd.DataFrame(train_sys, columns = ['zh','en'])

In [None]:
train_sys_df

Unnamed: 0,zh,en
0,保守黨最強力求各界嘅選民。,that the Conservative Party is at its stronges...
1,"上半場俾烏爾斯特嘅高手球王平納入波,入咗兩球,加碼第二球就大力60米,士冇咁力,上季決賽落咗...",In a first half bookended by Ulster's talisman...
2,快啲嚟 Denver Health Medical啦。,“Come to Denver Health Medical Center as soon ...
3,"博彩公司自從喺賽道俾草地會計主權,佢哋先至喺賽事開始之前賭,用奇怪嘅訊號系統,用嚟傳價俾博彩...",Bookmaking has come a long way since it was do...
4,"我都面對同一條決定,放棄權利定留低細路喺屋企。",I’ve faced the same choice — give up my rights...
...,...,...
199995,"段片顯示727撞地,俾追逐直升機跟蹤。",The video shows the 727 crashing to the ground...
199996,"羅話,阿嘅布德威iser好清淨。","Budvar has ""a full bodied taste"" while ""AB's B..."
199997,"沙巴真係好難自定義,「大西洋會」華盛頓米高爾安沙利非洲中心嘅總監J.Peter Pham話。",“The Shabab is really struggling to define its...
199998,"我哋為世以榮舉,唔會遮。",It will not dim the light of the values we pro...


In [None]:
with open(f"{base_path}Synthetic-Data/Iteration1/yue.txt", "w", encoding="UTF-8") as f:
  for line in syn_data_yue:
    f.write(f"{line}\n")

In [None]:
with open(f"{base_path}Synthetic-Data/Iteration1/en.txt", "w", encoding="UTF-8") as f:
  for line in syn_data_en:
    f.write(f"{line}")

## First Iteration Forward Model（1:1）

In [None]:
with open(f"{base_path}Synthetic-Data/Iteration1/yue.txt", "r", encoding="UTF-8") as f:
  syn_data_yue = f.read().splitlines()

In [None]:
with open(f"{base_path}Synthetic-Data/Iteration1/en.txt", "r", encoding="UTF-8") as f:
  syn_data_en = f.read().splitlines()

In [None]:
train_sys = zip(syn_data_yue, syn_data_en)
train_sys_df = pd.DataFrame(train_sys, columns = ['zh','en'])

In [None]:
mixed_train_df = pd.concat([train_df, train_sys_df.sample(len(train_df) * 1)],ignore_index=True)

In [None]:
mixed_train_df

Unnamed: 0,zh,en
0,50自,50-m freestyle race
1,AV線,audio-visual cable
2,DT堂,DT lesson
3,DT室,DT room
4,OL衫,clothes for office ladies
...,...,...
76083,"熱帶風暴襲侵印度南部,引致暴雨及暴風雨,令150,000人流離離所需。","A tropical storm slams into southern India, br..."
76084,"呢單決定令英國上市電訊集團,之前喺印度最高法院贏到呢單嘢,加上分析人士同貿易機構對印度對國際...",The decision prompted dismay from the UK-liste...
76085,"南部廣東嘅出口大廈,面對全球經濟衰退加劇社會緊張,應該會俾內蒙古黨總統胡春華掌握,聯繫領導嘅...",The export powerhouse of Guangdong in the sout...
76086,"實啦,而家啲監管可以到啲交易所嘅位。","Sure, regulators can now get access to dealers..."


In [None]:
dataset = DatasetDict({
    'train': Dataset.from_pandas(mixed_train_df),
    'dev': Dataset.from_pandas(dev_df),
    'test': Dataset.from_pandas(test_df)})

In [None]:
dataset

DatasetDict({
    train: Dataset({
        features: ['zh', 'en'],
        num_rows: 38044
    })
    dev: Dataset({
        features: ['zh', 'en'],
        num_rows: 3001
    })
    test: Dataset({
        features: ['zh', 'en'],
        num_rows: 3000
    })
})

In [None]:
model_name = "facebook/nllb-200-distilled-600M"

nllb_tokenizer = AutoTokenizer.from_pretrained(
    "facebook/nllb-200-distilled-600M", src_lang="yue_Hant", tgt_lang="eng_Latn"
)

nllb_model = AutoModelForSeq2SeqLM.from_pretrained("facebook/nllb-200-distilled-600M").to("cuda")

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/564 [00:00<?, ?B/s]

sentencepiece.bpe.model:   0%|          | 0.00/4.85M [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/17.3M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/3.55k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/846 [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/2.46G [00:00<?, ?B/s]

  return self.fget.__get__(instance, owner)()


generation_config.json:   0%|          | 0.00/189 [00:00<?, ?B/s]

In [None]:
prefix = ""
max_input_length = 128
max_target_length = 128
source_lang = "zh"
target_lang = "en"
def preprocess_function(examples):
    inputs =  examples[source_lang]
    targets = examples[target_lang]
    model_inputs = nllb_tokenizer(inputs, return_tensors="pt", max_length=max_input_length, padding=True, truncation=True)
    labels = nllb_tokenizer(targets, return_tensors="pt", max_length=max_input_length, padding=True, truncation=True)
    model_inputs["labels"] = labels["input_ids"]
    return model_inputs

In [None]:
tokenized_datasets = dataset.map(preprocess_function, batched=True)

Map:   0%|          | 0/76088 [00:00<?, ? examples/s]

Map:   0%|          | 0/3001 [00:00<?, ? examples/s]

Map:   0%|          | 0/3000 [00:00<?, ? examples/s]

In [None]:
batch_size = 8
model_name = model_name.split("/")[-1]
args = Seq2SeqTrainingArguments(
    f"{model_name}-finetuned",
    evaluation_strategy = "epoch",
    save_strategy = "epoch",
    learning_rate=1e-4,
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=batch_size,
    weight_decay=0.01,
    save_total_limit=3,
    num_train_epochs=3,
    predict_with_generate=True,
    fp16 = True,
    load_best_model_at_end=True,
)

In [None]:
data_collator = DataCollatorForSeq2Seq(nllb_tokenizer, model=nllb_model)

In [None]:
metric = evaluate.load("sacrebleu")

def postprocess_text(preds, labels):
    preds = [pred.strip() for pred in preds]
    labels = [[label.strip()] for label in labels]
    return preds, labels

def compute_metrics(eval_preds):
    preds, labels = eval_preds
    if isinstance(preds, tuple):
        preds = preds[0]
    decoded_preds = nllb_tokenizer.batch_decode(preds, skip_special_tokens=True)
    # Replace -100 in the labels as we can't decode them.
    labels = np.where(labels != -100, labels, nllb_tokenizer.pad_token_id)
    decoded_labels = nllb_tokenizer.batch_decode(labels, skip_special_tokens=True)
    # Some simple post-processing
    decoded_preds, decoded_labels = postprocess_text(decoded_preds, decoded_labels)
    result = metric.compute(predictions=decoded_preds, references=decoded_labels)
    result = {"bleu": result["score"]}
    prediction_lens = [np.count_nonzero(pred != nllb_tokenizer.pad_token_id) for pred in preds]
    result["gen_len"] = np.mean(prediction_lens)
    result = {k: round(v, 4) for k, v in result.items()}
    return result

Downloading builder script:   0%|          | 0.00/8.15k [00:00<?, ?B/s]

In [None]:
trainer = Seq2SeqTrainer(
    nllb_model,
    args,
    train_dataset=tokenized_datasets["train"],
    eval_dataset=tokenized_datasets["dev"],
    data_collator=data_collator,
    tokenizer=nllb_tokenizer,
    compute_metrics=compute_metrics,
)

In [None]:
trainer.train()

Epoch,Training Loss,Validation Loss,Bleu,Gen Len
1,0.5706,0.526841,15.4996,18.3349
2,0.4836,0.516193,16.4546,18.6544
3,0.3998,0.528258,16.1164,18.8071
4,0.3307,0.54482,16.5389,18.8784
5,0.2697,0.577442,16.2762,19.1783
6,0.2237,0.620672,16.1183,18.993
7,0.174,0.653028,15.8392,19.2736
8,0.1455,0.692812,15.7463,19.2029
9,0.118,0.71946,16.0155,19.037
10,0.102,0.734643,15.8388,19.2459


Non-default generation parameters: {'max_length': 200}
Non-default generation parameters: {'max_length': 200}
Non-default generation parameters: {'max_length': 200}
Non-default generation parameters: {'max_length': 200}
Non-default generation parameters: {'max_length': 200}
Non-default generation parameters: {'max_length': 200}
Non-default generation parameters: {'max_length': 200}
Non-default generation parameters: {'max_length': 200}
Non-default generation parameters: {'max_length': 200}
Non-default generation parameters: {'max_length': 200}
There were missing keys in the checkpoint model loaded: ['model.encoder.embed_tokens.weight', 'model.decoder.embed_tokens.weight', 'lm_head.weight'].


TrainOutput(global_step=95110, training_loss=0.28419081715301137, metrics={'train_runtime': 24638.8348, 'train_samples_per_second': 30.881, 'train_steps_per_second': 3.86, 'total_flos': 1.6077151396252877e+17, 'train_loss': 0.28419081715301137, 'epoch': 10.0})

In [None]:
trainer.save_model(f"{base_path}model/{model_name}-finetuned-it1-1:1")

Non-default generation parameters: {'max_length': 200}


In [None]:
nllb_tokenizer = AutoTokenizer.from_pretrained(
    "facebook/nllb-200-distilled-600M",
    src_lang="yue_Hant",
    tgt_lang="eng_Latn"
)

nllb_model = AutoModelForSeq2SeqLM.from_pretrained(
    f"{base_path}model/nllb-200-distilled-600M-finetuned-it1-1:1",
    local_files_only=True
).to("cuda")

In [None]:
n = random.randint(0,3000)
test = dataset["test"][n]
test_string = test["zh"]
print("Orginal Data: " + test_string)


inputs = nllb_tokenizer(test["zh"], return_tensors="pt").to("cuda")

translated_tokens = nllb_model.generate(
    **inputs, forced_bos_token_id=nllb_tokenizer.lang_code_to_id["eng_Latn"], max_length=30
)

print("Model Translation: " + nllb_tokenizer.batch_decode(translated_tokens, skip_special_tokens=True)[0])
print("Dataset Translation: " + test["en"])

Orginal Data: 噚日我唔記得帶遮，焗住要冒住大雨跑返屋企。
Model Translation: I forgot to bring my umbrella yesterday, so I had to run home in case of heavy rain.
Dataset Translation: Yesterday I forgot my umbrella and had to run back home in the heavy rain.


In [None]:
metric = evaluate.load("sacrebleu")

def compute_bleu_nllb(src_sentences, tgt_sentences, batch_size=32):
  pred = []

  for i in tqdm(range(0, len(src_sentences), batch_size)):
    batch = src_sentences[i:i + batch_size]
    encoded_input = nllb_tokenizer(batch, return_tensors="pt", padding=True, truncation=True, max_length=128).to("cuda")
    translated_tokens = nllb_model.generate(**encoded_input, forced_bos_token_id=nllb_tokenizer.lang_code_to_id["eng_Latn"], max_length=128)
    translated_batch = nllb_tokenizer.batch_decode(translated_tokens, skip_special_tokens=True)
    pred.extend(translated_batch)

  res = metric.compute(predictions=pred, references=tgt_sentences)
  return res["score"]

In [None]:
bleu_score = compute_bleu_nllb(dataset["test"]["zh"], dataset["test"]["en"])
print(f"Bleu Score after Finetuning w/ 1:1 Ratio Data and nllb-200-distilled-600M: {bleu_score}")

100%|██████████| 94/94 [01:09<00:00,  1.35it/s]


Bleu Score after Finetuning w/ 1:1 Ratio Data and nllb-200-distilled-600M: 16.66532520572667


## First Iteration Forward Model (1:3)


In [None]:
with open(f"{base_path}Synthetic-Data/Iteration1/yue.txt", "r", encoding="UTF-8") as f:
  syn_data_yue = f.read().splitlines()

In [None]:
with open(f"{base_path}Synthetic-Data/Iteration1/en.txt", "r", encoding="UTF-8") as f:
  syn_data_en = f.read().splitlines()

In [None]:
train_sys = zip(syn_data_yue, syn_data_en)
train_sys_df = pd.DataFrame(train_sys, columns = ['zh','en'])

In [None]:
mixed_train_df = pd.concat([train_df, train_sys_df.sample(len(train_df) * 3)],ignore_index=True)

In [None]:
mixed_train_df

Unnamed: 0,zh,en
0,50自,50-m freestyle race
1,AV線,audio-visual cable
2,DT堂,DT lesson
3,DT室,DT room
4,OL衫,clothes for office ladies
...,...,...
152171,父之喜樂變咗做「藍花故」等對幼兒嘅威脅詩。,The joy of fatherhood mutates into terrified p...
152172,伊朗軍司令阿達拉沙利星期二被半正式伊朗新聞社援引話「過荷爾摩斯海去阿曼灣嘅美軍艦唔返波斯灣。」,"On Tuesday, the chief of Iran’s military, Maj...."
152173,"佢話漫畫可以順咁回溯到時空,創造氣氛。","Comics can go “seamlessly” back in time, and b..."
152174,"佢最近似乎越嚟越關注女選人,加緊油價。","Over the past few days, he appears to be incre..."


In [None]:
dataset = DatasetDict({
    'train': Dataset.from_pandas(mixed_train_df),
    'dev': Dataset.from_pandas(dev_df),
    'test': Dataset.from_pandas(test_df)})

In [None]:
dataset

DatasetDict({
    train: Dataset({
        features: ['zh', 'en'],
        num_rows: 152176
    })
    dev: Dataset({
        features: ['zh', 'en'],
        num_rows: 3001
    })
    test: Dataset({
        features: ['zh', 'en'],
        num_rows: 3000
    })
})

In [None]:
model_name = "facebook/nllb-200-distilled-600M"

nllb_tokenizer = AutoTokenizer.from_pretrained(
    "facebook/nllb-200-distilled-600M", src_lang="yue_Hant", tgt_lang="eng_Latn"
)

nllb_model = AutoModelForSeq2SeqLM.from_pretrained("facebook/nllb-200-distilled-600M").to("cuda")

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/564 [00:00<?, ?B/s]

sentencepiece.bpe.model:   0%|          | 0.00/4.85M [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/17.3M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/3.55k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/846 [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/2.46G [00:00<?, ?B/s]

  return self.fget.__get__(instance, owner)()


generation_config.json:   0%|          | 0.00/189 [00:00<?, ?B/s]

In [None]:
prefix = ""
max_input_length = 128
max_target_length = 128
source_lang = "zh"
target_lang = "en"
def preprocess_function(examples):
    inputs =  examples[source_lang]
    targets = examples[target_lang]
    model_inputs = nllb_tokenizer(inputs, return_tensors="pt", max_length=max_input_length, padding=True, truncation=True)
    labels = nllb_tokenizer(targets, return_tensors="pt", max_length=max_input_length, padding=True, truncation=True)
    model_inputs["labels"] = labels["input_ids"]
    return model_inputs

In [None]:
tokenized_datasets = dataset.map(preprocess_function, batched=True)

Map:   0%|          | 0/152176 [00:00<?, ? examples/s]

Map:   0%|          | 0/3001 [00:00<?, ? examples/s]

Map:   0%|          | 0/3000 [00:00<?, ? examples/s]

In [None]:
batch_size = 8
model_name = model_name.split("/")[-1]
args = Seq2SeqTrainingArguments(
    f"{model_name}-finetuned",
    evaluation_strategy = "epoch",
    save_strategy = "epoch",
    learning_rate=1e-4,
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=batch_size,
    weight_decay=0.01,
    save_total_limit=3,
    num_train_epochs=3,
    predict_with_generate=True,
    fp16 = True,
    load_best_model_at_end=True,
)

In [None]:
data_collator = DataCollatorForSeq2Seq(nllb_tokenizer, model=nllb_model)

In [None]:
metric = evaluate.load("sacrebleu")

def postprocess_text(preds, labels):
    preds = [pred.strip() for pred in preds]
    labels = [[label.strip()] for label in labels]
    return preds, labels

def compute_metrics(eval_preds):
    preds, labels = eval_preds
    if isinstance(preds, tuple):
        preds = preds[0]
    decoded_preds = nllb_tokenizer.batch_decode(preds, skip_special_tokens=True)
    # Replace -100 in the labels as we can't decode them.
    labels = np.where(labels != -100, labels, nllb_tokenizer.pad_token_id)
    decoded_labels = nllb_tokenizer.batch_decode(labels, skip_special_tokens=True)
    # Some simple post-processing
    decoded_preds, decoded_labels = postprocess_text(decoded_preds, decoded_labels)
    result = metric.compute(predictions=decoded_preds, references=decoded_labels)
    result = {"bleu": result["score"]}
    prediction_lens = [np.count_nonzero(pred != nllb_tokenizer.pad_token_id) for pred in preds]
    result["gen_len"] = np.mean(prediction_lens)
    result = {k: round(v, 4) for k, v in result.items()}
    return result

Downloading builder script:   0%|          | 0.00/8.15k [00:00<?, ?B/s]

In [None]:
trainer = Seq2SeqTrainer(
    nllb_model,
    args,
    train_dataset=tokenized_datasets["train"],
    eval_dataset=tokenized_datasets["dev"],
    data_collator=data_collator,
    tokenizer=nllb_tokenizer,
    compute_metrics=compute_metrics,
)

In [None]:
trainer.train()

Epoch,Training Loss,Validation Loss,Bleu,Gen Len
1,0.5139,0.533344,15.5386,19.04
2,0.443,0.518796,16.3757,18.7964
3,0.3712,0.519842,16.6398,18.9014


Non-default generation parameters: {'max_length': 200}
Non-default generation parameters: {'max_length': 200}
Non-default generation parameters: {'max_length': 200}
There were missing keys in the checkpoint model loaded: ['model.encoder.embed_tokens.weight', 'model.decoder.embed_tokens.weight', 'lm_head.weight'].


TrainOutput(global_step=57066, training_loss=0.46463333457834743, metrics={'train_runtime': 14177.2063, 'train_samples_per_second': 32.202, 'train_steps_per_second': 4.025, 'total_flos': 9.661402667640422e+16, 'train_loss': 0.46463333457834743, 'epoch': 3.0})

In [None]:
trainer.save_model(f"{base_path}model/{model_name}-finetuned-it1-1:3")

Non-default generation parameters: {'max_length': 200}


In [None]:
nllb_tokenizer = AutoTokenizer.from_pretrained(
    "facebook/nllb-200-distilled-600M",
    src_lang="yue_Hant",
    tgt_lang="eng_Latn"
)

nllb_model = AutoModelForSeq2SeqLM.from_pretrained(
    f"{base_path}model/nllb-200-distilled-600M-finetuned-it1-1:3",
    local_files_only=True
).to("cuda")

In [None]:
n = random.randint(0,3000)
test = dataset["test"][n]
test_string = test["zh"]
print("Orginal Data: " + test_string)


inputs = nllb_tokenizer(test["zh"], return_tensors="pt").to("cuda")

translated_tokens = nllb_model.generate(
    **inputs, forced_bos_token_id=nllb_tokenizer.lang_code_to_id["eng_Latn"], max_length=30
)

print("Model Translation: " + nllb_tokenizer.batch_decode(translated_tokens, skip_special_tokens=True)[0])
print("Dataset Translation: " + test["en"])

Orginal Data: 噚日我唔記得帶遮，焗住要冒住大雨跑返屋企。
Model Translation: I forgot to bring an umbrella yesterday, so I had to run home in case of heavy rain.
Dataset Translation: Yesterday I forgot my umbrella and had to run back home in the heavy rain.


In [None]:
metric = evaluate.load("sacrebleu")

def compute_bleu_nllb(src_sentences, tgt_sentences, batch_size=32):
  pred = []

  for i in tqdm(range(0, len(src_sentences), batch_size)):
    batch = src_sentences[i:i + batch_size]
    encoded_input = nllb_tokenizer(batch, return_tensors="pt", padding=True, truncation=True, max_length=128).to("cuda")
    translated_tokens = nllb_model.generate(**encoded_input, forced_bos_token_id=nllb_tokenizer.lang_code_to_id["eng_Latn"], max_length=128)
    translated_batch = nllb_tokenizer.batch_decode(translated_tokens, skip_special_tokens=True)
    pred.extend(translated_batch)

  res = metric.compute(predictions=pred, references=tgt_sentences)
  return res["score"]

In [None]:
bleu_score = compute_bleu_nllb(dataset["test"]["zh"], dataset["test"]["en"])
print(f"Bleu Score after Finetuning w/ 1:1 Ratio Data and nllb-200-distilled-600M: {bleu_score}")

100%|██████████| 94/94 [01:12<00:00,  1.29it/s]


Bleu Score after Finetuning w/ 1:1 Ratio Data and nllb-200-distilled-600M: 16.158810183259966


## First Iteration Forward Model (1:5)



In [None]:
with open(f"{base_path}Synthetic-Data/Iteration1/yue.txt", "r",
          encoding="UTF-8") as f:
  syn_data_yue = f.read().splitlines()

In [None]:
with open(f"{base_path}Synthetic-Data/Iteration1/en.txt", "r",
          encoding="UTF-8") as f:
  syn_data_en = f.read().splitlines()

In [None]:
train_sys = zip(syn_data_yue, syn_data_en)
train_sys_df = pd.DataFrame(train_sys, columns = ['zh','en'])

In [None]:
mixed_train_df = pd.concat([train_df, train_sys_df.sample(len(train_df) * 5)],ignore_index=True)

In [None]:
mixed_train_df

Unnamed: 0,zh,en
0,50自,50-m freestyle race
1,AV線,audio-visual cable
2,DT堂,DT lesson
3,DT室,DT room
4,OL衫,clothes for office ladies
...,...,...
228259,美國老老老老老老老老老老老老老老老老老老老老老老老老老老老老老老老老老老老老老老老老老老老老...,"""For the record, there was nothing remotely ra..."
228260,"聯合國支持嘅營地主管施特朗卡馬西話原本係裝置住四千人,但係而家收埋七千人。","Straton Kamanzi, the manager of the U.N.-suppo..."
228261,"超級市場話師應該攻擊啲唔夠錢買奶嘅,例如食品廠商。",Supermarkets said the chefs should be attackin...
228262,"佢仲喺巴塞克嘅比賽中,贏咗63%。",She also won 63 percent of the points off Pasz...


In [None]:
dataset = DatasetDict({
    'train': Dataset.from_pandas(mixed_train_df),
    'dev': Dataset.from_pandas(dev_df),
    'test': Dataset.from_pandas(test_df)})

In [None]:
dataset

DatasetDict({
    train: Dataset({
        features: ['zh', 'en'],
        num_rows: 228264
    })
    dev: Dataset({
        features: ['zh', 'en'],
        num_rows: 3001
    })
    test: Dataset({
        features: ['zh', 'en'],
        num_rows: 3000
    })
})

In [None]:
model_name = "facebook/nllb-200-distilled-600M"

nllb_tokenizer = AutoTokenizer.from_pretrained(
    "facebook/nllb-200-distilled-600M", src_lang="yue_Hant", tgt_lang="eng_Latn"
)

nllb_model = AutoModelForSeq2SeqLM.from_pretrained("facebook/nllb-200-distilled-600M").to("cuda")

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/564 [00:00<?, ?B/s]

sentencepiece.bpe.model:   0%|          | 0.00/4.85M [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/17.3M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/3.55k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/846 [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/2.46G [00:00<?, ?B/s]

  return self.fget.__get__(instance, owner)()


generation_config.json:   0%|          | 0.00/189 [00:00<?, ?B/s]

In [None]:
prefix = ""
max_input_length = 128
max_target_length = 128
source_lang = "zh"
target_lang = "en"
def preprocess_function(examples):
    inputs =  examples[source_lang]
    targets = examples[target_lang]
    model_inputs = nllb_tokenizer(inputs, return_tensors="pt", max_length=max_input_length, padding=True, truncation=True)
    labels = nllb_tokenizer(targets, return_tensors="pt", max_length=max_input_length, padding=True, truncation=True)
    model_inputs["labels"] = labels["input_ids"]
    return model_inputs

In [None]:
tokenized_datasets = dataset.map(preprocess_function, batched=True)

Map:   0%|          | 0/228264 [00:00<?, ? examples/s]

Map:   0%|          | 0/3001 [00:00<?, ? examples/s]

Map:   0%|          | 0/3000 [00:00<?, ? examples/s]

In [None]:
batch_size = 8
model_name = model_name.split("/")[-1]
args = Seq2SeqTrainingArguments(
    f"{model_name}-finetuned",
    evaluation_strategy = "epoch",
    save_strategy = "epoch",
    learning_rate=1e-4,
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=batch_size,
    weight_decay=0.01,
    save_total_limit=3,
    num_train_epochs=3,
    predict_with_generate=True,
    fp16 = True,
    load_best_model_at_end=True,
)

In [None]:
data_collator = DataCollatorForSeq2Seq(nllb_tokenizer, model=nllb_model)

In [None]:
metric = evaluate.load("sacrebleu")

def postprocess_text(preds, labels):
    preds = [pred.strip() for pred in preds]
    labels = [[label.strip()] for label in labels]
    return preds, labels

def compute_metrics(eval_preds):
    preds, labels = eval_preds
    if isinstance(preds, tuple):
        preds = preds[0]
    decoded_preds = nllb_tokenizer.batch_decode(preds, skip_special_tokens=True)
    # Replace -100 in the labels as we can't decode them.
    labels = np.where(labels != -100, labels, nllb_tokenizer.pad_token_id)
    decoded_labels = nllb_tokenizer.batch_decode(labels, skip_special_tokens=True)
    # Some simple post-processing
    decoded_preds, decoded_labels = postprocess_text(decoded_preds, decoded_labels)
    result = metric.compute(predictions=decoded_preds, references=decoded_labels)
    result = {"bleu": result["score"]}
    prediction_lens = [np.count_nonzero(pred != nllb_tokenizer.pad_token_id) for pred in preds]
    result["gen_len"] = np.mean(prediction_lens)
    result = {k: round(v, 4) for k, v in result.items()}
    return result

Downloading builder script:   0%|          | 0.00/8.15k [00:00<?, ?B/s]

In [None]:
trainer = Seq2SeqTrainer(
    nllb_model,
    args,
    train_dataset=tokenized_datasets["train"],
    eval_dataset=tokenized_datasets["dev"],
    data_collator=data_collator,
    tokenizer=nllb_tokenizer,
    compute_metrics=compute_metrics,
)

In [None]:
trainer.train()

Epoch,Training Loss,Validation Loss,Bleu,Gen Len
1,0.5151,0.537383,15.0733,18.7074
2,0.4395,0.524023,15.9991,19.0017
3,0.3784,0.523501,16.4196,18.7491


Non-default generation parameters: {'max_length': 200}
Non-default generation parameters: {'max_length': 200}
Non-default generation parameters: {'max_length': 200}
There were missing keys in the checkpoint model loaded: ['model.encoder.embed_tokens.weight', 'model.decoder.embed_tokens.weight', 'lm_head.weight'].


TrainOutput(global_step=85599, training_loss=0.45978623346535447, metrics={'train_runtime': 20353.7784, 'train_samples_per_second': 33.644, 'train_steps_per_second': 4.206, 'total_flos': 1.449234272157696e+17, 'train_loss': 0.45978623346535447, 'epoch': 3.0})

In [None]:
trainer.save_model(f"{base_path}model/{model_name}-finetuned-it1-1:5")

Non-default generation parameters: {'max_length': 200}


In [None]:
nllb_tokenizer = AutoTokenizer.from_pretrained(
    "facebook/nllb-200-distilled-600M",
    src_lang="yue_Hant",
    tgt_lang="eng_Latn"
)

nllb_model = AutoModelForSeq2SeqLM.from_pretrained(
    f"{base_path}model/nllb-200-distilled-600M-finetuned-it1-1:5",
    local_files_only=True
).to("cuda")

In [None]:
n = random.randint(0,3000)
test = dataset["test"][n]
test_string = test["zh"]
print("Orginal Data: " + test_string)


inputs = nllb_tokenizer(test["zh"], return_tensors="pt").to("cuda")

translated_tokens = nllb_model.generate(
    **inputs, forced_bos_token_id=nllb_tokenizer.lang_code_to_id["eng_Latn"], max_length=30
)

print("Model Translation: " + nllb_tokenizer.batch_decode(translated_tokens, skip_special_tokens=True)[0])
print("Dataset Translation: " + test["en"])

Orginal Data: 噚日我唔記得帶遮，焗住要冒住大雨跑返屋企。
Model Translation: I forgot to bring an umbrella yesterday, so I had to run home in case of heavy rain.
Dataset Translation: Yesterday I forgot my umbrella and had to run back home in the heavy rain.


In [None]:
metric = evaluate.load("sacrebleu")

def compute_bleu_nllb(src_sentences, tgt_sentences, batch_size=32):
  pred = []

  for i in tqdm(range(0, len(src_sentences), batch_size)):
    batch = src_sentences[i:i + batch_size]
    encoded_input = nllb_tokenizer(batch, return_tensors="pt", padding=True, truncation=True, max_length=128).to("cuda")
    translated_tokens = nllb_model.generate(**encoded_input, forced_bos_token_id=nllb_tokenizer.lang_code_to_id["eng_Latn"], max_length=128)
    translated_batch = nllb_tokenizer.batch_decode(translated_tokens, skip_special_tokens=True)
    pred.extend(translated_batch)

  res = metric.compute(predictions=pred, references=tgt_sentences)
  return res["score"]

In [None]:
bleu_score = compute_bleu_nllb(dataset["test"]["zh"], dataset["test"]["en"])
print(f"Bleu Score after Finetuning w/ 1:5 Ratio Data and nllb-200-distilled-600M: {bleu_score}")

100%|██████████| 94/94 [01:10<00:00,  1.33it/s]


Bleu Score after Finetuning w/ 1:5 Ratio Data and nllb-200-distilled-600M: 16.065691321449496


In [None]:
from google.colab import runtime
runtime.unassign()

## First Iteration Forward Model（0.5:0.5）

In [15]:
with open(f"{base_path}Synthetic-Data/NLLB/yue.txt", "r", encoding="UTF-8") as f:
  syn_data_yue = f.read().splitlines()

In [14]:
with open(f"{base_path}Synthetic-Data/NLLB/en.txt", "r", encoding="UTF-8") as f:
  syn_data_en = f.read().splitlines()

In [16]:
train_sys = zip(syn_data_yue, syn_data_en)
train_sys_df = pd.DataFrame(train_sys, columns = ['zh','en'])

In [19]:
random.seed(42)
half = int(len(train_df) * 0.5)
print(f"Dataset: {half} Real - {half} Synthetic")
mixed_train_df = pd.concat([train_df.sample(half), train_sys_df.sample(half)],ignore_index=True)

Dataset: 19022 Real - 19022 Synthetic


In [20]:
mixed_train_df

Unnamed: 0,zh,en
0,如果你真係畀大家一人一票揀心儀代議士嘅，就唔使叫人含淚投票啦。,If you really suggest everyone could cast thei...
1,廣東話先有得咁講，英文冇囉。,"Only Cantonese has this saying, English does ..."
2,地下刊物,underground publications
3,呢啲公價貨嚟㗎，減唔到俾你。,Everyone is selling at this price. There's no ...
4,好心你痛風就唔好食咁多內臟啦。,Don't eat so much offal if you have gout.
...,...,...
38039,"布洛氏同同事唔肯定有咩生物縮碼,但係佢哋都幾有意,原來係伯格曼嘅法則。",Bloch and his collaborators could not be sure ...
38040,"我哋坐埋一齊食飯,傾咗六六年前嘅事。","""And we sat and had lunch together and discuss..."
38041,五名美軍喺哥倫布亞被指行失當,Five U.S. military accused of misconduct in Co...
38042,呢度就係阿沙巴嘅大戰士越嚟越頻派兵攻打肯亞捕食大象。,This is where al-Shabab warlords send raiding ...


In [21]:
dataset = DatasetDict({
    'train': Dataset.from_pandas(mixed_train_df),
    'dev': Dataset.from_pandas(dev_df),
    'test': Dataset.from_pandas(test_df)})

In [22]:
dataset

DatasetDict({
    train: Dataset({
        features: ['zh', 'en'],
        num_rows: 38044
    })
    dev: Dataset({
        features: ['zh', 'en'],
        num_rows: 3001
    })
    test: Dataset({
        features: ['zh', 'en'],
        num_rows: 3000
    })
})

In [23]:
model_name = "facebook/nllb-200-distilled-600M"

nllb_tokenizer = AutoTokenizer.from_pretrained(
    "facebook/nllb-200-distilled-600M", src_lang="yue_Hant", tgt_lang="eng_Latn"
)

nllb_model = AutoModelForSeq2SeqLM.from_pretrained("facebook/nllb-200-distilled-600M").to("cuda")

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/564 [00:00<?, ?B/s]

sentencepiece.bpe.model:   0%|          | 0.00/4.85M [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/17.3M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/3.55k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/846 [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/2.46G [00:00<?, ?B/s]

  return self.fget.__get__(instance, owner)()


generation_config.json:   0%|          | 0.00/189 [00:00<?, ?B/s]

In [24]:
prefix = ""
max_input_length = 128
max_target_length = 128
source_lang = "zh"
target_lang = "en"
def preprocess_function(examples):
    inputs =  examples[source_lang]
    targets = examples[target_lang]
    model_inputs = nllb_tokenizer(inputs, return_tensors="pt", max_length=max_input_length, padding=True, truncation=True)
    labels = nllb_tokenizer(targets, return_tensors="pt", max_length=max_input_length, padding=True, truncation=True)
    model_inputs["labels"] = labels["input_ids"]
    return model_inputs

In [25]:
tokenized_datasets = dataset.map(preprocess_function, batched=True)

Map:   0%|          | 0/38044 [00:00<?, ? examples/s]

Map:   0%|          | 0/3001 [00:00<?, ? examples/s]

Map:   0%|          | 0/3000 [00:00<?, ? examples/s]

In [26]:
batch_size = 8
model_name = model_name.split("/")[-1]
args = Seq2SeqTrainingArguments(
    f"{model_name}-finetuned",
    evaluation_strategy = "epoch",
    save_strategy = "epoch",
    learning_rate=1e-4,
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=batch_size,
    weight_decay=0.01,
    save_total_limit=3,
    num_train_epochs=3,
    predict_with_generate=True,
    fp16 = True,
    load_best_model_at_end=True,
)

In [27]:
data_collator = DataCollatorForSeq2Seq(nllb_tokenizer, model=nllb_model)

In [28]:
metric = evaluate.load("sacrebleu")

def postprocess_text(preds, labels):
    preds = [pred.strip() for pred in preds]
    labels = [[label.strip()] for label in labels]
    return preds, labels

def compute_metrics(eval_preds):
    preds, labels = eval_preds
    if isinstance(preds, tuple):
        preds = preds[0]
    decoded_preds = nllb_tokenizer.batch_decode(preds, skip_special_tokens=True)
    # Replace -100 in the labels as we can't decode them.
    labels = np.where(labels != -100, labels, nllb_tokenizer.pad_token_id)
    decoded_labels = nllb_tokenizer.batch_decode(labels, skip_special_tokens=True)
    # Some simple post-processing
    decoded_preds, decoded_labels = postprocess_text(decoded_preds, decoded_labels)
    result = metric.compute(predictions=decoded_preds, references=decoded_labels)
    result = {"bleu": result["score"]}
    prediction_lens = [np.count_nonzero(pred != nllb_tokenizer.pad_token_id) for pred in preds]
    result["gen_len"] = np.mean(prediction_lens)
    result = {k: round(v, 4) for k, v in result.items()}
    return result

Downloading builder script:   0%|          | 0.00/8.15k [00:00<?, ?B/s]

In [29]:
trainer = Seq2SeqTrainer(
    nllb_model,
    args,
    train_dataset=tokenized_datasets["train"],
    eval_dataset=tokenized_datasets["dev"],
    data_collator=data_collator,
    tokenizer=nllb_tokenizer,
    compute_metrics=compute_metrics,
)

In [30]:
trainer.train()

Epoch,Training Loss,Validation Loss,Bleu,Gen Len
1,0.5367,0.534869,15.4173,18.6941
2,0.4373,0.529822,16.1946,18.7647
3,0.3496,0.538973,15.8415,18.7667


Non-default generation parameters: {'max_length': 200}
Non-default generation parameters: {'max_length': 200}
Non-default generation parameters: {'max_length': 200}
There were missing keys in the checkpoint model loaded: ['model.encoder.embed_tokens.weight', 'model.decoder.embed_tokens.weight', 'lm_head.weight'].


TrainOutput(global_step=14268, training_loss=0.4865281019753557, metrics={'train_runtime': 3005.3291, 'train_samples_per_second': 37.977, 'train_steps_per_second': 4.748, 'total_flos': 2.412847491789619e+16, 'train_loss': 0.4865281019753557, 'epoch': 3.0})

In [None]:
trainer.save_model(f"{base_path}model/{model_name}-finetuned-half:half")

Non-default generation parameters: {'max_length': 200}


In [None]:
nllb_tokenizer = AutoTokenizer.from_pretrained(
    "facebook/nllb-200-distilled-600M",
    src_lang="yue_Hant",
    tgt_lang="eng_Latn"
)

nllb_model = AutoModelForSeq2SeqLM.from_pretrained(
    f"{base_path}model/nllb-200-distilled-600M-finetuned-half:half",
    local_files_only=True
).to("cuda")

In [None]:
n = random.randint(0,3000)
test = dataset["test"][n]
test_string = test["zh"]
print("Orginal Data: " + test_string)


inputs = nllb_tokenizer(test["zh"], return_tensors="pt").to("cuda")

translated_tokens = nllb_model.generate(
    **inputs, forced_bos_token_id=nllb_tokenizer.lang_code_to_id["eng_Latn"], max_length=30
)

print("Model Translation: " + nllb_tokenizer.batch_decode(translated_tokens, skip_special_tokens=True)[0])
print("Dataset Translation: " + test["en"])

the `lang_code_to_id` attribute is deprecated. The logic is natively handled in the `tokenizer.adder_tokens_decoder` this attribute will be removed in `transformers` v4.38


Orginal Data: 噚日我唔記得帶遮，焗住要冒住大雨跑返屋企。
Model Translation: On the previous day, I forgot to bring my umbrella. I had to run home in case of heavy rain.
Dataset Translation: Yesterday I forgot my umbrella and had to run back home in the heavy rain.


In [None]:
metric = evaluate.load("sacrebleu")

def compute_bleu_nllb(src_sentences, tgt_sentences, batch_size=32):
  pred = []

  for i in tqdm(range(0, len(src_sentences), batch_size)):
    batch = src_sentences[i:i + batch_size]
    encoded_input = nllb_tokenizer(batch, return_tensors="pt", padding=True, truncation=True, max_length=128).to("cuda")
    translated_tokens = nllb_model.generate(**encoded_input, forced_bos_token_id=nllb_tokenizer.lang_code_to_id["eng_Latn"], max_length=128)
    translated_batch = nllb_tokenizer.batch_decode(translated_tokens, skip_special_tokens=True)
    pred.extend(translated_batch)

  res = metric.compute(predictions=pred, references=tgt_sentences)
  return res["score"]

In [None]:
bleu_score = compute_bleu_nllb(dataset["test"]["zh"], dataset["test"]["en"])
print(f"Bleu Score after Finetuning w/ 1:1 Ratio Data and nllb-200-distilled-600M: {bleu_score}")