In [None]:
! pip install transformers datasets evaluate sacrebleu sentencepiece protobuf

In [None]:
from huggingface_hub import notebook_login
notebook_login()

In [1]:
!wget https://github.com/stefan-it/nmt-en-vi/raw/master/data/train-en-vi.tgz

--2024-07-03 06:20:32--  https://github.com/stefan-it/nmt-en-vi/raw/master/data/train-en-vi.tgz
Resolving github.com (github.com)... 140.82.121.3
Connecting to github.com (github.com)|140.82.121.3|:443... connected.
HTTP request sent, awaiting response... 302 Found
Location: https://raw.githubusercontent.com/stefan-it/nmt-en-vi/master/data/train-en-vi.tgz [following]
--2024-07-03 06:20:33--  https://raw.githubusercontent.com/stefan-it/nmt-en-vi/master/data/train-en-vi.tgz
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.111.133, 185.199.110.133, 185.199.108.133, ...
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.111.133|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 9903559 (9.4M) [application/octet-stream]
Saving to: ‘train-en-vi.tgz.1’


2024-07-03 06:20:33 (48.8 MB/s) - ‘train-en-vi.tgz.1’ saved [9903559/9903559]



In [None]:
import tarfile

file_path = 'train-en-vi.tgz'
with tarfile.open(file_path, 'r:gz') as tar:
  tar.extractall('data')

In [2]:
import os
import html
from torch.utils.data import Dataset, DataLoader

os.environ["TOKENIZERS_PARALLELISM"] = "false"
class EnToViDataset(Dataset):
  def __init__(self, root_dir, split):
    with open(os.path.join(root_dir, f'{split}.en'), 'r', encoding='utf-8') as f:
      src_data = html.unescape(f.read()).split('\n')
    with open(os.path.join(root_dir, f'{split}.vi'), 'r', encoding='utf-8') as f:
      tgt_data = html.unescape(f.read()).split('\n')
    assert len(src_data) == len(tgt_data), 'Source and target file have different number of lines'
    self.data = list(zip(src_data, tgt_data))

  def __len__(self):
    return len(self.data)

  def __getitem__(self, idx):
    return self.data[idx]



In [3]:
from datasets import Dataset
from torch.utils.data import DataLoader
from transformers import AutoTokenizer

with open(os.path.join('data', f'train.en'), 'r', encoding='utf-8') as f:
      src_data = html.unescape(f.read()).split('\n')
with open(os.path.join('data', f'train.vi'), 'r', encoding='utf-8') as f:
      tgt_data = html.unescape(f.read()).split('\n')

tokenizer = AutoTokenizer.from_pretrained("facebook/mbart-large-50", use_fast=True, src_lang="en_XX", tgt_lang="vi_VN")

def data_gen():
  for idx, (src, tgt) in enumerate(zip(src_data, tgt_data)):
    yield tokenizer(src, text_target=tgt, max_length=128, truncation=True)

train_data = Dataset.from_generator(data_gen)

In [4]:
from transformers import DataCollatorForSeq2Seq

checkpoint = "facebook/mbart-large-50"
data_collator = DataCollatorForSeq2Seq(tokenizer=tokenizer, model=checkpoint)

train_data.set_format('torch')
train_loader = DataLoader(train_data, shuffle=True, batch_size=8, collate_fn=data_collator, pin_memory=True)

In [5]:
import evaluate

metric = evaluate.load("sacrebleu")

In [6]:
import numpy as np

def postprocess_text(preds, labels):
    preds = [pred.strip() for pred in preds]
    labels = [[label.strip()] for label in labels]

    return preds, labels

def compute_metrics(eval_preds):
    preds, labels = eval_preds
    if isinstance(preds, tuple):
        preds = preds[0]
    decoded_preds = tokenizer.batch_decode(preds, skip_special_tokens=True)

    labels = np.where(labels != -100, labels, tokenizer.pad_token_id)
    decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)

    decoded_preds, decoded_labels = postprocess_text(decoded_preds, decoded_labels)

    result = metric.compute(predictions=decoded_preds, references=decoded_labels)
    result = {"bleu": result["score"]}

    prediction_lens = [np.count_nonzero(pred != tokenizer.pad_token_id) for pred in preds]
    result["gen_len"] = np.mean(prediction_lens)
    result = {k: round(v, 4) for k, v in result.items()}
    return result

In [7]:
from transformers import MBartForConditionalGeneration

model = MBartForConditionalGeneration.from_pretrained(checkpoint)
print(model.num_parameters())

610879488


In [8]:
from functools import partial
import torch
import math
from torch.optim.lr_scheduler import LambdaLR

def _get_cosine_schedule_with_warmup_lr_lambda(
    current_step: int, *, num_warmup_steps: int, num_training_steps: int, num_cycles: float
):
    if current_step < num_warmup_steps:
        return float(current_step) / float(max(1, num_warmup_steps))
    progress = float(current_step - num_warmup_steps) / float(max(1, num_training_steps - num_warmup_steps))
    return max(0.0, 0.5 * (1.0 + math.cos(math.pi * float(num_cycles) * 2.0 * progress)))

def get_cosine_schedule_with_warmup(
    optimizer, num_warmup_steps: int, num_training_steps: int, num_cycles: float = 0.5, last_epoch: int = -1
):
    lr_func = lambda iter: torch.tensor(_get_cosine_schedule_with_warmup_lr_lambda(iter,
        num_warmup_steps=num_warmup_steps,
        num_training_steps=num_training_steps,
        num_cycles=num_cycles,), requires_grad=False)
    return LambdaLR(optimizer, lr_func, last_epoch)

In [9]:
import torch
from torch.optim import AdamW
from transformers import get_scheduler
from tqdm.auto import tqdm

device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")
optimizer = AdamW(model.parameters(), lr=2e-5)
dtype = dtype = 'bfloat16' if torch.cuda.is_available() and torch.cuda.is_bf16_supported() else 'float16'
ptdtype = {'float32': torch.float32, 'bfloat16': torch.bfloat16, 'float16': torch.float16}[dtype]
ctx = torch.amp.autocast(device_type='cuda', dtype=ptdtype)
scaler = torch.cuda.amp.GradScaler(init_scale=2.0**14, enabled=(dtype == 'float16'))
num_epochs = 1
num_training_steps = num_epochs * len(train_loader)
lr_scheduler = get_cosine_schedule_with_warmup(
    optimizer=optimizer, num_warmup_steps=50, num_training_steps=num_training_steps
)


In [None]:
text = 'Swift spent her early years on a Christmas tree farm in New York that her father had purchased from one of his clients, and she spent her summers at her family\'s vacation home in, New Jersey, where she occasionally performed acoustic songs at a local coffee shop.'
input_tokens = tokenizer(text, max_length=128, truncation=True, return_tensors="pt").to(device)

In [None]:
import torch._dynamo
import torch.nn.functional as F
torch._dynamo.config.suppress_errors = True

torch.cuda.empty_cache()
# torch.autograd.set_detect_anomaly(True)
print('compiling model...')
# model = torch.compile(model)
model.to(device)

print(f'training with {dtype}')
pbar = tqdm(range(num_training_steps))
model.train()

for epoch in range(num_epochs):
  for i, batch in enumerate(train_loader):
        batch = {k: v.to(device) for k, v in batch.items()}
        with ctx:
          outputs = model(**batch)
          loss = outputs.loss

        #loss.backward()
        scaler.scale(loss).backward()

        scaler.unscale_(optimizer)
        torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)

        for name, param in model.named_parameters():
          if torch.isnan(param.grad).any():
            print("nan gradient found")

        scaler.step(optimizer)
        # optimizer.step()
        scale = scaler.get_scale()

        scaler.update()
        if scaler.get_scale() >= scale:
          lr_scheduler.step()
        optimizer.zero_grad()

        if i % 250 == 0:
            model.eval()

            output_ids = model.generate(batch['input_ids'][:3], forced_bos_token_id=tokenizer.lang_code_to_id["vi_VN"])
            src_sentences = tokenizer.batch_decode(batch['input_ids'], skip_special_tokens=True)
            tgt_sentences = tokenizer.batch_decode(output_ids, skip_special_tokens=True)

            print(f'iter {epoch*len(train_loader) + i}:')
            for (src, tgt) in zip(src_sentences, tgt_sentences):
                print(f'Source: {src}; Translate: {tgt}')
            
            print()
            model.train()
        
        if i == 3000:
            break
        pbar.update(1)
        pbar.set_postfix({'loss': loss.item()})

compiling model...
training with bfloat16


  0%|          | 0/16665 [00:00<?, ?it/s]

  batch["labels"] = torch.tensor(batch["labels"], dtype=torch.int64)


iter 0:
Source: Well, maybe he was too trusting, because he stayed where he was long after the Nazi Anschluss in Austria and even after the arrests and deportations began in Budapest.; Translate: Well, maybe he was too trusting, because he stayed where he was long after the Nazi Anschluss in Austria and even after the arrests and deportations began in Budapest.
Source: Dum da ta da dum Dum da ta da dum Da ta da da That's a lot of power.; Translate: Dum da ta da dum Dum da ta da ta da dum Da ta da da da That's a lot of power.
Source: We're four percent of the world's population ; we use 25 percent of the world's oil production.; Translate: We're four percent of the world's population ; we use 25 percent of the world's oil production.

iter 250:
Source: When her mother fell sick she said, " Sunitha, you have so much of contacts.; Translate: Khi mẹ của cô ấy bị bệnh, cô ấy nói, " Sunitha, cô có rất nhiều liên lạc.
Source: So, who told? I told.; Translate: Vậy, ai đã nói? Tôi đã nói.
Sourc

In [None]:
translated_tokens = model.generate(**inputs, forced_bos_token_id=tokenizer.lang_code_to_id["vi_VN"], output_scores=True)
print(translated_tokens)
tokenizer.batch_decode(translated_tokens, skip_special_tokens=True)