##### Authors: Nay Lin
This notebook wants to test MBart before and after adding the self-consistency layer.

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
import pandas as pd

# Load the data
train_df = pd.read_csv('/content/drive/MyDrive/data/train.csv')
test_df = pd.read_csv('/content/drive/MyDrive/data/test.csv')
validation_df = pd.read_csv('/content/drive/MyDrive/data/validation.csv')

# Sanity check
print(validation_df['en'])
print(validation_df['zh'])

0      Last year I showed these two slides so that  d...
1      But this understates the seriousness of this p...
2      The arctic ice cap is, in a sense,  the beatin...
3          It expands in winter and contracts in summer.
4      The next slide I show you will be  a rapid fas...
                             ...                        
874    You increase paralysis, and you decrease satis...
875                          Everybody needs a fishbowl.
876    This one is almost certainly too limited --  p...
877    But the absence of some metaphorical fishbowl ...
878                                 Thank you very much.
Name: en, Length: 879, dtype: object
0      去年我给各位展示了两个 关于北极冰帽的演示 在过去三百万年中 其面积由相当于美国南方48州面...
1                        但这些没能完全说明这个问题的严重性 因为这没有表示出冰帽的厚度
2                               感觉上，北极冰帽 就好象全球气候系统中跳动的心脏
3                                          冬天心脏舒张，夏天心脏收缩
4                                  下面我要展示的是 在过去25年里的极剧变化
                             ...                   

In [None]:
from transformers import MBartForConditionalGeneration, MBart50TokenizerFast
import torch

model_name = "facebook/mbart-large-50-many-to-many-mmt"
model = MBartForConditionalGeneration.from_pretrained(model_name)
tokenizer = MBart50TokenizerFast.from_pretrained(model_name)

In [None]:
# Translation Test
src_lang = "en_XX"
tgt_lang = "zh_CN"

# Func. def. if using GPU
device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")
print(device)
model = model.to(device)

def translate(model, tokenizer, sentence, src_lang, tgt_lang):
    tokenizer.src_lang = src_lang
    tokenizer.tgt_lang = tgt_lang
    encoded_input = tokenizer(sentence, return_tensors="pt").to(device)
    generated_tokens = model.generate(**encoded_input, forced_bos_token_id=tokenizer.lang_code_to_id[tgt_lang])
    translation = tokenizer.decode(generated_tokens[0], skip_special_tokens=True)
    return translation

# To only use CPU, run the previous cell + this func. def. below
# def translate(model, tokenizer, sentence, tgt_lang):
#     encoded_input = tokenizer(sentence, return_tensors="pt")
#     generated_tokens = model.generate(**encoded_input, forced_bos_token_id=tokenizer.lang_code_to_id[tgt_lang])
#     translation = tokenizer.decode(generated_tokens[0], skip_special_tokens=True)
#     return translation

english_sentence = "Hello, how are you?"
chinese_translation = translate(model, tokenizer, english_sentence, src_lang, tgt_lang)
back_translate = translate(model, tokenizer, chinese_translation, tgt_lang, src_lang)
print(f"English: {english_sentence}\nChinese: {chinese_translation}\nBack-translated: {back_translate}")

cuda
English: Hello, how are you?
Chinese: 你好,你好吗?
Back-translated: Hello, how are you?


In [None]:
!pip3 install sacrebleu

Collecting sacrebleu
  Downloading sacrebleu-2.4.1-py3-none-any.whl (106 kB)
[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/106.6 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m106.6/106.6 kB[0m [31m4.0 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting portalocker (from sacrebleu)
  Downloading portalocker-2.8.2-py3-none-any.whl (17 kB)
Collecting colorama (from sacrebleu)
  Downloading colorama-0.4.6-py2.py3-none-any.whl (25 kB)
Installing collected packages: portalocker, colorama, sacrebleu
Successfully installed colorama-0.4.6 portalocker-2.8.2 sacrebleu-2.4.1


In [None]:
# NOTE: If you already ran this before, load the translations from a text file and get the BLEU Score
with open('/content/drive/MyDrive/mbarttranslations.txt', 'r', encoding='utf-8') as f:
    hyp = [line.strip() for line in f.readlines()]
refs = [[row['zh']] for _, row in validation_df.iterrows()]
import sacrebleu
bleu_score = sacrebleu.corpus_bleu(hyp, refs, tokenize='zh')
print(f"BLEU Score: {bleu_score.score}")

BLEU Score: 28.317815064640417


In [None]:
# Method to perform SacreBleu test on pretrained model (en to zh)
import sacrebleu
def translate_and_evaluate(df, model, tokenizer, src_lang, tgt_lang):
    if src_lang == "en":
      tokenizer.src_lang, tokenizer.tgt_lang = "en_XX", "zh_CN"
    else:
      tokenizer.src_lang, tokenizer.tgt_lang = "zh_CN", "en_XX"

    hyp = []
    refs = [[] for _ in range(len(df))]
    for idx, row in df.iterrows():
        # Translate the sentence
        encoded_input = tokenizer(row[src_lang], return_tensors="pt").to(device)
        generated_tokens = model.generate(**encoded_input, forced_bos_token_id=tokenizer.lang_code_to_id[tokenizer.tgt_lang])
        translation = tokenizer.decode(generated_tokens[0], skip_special_tokens=True)
        hyp.append(translation)

        # Prepare references
        refs[idx].append(row[tgt_lang])

    # Compute BLEU score
    if tgt_lang == "en":
      sacrebleu_without_smoothing = sacrebleu.corpus_bleu(hyp, refs)
    else:
      sacrebleu_without_smoothing = sacrebleu.corpus_bleu(hyp, refs, tokenize='zh')
    return hyp, sacrebleu_without_smoothing

In [None]:
# Perform the forward translation and evaluation
translations, bleu_score = translate_and_evaluate(validation_df, model, tokenizer, 'en', 'zh')
print(f"BLEU Score: {bleu_score.score}")

# Save translations to a file
with open('forwardmbarttranslations.txt', 'w', encoding='utf-8') as f:
    for translation in translations:
        f.write(translation + '\n')

BLEU Score: 28.317815064640417


In [None]:
# Perform the backward translation and evaluation
translations, bleu_score = translate_and_evaluate(validation_df, model, tokenizer, 'zh', 'en')
print(f"BLEU Score: {bleu_score.score}")
# Save translations to a file
with open('backwardmbarttranslations.txt', 'w', encoding='utf-8') as f:
    for translation in translations:
        f.write(translation + '\n')

BLEU Score: 52.511563937151365


In [None]:
# Define self-consistency simply using cosine_similarity
# TODO: Test other metrics e.g. MSE, etc.
import torch
from torch.nn.functional import cosine_similarity

def self_consistency_loss(original_text, model, tokenizer):
    # Translate to Chinese
    translated_text = translate(model, tokenizer, original_text, "zh_CN")

    # Translate back to English
    back_translated_text = translate(model, tokenizer, translated_text, "en_XX")

    # Encoded texts
    encoded_input = tokenizer(original_text, return_tensors="pt").to(device)
    encoded_back_translated_input = tokenizer(back_translated_text, return_tensors="pt").to(device)

    # Calculate cosine similarity between original and back-translated text embeddings
    original_embedding = model.get_encoder()(**encoded_input).last_hidden_state.mean(dim=1)
    back_translated_embedding = model.get_encoder()(**encoded_back_translated_input).last_hidden_state.mean(dim=1)
    loss = 1 - cosine_similarity(original_embedding, back_translated_embedding)

    return loss

In [None]:
# Checking the maximum length of the tokens
en_token_len = [len(tokenizer.encode(text)) for text in train_df['en']]
avg_en_token_len = sum(en_token_len) / len(en_token_len)
print(f"Average English Token Length: {avg_en_token_len}")

zh_token_len = [len(tokenizer.encode(text)) for text in train_df['zh']]
avg_zh_token_len = sum(zh_token_len) / len(zh_token_len)
print(f"Average Chinese Token Length: {avg_zh_token_len}")

Average English Token Length: 25.429808099763907
Average Chinese Token Length: 22.599063416152827


In [None]:
from torch.utils.data import Dataset, DataLoader

# You can adjust
# 1. max_length
# 2. batch_size

class TranslationDataset(Dataset):
    # Here I chose max length 32 because it is bigger than 25.4 and 22.6
    def __init__(self, dataframe, tokenizer, max_length=32):
        self.dataframe = dataframe
        self.tokenizer = tokenizer
        self.max_length = max_length

    def __len__(self):
        return len(self.dataframe)

    def __getitem__(self, idx):
        english_text = self.dataframe.iloc[idx]['en']
        chinese_text = self.dataframe.iloc[idx]['zh']
        tokenized_input = self.tokenizer(english_text, return_tensors="pt", max_length=self.max_length, padding="max_length", truncation=True)
        tokenized_target = self.tokenizer(chinese_text, return_tensors="pt", max_length=self.max_length, padding="max_length", truncation=True)
        input_ids = tokenized_input.input_ids.squeeze()
        attention_mask = tokenized_input.attention_mask.squeeze()
        labels = tokenized_target.input_ids.squeeze()
        return {"input_ids": input_ids, "attention_mask": attention_mask, "labels": labels}

# Create the dataset
dataset = TranslationDataset(train_df, tokenizer)

# Create the DataLoader
dataloader = DataLoader(dataset, batch_size=16, shuffle=True, num_workers=2)


In [None]:
import time
import os
optimizer = torch.optim.AdamW(model.parameters(), lr=1e-5)

def save_checkpoint(model, optimizer, epoch, batch_index, checkpoint_path):
    torch.save({
        'epoch': epoch,
        'batch_index': batch_index,
        'model_state_dict': model.state_dict(),
        'optimizer_state_dict': optimizer.state_dict()
    }, checkpoint_path)
    print(f"Checkpoint saved at epoch {epoch}, batch {batch_index}")

# Define the checkpoint path
checkpoint_dir = "/content/drive/MyDrive/checkpoints"
os.makedirs(checkpoint_dir, exist_ok=True)

# Check if a checkpoint exists
latest_checkpoint_path = os.path.join(checkpoint_dir, "latest_checkpoint.pt")
if os.path.exists(latest_checkpoint_path):
    # Load the checkpoint
    checkpoint = torch.load(latest_checkpoint_path, map_location='cpu')
    model.load_state_dict(checkpoint['model_state_dict'])
    optimizer.load_state_dict(checkpoint['optimizer_state_dict'])
    start_epoch = checkpoint['epoch']
    start_batch_index = checkpoint['batch_index'] + 1
    print(f"Restarting from {start_epoch} and batch {start_batch_index}")
else:
    start_epoch = 0
    start_batch_index = 0

# Hyper-Parameters
num_epochs = 1
consistency_weight = 0.1 # Weight to self-consistency loss vs translation loss
num_batches = len(dataloader)
print(f"Total number of batches: {num_batches}")

# Training Loop
model.train()
start_time = time.time()
for epoch in range(num_epochs):
    start_time = time.time()
    for batch_index, batch in enumerate(dataloader, start=1):
        if batch_index < start_batch_index:
            continue

        # Reset the gradients after each mini batch
        optimizer.zero_grad()

        # Move data to GPU
        input_ids = batch["input_ids"].to(device)
        attention_mask = batch["attention_mask"].to(device)
        labels = batch["labels"].to(device)
        # input_ids = batch["input_ids"]
        # attention_mask = batch["attention_mask"]
        # labels = batch["labels"]

        # Forward pass
        outputs = model(input_ids=input_ids, attention_mask=attention_mask, labels=labels)
        translation_loss = outputs.loss

        # Self-consistency loss
        consistency_losses = []
        for input_id in input_ids:
            text = tokenizer.decode(input_id, skip_special_tokens=True)
            loss = self_consistency_loss(text, model, tokenizer).to(device)
            # loss = self_consistency_loss(text, model, tokenizer)
            consistency_losses.append(loss)
        consistency_loss = torch.mean(torch.stack(consistency_losses))

        # Total loss
        total_loss = translation_loss + consistency_weight * consistency_loss

        # Backward pass and optimization
        total_loss.backward()
        optimizer.step()

        if batch_index % 100 == 0:
            # checkpoint_path = os.path.join(checkpoint_dir, f"checkpoint_epoch_{epoch}_batch_{batch_index}.pt")
            # save_checkpoint(model, optimizer, epoch, batch_index, checkpoint_path)
            # Update the latest checkpoint path
            save_checkpoint(model, optimizer, epoch, batch_index, latest_checkpoint_path)

            # Time:
            curr_time = time.time()
            remaining_duration = ((curr_time - start_time) / (batch_index)) * (num_batches - batch_index)
            print(f"Batch number {batch_index}: Epoch {epoch}, Batch Loss: {total_loss.item()}, Estimated Time Left: {remaining_duration}")

    # Reset the start_batch_index after completing an epoch
    start_batch_index = 0

    print(f"Epoch {epoch + 1} completed")


Restarting from 0 and batch 401
Total number of batches: 14455
Checkpoint saved at epoch 0, batch 500
Batch number 500: Epoch 0, Batch Loss: 1.1399003267288208, Estimated Time Left: 64508.99039309978
Checkpoint saved at epoch 0, batch 600
Batch number 600: Epoch 0, Batch Loss: 1.7609258890151978, Estimated Time Left: 106936.80470516084
Checkpoint saved at epoch 0, batch 700
Batch number 700: Epoch 0, Batch Loss: 1.4337968826293945, Estimated Time Left: 135701.80808919668
Checkpoint saved at epoch 0, batch 800
Batch number 800: Epoch 0, Batch Loss: 1.4458568096160889, Estimated Time Left: 156712.67868054807
Checkpoint saved at epoch 0, batch 900
Batch number 900: Epoch 0, Batch Loss: 1.5686205625534058, Estimated Time Left: 172539.733075281
Checkpoint saved at epoch 0, batch 1000
Batch number 1000: Epoch 0, Batch Loss: 1.2339277267456055, Estimated Time Left: 185966.03737805845
Checkpoint saved at epoch 0, batch 1100
Batch number 1100: Epoch 0, Batch Loss: 1.3911930322647095, Estimated 

In [None]:
# Load the checkpoint
latest_checkpoint_path = "/content/drive/MyDrive/data/mbart_latest_4000_of_14455.pt"
checkpoint = torch.load(latest_checkpoint_path, map_location='cpu')
model.load_state_dict(checkpoint['model_state_dict'])
start_epoch = checkpoint['epoch']
start_batch_index = checkpoint['batch_index']
print(f"Loaded from epoch {start_epoch} and batch {start_batch_index}")

Loaded from epoch 0 and batch 4000


In [None]:
# Testing forward translation
# Perform the translation and evaluation
translations, bleu_score = translate_and_evaluate(validation_df, model, tokenizer, 'en', 'zh')
print(f"BLEU Score: {bleu_score.score}")

# Save translations to a file
with open('tunedforwardmbarttranslations.txt', 'w', encoding='utf-8') as f:
    for translation in translations:
        f.write(translation + '\n')

BLEU Score: 30.682659738699808


In [None]:
def translate(model, tokenizer, sentence, src_lang, tgt_lang):
    tokenizer.src_lang = src_lang
    tokenizer.tgt_lang = tgt_lang
    encoded_input = tokenizer(sentence, return_tensors="pt").to(device)
    generated_tokens = model.generate(**encoded_input, forced_bos_token_id=tokenizer.lang_code_to_id[tgt_lang])
    translation = tokenizer.decode(generated_tokens[0], skip_special_tokens=True)
    return translation

print(translate(model, tokenizer, "你好吗?", 'zh_CN', 'en_XX'))
print(translate(model, tokenizer, "Can you please work?", 'en_XX', 'zh_CN'))
print(translate(model, tokenizer, "आज मेरा जन्मदिन हे", 'hi_IN', 'en_XX'))

你好吗?
你能工作吗?
今天是我生日。


In [None]:
# Testing backward translation
# Perform the translation and evaluation
translations, bleu_score = translate_and_evaluate(validation_df, model, tokenizer, 'zh', 'en')
print(f"BLEU Score: {bleu_score.score}")

# Save translations to a file
with open('tunedbackwardmbarttranslations.txt', 'w', encoding='utf-8') as f:
    for translation in translations:
        f.write(translation + '\n')

BLEU Score: 0.0


In [None]:
# Save the Model
torch.save(model.state_dict(), "mbart_translation_model_with_self_consistency.pth")

In [None]:
# Load the Model
model_path = "mbart_translation_model_with_self_consistency.pth"
model.load_state_dict(torch.load(model_path))
model.eval()

In [None]:
# TODO: Test the Model
