## Notebook to test finetuned Helsinki-NLP/opus-mt-eng-hi

In [4]:
import torch
device = torch.device("mps")

import evaluate
import sacrebleu
from tqdm import tqdm

from datasets import Dataset
from peft import LoraConfig, get_peft_model, PeftModel
from transformers import AutoModelForSeq2SeqLM, AutoTokenizer, Seq2SeqTrainer, Seq2SeqTrainingArguments

### Part 1) Loading the finetuned model, test and benchmark datasets


In [3]:
# function to prepare the training and any other dataset
def prepare_data(eng_file_path, mai_file_path, data_type):
    # load the english and maithili texts files
    with open(eng_file_path, "r", encoding="utf-8") as en_file:
        eng_texts = en_file.readlines()

    with open(mai_file_path, "r", encoding="utf-8") as maithili_file:
        mai_texts = maithili_file.readlines()

    assert len(eng_texts) == len(mai_texts), "The number of sentences in both files must be the same."

    # clean the text files
    eng_texts_cleaned = [text.strip() for text in eng_texts]
    mai_texts_cleaned = [text.strip() for text in mai_texts]

    # create the dataset
    data = {
        "source_text": eng_texts_cleaned,
        "target_text": mai_texts_cleaned, 
    }
    dataset = Dataset.from_dict(data)

    # split the dataset into train, validation and test sets
    if data_type == "train":
        train_dataset, temp_dataset = dataset.train_test_split(test_size=0.1).values()
        val_dataset, test_dataset = temp_dataset.train_test_split(test_size=0.5).values()

        print(f"Training set size: {len(train_dataset)}")
        print(f"Validation set size: {len(val_dataset)}")
        print(f"Test set size: {len(test_dataset)}")

        return train_dataset, val_dataset, test_dataset
    elif data_type == "test":
        return dataset

# preprocessor function for tokenizer
def preprocess_function(examples, tokenizer):
    inputs = tokenizer(examples["source_text"], truncation=True, padding="max_length", max_length=128)
    targets = tokenizer(examples["target_text"], truncation=True, padding="max_length", max_length=128)
    inputs["labels"] = targets["input_ids"]
    return inputs

# function to tokenize the data
def tokenize_dataset(dataset, tokenizer):
    dataset_tokenized = dataset.map(lambda x: preprocess_function(x, tokenizer), batched=True)
    return dataset_tokenized

# function to move the dataset to device
def move_to_device(batch):
    # move each tensor in the batch to the MPS device
    for key in batch:
        batch[key] = torch.tensor(batch[key]).to(device)
    return batch

# function to batch predict the model
def trainer_evaluate(model, tokenizer, test_dataset):

    eval_trainer = Seq2SeqTrainer(
        model=model,
        args = Seq2SeqTrainingArguments(
            output_dir="./results/test/",
            per_device_eval_batch_size=32,
            predict_with_generate=True,
            disable_tqdm=False,
        ), 
        eval_dataset=test_dataset,
        tokenizer=tokenizer,
    )

    return eval_trainer

# function to evaluate the model
def compute_chrf(predictions, references):
    chrf = evaluate.load("chrf")
    chrf_score = chrf.compute(predictions=predictions, references=references, word_order=2)
    return chrf_score 

In [5]:
# load the model and tokenizer
model_path = "./finetuned/epoch2"
finetuned_tokenizer = AutoTokenizer.from_pretrained(model_path)
finetuned_model = AutoModelForSeq2SeqLM.from_pretrained(model_path)
print("Parameter size:", finetuned_model.num_parameters())

# load the merged model as peft model
merged_model = PeftModel.from_pretrained(finetuned_model, model_path)

# merge the LoRA weights into the model
merged_model.merge_and_unload()

# affter merging, get the parameter size
print("Parameter size after merging:", merged_model.num_parameters())

Parameter size: 77265920
Parameter size after merging: 76381184


In [None]:
# read test and benchmark data
test_dataset = prepare_data("./dataset/training_split/bpcc/test.eng_Latn", "./dataset/training_split/bpcc/test.mai_Deva", "test")

# choose random 1200 examples from test dataset for faster inference and evaluation
test_dataset = test_dataset.shuffle(seed=42).select(range(1200))
test_dataset_tokenized = tokenize_dataset(test_dataset, finetuned_tokenizer)

in22_mai_test = prepare_data("./dataset/test/IN22_test/gen/test.eng_Latn", "./dataset//test/IN22_test/gen/test.mai_Deva", "test")
in22_mai_test_tokenized = tokenize_dataset(in22_mai_test, finetuned_tokenizer)

print("Test dataset size:", len(test_dataset))
print("IN22 benchmark dataset size:", len(in22_mai_test))

Map:   0%|          | 0/1200 [00:00<?, ? examples/s]

Map:   0%|          | 0/1024 [00:00<?, ? examples/s]

Test dataset size: 1200
IN22 benchmark dataset size: 1024


### Part 2) Testing on BPCC Eng-Mai test split


In [7]:
# move the model and in22 testdata to the MPS device
merged_model = merged_model.to(device)
test_dataset_tokenized = test_dataset_tokenized.with_transform(move_to_device)

# generate predictions for english to hindi
eval_trainer = trainer_evaluate(merged_model, finetuned_tokenizer, test_dataset_tokenized)
test_dataset_mai_pred, test_dataset_mai_lab, _ = eval_trainer.predict(test_dataset_tokenized)

# decode the predictions and references
test_dataset_mai_pred = finetuned_tokenizer.batch_decode(test_dataset_mai_pred, skip_special_tokens=True)
test_dataset_mai_ref = finetuned_tokenizer.batch_decode(test_dataset_mai_lab, skip_special_tokens=True)

  0%|          | 0/38 [00:00<?, ?it/s]

In [8]:
# print the predictions and references for comparison
print("Testing results for test dataset\n")
for i in range(2):
    print("English Text : ", test_dataset[i]['source_text'])
    print("Maithili Reference: ", test_dataset_mai_ref[i])
    print("Maithili Prediction: ",test_dataset_mai_pred[i])
    print("\n")

# calculate chrF++ score for hindi to maithili overlap
chrf_score_mai = compute_chrf(test_dataset_mai_pred, test_dataset_mai_ref)
print(f"chrF++ score for English-Maithili test data split: {chrf_score_mai['score']}")

Testing results for test dataset

English Text :  Authorities have asked for a DNA test to confirm the girl's identity.
Maithili Reference:  अधिकारी लड़कीक पानक प्टि करबाक लेल डीएनए जा करबाक लेल कहने  ।
Maithili Prediction:  एकर प्रार प्रक प्रारार समे प्रूप्रार समे प्राइत समे प्राइत प्रूप्र्त प्राइत प्राइत प्राइत प्र्र्त समेलेल छल छल।


English Text :  For example, you might know that a sweater’s original price is $69, and that it is on sale for $51.75.
Maithili Reference:  उदाहरण लेल, ा बल हो जे एक टा स्वेटरक मूल दाम  डॉलर  आ ई ५१.७५ डॉलरमे सेलमे बिका रहल
Maithili Prediction:  एकर समे एकर समे एकर प्रूर प्रार्त समे समे समे प्राइत प्राइत समे समे प्रूराइत प्राइत छल छलेल छल छल छलेल छल छल छल छल छल छल छल।


chrF++ score for English-Maithili test data split: 6.712408945695135


### Part 3) Testing on IN22 Eng-Mai benchmark dataset


In [9]:

# move the model and in22 testdata to the MPS device
finetuned_model = finetuned_model.to(device)
in22_mai_test_tokenized = in22_mai_test_tokenized.with_transform(move_to_device)

# generate predictions for english to hindi
eval_trainer = trainer_evaluate(finetuned_model, finetuned_tokenizer, in22_mai_test_tokenized)
in22_mai_test_pred, in22_mai_test_lab, _ = eval_trainer.predict(in22_mai_test_tokenized)

# decode the predictions and references
in22_mai_test_pred = finetuned_tokenizer.batch_decode(in22_mai_test_pred, skip_special_tokens=True)
in22_mai_test_ref = finetuned_tokenizer.batch_decode(in22_mai_test_lab, skip_special_tokens=True)

  0%|          | 0/32 [00:00<?, ?it/s]

In [10]:
# print the predictions and references for comparison
for i in range(2):
    print("English Text: ", in22_mai_test[i]["source_text"])
    print("Maithili Reference: ", in22_mai_test_ref[i])
    print("Maithili Prediction: ", in22_mai_test_pred[i])
    print("\n")

# calculate chrF++ score for hindi to maithili overlap
chrf_score_mai = compute_chrf(in22_mai_test_pred, in22_mai_test_ref)
print(f"chrF++ score for English-Maithili IN22 benchmark dataset: {chrf_score_mai['score']}")

English Text:  An appearance is a bunch of attributes related to the service person, like their shoes, clothes, tie, jewellery, hairstyle, make-up, watch, cosmetics, perfume, etc.
Maithili Reference:  रूप सर्ला व्यक् सम्बन् ब रास लक् सम होयत  जेना हुनक ा, कपड़ा, टाई, गहना, , श्रृंगार, ़ी, प्रसाधन सामग्री, सेंट इत्या
Maithili Prediction:  एकरा प्रार प्रार्तार प्रार प्रार्तार, समे प्रामे, प्र्रारार, समे, प्र्र्रारार, प्र्र्र्रारारारार, स्त्रारारारारारार, क, स्त्र्रारारारारारारारार्त, ल, ल, ल, स्त्त्मेल, स्त्त्त, लेलेलेलेलेलेलेल, छल, छलेलेलेल, छल, छल, छल, छल, छल, छल, छल, छल, छल, छल, छल, छल, छल, छल, छल, छल, छल, छलेल, छल, छल, छलेलेलेलेल, छलेल, छल, छल, छल, छल, छल, छल, छल, छल, छल।


English Text:  Ajanta, located in the Aurangabad District of Maharashtra has twenty-nine caitya and vihara caves decorated with sculptures and paintings from the first century B.C.E. to the fifth century C.E.
Maithili Reference:  महाराष्ट्रके औरंगाबादमे स्थित न्तामे पल ाब्दी ा पूर्व  पाम ाब्दी धरिक मूर्तिकला आ चित