## Notebook to test Helsinki-NLP/opus-mt-eng-hi tokenizer


In [3]:
import torch
device = torch.device("mps")

import evaluate
from tqdm import tqdm

from datasets import Dataset
from peft import LoraConfig, get_peft_model
from transformers import AutoModelForSeq2SeqLM, AutoTokenizer, Seq2SeqTrainer, Seq2SeqTrainingArguments

In [8]:
# function to prepare the training and any other dataset
def prepare_data(eng_file_path, mai_file_path, data_type):
    # load the english and maithili texts files
    with open(eng_file_path, "r", encoding="utf-8") as en_file:
        eng_texts = en_file.readlines()

    with open(mai_file_path, "r", encoding="utf-8") as maithili_file:
        mai_texts = maithili_file.readlines()

    assert len(eng_texts) == len(mai_texts), "The number of sentences in both files must be the same."

    # clean the text files
    eng_texts_cleaned = [text.strip() for text in eng_texts]
    mai_texts_cleaned = [text.strip() for text in mai_texts]

    # create the dataset
    data = {
        "source_text": eng_texts_cleaned,
        "target_text": mai_texts_cleaned, 
    }
    dataset = Dataset.from_dict(data)

    # split the dataset into train, validation and test sets
    if data_type == "train":
        train_dataset, temp_dataset = dataset.train_test_split(test_size=0.1).values()
        val_dataset, test_dataset = temp_dataset.train_test_split(test_size=0.5).values()

        print(f"Training set size: {len(train_dataset)}")
        print(f"Validation set size: {len(val_dataset)}")
        print(f"Test set size: {len(test_dataset)}")

        return train_dataset, val_dataset, test_dataset
    elif data_type == "test":
        return dataset

# preprocessor function for tokenizer
def preprocess_function(examples, tokenizer):
    inputs = tokenizer(examples["source_text"], truncation=True, padding="max_length", max_length=128)
    targets = tokenizer(examples["target_text"], truncation=True, padding="max_length", max_length=128)
    inputs["labels"] = targets["input_ids"]
    return inputs

# function to tokenize the data
def tokenize_dataset(dataset, tokenizer):
    dataset_tokenized = dataset.map(lambda x: preprocess_function(x, tokenizer), batched=True)
    return dataset_tokenized

# function to evaluate the model
def compute_chrf(predictions, references):
    chrf = evaluate.load("chrf")
    chrf_score = chrf.compute(predictions=predictions, references=references, word_order=2)
    return chrf_score 

In [None]:
# load the model and tokenizer
model_path = "Helsinki-NLP/opus-mt-en-hi"
tokenizer = AutoTokenizer.from_pretrained(model_path)

# prepare the benchmark dataset
in22_mai_test = prepare_data("./dataset/test/IN22_test/gen/test.eng_Latn", "./dataset//test/IN22_test/gen/test.mai_Deva", "test")

# tokenize the dataset
in22_mai_test_tokenized = tokenize_dataset(in22_mai_test, tokenizer)

# deconde the tokenized labels
in22_mai_test_detokenized = tokenizer.batch_decode(in22_mai_test_tokenized['labels'], skip_special_tokens=True)

Map:   0%|          | 0/1024 [00:00<?, ? examples/s]

In [13]:
for i in range(3):
    print(f"Source Englis Text: {in22_mai_test_tokenized['source_text'][i]}")
    print(f"Target Maithili Text: {in22_mai_test_tokenized['target_text'][i]}")
    print(f"Decoded Target Maithili Text: {in22_mai_test_detokenized[i]}")
    print("\n")

# calculate chrF++ score for hindi to maithili overlap
chrf_score_mai = compute_chrf(in22_mai_test_tokenized["target_text"], in22_mai_test_detokenized)
print(f"chrF++ score for Maithili-Maithili References texts: {chrf_score_mai['score']}")

Source Englis Text: An appearance is a bunch of attributes related to the service person, like their shoes, clothes, tie, jewellery, hairstyle, make-up, watch, cosmetics, perfume, etc.
Target Maithili Text: रूप सर्विसवला व्यक्तिसँ सम्बन्धित बहुत रास लक्षणक समूह होयत छै जेना हुनक जूता, कपड़ा, टाई, गहना, केश, श्रृंगार, घड़ी, प्रसाधन सामग्री, सेंट इत्यादि।
Decoded Target Maithili Text: रूप सर्ला व्यक् सम्बन् ब रास लक् सम होयत  जेना हुनक ा, कपड़ा, टाई, गहना, , श्रृंगार, ़ी, प्रसाधन सामग्री, सेंट इत्या


Source Englis Text: Ajanta, located in the Aurangabad District of Maharashtra has twenty-nine caitya and vihara caves decorated with sculptures and paintings from the first century B.C.E. to the fifth century C.E.
Target Maithili Text: महाराष्ट्रके औरंगाबादमे स्थित अजन्तामे पहिल शताब्दी ईसा पूर्व सँ पाँचम शताब्दी धरिक मूर्तिकला आ चित्रकला सँ सजाओल उन्नतीस टा चैत्य आ विहार अछि।
Decoded Target Maithili Text: महाराष्ट्रके औरंगाबादमे स्थित न्तामे पल ाब्दी ा पूर्व  पाम ाब्दी धरिक मूर्तिकला आ चित