In [6]:
!pip install evaluate sacremoses sacrebleu



In [7]:
import os
import torch
import evaluate
from tqdm import tqdm

from datasets import Dataset
from transformers import AutoModelForSeq2SeqLM, AutoTokenizer, Seq2SeqTrainer, Seq2SeqTrainingArguments

In [8]:
# input_path = "/kaggle/working"

# for root, dirs, files in os.walk(input_path):
#     print(f"Directory: {root}")
#     for file in files:
#         print(f"  File: {file}")


# file_path = "./test.mai_Deva"
# if os.path.exists(file_path):
#     os.remove(file_path)
# else: 
#     print("file not found")

# dir_path = ""
# # Remove the directory
# try:
#     os.rmdir(dir_path)
#     print(f"Directory {dir_path} has been removed.")
# except OSError as e:
#     print(f"Error: {e}")

# directory_path = "./logs/"
# os.makedirs(directory_path, exist_ok=True)

!ls /kaggle/working/

finetuned  state.db  test_split  wandb


### Part 1) Preparing and tokenizing the training datasets

In [9]:
# function to prepare the training and any other dataset
def prepare_data(eng_file_path, mai_file_path, data_type):
    # load the english and maithili texts files
    with open(eng_file_path, "r", encoding="utf-8") as en_file:
        eng_texts = en_file.readlines()

    with open(mai_file_path, "r", encoding="utf-8") as maithili_file:
        mai_texts = maithili_file.readlines()

    assert len(eng_texts) == len(mai_texts), "The number of sentences in both files must be the same."

    # clean the text files
    eng_texts_cleaned = [text.strip() for text in eng_texts]
    mai_texts_cleaned = [text.strip() for text in mai_texts]

    # create the dataset
    data = {
        "source_text": eng_texts_cleaned,
        "target_text": mai_texts_cleaned, 
    }
    dataset = Dataset.from_dict(data)

    # split the dataset into train, validation and test sets
    if data_type == "train":
        train_dataset, temp_dataset = dataset.train_test_split(test_size=0.1).values()
        val_dataset, test_dataset = temp_dataset.train_test_split(test_size=0.5).values()

        print(f"Training set size: {len(train_dataset)}")
        print(f"Validation set size: {len(val_dataset)}")
        print(f"Test set size: {len(test_dataset)}")

        return train_dataset, val_dataset, test_dataset
    elif data_type == "test":
        return dataset

# preprocessor function for tokenizer
def preprocess_function(examples, tokenizer):
    inputs = tokenizer(examples["source_text"], truncation=True, padding="max_length", max_length=128)
    targets = tokenizer(examples["target_text"], truncation=True, padding="max_length", max_length=128)
    inputs["labels"] = targets["input_ids"]
    return inputs

# function to tokenize the data
def tokenize_dataset(dataset, tokenizer):
    dataset_tokenized = dataset.map(lambda x: preprocess_function(x, tokenizer), batched=True)
    return dataset_tokenized

def save_dataset(dataset, file_path):
    with open(file_path, "w", encoding="utf-8") as f:
        for line in dataset: 
            f.write(line + "\n")


In [10]:
# prepare the data
train_dataset, val_dataset, test_dataset = prepare_data("/kaggle/input/bpcc-eng-mai-train/bpcc/train.eng_Latn", "/kaggle/input/bpcc-eng-mai-train/bpcc/train.mai_Deva", "train")

# tokenize the data
tokenizer = AutoTokenizer.from_pretrained("Helsinki-NLP/opus-mt-en-hi") # load the tokenizer
train_dataset_tokenized = tokenize_dataset(train_dataset, tokenizer)
val_dataset_tokenized = tokenize_dataset(val_dataset, tokenizer)
test_dataset_tokenized = tokenize_dataset(test_dataset, tokenizer)

# # saving dataset to csv (backup)
# save_dataset(train_dataset["source_text"], "./dataset/training_split/bpcc/train.eng_Latn")
# save_dataset(train_dataset["target_text"], "./dataset/training_split/bpcc/train.mai_Deva")
# save_dataset(val_dataset["source_text"], "./dataset/training_split/bpcc/val.eng_Latn")
# save_dataset(val_dataset["target_text"], "./dataset/training_split/bpcc/val.mai_Deva")
save_dataset(test_dataset["source_text"], "./test_split/test.eng_Latn")
save_dataset(test_dataset["target_text"], "./test_split/test.mai_Deva")

Training set size: 60892
Validation set size: 3383
Test set size: 3383


tokenizer_config.json:   0%|          | 0.00/44.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/1.39k [00:00<?, ?B/s]

source.spm:   0%|          | 0.00/812k [00:00<?, ?B/s]

target.spm:   0%|          | 0.00/1.07M [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/2.10M [00:00<?, ?B/s]

Map:   0%|          | 0/60892 [00:00<?, ? examples/s]

Map:   0%|          | 0/3383 [00:00<?, ? examples/s]

Map:   0%|          | 0/3383 [00:00<?, ? examples/s]

In [11]:
# checking the tokenization and vocab subwords
print("Source text: ", train_dataset_tokenized[0]["source_text"])
print("Target text: ", train_dataset_tokenized[0]["target_text"])
print("Source tokens: ", tokenizer.convert_ids_to_tokens(train_dataset_tokenized[0]["input_ids"]))
print("Target tokens: ", tokenizer.convert_ids_to_tokens(train_dataset_tokenized[0]["labels"]))


Source text:  Rail transport was absent in the state until 2008–09 when the railway track was extended to the capital Agartala.
Target text:  २००८-०९ धरि राज्यमे रेल परिवहन अनुपस्थित छल, जखन रेल पटरीकेँ राजधानी अगरतला धरि बढ़ाओल गेल छल।
Source tokens:  ['▁Ra', 'il', '▁transport', '▁was', '▁absent', '▁in', '▁the', '▁state', '▁until', '▁2008', '–', '09', '▁when', '▁the', '▁railway', '▁track', '▁was', '▁extended', '▁to', '▁the', '▁capital', '▁A', 'gar', 't', 'ala', '.', '</s>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>',

### Part 2) Fine tuning the pretrained model

In [12]:
# funtion to configure LoRA
def configure_lora(rank, lora_alpha, lora_dropout, task_type, bias, target_modules):

    lora_config = LoraConfig(
        r=rank,  # rank of the low-rank adaptation
        lora_alpha=lora_alpha,  # scaling factor for the LoRA layers
        lora_dropout=lora_dropout,  # dropout for the LoRA layers
        task_type=task_type, # task type
        bias=bias,  # set bias as 'none', 'all', or 'lora_only'
        target_modules=target_modules  # specify the target modules
    )

    return lora_config

# function to define training arguments
def train_args(output_dir, eval_stra, learning_r, batch_size, grad_step, num_train_epochs, 
            save_steps, logging_dir, logging_steps, save_total_limit):

    training_args = Seq2SeqTrainingArguments(
        output_dir=output_dir,  # directory to save results
        eval_strategy=eval_stra,  # strategy for evaluation
        learning_rate=learning_r,  # learnin rate for fine-tuning
        per_device_train_batch_size=batch_size,  # batch size
        gradient_accumulation_steps= grad_step, # step for backpropagation
        num_train_epochs=num_train_epochs,  # number of training epochs
        save_steps=save_steps,  # save checkpoints after this many steps
        logging_dir=logging_dir,  # directory for logs
        logging_steps=logging_steps, # log after this many steps
        save_total_limit=save_total_limit,  # limit number of saved checkpoints
    )

    return training_args

# function to define the training trainer
def trainer_train(model, training_args, train_dataset, val_dataset, tokenizer):

    trainer = Seq2SeqTrainer(
        model=model,  # model to be fine-tuned
        args=training_args,  # training arguments
        train_dataset=train_dataset,  # tokenized training dataset
        eval_dataset=val_dataset, # tokenized validation dataset
        tokenizer=tokenizer  # tokenizer for tokenization
    )

    return trainer

# function to print the trainable parameters
def get_trainable_parameters(model):
    trainable_params = 0
    all_param = 0
    for _, param in model.named_parameters():
        all_param += param.numel()
        if param.requires_grad:
            trainable_params += param.numel()
    print(
        f"After adding LoRA: Trainable Params: {trainable_params}, All Params: {all_param},  Trainable %: {100 * trainable_params / all_param:.2f}"
    )

In [13]:
# load the model
model_name = "Helsinki-NLP/opus-mt-en-hi"
model = AutoModelForSeq2SeqLM.from_pretrained(model_name)

print("Parameter Size (pretrained): ", model.num_parameters())
print("Training Dataset length: ", len(train_dataset_tokenized))
print("Validation Dataset length: ", len(val_dataset_tokenized))

pytorch_model.bin:   0%|          | 0.00/306M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/293 [00:00<?, ?B/s]

Parameter Size (pretrained):  76381184
Training Dataset length:  60892
Validation Dataset length:  3383


In [14]:
# define training arguments and trainer
training_args = train_args(output_dir="/results/epoch5", eval_stra="epoch", learning_r=5e-5, 
                           batch_size=32, grad_step=1, num_train_epochs=5, save_steps=1000,
                           logging_dir="./logs/epoch5", logging_steps=500, save_total_limit=2)

train_trainer = trainer_train(model, training_args, train_dataset_tokenized, val_dataset_tokenized, tokenizer)

# train the model
print("Finetuning the pretrained model")
train_trainer.train()

  trainer = Seq2SeqTrainer(


Finetuning the pretrained model


[34m[1mwandb[0m: Using wandb-core as the SDK backend.  Please refer to https://wandb.me/wandb-core for more information.
[34m[1mwandb[0m: Logging into wandb.ai. (Learn how to deploy a W&B server locally: https://wandb.me/wandb-server)
[34m[1mwandb[0m: You can find your API key in your browser here: https://wandb.ai/authorize
[34m[1mwandb[0m: Paste an API key from your profile and hit enter, or press ctrl+c to quit:

  ········


[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc


  with torch.cuda.device(device), torch.cuda.stream(stream), autocast(enabled=autocast_enabled):


Epoch,Training Loss,Validation Loss
1,1.5394,0.956963
2,0.9569,0.773835
3,0.7961,0.689845
4,0.7215,0.651697
5,0.6813,0.63922


  with torch.cuda.device(device), torch.cuda.stream(stream), autocast(enabled=autocast_enabled):
  with torch.cuda.device(device), torch.cuda.stream(stream), autocast(enabled=autocast_enabled):
  with torch.cuda.device(device), torch.cuda.stream(stream), autocast(enabled=autocast_enabled):
  with torch.cuda.device(device), torch.cuda.stream(stream), autocast(enabled=autocast_enabled):
  with torch.cuda.device(device), torch.cuda.stream(stream), autocast(enabled=autocast_enabled):


TrainOutput(global_step=4760, training_loss=0.8909465180725611, metrics={'train_runtime': 4273.494, 'train_samples_per_second': 71.244, 'train_steps_per_second': 1.114, 'total_flos': 1.032069618597888e+16, 'train_loss': 0.8909465180725611, 'epoch': 5.0})

In [15]:
# save the fintuned model
model.save_pretrained("./finetuned/epoch5")
tokenizer.save_pretrained("./finetuned/epoch5")

('./finetuned/epoch5/tokenizer_config.json',
 './finetuned/epoch5/special_tokens_map.json',
 './finetuned/epoch5/vocab.json',
 './finetuned/epoch5/source.spm',
 './finetuned/epoch5/target.spm',
 './finetuned/epoch5/added_tokens.json')

### Part 3) Testing finetuned model's performance on test data split


In [16]:
# define trainer for evaluation
def trainer_evaluate(model, tokenizer, test_dataset):

    eval_trainer = Seq2SeqTrainer(
        model=model,
        args = Seq2SeqTrainingArguments(
            output_dir="./results/test/",
            per_device_eval_batch_size=32,
            predict_with_generate=True,
            disable_tqdm=False,
        ), 
        eval_dataset=test_dataset,
        tokenizer=tokenizer,
    )

    return eval_trainer

def compute_chrf(predictions, references):
    chrf = evaluate.load("chrf")
    chrf_score = chrf.compute(predictions=predictions, references=references, word_order=2)
    return chrf_score

In [17]:
# read test and benchmark data
test_dataset = prepare_data("./test_split/test.eng_Latn", "./test_split/test.mai_Deva", "test")

# choose random 1200 examples from test dataset for faster inference and evaluation
test_dataset = test_dataset.shuffle(seed=42).select(range(1200))
test_dataset_tokenized = tokenize_dataset(test_dataset, tokenizer)

# generate predictions for english to hindi
eval_trainer = trainer_evaluate(model, tokenizer, test_dataset_tokenized)
test_dataset_mai_pred, test_dataset_mai_lab, _ = eval_trainer.predict(test_dataset_tokenized)

# decode the predictions and references
test_dataset_mai_pred = tokenizer.batch_decode(test_dataset_mai_pred, skip_special_tokens=True)
test_dataset_mai_ref = tokenizer.batch_decode(test_dataset_mai_lab, skip_special_tokens=True)

Map:   0%|          | 0/1200 [00:00<?, ? examples/s]

  eval_trainer = Seq2SeqTrainer(
Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
  with torch.cuda.device(device), torch.cuda.stream(stream), autocast(enabled=autocast_enabled):
Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.


Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Trainer.tokenizer is now deprecated. You should use Tr

In [20]:
# print the predictions and references for comparison
print("Testing results for test dataset\n")
print("Test dataset size:", len(test_dataset))
for i in range(2):
    print("English Text : ", test_dataset[i]['source_text'])
    print("Maithili Reference (Original) : ", test_dataset[i]['target_text'])
    print("Maithili Reference (Decoded): ", test_dataset_mai_ref[i])
    print("Maithili Prediction: ",test_dataset_mai_pred[i])
    print("\n")

# calculate chrF++ score for hindi to maithili overlap
chrf_score_mai = compute_chrf(test_dataset_mai_pred, test_dataset_mai_ref)
print(f"chrF++ score for English-Maithili test data split: {chrf_score_mai['score']}")

Testing results for test dataset

Test dataset size: 1200
English Text :  The car makes Kirby faster, while the dolphin allows him to go underwater.
Maithili Reference (Original) :  कार किर्बीकेँ तेज बनबैत अछि, जखनकि डॉल्फिन ओकरा पानिक नीचाँ जाय दैत अछि।
Maithili Reference (Decoded):  कार किर्बी तेज बनब , नकि डॉल्फिन रा पानिक नीचाँ जाय
Maithili Prediction:  कार किरबी तेजी बनब , नकि फिल्फिन हुनका पातमे जाइत


English Text :  Both teams will have Friday off before the Penguins battle the Rangers and the Devils face the Sabres on Saturday.
Maithili Reference (Original) :  दुनू टीमकेँ शुक्रदिन छुट्टी भेटतनि जेकरा बाद शनिदिन पहिने पेंगुइनक मैच रेंजर्ससँ आ डेविल्सक मैच सबरेजसँ होयत।
Maithili Reference (Decoded):  दुनू टीम ्रदिन ट्टी टतनि रा बाद शनन पने पेंगुइनक म रेंजर् आ ल्सक म सबर हो
Maithili Prediction:  दुनू टीम पेंगुइन ्डर्स ्ध  पने फेंगुइन   आ सॉफ्ट शन् सबर


chrF++ score for English-Maithili test data split: 34.759289318711595


### Part 4) Testing finetuned model's performance on benchmark datasets

In [22]:
# prepare and tokenize the in22 benchmark datasets
in22_mai_test = prepare_data("/kaggle/input/in22-test-data/IN22_test/gen/test.eng_Latn", "/kaggle/input/in22-test-data/IN22_test/gen/test.mai_Deva", "test")
in22_mai_test_tokenized = tokenize_dataset(in22_mai_test, tokenizer)

# generate predictions for english to maithili
eval_trainer = trainer_evaluate(model, tokenizer, in22_mai_test_tokenized)
in22_mai_test_pred, in22_mai_test_lab, _ = eval_trainer.predict(in22_mai_test_tokenized)

# decode the predictions and references
in22_mai_test_pred = tokenizer.batch_decode(in22_mai_test_pred, skip_special_tokens=True)
in22_mai_test_ref = tokenizer.batch_decode(in22_mai_test_lab, skip_special_tokens=True)

Map:   0%|          | 0/1024 [00:00<?, ? examples/s]

  eval_trainer = Seq2SeqTrainer(
Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
  with torch.cuda.device(device), torch.cuda.stream(stream), autocast(enabled=autocast_enabled):
Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.


Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Trainer.tokenizer is now deprecated. You should use Tr

In [23]:
print("Testing results for IN22 benchmark dataset\n")
print("IN22 benchmark dataset size:", len(in22_mai_test))

# print the predictions and references for comparison
for i in range(2):
    print("English Text: ", in22_mai_test[i]["source_text"])
    print("Maithili Reference (Original) : ", in22_mai_test[i]['target_text'])
    print("Maithili Reference (Decoded): ", in22_mai_test_ref[i])
    print("Maithili Prediction: ", in22_mai_test_pred[i])
    print("\n")

# calculate chrF++ score for hindi to maithili overlap
chrf_score_mai = compute_chrf(in22_mai_test_pred, in22_mai_test_ref)
print(f"chrF++ score for English-Maithili IN22 benchmark dataset: {chrf_score_mai['score']}")

Testing results for IN22 benchmark dataset

IN22 benchmark dataset size: 1024
English Text:  An appearance is a bunch of attributes related to the service person, like their shoes, clothes, tie, jewellery, hairstyle, make-up, watch, cosmetics, perfume, etc.
Maithili Reference (Original) :  रूप सर्विसवला व्यक्तिसँ सम्बन्धित बहुत रास लक्षणक समूह होयत छै जेना हुनक जूता, कपड़ा, टाई, गहना, केश, श्रृंगार, घड़ी, प्रसाधन सामग्री, सेंट इत्यादि।
Maithili Reference (Decoded):  रूप सर्ला व्यक् सम्बन् ब रास लक् सम होयत  जेना हुनक ा, कपड़ा, टाई, गहना, , श्रृंगार, ़ी, प्रसाधन सामग्री, सेंट इत्या
Maithili Prediction:  एकटा उपकरण सेवा व्यक्ति सम्बन् एक टा  , जेना हुनकर पाथ, कपड़, ार, गारी, ्पाली, बन्द, ्सा, ्सा, ्सा, आ सम्बन् सम्मिलित


English Text:  Ajanta, located in the Aurangabad District of Maharashtra has twenty-nine caitya and vihara caves decorated with sculptures and paintings from the first century B.C.E. to the fifth century C.E.
Maithili Reference (Original) :  महाराष्ट्रके औरंगाबादमे स्थि