In [14]:
import torch
device = torch.device("mps")

import evaluate
import sacrebleu
from tqdm import tqdm

from datasets import Dataset
from peft import LoraConfig, get_peft_model
from transformers import AutoModelForSeq2SeqLM, AutoTokenizer, Seq2SeqTrainer, Seq2SeqTrainingArguments

### Part 1) Preparing and tokenizing the training datasets


In [3]:
# function to prepare the training and any other dataset
def prepare_data(eng_file_path, mai_file_path, data_type):
    # load the english and maithili texts files
    with open(eng_file_path, "r", encoding="utf-8") as en_file:
        eng_texts = en_file.readlines()

    with open(mai_file_path, "r", encoding="utf-8") as maithili_file:
        mai_texts = maithili_file.readlines()

    assert len(eng_texts) == len(mai_texts), "The number of sentences in both files must be the same."

    # clean the text files
    eng_texts_cleaned = [text.strip() for text in eng_texts]
    mai_texts_cleaned = [text.strip() for text in mai_texts]

    # create the dataset
    data = {
        "source_text": eng_texts_cleaned,
        "target_text": mai_texts_cleaned, 
    }
    dataset = Dataset.from_dict(data)

    # split the dataset into train, validation and test sets
    if data_type == "train":
        train_dataset, temp_dataset = dataset.train_test_split(test_size=0.1).values()
        val_dataset, test_dataset = temp_dataset.train_test_split(test_size=0.5).values()

        print(f"Training set size: {len(train_dataset)}")
        print(f"Validation set size: {len(val_dataset)}")
        print(f"Test set size: {len(test_dataset)}")

        return train_dataset, val_dataset, test_dataset
    elif data_type == "test":
        return dataset

# preprocessor function for tokenizer
def preprocess_function(examples, tokenizer):
    inputs = tokenizer(examples["source_text"], truncation=True, padding="max_length", max_length=128)
    targets = tokenizer(examples["target_text"], truncation=True, padding="max_length", max_length=128)
    inputs["labels"] = targets["input_ids"]
    return inputs

# function to tokenize the data
def tokenize_dataset(dataset, tokenizer):
    dataset_tokenized = dataset.map(lambda x: preprocess_function(x, tokenizer), batched=True)
    return dataset_tokenized

# function to move the dataset to device
def move_to_device(batch):
    # move each tensor in the batch to the MPS device
    for key in batch:
        batch[key] = torch.tensor(batch[key]).to(device)
    return batch

def save_dataset(dataset, file_path):
    with open(file_path, "w", encoding="utf-8") as f:
        for line in dataset: 
            f.write(line + "\n")


In [4]:
# prepare the data
train_dataset, val_dataset, test_dataset = prepare_data("./dataset/train/bpcc/train.eng_Latn", "./dataset/train/bpcc/train.mai_Deva", "train")

# tokenize the data
tokenizer = AutoTokenizer.from_pretrained("Helsinki-NLP/opus-mt-en-hi") # load the tokenizer
train_dataset_tokenized = tokenize_dataset(train_dataset, tokenizer)
val_dataset_tokenized = tokenize_dataset(val_dataset, tokenizer)
test_dataset_tokenized = tokenize_dataset(test_dataset, tokenizer)

# # saving dataset to csv (backup)
# save_dataset(train_dataset["source_text"], "./dataset/training_split/bpcc/train.eng_Latn")
# save_dataset(train_dataset["target_text"], "./dataset/training_split/bpcc/train.mai_Deva")
# save_dataset(val_dataset["source_text"], "./dataset/training_split/bpcc/val.eng_Latn")
# save_dataset(val_dataset["target_text"], "./dataset/training_split/bpcc/val.mai_Deva")
save_dataset(test_dataset["source_text"], "./dataset/training_split/bpcc/test.eng_Latn")
save_dataset(test_dataset["target_text"], "./dataset/training_split/bpcc/test.mai_Deva")


Training set size: 60892
Validation set size: 3383
Test set size: 3383


Map:   0%|          | 0/60892 [00:00<?, ? examples/s]

Map:   0%|          | 0/3383 [00:00<?, ? examples/s]

Map:   0%|          | 0/3383 [00:00<?, ? examples/s]

In [5]:
# checking the tokenization and vocab subwords
print("Source text: ", train_dataset_tokenized[0]["source_text"])
print("Target text: ", train_dataset_tokenized[0]["target_text"])
print("Source tokens: ", tokenizer.convert_ids_to_tokens(train_dataset_tokenized[0]["input_ids"]))
print("Target tokens: ", tokenizer.convert_ids_to_tokens(train_dataset_tokenized[0]["labels"]))

# # save the tokenized output to a text file
# with open("dataset/training_split/tokenized_output.txt", "w", encoding="utf-8") as f:
#     f.write(train_dataset_tokenized[0]["source_text"] + "\n")
#     f.write(train_dataset_tokenized[0]["target_text"] + "\n")
#     f.write(" ".join(tokenizer.convert_ids_to_tokens(train_dataset_tokenized[0]["input_ids"])) + "\n")
#     f.write(" ".join(tokenizer.convert_ids_to_tokens(train_dataset_tokenized[0]["labels"])) + "\n")

Source text:  The Division has been held by two generations of the Jenkins family.
Target text:  ई प्रभाग जेनकिन्स परिवारक दू पीढ़ी द्वारा चलाओल गेल अछि।
Source tokens:  ['▁The', '▁Division', '▁has', '▁been', '▁held', '▁by', '▁two', '▁generations', '▁of', '▁the', '▁J', 'en', 'kin', 's', '▁family', '.', '</s>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pa

### Part 2) Loading the pretrained model and testing its performance on benchmark datasets


In [63]:
# load the model
model_name = "Helsinki-NLP/opus-mt-en-hi"
model = AutoModelForSeq2SeqLM.from_pretrained(model_name)
print("Parameter Size (pretrained): ", model.num_parameters())

# prepare the in22 benchmark datasets
in22_hin_test = prepare_data("./dataset/test/IN22_test/gen/test.eng_Latn", "./dataset/test/IN22_test/gen/test.hin_Deva", "test")
in22_mai_test = prepare_data("./dataset/test/IN22_test/gen/test.eng_Latn", "./dataset//test/IN22_test/gen/test.mai_Deva", "test")

print("English Text: ", in22_hin_test[0]["source_text"])
print("Hindi Text: ", in22_hin_test[0]["target_text"])
print("Maithili Text: ", in22_mai_test[0]["target_text"])

# tokenize the in22 benchmark datasets
in22_hin_test_tokenized = tokenize_dataset(in22_hin_test, tokenizer)
in22_mai_test_tokenized = tokenize_dataset(in22_mai_test, tokenizer)


Parameter Size (pretrained):  76381184
English Text:  An appearance is a bunch of attributes related to the service person, like their shoes, clothes, tie, jewellery, hairstyle, make-up, watch, cosmetics, perfume, etc.
Hindi Text:  सेवा संबंधी लोगों के लिए भेष कई गुणों का संयोजन है, जैसे कि उनके जूते, कपड़े, टाई, आभूषण, केश शैली, मेक-अप, घड़ी, कॉस्मेटिक, इत्र, आदि।
Maithili Text:  रूप सर्विसवला व्यक्तिसँ सम्बन्धित बहुत रास लक्षणक समूह होयत छै जेना हुनक जूता, कपड़ा, टाई, गहना, केश, श्रृंगार, घड़ी, प्रसाधन सामग्री, सेंट इत्यादि।


Map:   0%|          | 0/1024 [00:00<?, ? examples/s]

Map:   0%|          | 0/1024 [00:00<?, ? examples/s]

In [7]:
#testing the pretrained model on the in22 benchmark datasets

# generate predictions for english to hindi
def generate_predictions(model, tokenizer, test_dataset):
    predictions = []
    references = []
    
    model.eval()  # set model to evaluation mode
    for example in tqdm(test_dataset, desc="Generating predictions: ", unit="example"):
        inputs = torch.tensor(example['input_ids']).unsqueeze(0).to(model.device)  # move input to device
        attention_mask = torch.tensor(example['attention_mask']).unsqueeze(0).to(model.device) # move attention mask to device
        
        with torch.no_grad():
            output = model.generate(inputs, attention_mask=attention_mask, max_length=128) # generate the output
        
        prediction = tokenizer.decode(output[0], skip_special_tokens=True) # decode the output
        target = example['target_text']  # if the target is already a string
        
        predictions.append(prediction)
        references.append(target)
    
    return predictions, references

# define trainer for evaluation
def trainer_evaluate(model, tokenizer, test_dataset):

    eval_trainer = Seq2SeqTrainer(
        model=model,
        args = Seq2SeqTrainingArguments(
            output_dir="./results/test/",
            per_device_eval_batch_size=32,
            predict_with_generate=True,
            disable_tqdm=False,
        ), 
        eval_dataset=test_dataset,
        tokenizer=tokenizer,
    )

    return eval_trainer

def compute_chrf(predictions, references):
    chrf = evaluate.load("chrf")
    chrf_score = chrf.compute(predictions=predictions, references=references, word_order=2)
    return chrf_score


In [8]:
# move the model and in22 testdata to the MPS device
model = model.to(device)
in22_hin_test_tokenized = in22_hin_test_tokenized.with_transform(move_to_device)

# generate predictions for english to hindi
eval_trainer = trainer_evaluate(model, tokenizer, in22_hin_test_tokenized)
hin_pred, hin_lab, _ = eval_trainer.predict(in22_hin_test_tokenized)

# decode the predictions and references
hin_pred = tokenizer.batch_decode(hin_pred, skip_special_tokens=True)
hin_ref = tokenizer.batch_decode(hin_lab, skip_special_tokens=True)

  0%|          | 0/32 [00:00<?, ?it/s]

In [9]:
# print the predictions and references for comparison
for i in range(2):
    print("English Text: ", in22_hin_test[i]["source_text"])
    print("Hindi Reference: ", hin_pred[i])
    print("Hindi Prediction: ", hin_ref[i])
    print("\n")

# calculate chrF++ score for english to hindi translation
chrf_score_hin = compute_chrf(hin_pred, hin_ref)
print(f"chrF++ score for English-Hindi translation: {chrf_score_hin['score']}")

# calculate chrF++ score for hindi to maithili overlap
mai_ref = in22_mai_test["target_text"]
chrf_score_mai = compute_chrf(hin_ref, mai_ref)
print(f"chrF++ score for Hindi-Maithili overlap: {chrf_score_mai['score']}")


English Text:  An appearance is a bunch of attributes related to the service person, like their shoes, clothes, tie, jewellery, hairstyle, make-up, watch, cosmetics, perfume, etc.
Hindi Reference:  एक फैशन सेवा व्यक्ति से संबंधित गुणों का एक गुच्छा है, उनके जूते, कपड़े, टाई, बाल शैली, बनाएँ- अप, उपभोग, इत्र, इत्र, इत्यादि.
Hindi Prediction:  सेवा संबंधी लोगों के लिए  कई गों का संयोजन है, जैसे कि उनके , कपड़े, टाई, आ,  शैली, मेक-अप, ़ी, कॉस्मेटिक, इत्र, आ


English Text:  Ajanta, located in the Aurangabad District of Maharashtra has twenty-nine caitya and vihara caves decorated with sculptures and paintings from the first century B.C.E. to the fifth century C.E.
Hindi Reference:  पू. पहली सदी से लेकर सा. यु.
Hindi Prediction:  महाराष्ट्र के औरंगाबाद जिले में स्थित ंता में उन्तीस ्य और ार गुफाएँ हैं जो पहली ाब्दी ई.पू. से ले कर पाीं ाब्दी ्वी तक की मूर


chrF++ score for English-Hindi translation: 27.315089809805986
chrF++ score for Hindi-Maithili overlap: 21.939127134885133


### Part 3) Setting up LoRA and fine tuning the pretrained model


In [64]:
# funtion to configure LoRA
def configure_lora(rank, lora_alpha, lora_dropout, task_type, bias, target_modules):

    lora_config = LoraConfig(
        r=rank,  # rank of the low-rank adaptation
        lora_alpha=lora_alpha,  # scaling factor for the LoRA layers
        lora_dropout=lora_dropout,  # dropout for the LoRA layers
        task_type=task_type, # task type
        bias=bias,  # set bias as 'none', 'all', or 'lora_only'
        target_modules=target_modules  # specify the target modules
    )

    return lora_config

# function to define training arguments
def train_args(output_dir, eval_stra, learning_r, batch_size, grad_step, num_train_epochs, 
            save_steps, logging_dir, logging_steps, save_total_limit):

    training_args = Seq2SeqTrainingArguments(
        output_dir=output_dir,  # directory to save results
        eval_strategy=eval_stra,  # strategy for evaluation
        learning_rate=learning_r,  # learnin rate for fine-tuning
        per_device_train_batch_size=batch_size,  # batch size
        gradient_accumulation_steps= grad_step, # step for backpropagation
        num_train_epochs=num_train_epochs,  # number of training epochs
        save_steps=save_steps,  # save checkpoints after this many steps
        logging_dir=logging_dir,  # directory for logs
        logging_steps=logging_steps, # log after this many steps
        save_total_limit=save_total_limit,  # limit number of saved checkpoints
    )

    return training_args

# function to define the training trainer
def trainer_train(model, training_args, train_dataset, val_dataset, tokenizer):

    trainer = Seq2SeqTrainer(
        model=model,  # model to be fine-tuned
        args=training_args,  # training arguments
        train_dataset=train_dataset,  # tokenized training dataset
        eval_dataset=val_dataset, # tokenized validation dataset
        tokenizer=tokenizer  # tokenizer for tokenization
    )

    return trainer

# function to print the trainable parameters
def get_trainable_parameters(model):
    trainable_params = 0
    all_param = 0
    for _, param in model.named_parameters():
        all_param += param.numel()
        if param.requires_grad:
            trainable_params += param.numel()
    print(
        f"After adding LoRA: Trainable Params: {trainable_params}, All Params: {all_param},  Trainable %: {100 * trainable_params / all_param:.2f}"
    )

In [None]:
# configure LoRA
print(f"Before adding LoRA, All Params: {model.num_parameters()}")

lora_config = configure_lora(16, 32, 0.1, "SEQ_2_SEQ_LM", "lora_only", ["q_proj", "k_proj", "v_proj"])
lora_model = get_peft_model(model, lora_config)
get_trainable_parameters(model)

# define training arguments and trainer
training_args = train_args("./results/train/epoch2", "epoch", 5e-5, 16, 2, 2, 1000, "./logs", 500, 2)
train_dataset_tokenized = train_dataset_tokenized.with_transform(move_to_device)
val_dataset_tokenized = val_dataset_tokenized.with_transform(move_to_device)
train_trainer = trainer_train(lora_model, training_args, train_dataset_tokenized, val_dataset_tokenized, tokenizer)

# train the model

print("Finetuning the model using LoRA...")
train_trainer.train()

# save the model
lora_model.save_pretrained("./finetuned/epoch2")
tokenizer.save_pretrained("./finetuned/epoch2")

Before adding LoRA, All Params: 76381184
After adding LoRA: Trainable Params: 912384, All Params: 77265920,  Trainable %: 1.18
Finetuning the model using LoRA...


  0%|          | 0/3806 [00:00<?, ?it/s]

{'loss': 3.2021, 'grad_norm': 0.3349359631538391, 'learning_rate': 4.343142406726222e-05, 'epoch': 0.26}
{'loss': 2.3476, 'grad_norm': 0.4420273005962372, 'learning_rate': 3.686284813452444e-05, 'epoch': 0.53}




{'loss': 2.2809, 'grad_norm': 0.6663594841957092, 'learning_rate': 3.0294272201786656e-05, 'epoch': 0.79}


  0%|          | 0/423 [00:00<?, ?it/s]

{'eval_loss': 2.198612928390503, 'eval_runtime': 70.1139, 'eval_samples_per_second': 48.25, 'eval_steps_per_second': 6.033, 'epoch': 1.0}
{'loss': 2.2413, 'grad_norm': 0.5167800188064575, 'learning_rate': 2.3725696269048873e-05, 'epoch': 1.05}
{'loss': 2.2147, 'grad_norm': 0.7370030283927917, 'learning_rate': 1.7157120336311088e-05, 'epoch': 1.31}
{'loss': 2.2063, 'grad_norm': 0.5104997754096985, 'learning_rate': 1.0588544403573306e-05, 'epoch': 1.58}
{'loss': 2.1934, 'grad_norm': 0.5723661184310913, 'learning_rate': 4.019968470835523e-06, 'epoch': 1.84}




  0%|          | 0/423 [00:00<?, ?it/s]

{'eval_loss': 2.1613428592681885, 'eval_runtime': 70.0914, 'eval_samples_per_second': 48.266, 'eval_steps_per_second': 6.035, 'epoch': 2.0}
{'train_runtime': 7020.9441, 'train_samples_per_second': 17.346, 'train_steps_per_second': 0.542, 'train_loss': 2.36861842642066, 'epoch': 2.0}


('./finetuned/epoch2/tokenizer_config.json',
 './finetuned/epoch2/special_tokens_map.json',
 './finetuned/epoch2/vocab.json',
 './finetuned/epoch2/source.spm',
 './finetuned/epoch2/target.spm',
 './finetuned/epoch2/added_tokens.json')

In [None]:
# after finetuning, merge LoRA weights into the model
print("Merging LoRA weights into the base model...")
lora_model.merge_lora_weights()

# save the model with merged weights
lora_model.save_pretrained("./finetuned_merged_model/")

# print the number of parameters after merging
print(f"Model parameters after merging: {lora_model.num_parameters()}")