In [None]:
import torch
device = torch.device("mps")

import evaluate
import sacrebleu
from tqdm import tqdm

from datasets import Dataset
from peft import LoraConfig, get_peft_model
from transformers import AutoModelForSeq2SeqLM, AutoTokenizer, Seq2SeqTrainer, Seq2SeqTrainingArguments

Collecting evaluate
  Using cached evaluate-0.4.3-py3-none-any.whl.metadata (9.2 kB)
Collecting sacrebleu
  Downloading sacrebleu-2.4.3-py3-none-any.whl.metadata (51 kB)
Collecting portalocker (from sacrebleu)
  Downloading portalocker-3.0.0-py3-none-any.whl.metadata (8.5 kB)
Collecting tabulate>=0.8.9 (from sacrebleu)
  Downloading tabulate-0.9.0-py3-none-any.whl.metadata (34 kB)
Collecting lxml (from sacrebleu)
  Downloading lxml-5.3.0-cp39-cp39-macosx_10_9_universal2.whl.metadata (3.8 kB)
Using cached evaluate-0.4.3-py3-none-any.whl (84 kB)
Downloading sacrebleu-2.4.3-py3-none-any.whl (103 kB)
Downloading tabulate-0.9.0-py3-none-any.whl (35 kB)
Downloading lxml-5.3.0-cp39-cp39-macosx_10_9_universal2.whl (8.1 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m8.1/8.1 MB[0m [31m5.0 MB/s[0m eta [36m0:00:00[0ma [36m0:00:01[0m
[?25hDownloading portalocker-3.0.0-py3-none-any.whl (19 kB)
Installing collected packages: tabulate, portalocker, lxml, sacrebleu, evaluate


  Referenced from: <843938F4-8FEE-3058-B0A3-50B73FAF02AB> /opt/miniconda3/envs/work/lib/python3.9/site-packages/torchvision/image.so
  warn(


### Part 1) Preparing and tokenizing the training datasets


In [54]:
# function to prepare the training and any other dataset
def prepare_data(eng_file_path, mai_file_path, data_type):
    # load the english and maithili texts files
    with open(eng_file_path, "r", encoding="utf-8") as en_file:
        eng_texts = en_file.readlines()

    with open(mai_file_path, "r", encoding="utf-8") as maithili_file:
        mai_texts = maithili_file.readlines()

    assert len(eng_texts) == len(mai_texts), "The number of sentences in both files must be the same."

    # clean the text files
    eng_texts_cleaned = [text.strip() for text in eng_texts]
    mai_texts_cleaned = [text.strip() for text in mai_texts]

    # create the dataset
    data = {
        "source_text": eng_texts_cleaned,
        "target_text": mai_texts_cleaned, 
    }
    dataset = Dataset.from_dict(data)

    # split the dataset into train, validation and test sets
    if data_type == "train":
        train_dataset, temp_dataset = dataset.train_test_split(test_size=0.1).values()
        val_dataset, test_dataset = temp_dataset.train_test_split(test_size=0.5).values()

        print(f"Training set size: {len(train_dataset)}")
        print(f"Validation set size: {len(val_dataset)}")
        print(f"Test set size: {len(test_dataset)}")

        return train_dataset, val_dataset, test_dataset
    elif data_type == "test":
        return dataset

# preprocessor function for tokenizer
def preprocess_function(examples, tokenizer):
    inputs = tokenizer(examples["source_text"], truncation=True, padding="max_length", max_length=128)
    targets = tokenizer(examples["target_text"], truncation=True, padding="max_length", max_length=128)
    inputs["labels"] = targets["input_ids"]
    return inputs

# function to tokenize the data
def tokenize_data(dataset, tokenizer):
    dataset_tokenized = dataset.map(lambda x: preprocess_function(x, tokenizer), batched=True)
    return dataset_tokenized

# function to move the dataset to device
def move_to_device(batch):
    # move each tensor in the batch to the MPS device
    for key in batch:
        batch[key] = torch.tensor(batch[key]).to(device)
    return batch


In [55]:
# prepare the data
train_dataset, val_dataset, test_dataset = prepare_data("./dataset/train/bpcc/train.eng_Latn", "./dataset/train/bpcc/train.mai_Deva", "train")

# tokenize the data
tokenizer = AutoTokenizer.from_pretrained("Helsinki-NLP/opus-mt-en-hi") # load the tokenizer
train_dataset_tokenized = tokenize_data(train_dataset, tokenizer)
val_dataset_tokenized = tokenize_data(val_dataset, tokenizer)
test_dataset_tokenized = tokenize_data(test_dataset, tokenizer)

# # saving dataset to csv (backup)
# train_dataset_tokenized.to_csv("./dataset/training/bpcc/train_dataset.csv")
# val_dataset_tokenized.to_csv("./dataset/training/bpcc/val_dataset.csv")
# test_dataset_tokenized.to_csv("./dataset/training/bpcc/test_dataset.csv")


Training set size: 60892
Validation set size: 3383
Test set size: 3383


Map:   0%|          | 0/60892 [00:00<?, ? examples/s]

Map:   0%|          | 0/3383 [00:00<?, ? examples/s]

Map:   0%|          | 0/3383 [00:00<?, ? examples/s]

In [56]:
# checking the tokenization and vocab subwords
print("Source text: ", train_dataset_tokenized[0]["source_text"])
print("Target text: ", train_dataset_tokenized[0]["target_text"])
print("Source tokens: ", tokenizer.convert_ids_to_tokens(train_dataset_tokenized[0]["input_ids"]))
print("Target tokens: ", tokenizer.convert_ids_to_tokens(train_dataset_tokenized[0]["labels"]))

# save the tokenized output to a text file
with open("dataset/training/tokenized_output.txt", "w", encoding="utf-8") as f:
    f.write(train_dataset_tokenized[0]["source_text"] + "\n")
    f.write(train_dataset_tokenized[0]["target_text"] + "\n")
    f.write(" ".join(tokenizer.convert_ids_to_tokens(train_dataset_tokenized[0]["input_ids"])) + "\n")
    f.write(" ".join(tokenizer.convert_ids_to_tokens(train_dataset_tokenized[0]["labels"])) + "\n")

Source text:  Although he admired Godard's "revolutionary" early phase, he thought his later phase was "alien".
Target text:  यद्यपि ओ गोडार्डक "क्रान्तिकारी" प्रारम्भिक चरणक प्रशंसा कयलनि, मुदा हुनक सोच छल जे हुनक बादक चरण "बेगाना" छल।
Source tokens:  ['▁Although', '▁he', '▁admired', '▁God', 'ard', "'", 's', '▁"', 'r', 'evolutionary', '"', '▁early', '▁phase', ',', '▁he', '▁thought', '▁his', '▁later', '▁phase', '▁was', '▁"', 'alien', '".', '</s>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>',

In [57]:
# move the training and validation dataset to the MPS device
train_dataset_tokenized = train_dataset_tokenized.with_transform(move_to_device)
val_dataset_tokenized = val_dataset_tokenized.with_transform(move_to_device)

### Part 2) Loading the pretrained model and testing its performance on benchmark datasets


In [None]:
# load the model to device
model_name = "Helsinki-NLP/opus-mt-en-hi"
model = AutoModelForSeq2SeqLM.from_pretrained(model_name)
model = model.to(device)

# prepare the benchmark datasets
in22_hin_test = prepare_data("./dataset/test/IN22_test/gen/test.eng_Latn", "./dataset/test/IN22_test/gen/test.hin_Deva", "test")
in22_mai_test = prepare_data("./dataset/test/IN22_test/gen/test.eng_Latn", "./dataset//test/IN22_test/gen/test.mai_Deva", "test")

print("English Text: ", in22_hin_test[0]["source_text"])
print("Hindi Text: ", in22_hin_test[0]["target_text"])
print("Maithili Text: ", in22_mai_test[0]["target_text"])

# tokenize the benchmark datasets
in22_hin_test_tokenized = tokenize_data(in22_hin_test, tokenizer)
in22_mai_test_tokenized = tokenize_data(in22_mai_test, tokenizer)


English Text:  An appearance is a bunch of attributes related to the service person, like their shoes, clothes, tie, jewellery, hairstyle, make-up, watch, cosmetics, perfume, etc.
Hindi Text:  सेवा संबंधी लोगों के लिए भेष कई गुणों का संयोजन है, जैसे कि उनके जूते, कपड़े, टाई, आभूषण, केश शैली, मेक-अप, घड़ी, कॉस्मेटिक, इत्र, आदि।
Maithili Text:  रूप सर्विसवला व्यक्तिसँ सम्बन्धित बहुत रास लक्षणक समूह होयत छै जेना हुनक जूता, कपड़ा, टाई, गहना, केश, श्रृंगार, घड़ी, प्रसाधन सामग्री, सेंट इत्यादि।


Map:   0%|          | 0/1024 [00:00<?, ? examples/s]

Map:   0%|          | 0/1024 [00:00<?, ? examples/s]

In [63]:
#testing the model on the benchmark datasets

# generate predictions for english to hindi
def generate_predictions(model, tokenizer, test_dataset):
    predictions = []
    references = []
    
    model.eval()  # set model to evaluation mode
    for example in tqdm(test_dataset, desc="Generating predictions: ", unit="example"):
        inputs = torch.tensor(example['input_ids']).unsqueeze(0).to(model.device)  # move input to device
        attention_mask = torch.tensor(example['attention_mask']).unsqueeze(0).to(model.device) # move attention mask to device
        
        with torch.no_grad():
            output = model.generate(inputs, attention_mask=attention_mask, max_length=128) # generate the output
        
        prediction = tokenizer.decode(output[0], skip_special_tokens=True) # decode the output
        target = example['target_text']  # if the target is already a string
        
        predictions.append(prediction)
        references.append(target)
    
    return predictions, references

hin_predictions, hin_references = generate_predictions(model, tokenizer, in22_hin_test_tokenized)




In [73]:
hin_predictions

NameError: name 'hin_predictions' is not defined

In [None]:
# Load the chrF++ metric from the evaluate library
chrf = evaluate.load("chrf")

# calculate chrF++ score for english to hindi translation
chrf_score_hin = chrf.compute(predictions=predictions, references=references)
print(f"chrF++ score: {chrf_score_hin['score']}")

# calclate the chrF++ score for hindi to maithili overlap
mai_references = [example["target_text"] for example in in22_mai_test]
chrF_score_mai = chrf.compute(predictions=hin_predictions, references=mai_references)
print(f"chrF++ score: {chrF_score_mai['score']}")

Downloading builder script:   0%|          | 0.00/9.01k [00:00<?, ?B/s]

NameError: name 'hin_predictions' is not defined

### Setting up LoRA


In [None]:


print(f"Before adding LoRA, Parameter Size: {model.num_parameters()}")

# defining the LoRA configuration
lora_config = LoraConfig(
    r=8,  # The rank of the low-rank adaptation
    lora_alpha=16,  # Scaling factor for the LoRA layers
    lora_dropout=0.1,  # Dropout for the LoRA layers
    task_type="SEQ_2_SEQ_LM",
    bias="none",  # You can set bias as 'none', 'all', or 'lora_only'
    target_modules=["q_proj", "v_proj"]  # Specify the target modules
)

# Apply LoRA to the model
model = get_peft_model(model, lora_config)
print(f"After adding LoRA, Parameter Size: {model.num_parameters()}")


Before adding LoRA, Parameter Size: 76381184
After adding LoRA, Parameter Size: 76676096


### Setup training arguments and trainer


In [None]:
# Define training parameters
training_args = Seq2SeqTrainingArguments(
    output_dir="./results",  # Where to save results
    eval_strategy="epoch",  # Evaluate at the end of each epoch
    learning_rate=5e-5,  # Learning rate for fine-tuning
    per_device_train_batch_size=16,  # Batch size (adjust based on GPU memory)
    # gradient_accumulation_steps=2, # backpropagate every 2 steps
    num_train_epochs=1,  # Number of training epochs
    save_steps=1000,  # Save checkpoints after this many steps
    logging_dir="./logs",  # Directory for logs
    logging_steps=100,
    save_total_limit=2,  # Limit number of saved checkpoints
)

trainer = Seq2SeqTrainer(
    model=model,  # LoRA fine-tuned model
    args=training_args,  # Training parameters
    train_dataset=train_dataset_tokenized,  # Tokenized training dataset
    eval_dataset=val_dataset_tokenized,
    tokenizer=tokenizer  # Tokenizer for handling tokenization during training
)


In [None]:
trainer.train()

model.save_pretrained("./finetuned_epoch")
tokenizer.save_pretrained("./finetuned_epoch")


  0%|          | 0/1691 [00:00<?, ?it/s]

{'loss': 6.3508, 'grad_norm': 2.3712217807769775, 'learning_rate': 4.704316972205796e-05, 'epoch': 0.06}
{'loss': 3.9726, 'grad_norm': 1.2145708799362183, 'learning_rate': 4.408633944411591e-05, 'epoch': 0.12}
{'loss': 2.9302, 'grad_norm': 0.5059202313423157, 'learning_rate': 4.112950916617386e-05, 'epoch': 0.18}
{'loss': 2.7166, 'grad_norm': 0.5383468270301819, 'learning_rate': 3.817267888823182e-05, 'epoch': 0.24}
{'loss': 2.599, 'grad_norm': 0.3628119230270386, 'learning_rate': 3.521584861028977e-05, 'epoch': 0.3}
{'loss': 2.5404, 'grad_norm': 0.3662428855895996, 'learning_rate': 3.225901833234772e-05, 'epoch': 0.35}
{'loss': 2.52, 'grad_norm': 0.34807419776916504, 'learning_rate': 2.9302188054405678e-05, 'epoch': 0.41}
{'loss': 2.4881, 'grad_norm': 0.35338684916496277, 'learning_rate': 2.634535777646363e-05, 'epoch': 0.47}
{'loss': 2.4844, 'grad_norm': 0.3414859473705292, 'learning_rate': 2.3388527498521585e-05, 'epoch': 0.53}
{'loss': 2.4701, 'grad_norm': 0.4289063811302185, 'lear



{'loss': 2.4595, 'grad_norm': 0.5130706429481506, 'learning_rate': 1.7474866942637493e-05, 'epoch': 0.65}
{'loss': 2.4485, 'grad_norm': 0.32000532746315, 'learning_rate': 1.4518036664695447e-05, 'epoch': 0.71}
{'loss': 2.4441, 'grad_norm': 0.30320510268211365, 'learning_rate': 1.15612063867534e-05, 'epoch': 0.77}
{'loss': 2.424, 'grad_norm': 0.323080837726593, 'learning_rate': 8.604376108811355e-06, 'epoch': 0.83}
{'loss': 2.4135, 'grad_norm': 0.47815823554992676, 'learning_rate': 5.647545830869308e-06, 'epoch': 0.89}
{'loss': 2.4244, 'grad_norm': 0.3734165132045746, 'learning_rate': 2.6907155529272622e-06, 'epoch': 0.95}


  0%|          | 0/846 [00:00<?, ?it/s]

{'eval_loss': 2.3473432064056396, 'eval_runtime': 158.4219, 'eval_samples_per_second': 42.709, 'eval_steps_per_second': 5.34, 'epoch': 1.0}
{'train_runtime': 4269.1797, 'train_samples_per_second': 12.678, 'train_steps_per_second': 0.396, 'train_loss': 2.8314020121894172, 'epoch': 1.0}


('./fine_tuned_model/tokenizer_config.json',
 './fine_tuned_model/special_tokens_map.json',
 './fine_tuned_model/vocab.json',
 './fine_tuned_model/source.spm',
 './fine_tuned_model/target.spm',
 './fine_tuned_model/added_tokens.json')

In [None]:
# Example input text
input_text = "Penicillin is an effective treatment for syphilis in pregnancy but there is no agreement on which dose or route of delivery is most effective"

# Tokenize the input text
encoded_input = tokenizer(input_text, return_tensors="pt", padding=True, truncation=True, max_length=128).to(device)

# Generate prediction
output_sequences = model.generate(
    input_ids=encoded_input["input_ids"],
    attention_mask=encoded_input["attention_mask"],
    max_length=128,
    num_beams=5,
    early_stopping=True
)

# Decode the generated sequences
predicted_text = tokenizer.decode(output_sequences[0], skip_special_tokens=True)

print(predicted_text)

प्राक प्राक सार्रा सारारा सारारारा स्रारा स्रारारा स्रारारा स्रारारारारारारा क स्रारारारारारारारारारारारा स स्रारारारा क स्रा


In [1]:
model

NameError: name 'model' is not defined