In [136]:
from transformers import AutoModelForSeq2SeqLM, AutoTokenizer, Seq2SeqTrainer, Seq2SeqTrainingArguments
from adapters import BnConfig, AutoAdapterModel
from datasets import Dataset
from peft import LoraConfig, get_peft_model

import torch
device = torch.device("mps")

### Preparing the dataset


In [137]:
# loading the english and maithili texts files
with open("./dataset/train/bpcc/train.eng_Latn", "r", encoding="utf-8") as en_file:
    eng_texts = en_file.readlines()

with open("./dataset/train/bpcc/train.mai_Deva", "r", encoding="utf-8") as maithili_file:
    mai_texts = maithili_file.readlines()

assert len(eng_texts) == len(mai_texts), "The number of sentences in both files must be the same."

# cleaning the text files
eng_texts_c = [text.strip() for text in eng_texts]
mai_texts_c = [text.strip() for text in mai_texts]

# creating the dataset
data = {
    "source_text": eng_texts_c,
    "target_text": mai_texts_c
}
dataset = Dataset.from_dict(data)

# creating the train, val, test set
train_dataset, temp_dataset = dataset.train_test_split(test_size=0.2).values()
val_dataset, test_dataset = temp_dataset.train_test_split(test_size=0.5).values()

print(f"Training set size: {len(train_dataset)}")
print(f"Validation set size: {len(val_dataset)}")
print(f"Test set size: {len(test_dataset)}")


Training set size: 54126
Validation set size: 6766
Test set size: 6766


### Tokenizing the data


In [None]:
# loading the pretrained tokenizer
tokenizer = AutoTokenizer.from_pretrained("Helsinki-NLP/opus-mt-en-hi")

# defining the preprocessing function
def preprocess_function(examples):
    inputs = tokenizer(examples["source_text"], truncation=True, padding="max_length", max_length=128)
    targets = tokenizer(examples["target_text"], truncation=True, padding="max_length", max_length=128)
    inputs["labels"] = targets["input_ids"]
    return inputs

# applying preprocessing function to each split
train_dataset_tokenized = train_dataset.map(preprocess_function, batched=True)
val_dataset_tokenized = val_dataset.map(preprocess_function, batched=True)
test_dataset_tokenized = test_dataset.map(preprocess_function, batched=True)

# saving dataset to csv (backup)
train_dataset_tokenized.to_csv("./dataset/training/bpcc/train_dataset.csv")
val_dataset_tokenized.to_csv("./dataset/training/bpcc/val_dataset.csv")
test_dataset_tokenized.to_csv("./dataset/training/bpcc/test_dataset.csv")

Map:   0%|          | 0/54126 [00:00<?, ? examples/s]

Map:   0%|          | 0/6766 [00:00<?, ? examples/s]

Map:   0%|          | 0/6766 [00:00<?, ? examples/s]

Creating CSV from Arrow format:   0%|          | 0/55 [00:00<?, ?ba/s]

Creating CSV from Arrow format:   0%|          | 0/7 [00:00<?, ?ba/s]

Creating CSV from Arrow format:   0%|          | 0/7 [00:00<?, ?ba/s]

14677570

In [139]:
# moving the dataset to MPS (for training purposes)
def move_to_device(batch):
    # move each tensor in the batch to the MPS device
    for key in batch:
        batch[key] = torch.tensor(batch[key]).to(device)
    return batch

train_dataset_tokenized = train_dataset_tokenized.with_transform(move_to_device)
val_dataset_tokenized = val_dataset_tokenized.with_transform(move_to_device)

### Setting up LoRA


In [140]:
model_name = "Helsinki-NLP/opus-mt-en-hi"
model = AutoModelForSeq2SeqLM.from_pretrained(model_name)
model = model.to(device)

print(f"Before adding LoRA, Parameter Size: {model.num_parameters()}")

# defining the LoRA configuration
lora_config = LoraConfig(
    r=8,  # The rank of the low-rank adaptation
    lora_alpha=16,  # Scaling factor for the LoRA layers
    lora_dropout=0.1,  # Dropout for the LoRA layers
    task_type="SEQ_2_SEQ_LM",
    bias="none",  # You can set bias as 'none', 'all', or 'lora_only'
    target_modules=["q_proj", "v_proj"]  # Specify the target modules
)

# Apply LoRA to the model
model = get_peft_model(model, lora_config)
print(f"After adding LoRA, Parameter Size: {model.num_parameters()}")


Before adding LoRA, Parameter Size: 76381184
After adding LoRA, Parameter Size: 76676096


### Setup training arguments and trainer


In [None]:
# Define training parameters
training_args = Seq2SeqTrainingArguments(
    output_dir="./results",  # Where to save results
    eval_strategy="epoch",  # Evaluate at the end of each epoch
    learning_rate=5e-5,  # Learning rate for fine-tuning
    per_device_train_batch_size=16,  # Batch size (adjust based on GPU memory)
    gradient_accumulation_steps=2,
    num_train_epochs=1,  # Number of training epochs
    save_steps=1000,  # Save checkpoints after this many steps
    logging_dir="./logs",  # Directory for logs
    logging_steps=100,
    save_total_limit=2,  # Limit number of saved checkpoints
)

trainer = Seq2SeqTrainer(
    model=model,  # LoRA fine-tuned model
    args=training_args,  # Training parameters
    train_dataset=train_dataset_tokenized,  # Tokenized training dataset
    eval_dataset=val_dataset_tokenized,
    tokenizer=tokenizer  # Tokenizer for handling tokenization during training
)


In [None]:
trainer.train()

model.save_pretrained("./fine_tuned_model")
tokenizer.save_pretrained("./fine_tuned_model")


  0%|          | 0/1480 [00:00<?, ?it/s]

{'loss': 6.344, 'grad_norm': 2.259830951690674, 'learning_rate': 4.662162162162162e-05, 'epoch': 0.07}
{'loss': 4.0051, 'grad_norm': 1.1423373222351074, 'learning_rate': 4.324324324324325e-05, 'epoch': 0.14}
{'loss': 2.992, 'grad_norm': 0.47160857915878296, 'learning_rate': 3.986486486486487e-05, 'epoch': 0.2}
{'loss': 2.7376, 'grad_norm': 0.4085295796394348, 'learning_rate': 3.648648648648649e-05, 'epoch': 0.27}
{'loss': 2.6265, 'grad_norm': 0.39469751715660095, 'learning_rate': 3.310810810810811e-05, 'epoch': 0.34}
{'loss': 2.5549, 'grad_norm': 0.3498665392398834, 'learning_rate': 2.9729729729729733e-05, 'epoch': 0.41}
{'loss': 2.5244, 'grad_norm': 0.3561296761035919, 'learning_rate': 2.635135135135135e-05, 'epoch': 0.47}
{'loss': 2.4922, 'grad_norm': 0.34938687086105347, 'learning_rate': 2.2972972972972976e-05, 'epoch': 0.54}
{'loss': 2.5033, 'grad_norm': 0.3200814425945282, 'learning_rate': 1.9594594594594595e-05, 'epoch': 0.61}
{'loss': 2.4628, 'grad_norm': 0.39211922883987427, 'l



{'loss': 2.4606, 'grad_norm': 0.3618454039096832, 'learning_rate': 1.2837837837837838e-05, 'epoch': 0.74}
{'loss': 2.4286, 'grad_norm': 0.36493194103240967, 'learning_rate': 9.45945945945946e-06, 'epoch': 0.81}
{'loss': 2.4656, 'grad_norm': 0.2945733964443207, 'learning_rate': 6.081081081081082e-06, 'epoch': 0.88}
{'loss': 2.4524, 'grad_norm': 0.3679347038269043, 'learning_rate': 2.702702702702703e-06, 'epoch': 0.95}


ValueError: Trainer: evaluation requires an eval_dataset.