In [10]:
from transformers import AutoModelForSeq2SeqLM, AutoTokenizer, Seq2SeqTrainer, Seq2SeqTrainingArguments
from adapters import BnConfig, AutoAdapterModel
from datasets import Dataset
from peft import LoraConfig, get_peft_model

import torch
device = torch.device("mps")

### Preparing the dataset


In [11]:
# loading the english and maithili texts files
with open("./dataset/train/bpcc/train.eng_Latn", "r", encoding="utf-8") as en_file:
    eng_texts = en_file.readlines()

with open("./dataset/train/bpcc/train.mai_Deva", "r", encoding="utf-8") as maithili_file:
    mai_texts = maithili_file.readlines()

assert len(eng_texts) == len(mai_texts), "The number of sentences in both files must be the same."

# cleaning the text files
eng_texts_c = [text.strip() for text in eng_texts]
mai_texts_c = [text.strip() for text in mai_texts]

# creating the dataset
data = {
    "source_text": eng_texts_c,
    "target_text": mai_texts_c
}
dataset = Dataset.from_dict(data)

# creating the train, val, test set
train_dataset, temp_dataset = dataset.train_test_split(test_size=0.1).values()
val_dataset, test_dataset = temp_dataset.train_test_split(test_size=0.5).values()

print(f"Training set size: {len(train_dataset)}")
print(f"Validation set size: {len(val_dataset)}")
print(f"Test set size: {len(test_dataset)}")


Training set size: 60892
Validation set size: 3383
Test set size: 3383


### Tokenizing the data


In [9]:
# loading the pretrained tokenizer
tokenizer = AutoTokenizer.from_pretrained("Helsinki-NLP/opus-mt-en-hi")

# defining the preprocessing function
def preprocess_function(examples):
    inputs = tokenizer(examples["source_text"], truncation=True, padding="max_length", max_length=128)
    targets = tokenizer(examples["target_text"], truncation=True, padding="max_length", max_length=128)
    inputs["labels"] = targets["input_ids"]
    return inputs

# applying preprocessing function to each split
train_dataset_tokenized = train_dataset.map(preprocess_function, batched=True)
val_dataset_tokenized = val_dataset.map(preprocess_function, batched=True)
test_dataset_tokenized = test_dataset.map(preprocess_function, batched=True)

# saving dataset to csv (backup)
train_dataset_tokenized.to_csv("./dataset/training/bpcc/train_dataset.csv")
val_dataset_tokenized.to_csv("./dataset/training/bpcc/val_dataset.csv")
test_dataset_tokenized.to_csv("./dataset/training/bpcc/test_dataset.csv")

Map:   0%|          | 0/60892 [00:00<?, ? examples/s]

Map:   0%|          | 0/3383 [00:00<?, ? examples/s]

Map:   0%|          | 0/3383 [00:00<?, ? examples/s]

Creating CSV from Arrow format:   0%|          | 0/61 [00:00<?, ?ba/s]

Creating CSV from Arrow format:   0%|          | 0/4 [00:00<?, ?ba/s]

Creating CSV from Arrow format:   0%|          | 0/4 [00:00<?, ?ba/s]

7350853

In [143]:
# moving the dataset to MPS (for training purposes)
def move_to_device(batch):
    # move each tensor in the batch to the MPS device
    for key in batch:
        batch[key] = torch.tensor(batch[key]).to(device)
    return batch

train_dataset_tokenized = train_dataset_tokenized.with_transform(move_to_device)
val_dataset_tokenized = val_dataset_tokenized.with_transform(move_to_device)

### Setting up LoRA


In [144]:
model_name = "Helsinki-NLP/opus-mt-en-hi"
model = AutoModelForSeq2SeqLM.from_pretrained(model_name)
model = model.to(device)

print(f"Before adding LoRA, Parameter Size: {model.num_parameters()}")

# defining the LoRA configuration
lora_config = LoraConfig(
    r=8,  # The rank of the low-rank adaptation
    lora_alpha=16,  # Scaling factor for the LoRA layers
    lora_dropout=0.1,  # Dropout for the LoRA layers
    task_type="SEQ_2_SEQ_LM",
    bias="none",  # You can set bias as 'none', 'all', or 'lora_only'
    target_modules=["q_proj", "v_proj"]  # Specify the target modules
)

# Apply LoRA to the model
model = get_peft_model(model, lora_config)
print(f"After adding LoRA, Parameter Size: {model.num_parameters()}")


Before adding LoRA, Parameter Size: 76381184
After adding LoRA, Parameter Size: 76676096


### Setup training arguments and trainer


In [None]:
# Define training parameters
training_args = Seq2SeqTrainingArguments(
    output_dir="./results",  # Where to save results
    eval_strategy="epoch",  # Evaluate at the end of each epoch
    learning_rate=5e-5,  # Learning rate for fine-tuning
    per_device_train_batch_size=16,  # Batch size (adjust based on GPU memory)
    # gradient_accumulation_steps=2, # backpropagate every 2 steps
    num_train_epochs=1,  # Number of training epochs
    save_steps=1000,  # Save checkpoints after this many steps
    logging_dir="./logs",  # Directory for logs
    logging_steps=100,
    save_total_limit=2,  # Limit number of saved checkpoints
)

trainer = Seq2SeqTrainer(
    model=model,  # LoRA fine-tuned model
    args=training_args,  # Training parameters
    train_dataset=train_dataset_tokenized,  # Tokenized training dataset
    eval_dataset=val_dataset_tokenized,
    tokenizer=tokenizer  # Tokenizer for handling tokenization during training
)


In [None]:
trainer.train()

model.save_pretrained("./finetuned_epoch")
tokenizer.save_pretrained("./finetuned_epoch")


  0%|          | 0/1691 [00:00<?, ?it/s]

{'loss': 6.3508, 'grad_norm': 2.3712217807769775, 'learning_rate': 4.704316972205796e-05, 'epoch': 0.06}
{'loss': 3.9726, 'grad_norm': 1.2145708799362183, 'learning_rate': 4.408633944411591e-05, 'epoch': 0.12}
{'loss': 2.9302, 'grad_norm': 0.5059202313423157, 'learning_rate': 4.112950916617386e-05, 'epoch': 0.18}
{'loss': 2.7166, 'grad_norm': 0.5383468270301819, 'learning_rate': 3.817267888823182e-05, 'epoch': 0.24}
{'loss': 2.599, 'grad_norm': 0.3628119230270386, 'learning_rate': 3.521584861028977e-05, 'epoch': 0.3}
{'loss': 2.5404, 'grad_norm': 0.3662428855895996, 'learning_rate': 3.225901833234772e-05, 'epoch': 0.35}
{'loss': 2.52, 'grad_norm': 0.34807419776916504, 'learning_rate': 2.9302188054405678e-05, 'epoch': 0.41}
{'loss': 2.4881, 'grad_norm': 0.35338684916496277, 'learning_rate': 2.634535777646363e-05, 'epoch': 0.47}
{'loss': 2.4844, 'grad_norm': 0.3414859473705292, 'learning_rate': 2.3388527498521585e-05, 'epoch': 0.53}
{'loss': 2.4701, 'grad_norm': 0.4289063811302185, 'lear



{'loss': 2.4595, 'grad_norm': 0.5130706429481506, 'learning_rate': 1.7474866942637493e-05, 'epoch': 0.65}
{'loss': 2.4485, 'grad_norm': 0.32000532746315, 'learning_rate': 1.4518036664695447e-05, 'epoch': 0.71}
{'loss': 2.4441, 'grad_norm': 0.30320510268211365, 'learning_rate': 1.15612063867534e-05, 'epoch': 0.77}
{'loss': 2.424, 'grad_norm': 0.323080837726593, 'learning_rate': 8.604376108811355e-06, 'epoch': 0.83}
{'loss': 2.4135, 'grad_norm': 0.47815823554992676, 'learning_rate': 5.647545830869308e-06, 'epoch': 0.89}
{'loss': 2.4244, 'grad_norm': 0.3734165132045746, 'learning_rate': 2.6907155529272622e-06, 'epoch': 0.95}


  0%|          | 0/846 [00:00<?, ?it/s]

{'eval_loss': 2.3473432064056396, 'eval_runtime': 158.4219, 'eval_samples_per_second': 42.709, 'eval_steps_per_second': 5.34, 'epoch': 1.0}
{'train_runtime': 4269.1797, 'train_samples_per_second': 12.678, 'train_steps_per_second': 0.396, 'train_loss': 2.8314020121894172, 'epoch': 1.0}


('./fine_tuned_model/tokenizer_config.json',
 './fine_tuned_model/special_tokens_map.json',
 './fine_tuned_model/vocab.json',
 './fine_tuned_model/source.spm',
 './fine_tuned_model/target.spm',
 './fine_tuned_model/added_tokens.json')

In [None]:
# Example input text
input_text = "Penicillin is an effective treatment for syphilis in pregnancy but there is no agreement on which dose or route of delivery is most effective"

# Tokenize the input text
encoded_input = tokenizer(input_text, return_tensors="pt", padding=True, truncation=True, max_length=128).to(device)

# Generate prediction
output_sequences = model.generate(
    input_ids=encoded_input["input_ids"],
    attention_mask=encoded_input["attention_mask"],
    max_length=128,
    num_beams=5,
    early_stopping=True
)

# Decode the generated sequences
predicted_text = tokenizer.decode(output_sequences[0], skip_special_tokens=True)

print(predicted_text)

प्राक प्राक सार्रा सारारा सारारारा स्रारा स्रारारा स्रारारा स्रारारारारारारा क स्रारारारारारारारारारारारा स स्रारारारा क स्रा


In [1]:
model

NameError: name 'model' is not defined