# To reduce amount of GPU VRAM used
1. LoRA + gradient checkpointing + max_len=64 + batch=1

2. Use adamw_bnb_8bit optimizer if available.

3. Turn off predict_with_generate during training (use it only for final evaluation).

## LoRA PEFT
```python
from peft import LoraConfig, get_peft_model

config = LoraConfig(
    r=16,
    lora_alpha=32,
    target_modules=["q_proj", "v_proj"],
    lora_dropout=0.05,
    bias="none",
    task_type="SEQ_2_SEQ_LM"
)
model = get_peft_model(model, config)
```

## Optim adamw_bnb_8bit
```python
args_tr = Seq2SeqTrainingArguments(
    ...
    optim="adamw_bnb_8bit"
)
```

# Translation - Practice

## Load datasets

### IITB EN-HI dataset

In [None]:
from datasets import load_dataset
iitb_en_hi = load_dataset("cfilt/iitb-english-hindi")

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


README.md: 0.00B [00:00, ?B/s]

dataset_infos.json:   0%|          | 0.00/953 [00:00<?, ?B/s]

data/train-00000-of-00001.parquet:   0%|          | 0.00/190M [00:00<?, ?B/s]

data/validation-00000-of-00001.parquet:   0%|          | 0.00/85.7k [00:00<?, ?B/s]

data/test-00000-of-00001.parquet:   0%|          | 0.00/500k [00:00<?, ?B/s]

Generating train split:   0%|          | 0/1659083 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/520 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/2507 [00:00<?, ? examples/s]

### Train dataset and Eval dataset (Subset of IITB EN-HI) {400000 pairs}

In [None]:
from datasets import Dataset
train_en_hi = Dataset.from_dict({"en":list(iitb_en_hi['train']['translation']['en'][1000000:1400000]), "hi":list(iitb_en_hi['train']['translation']['hi'][1000000:1400000])})
valid_en_hi = Dataset.from_dict({"en":list(iitb_en_hi['validation']['translation']['en'][:]), "hi":list(iitb_en_hi['validation']['translation']['hi'][:])})
# test_en_hi = Dataset.from_dict({"en":list(iitb_en_hi['test']['translation']['en'][:]), "hi":list(iitb_en_hi['test']['translation']['hi'][:])})

### Convert to csv to save

In [None]:
train_en_hi=train_en_hi.to_pandas('train_en_hi')
valid_en_hi=valid_en_hi.to_pandas('valid_en_hi')

In [None]:
import pandas as pd
pd.DataFrame(train_en_hi).to_csv('train_en_hi.csv', index=False)
pd.DataFrame(valid_en_hi).to_csv('valid_en_hi.csv', index=False)

### Preprocess function (Old Way)

In [None]:
def preprocess_function(examples, tokenizer, max_length=64):

    inputs = examples["en"]
    targets = examples["hi"]

    model_inputs = tokenizer(inputs, max_length=max_length, truncation=True)

    with tokenizer.as_target_tokenizer():
        labels = tokenizer(targets, max_length=max_length, truncation=True)

    model_inputs["labels"] = labels["input_ids"]
    return model_inputs

### Download MBart-Large-50 model and tokenizer
### Attach Peft Model with base model

In [None]:
!pip install peft -q

In [None]:
from transformers import MBartForConditionalGeneration, MBart50TokenizerFast
from peft import LoraConfig, get_peft_model


model = MBartForConditionalGeneration.from_pretrained("facebook/mbart-large-50")
tokenizer = MBart50TokenizerFast.from_pretrained(
    "facebook/mbart-large-50",
    src_lang="en_XX",
    tgt_lang="hi_IN"
)


model.gradient_checkpointing_enable()


config = LoraConfig(
    r=16,
    lora_alpha=32,
    target_modules=["q_proj", "v_proj"],
    lora_dropout=0.05,
    bias="none",
    task_type="SEQ_2_SEQ_LM"
)


model = get_peft_model(model, config)


model.print_trainable_parameters()


config.json: 0.00B [00:00, ?B/s]

pytorch_model.bin:   0%|          | 0.00/2.44G [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/2.44G [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/261 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/531 [00:00<?, ?B/s]

sentencepiece.bpe.model:   0%|          | 0.00/5.07M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/649 [00:00<?, ?B/s]

### Tokenize Source and Target Languages

In [None]:
tokenizer.src_lang = "en_XX"
tokenizer.tgt_lang = "hi_IN"
train_tokenized = train_en_hi.map(lambda ex: preprocess_function(ex, tokenizer), batched=True)
valid_tokenized = valid_en_hi.map(lambda ex: preprocess_function(ex, tokenizer), batched=True)

Map:   0%|          | 0/600000 [00:00<?, ? examples/s]



Map:   0%|          | 0/520 [00:00<?, ? examples/s]

### Optionally Save to disk

In [None]:
train_tokenized.save_to_disk("train_tokenized")
valid_tokenized.save_to_disk("valid_tokenized")

Saving the dataset (0/2 shards):   0%|          | 0/859083 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/520 [00:00<?, ? examples/s]

### Configure Trainer Arguments

In [None]:
from transformers import Seq2SeqTrainingArguments

out_dir = "outputs/mbart_en_hi"
batch_size = 1              # Reduced from 4 to 2 to fit 8GB GPU safely
lr = 3e-5
num_epochs = 2

args_tr = Seq2SeqTrainingArguments(
    output_dir=out_dir,

    eval_strategy="steps",
    eval_steps=2000,             # Evaluate every 1000 steps (adjust as needed)
    save_steps=2000,             # Save checkpoint every 1000 steps

    save_total_limit=2,
    logging_dir="./logs",
    logging_steps=500,

    per_device_train_batch_size=batch_size,   # 2 to fit 8GB GPU safely
    per_device_eval_batch_size=batch_size,

    gradient_accumulation_steps=8,   # effective batch size 8
    learning_rate=lr,

    num_train_epochs=num_epochs,

    fp16=True,                       # Mixed precision (reduces VRAM usage, speeds up)

    load_best_model_at_end=True,
    metric_for_best_model="bleu",
    greater_is_better=True,

    predict_with_generate=False,      # Needed for BLEU computation

    # Disable multiprocessing workers if system RAM is tight
    dataloader_num_workers=1,
    report_to="none",

    optim="adamw_bnb_8bit"
)


### Define compute_metrics function for evaluation purpose

In [None]:
!pip install sacrebleu -q

[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m51.8/51.8 kB[0m [31m3.1 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m104.1/104.1 kB[0m [31m7.2 MB/s[0m eta [36m0:00:00[0m
[?25h

In [None]:
from sacrebleu import corpus_bleu as metric
def compute_metrics(eval_pred):
    preds, labels = eval_pred
    decoded_preds = tokenizer.batch_decode(preds, skip_special_tokens=True)
    # replace -100 in labels
    labels = [[(l if l != -100 else tokenizer.pad_token_id) for l in label] for label in labels]
    decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)
    bleu = metric.compute(predictions=decoded_preds, references=[[l] for l in decoded_labels])
    return {"bleu": bleu["score"]}

### Define Data Collator function to run on data batches

In [None]:
from transformers import DataCollatorForSeq2Seq

data_collator = DataCollatorForSeq2Seq(tokenizer, model=model)

### Instantiate Trainer for training

In [None]:
from transformers import Seq2SeqTrainer
trainer = Seq2SeqTrainer(
    model=model,
    args=args_tr,
    train_dataset=train_tokenized,
    eval_dataset=valid_tokenized,  # for quick prototyping; replace with real val split
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics
)

  trainer = Seq2SeqTrainer(


### Clean GPU cache and disable WANDB logging

In [None]:
import torch
torch.cuda.empty_cache()

In [None]:
!export WANDB_DISABLED=true

### Run training

In [None]:
trainer.train()

OutOfMemoryError: CUDA out of memory. Tried to allocate 978.00 MiB. GPU 0 has a total capacity of 14.74 GiB of which 800.12 MiB is free. Process 12915 has 13.96 GiB memory in use. Of the allocated memory 13.67 GiB is allocated by PyTorch, and 157.90 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)