# Fine tune Mbart-50-large fro en2hi translation task

### Install required libraries

In [2]:
!pip install -q bitsandbytes peft evaluate datasets 

In [None]:
!pip install -q git+https://github.com/huggingface/transformers.git

## Load and configure models

### Load base model and tokenizer

In [2]:
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
import bitsandbytes as bnb
tokenizer = AutoTokenizer.from_pretrained("facebook/mbart-large-50", src_lang="en_XX", tgt_lang="hi_IN")
base_model = AutoModelForSeq2SeqLM.from_pretrained("facebook/mbart-large-50")

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/531 [00:00<?, ?B/s]

config.json: 0.00B [00:00, ?B/s]

sentencepiece.bpe.model:   0%|          | 0.00/5.07M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/649 [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/2.44G [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/2.44G [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/261 [00:00<?, ?B/s]

### Configure LORA for quantization and attach Peft model with base model

In [None]:
from peft import LoftQConfig, LoraConfig, get_peft_model

loftq_config = LoftQConfig(loftq_bits=4)
lora_config = LoraConfig(r=12,target_modules=["q_proj", "v_proj"], lora_alpha=16, task_type="SEQ_2_SEQ_LM", init_lora_weights="loftq", loftq_config=loftq_config, use_rslora=True)
peft_model = get_peft_model(base_model, lora_config)
peft_model.config.use_cache = False
# peft_model.config.pretraining_tp = 1

### save loaded models (Optional)

In [5]:
base_model.save_pretrained("/content/drive/MyDrive/mbart-large-50-en-hi/base_model")
tokenizer.save_pretrained("/content/drive/MyDrive/mbart-large-50-en-hi/tokenizer")
peft_model.save_pretrained("/content/drive/MyDrive/mbart-large-50-en-hi/peft_model")

### Load Peft model from checkpoint directory (Optional)

In [3]:
from peft import PeftModel
peft_model = PeftModel.from_pretrained(base_model, "/content/drive/MyDrive/checkpoint-6500")

### List trainable paramters of peft model

In [6]:
peft_model.print_trainable_parameters()

trainable params: 0 || all params: 612,648,960 || trainable%: 0.0000


## Configure Training Arguments

In [None]:
from multiprocessing import log_to_stderr
from transformers import TrainingArguments

training_args = TrainingArguments(
    output_dir="mbart-large-50-en-hi-checkpoints",
    learning_rate=1e-5,
    per_device_train_batch_size=4,
    per_device_eval_batch_size=4,
    num_train_epochs=1,
    weight_decay=0.01,

    do_eval=False,    # eval_strategy="steps", eval_steps=500,


    # load_best_model_at_end=True,
    metric_for_best_model="accuracy", # compute metrics in trainer intialization should return same metric
    greater_is_better=True,
    gradient_accumulation_steps=2,

    # save_strategy="no",  dafault:  save_strategy="steps" save_steps=500 save_total_limit=None

    dataloader_num_workers=2,
    logging_dir='.logs/',
    torch_empty_cache_steps=500,

    report_to="none",
    optim="adamw_bnb_8bit",

)

### Compute metric for evaluation and training

In [None]:
import numpy as np
from evaluate import load
metric = load("accuracy")


def compute_metric(eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)
    return metric.compute(predictions=predictions, references=labels)

## Prepare Data

### Load dataset from disk

In [None]:
from datasets import Dataset
train_dataset = Dataset.from_csv("/content/train_en_hi.csv")
# eval_dataset = Dataset.from_csv("/content/valid_en_hi.csv")

### Filter non empty rows (either src or tgt)

In [None]:
train_dataset = train_dataset.filter(lambda example: all(v is not None for v in example.values()))
# eval_dataset = eval_dataset.filter(lambda example: all(v is not None for v in example.values()))

### Unicode normalization

In [None]:
import unicodedata
def unicode_normalization(sent, type="NFKC"):
  return unicodedata.normalize("NFKC", sent)

train_dataset = train_dataset.map(lambda ex: {"en": unicode_normalization(ex["en"]), "hi": unicode_normalization(ex["hi"])})
# eval_dataset = eval_dataset.map(lambda ex: {"en": unicode_normalization(ex["en"]), "hi": unicode_normalization(ex["hi"])})

In [None]:
print(train_dataset["en"][:2])
print(train_dataset["hi"][:2])

['The company decided to treat the unfunded income as profit.', 'Unguentum is an ointment used to apply on the wound']
['कंपनी ने अनिधिक आय को लाभ के रूप में दर्शाना तय किया। ', 'मरहम एक लेप है जो जखमों पर लगाने के काम आता है. ']


### Filter on the basis of length of both source and target sentences and ratio of lengths of source and of target sentences

In [None]:
def ratio_length_filters(src, tgt, min_len=1, max_len=64, ratio=3.0):
  ls = len(src.split())
  lt = len(tgt.split())
  if ls < min_len or ls > max_len:
    return False
  if lt < min_len or lt > max_len:
    return False
  if max(ls/lt if lt != 0 else 999, lt/ls if ls != 0 else 999) > ratio:
    return False
  return True

train_dataset = train_dataset.filter(lambda ex: ratio_length_filters(ex["en"], ex["hi"]))
# eval_dataset = eval_dataset.filter(lambda ex: ratio_length_filters(ex["en"], ex["hi"]))

Filter:   0%|          | 0/399289 [00:00<?, ? examples/s]

### Deduplication of rows

In [None]:
def deduplication(dataset):
  seen = set()
  out = []
  for pair in dataset:
    if tuple((pair['en'], pair['hi'])) in seen:
      continue
    seen.add(tuple((pair['en'], pair['hi'])))
    out.append(pair)
  return out

train_dataset = Dataset.from_list(deduplication(train_dataset))
# eval_dataset = Dataset.from_list(deduplication(eval_dataset))

In [None]:
print(train_dataset["en"][:2])
print(train_dataset["hi"][:2])
print(train_dataset)

['The company decided to treat the unfunded income as profit.', 'Unguentum is an ointment used to apply on the wound']
Dataset({
    features: ['en', 'hi'],
    num_rows: 382956
})


## Tokenize Data

In [None]:
def preprocess_function(examples, tokenizer, max_length=96):
    inputs = [ex for ex in examples["en"]]
    targets = [ex for ex in examples["hi"]]
    model_inputs = tokenizer(inputs, text_target=targets, max_length=max_length, truncation=True, return_tensors="pt", padding=True)
    return model_inputs

In [None]:
tokenizer.src_lang = "en_XX"
tokenizer.tgt_lang = "hi_IN"
train_dataset = train_dataset.map(lambda ex: preprocess_function(ex, tokenizer), batched=True, remove_columns=["en", "hi"])
# eval_dataset = eval_dataset.map(lambda ex: preprocess_function(ex, tokenizer), batched=True, remove_columns=["en", "hi"])

Map:   0%|          | 0/382956 [00:00<?, ? examples/s]

In [None]:
print(train_dataset.column_names)
# print(eval_dataset.column_names)

['input_ids', 'attention_mask', 'labels']


## Configure Trainer Arguments and Data Collator for data loading

In [None]:
from transformers import Trainer, DataCollatorForSeq2Seq
data_collator = DataCollatorForSeq2Seq(tokenizer, model=peft_model)

trainer = Trainer(
    model=peft_model,
    args=training_args,
    train_dataset=train_dataset,
    # eval_dataset=eval_dataset,
    compute_metrics=compute_metric,
    data_collator=data_collator
)

## Run Training

In [None]:
trainer.train()

Step,Training Loss
500,10.781
1000,9.0443
1500,8.7532
2000,8.6187
2500,8.2118
3000,8.1094
3500,8.0561
4000,8.0856
4500,7.8949
5000,7.7548


KeyboardInterrupt: 

## Save trained model

In [None]:
trainer.model.save_pretrained("mbart-large-50-en-hi-uni")

## Empty RAM Space

In [None]:
del base_model
del peft_model
del tokenizer
del train_dataset
del eval_dataset
del data_collator
del trainer

NameError: name 'base_model' is not defined

# Inference

### Load Models

In [None]:
from peft import PeftModel
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
import bitsandbytes as bnb
tokenizer = AutoTokenizer.from_pretrained("facebook/mbart-large-50", src_lang="en_XX", tgt_lang="hi_IN")
base_model = AutoModelForSeq2SeqLM.from_pretrained("facebook/mbart-large-50")
peft_model = PeftModel.from_pretrained(base_model, "/content/drive/MyDrive/checkpoint-6500")

### Test data

In [19]:
test = [
  {
    "english": "Hello.",
    "hindi": "नमस्ते।"
  },
  {
    "english": "How are you?",
    "hindi": "आप कैसे हैं?"
  },
  {
    "english": "I am fine.",
    "hindi": "मैं ठीक हूँ।"
  },
  {
    "english": "What is your name?",
    "hindi": "आपका नाम क्या है?"
  },
  {
    "english": "My name is John.",
    "hindi": "मेरा नाम जॉन है।"
  },
  {
    "english": "Where are you going?",
    "hindi": "आप कहाँ जा रहे हैं?"
  },
  {
    "english": "I am going home.",
    "hindi": "मैं घर जा रहा हूँ।"
  },
  {
    "english": "Thank you.",
    "hindi": "धन्यवाद।"
  },
  {
    "english": "Good morning.",
    "hindi": "शुभ प्रभात।"
  },
  {
    "english": "Good night.",
    "hindi": "शुभ रात्रि।"
  },
  {
    "english": "Please help me.",
    "hindi": "कृपया मेरी मदद करें।"
  },
  {
    "english": "What time is it?",
    "hindi": "अभी क्या समय हुआ है?"
  },
  {
    "english": "I don't understand.",
    "hindi": "मुझे समझ में नहीं आया।"
  },
  {
    "english": "Can you speak English?",
    "hindi": "क्या आप अंग्रेज़ी बोल सकते हैं?"
  }
]

### Get predictions

In [21]:
for d in test:
    src =  d['english'].strip().lower()
    tgt = d['hindi']
    src_tokens = tokenizer(src, return_tensors="pt", padding=True, truncation=True).to(peft_model.device)
    translated_tokens = peft_model.generate(**src_tokens)
    tgt_trans = tokenizer.batch_decode(translated_tokens, skip_special_tokens=True)[0]
    print(f"Source: {src}")
    print(f"Target: {tgt}")
    print(f"Translation: {tgt_trans}")
    print()

Source: hello.
Target: नमस्ते।
Translation: आओ।

Source: how are you?
Target: आप कैसे हैं?
Translation: आप कैसा हैं?

Source: i am fine.
Target: मैं ठीक हूँ।
Translation: मैं ठीक हूँ।

Source: what is your name?
Target: आपका नाम क्या है?
Translation: आपका नाम क्या है?

Source: my name is john.
Target: मेरा नाम जॉन है।
Translation: मेरी नाम हरी है।

Source: where are you going?
Target: आप कहाँ जा रहे हैं?
Translation: कहाँ जा रहे हो?

Source: i am going home.
Target: मैं घर जा रहा हूँ।
Translation: मैं घर जा रहा हूँ।

Source: thank you.
Target: धन्यवाद।
Translation: धन्यवाद।

Source: good morning.
Target: शुभ प्रभात।
Translation: सुबह अच्छी है।

Source: good night.
Target: शुभ रात्रि।
Translation: अच्छा रात।

Source: please help me.
Target: कृपया मेरी मदद करें।
Translation: कृपया मदद करें।

Source: what time is it?
Target: अभी क्या समय हुआ है?
Translation: क्या समय है?

Source: i don't understand.
Target: मुझे समझ में नहीं आया।
Translation: मैं समझ नहीं पाता।

Source: can you speak engl

# Mount Colab

In [None]:
from google.colab import drive
drive.mount('/content/drive')