In [1]:
# Step 1: Install required packages (clean installation)
!pip install --quiet git+https://github.com/VarunGumma/IndicTransToolkit.git --use-pep517
!pip install --quiet transformers sentencepiece accelerate
# Install the appropriate bitsandbytes version for GPU
!pip install --quiet bitsandbytes

  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
  Preparing metadata (pyproject.toml) ... [?25l[?25hdone
  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
  Preparing metadata (pyproject.toml) ... [?25l[?25hdone
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m51.8/51.8 kB[0m [31m2.0 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m104.1/104.1 kB[0m [31m4.4 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m897.5/897.5 kB[0m [31m20.7 MB/s[0m eta [36m0:00:00[0m00:01[0m
[?25h  Building wheel for IndicTransToolkit (pyproject.toml) ... [?25l[?25hdone
  Building wheel for indic-nlp-library-IT2 (pyproject.toml) ... [?25l[?25hdone
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m76.0/76.0 MB[0m [31m21.5 MB/s[0m eta [36m0:00:00[0m:00:01

In [2]:
import torch
import warnings
import os
from IndicTransToolkit.processor import IndicProcessor
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
from datasets import load_dataset
from transformers import DataCollatorForSeq2Seq, Seq2SeqTrainer, Seq2SeqTrainingArguments
from peft import get_peft_model, LoraConfig, prepare_model_for_kbit_training

# Suppress warnings
warnings.filterwarnings("ignore")
os.environ["WANDB_DISABLED"] = "true"

# Check device
device = "cuda" if torch.cuda.is_available() else "cpu"
print(f"Using device: {device}")

Using device: cuda


In [3]:
# Set model name
model_name = "prajdabre/rotary-indictrans2-en-indic-dist-200M"

# Initialize processor
ip = IndicProcessor(inference=True)

# Initialize tokenizer with correct settings
tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)

# Load dataset
raw_datasets = load_dataset("cfilt/iitb-english-hindi")
print(f"Dataset loaded: {raw_datasets}")

tokenizer_config.json:   0%|          | 0.00/1.11k [00:00<?, ?B/s]

tokenization_indictrans.py:   0%|          | 0.00/8.04k [00:00<?, ?B/s]

A new version of the following files was downloaded from https://huggingface.co/prajdabre/rotary-indictrans2-en-indic-dist-200M:
- tokenization_indictrans.py
. Make sure to double-check they do not contain any added malicious code. To avoid downloading new versions of the code file, you can pin a revision.


dict.SRC.json:   0%|          | 0.00/645k [00:00<?, ?B/s]

dict.TGT.json:   0%|          | 0.00/3.39M [00:00<?, ?B/s]

model.SRC:   0%|          | 0.00/759k [00:00<?, ?B/s]

model.TGT:   0%|          | 0.00/3.26M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/96.0 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/3.14k [00:00<?, ?B/s]

dataset_infos.json:   0%|          | 0.00/953 [00:00<?, ?B/s]

train-00000-of-00001.parquet:   0%|          | 0.00/190M [00:00<?, ?B/s]

validation-00000-of-00001.parquet:   0%|          | 0.00/85.7k [00:00<?, ?B/s]

test-00000-of-00001.parquet:   0%|          | 0.00/500k [00:00<?, ?B/s]

Generating train split:   0%|          | 0/1659083 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/520 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/2507 [00:00<?, ? examples/s]

Dataset loaded: DatasetDict({
    train: Dataset({
        features: ['translation'],
        num_rows: 1659083
    })
    validation: Dataset({
        features: ['translation'],
        num_rows: 520
    })
    test: Dataset({
        features: ['translation'],
        num_rows: 2507
    })
})


In [4]:
# Define preprocessing function with proper target tokenization
def preprocess_function(examples):
    # Unpack the dict-of-lists
    translations = examples["translation"]
    en_sentences = [t["en"] for t in translations]
    hi_sentences = [t["hi"] for t in translations]
    
    # Transliterate & normalize
    inputs = ip.preprocess_batch(en_sentences, src_lang="eng_Latn", tgt_lang="hin_Deva")
    targets = ip.preprocess_batch(hi_sentences, src_lang="hin_Deva", tgt_lang="hin_Deva")
    
    # Tokenize inputs
    model_inputs = tokenizer(inputs, max_length=128, truncation=True)
    
    # Tokenize targets - properly handle target tokenization
    labels = tokenizer(targets, max_length=128, truncation=True)
    model_inputs["labels"] = labels["input_ids"]
    
    return model_inputs

# Process datasets
print("Processing datasets...")
tokenized_datasets = raw_datasets.map(
    preprocess_function,
    batched=True,
    remove_columns=["translation"],  # Drop the original dict-column
)




Processing datasets...


Map:   0%|          | 0/1659083 [00:00<?, ? examples/s]

Map:   0%|          | 0/520 [00:00<?, ? examples/s]

Map:   0%|          | 0/2507 [00:00<?, ? examples/s]

In [11]:
# # Create smaller datasets for experimentation
# small_train = tokenized_datasets["train"].shuffle(seed=42).select(range(50_000))  # 50k for training
# small_val = tokenized_datasets["test"].shuffle(seed=42).select(range(2_500))  # 2.5k for validation
# print(f"Training samples: {len(small_train)}, Validation samples: {len(small_val)}")


Training samples: 50000, Validation samples: 2500


In [5]:
# Load model for training
print("Loading model for training...")
# Set this environment variable to avoid copying issues
os.environ["HF_8BIT_SKIP_CONVERT_CHECK"] = "1"

# Load model in regular fp16 mode instead of 8-bit
model = AutoModelForSeq2SeqLM.from_pretrained(
    model_name,
    torch_dtype=torch.float16 if device == "cuda" else torch.float32,
    device_map=None,  # Load to CPU first
    trust_remote_code=True,
    state_dict=None  # Will force loading weights normally
)
if device == "cuda":
    model = model.to(device)

Loading model for training...


config.json:   0%|          | 0.00/1.46k [00:00<?, ?B/s]

configuration_rotary_indictrans.py:   0%|          | 0.00/6.56k [00:00<?, ?B/s]

A new version of the following files was downloaded from https://huggingface.co/prajdabre/rotary-indictrans2-en-indic-dist-200M:
- configuration_rotary_indictrans.py
. Make sure to double-check they do not contain any added malicious code. To avoid downloading new versions of the code file, you can pin a revision.


modeling_rotary_indictrans.py:   0%|          | 0.00/68.6k [00:00<?, ?B/s]

A new version of the following files was downloaded from https://huggingface.co/prajdabre/rotary-indictrans2-en-indic-dist-200M:
- modeling_rotary_indictrans.py
. Make sure to double-check they do not contain any added malicious code. To avoid downloading new versions of the code file, you can pin a revision.


model.safetensors:   0%|          | 0.00/847M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/163 [00:00<?, ?B/s]

In [6]:
# Prepare model for training with LoRA
if device == "cuda":
    # Skip prepare_model_for_kbit_training since we're not loading in 8-bit/4-bit
    # If you're not using quantization, you can omit this line
    # model = prepare_model_for_kbit_training(model)
    
    # Identify the correct target modules for this model architecture
    # Let's get the correct module names from the model
    target_modules = []
    for name, module in model.named_modules():
        if "q_proj" in name or "v_proj" in name:
            module_name = name.split('.')[-1]
            if module_name not in target_modules:
                target_modules.append(module_name)
    
    print(f"Using LoRA target modules: {target_modules}")
    
    # Configure LoRA
    lora_cfg = LoraConfig(
        r=8,
        lora_alpha=32,
        target_modules=target_modules,
        lora_dropout=0.1,
        bias="none",
        task_type="SEQ_2_SEQ_LM",
    )
    
    # Apply LoRA to model
    model = get_peft_model(model, lora_cfg)
    model.print_trainable_parameters()

Using LoRA target modules: ['v_proj', 'q_proj']
trainable params: 884,736 || all params: 212,661,248 || trainable%: 0.4160


In [7]:
# Data collator for dynamic padding
data_collator = DataCollatorForSeq2Seq(tokenizer, model=model)

# Configure training arguments
training_args = Seq2SeqTrainingArguments(
    output_dir="./lora_iitb",
    per_device_train_batch_size=8 if device == "cuda" else 4,
    gradient_accumulation_steps=4,
    num_train_epochs=1,
    learning_rate=3e-4,
    optim="paged_adamw_8bit" if device == "cuda" else "adamw_torch",
    fp16=device == "cuda",
    report_to=["none"],
    save_strategy="steps",
    save_steps=5000,
    evaluation_strategy="steps",
    eval_steps=5000,
    logging_steps=500,
    load_best_model_at_end=True,
    metric_for_best_model="eval_loss",
)

In [None]:
# Initialize trainer
trainer = Seq2SeqTrainer(
    model=model,
    args=training_args,
    # train_dataset=small_train,
    # eval_dataset=small_val,
    train_dataset=tokenized_datasets["train"],
    eval_dataset=tokenized_datasets["test"],
    tokenizer=tokenizer,
    data_collator=data_collator,
)

# Run training
print("Starting training...")
trainer.train()
print("Training completed!")


Starting training...


Step,Training Loss,Validation Loss


In [17]:
# Save the fine-tuned model
model.save_pretrained("./final_lora_iitb")
tokenizer.save_pretrained("./final_lora_iitb")
print("Model saved!")


Model saved!


In [18]:
test_sentences = [
    """As I wandered through the bustling city streets on a warm, sunny afternoon, 
    the world around me seemed to hum with life. The vibrant colors of flowers blooming 
    in the meticulously maintained parks painted a vivid contrast against the steel and 
    glass of the towering skyscrapers that loomed above, reflecting the bright blue sky 
    in their countless windows. The sunlight, filtering through the canopy of leaves in 
    the small, scattered patches of greenery that dotted the city, cast dappled shadows 
    on the pavement below, where the rhythmic clatter of footsteps and the distant hum 
    of traffic merged into a soothing, urban symphony. Street performers, positioned at 
    nearly every corner, played their guitars, violins, and saxophones, their melodies 
    blending into a medley of sounds that flowed like a river through the streets, carried
    by the laughter and chatter of people spilling out of bustling cafés and restaurants, 
    their faces alight with the joy of an afternoon well spent. Children raced through the 
    playgrounds, their laughter carried on the gentle breeze that occasionally swept 
    through the streets, bringing with it the scent of freshly baked bread from a nearby 
    bakery and the tang of street food sizzling on grills as vendors called out to passersby. 
    In the midst of all this vibrant activity, I found myself swept up in the energy of the 
    city, its pulse echoing in my ears as I walked along, watching the ever-changing tapestry
    of life unfold before me. The city was a living, breathing organism, constantly in motion, 
    and I couldn’t help but marvel at how different this world felt from the one I had known 
    for so long. It hadn’t been that long ago, only a few months, that I had lived in the 
    countryside, surrounded by rolling hills and quiet fields that stretched as far as the 
    eye could see. The mornings there had always been peaceful, almost meditative in their 
    stillness, with only the soft rustle of leaves in the trees, the gentle whisper of the 
    wind as it passed through the tall grasses, and the occasional chirping of birds breaking 
    the silence. I would wake early, just as the first rays of sunlight crept over the horizon, 
    casting a soft, golden light over the landscape. The air would be cool and crisp, filled 
    with the earthy scent of dew-soaked soil and the faint aroma of wildflowers. Those mornings 
    had their own kind of beauty, a beauty that came from the quiet simplicity of nature, from 
    the feeling of being alone in the world, surrounded by nothing but the sounds of the earth 
    waking up. I would sit on the porch of my small cottage, with a cup of steaming coffee in 
    hand, and watch as the mist slowly lifted from the fields, revealing the distant outline 
    of the forest, where the trees stood tall and silent, like sentinels guarding the secrets 
    of the land. Sometimes, if I was lucky, I’d catch a glimpse of a deer or two grazing in the 
    distance, their sleek bodies moving gracefully through the tall grass, unaware of my presence. 
    But despite the peace and tranquility of the countryside, there had always been a part of me 
    that longed for something more, something beyond the quiet, predictable rhythm of rural life. 
    I had spent years surrounded by nature’s beauty, and while I had loved it, I had also begun 
    to feel a sense of restlessness, a yearning for the energy and excitement that only a city 
    could provide. So, when the opportunity came to move to the city, I had taken it without 
    hesitation, packing up my life into a few boxes and leaving behind the familiar comforts of 
    the countryside for the unknown adventures that awaited me in the bustling streets of urban 
    life. Now, as I walked through those very streets, I realized just how much my life had changed 
    in such a short time. The cit y was everything I had hoped it would be, and more. Every day 
    brought something new, something unexpected. One day, I might stumble upon a hidden café tucked 
    away in a narrow alley, its walls covered in ivy, serving the best espresso I had ever tasted. 
    The next, I might find myself standing in the middle of a street festival, surrounded by food 
    stalls offering dishes from every corner of the world, the air filled with the scent of spices 
    and grilled meats, while performers danced and played music in the center of the crowd, their 
    movements a celebration of culture and tradition. The people, too, were different. In the 
    countryside, I had known everyone—every face was familiar, every story already told. But here, 
    in the city, every person I passed was a mystery, a new story waiting to be discovered. There 
    were the artists, sitting in cafés sketching scenes from their imagination; the businesspeople, 
    hurrying to meetings with phones pressed to their ears, their faces a mask of determination; 
    the students, gathered in groups, discussing everything from philosophy to the latest fashion 
    trends; and the tourists, cameras slung around their necks, eyes wide with wonder as they took 
    in the sights of the city. Each of them was on their own journey, their paths crossing with 
    mine for just a brief moment before they continued on their way, leaving behind only the faintest 
    trace of their presence. Yet, despite all the excitement and energy of the city, there were 
    still moments when I found myself missing the quiet solitude of the countryside. There were 
    times, late at night, when the city had finally quieted down, and I would lie awake in my small 
    apartment, listening to the distant sounds of cars passing by on the streets below, that I 
    would think back to those early mornings in the countryside, to the feeling of peace that came 
    from being alone with nature. I would remember the way the first light of dawn had turned the 
    sky a soft shade of pink, the way the wind had whispered through the trees, and the way the 
    world had felt so still, as if it were holding its breath, waiting for the day to begin. But 
    then the sun would rise, casting its golden light over the city once again, and I would step 
    outside, greeted by the sights and sounds of life unfolding around me, and I would remember 
    why I had chosen to leave the quiet behind. The city, with all its noise, its chaos, and its 
    constant movement, had a magic of its own, a magic that drew me in and made me feel alive in 
    a way that the countryside never could. It was a place where anything seemed possible, where 
    every corner held the promise of something new, something unexpected. And as I continued to 
    walk through the streets that day, I knew that, despite the occasional longing for the peace 
    of the countryside, I had found a new home in the heart of the city, where the pulse of life 
    beat strong and steady, carrying me along with it."""
]


In [19]:
# Load the processor for inference
ip_test = IndicProcessor(inference=True)

# Load the fine-tuned model for inference (merge LoRA weights)
from peft import PeftModel, PeftConfig

# Load base model
base_model = AutoModelForSeq2SeqLM.from_pretrained(
    model_name,
    torch_dtype=torch.float16 if device == "cuda" else torch.float32,
    trust_remote_code=True,
).to(device)

# Load LoRA adapter
peft_model = PeftModel.from_pretrained(base_model, "./final_lora_iitb")

In [23]:
# Merge weights (optional for faster inference)
merged_model = peft_model.merge_and_unload()

# Prepare input for translation
batch = ip_test.preprocess_batch(test_sentences, src_lang="eng_Latn", tgt_lang="hin_Deva")
batch = tokenizer(
    batch, padding="longest", truncation=True, max_length=2048, return_tensors="pt"
).to(device)

# Generate translations
with torch.inference_mode():
    outputs = merged_model.generate(
        **batch,
        num_beams=5,
        length_penalty=1.0,
        repetition_penalty=1.2,
        max_new_tokens=2048,
        early_stopping=True
    )

# Decode outputs
outputs = tokenizer.batch_decode(
    outputs, skip_special_tokens=True, clean_up_tokenization_spaces=True
)
outputs = ip_test.postprocess_batch(outputs, lang="hin_Deva")

In [24]:
print("Translation results:")
for i, (src, tgt) in enumerate(zip(test_sentences, outputs)):
    print(f"Translation {i+1}: {tgt}")

Translation results:
Translation 1: जैसे ही मैं एक गर्म, धूप भरी दोपहर को शहर की व्यस्त सड़कों पर घूम रहा था, मेरे चारों ओर की दुनिया जीवन से गूंज रही थी। सावधानीपूर्वक बनाए गए पार्कों में खिलते फूलों के जीवंत रंग उन ऊँचे गगनचुंबी इमारतों के स्टील और कांच के खिलाफ एक जीवंत विरोधाभास को चित्रित करते थे जो ऊपर झुके हुए थे, उनकी अनगिनत खिड़कियों में चमकीले नीले आकाश को प्रतिबिंबित करते थे। धूप, छोटे, बिखरे हुए हरियाली के टुकड़ों में पत्तियों की छतरी के माध्यम से छांटते हुए, नीचे फुटपाथ पर छायाएँ डालते हुए, जहां कदमों की लयबद्ध गड़गड़ाहट और दूर की ट्रैफिक की गूंज एक शांत, शहरी सिम्फनी में मिल जाती थी। लगभग हर कोने पर स्थित, सड़क के कलाकार अपने गिटार, वायलिन और सैक्सोफोन बजाते थे, उनकी धुनें सड़कों से एक नदी की तरह बहने वाली ध्वनियों के मिश्रण में मिल जाती थीं, जो भीड़भाड़ वाले कैफे और रेस्तरां से बाहर निकलने वाले लोगों की हंसी और बातचीत से भरी होती थीं, उनके चेहरे एक अच्छी तरह से बिताई गई दोपहर की खुशी से चमकते थे। बच्चे खेल के मैदानों में दौड़ते थे, उनकी हंसी कभी-कभी सड़कों पर बहने वाली ह

In [25]:
# assume raw_datasets["test"] is still around
test_src = [t["en"] for t in raw_datasets["test"]["translation"]]
test_ref = [t["hi"] for t in raw_datasets["test"]["translation"]]


In [26]:
from tqdm.auto import tqdm

# batch size for inference
bsz = 16
predictions = []

for i in tqdm(range(0, len(test_src), bsz)):
    batch_src = test_src[i : i+bsz]
    # preprocess + tokenize
    inputs = ip_test.preprocess_batch(batch_src, src_lang="eng_Latn", tgt_lang="hin_Deva")
    tok = tokenizer(inputs, padding="longest", truncation=True, max_length=512, return_tensors="pt").to(device)
    # generate
    with torch.inference_mode():
        outs = merged_model.generate(**tok, num_beams=5, max_new_tokens=256)
    # decode + postprocess
    dec = tokenizer.batch_decode(outs, skip_special_tokens=True)
    dec = ip_test.postprocess_batch(dec, lang="hin_Deva")
    predictions.extend(dec)


  0%|          | 0/157 [00:00<?, ?it/s]

In [28]:
!pip install --quiet evaluate sacrebleu bert-score


[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m84.0/84.0 kB[0m [31m3.0 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m61.1/61.1 kB[0m [31m3.8 MB/s[0m eta [36m0:00:00[0m
[?25h

In [30]:
import evaluate
import numpy as np

bleu = evaluate.load("sacrebleu")
chrf = evaluate.load("chrf")
bertscore = evaluate.load("bertscore")

# sacreBLEU expects a list of lists of references
bleu_res    = bleu.compute(predictions=predictions, references=[[r] for r in test_ref])
chrf_res    = chrf.compute(predictions=predictions, references=test_ref)
bertscore_res = bertscore.compute(predictions=predictions, references=test_ref, lang="hi")

print(f"BLEU:    {bleu_res['score']:.2f}")
print(f"chrF:    {chrf_res['score']:.2f}")
print(f"BERTScore F1: {np.mean(bertscore_res['f1']):.4f}")


BLEU:    26.67
chrF:    53.98
BERTScore F1: 0.8694
