In [1]:
# Cell 1 — installs (run this first)
!pip install -q transformers datasets evaluate sacrebleu sentencepiece accelerate


[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m51.8/51.8 kB[0m [31m3.6 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m84.1/84.1 kB[0m [31m9.0 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m104.1/104.1 kB[0m [31m12.7 MB/s[0m eta [36m0:00:00[0m
[?25h

In [2]:
#create dataset in a DataFrame and save as a proper CSV (fields will be quoted)
import pandas as pd

pairs = [
    ("Hello!", "नमस्ते!"),
    ("How are you?", "आप कैसे हैं?"),
    ("I am fine, thank you.", "मैं ठीक हूँ, धन्यवाद।"),
    ("What is your name?", "आपका नाम क्या है?"),
    ("My name is Kishan.", "मेरा नाम किशन है।"),
    ("Where are you from?", "आप कहाँ से हैं?"),
    ("I am from India.", "मैं भारत से हूँ।"),
    ("I love learning languages.", "मुझे भाषाएँ सीखना पसंद है।"),
    ("Do you speak English?", "क्या आप अंग्रेज़ी बोलते हैं?"),
    ("Yes, a little.", "हाँ, थोड़ा।"),
    ("Please help me.", "कृपया मेरी मदद कीजिए।"),
    ("How much does this cost?", "इसकी कीमत कितनी है?"),
    ("I need water.", "मुझे पानी चाहिए।"),
    ("Turn left here.", "यहाँ बाएँ मुड़ें।"),
    ("Turn right at the next junction.", "अगले चौराहे पर दाएँ मुड़ें।"),
    ("This is delicious.", "यह स्वादिष्ट है।"),
    ("I will be back soon.", "मैं जल्द वापस आऊँगा।"),
    ("Can you repeat that?", "क्या आप उसे दोहरा सकते हैं?"),
    ("I don't understand.", "मुझे समझ नहीं आया।"),
    ("Where is the restroom?", "शौचालय कहाँ है?"),
    ("Call the doctor!", "डॉक्टर को बुलाइए!"),
    ("I am learning to code.", "मैं कोडिंग सीख रहा हूँ।"),
    ("Open the window.", "खिड़की खोलिए।"),
    ("Close the door.", "दरवाज़ा बंद करो।"),
    ("What time is it?", "कितने बजे हैं?"),
    ("I am hungry.", "मैं भूखा हूँ।"),
    ("I am tired.", "मैं थका हुआ हूँ।"),
    ("Please speak slowly.", "कृपया धीरे बोलें।"),
    ("Congratulations!", "बधाई हो!"),
    ("Happy birthday!", "जन्मदिन मुबारक!"),
    ("I like this song.", "मुझे यह गीत पसंद है।"),
    ("Do you have a pen?", "क्या आपके पास पेन है?"),
    ("I lost my bag.", "मैंने अपना बैग खो दिया।"),
    ("Where can I buy a ticket?", "मैं टिकट कहाँ खरीद सकता/सकती हूँ?"),
    ("I am studying at college.", "मैं कॉलेज में पढ़ता/पढ़ती हूँ।"),
    ("This is my friend.", "यह मेरा/मेरी दोस्त है।"),
    ("Excuse me.", "माफ़ कीजिए।"),
    ("How long will it take?", "यह कितना समय लेगा?"),
    ("Please write it down.", "कृपया इसे लिख दें।"),
]

df = pd.DataFrame(pairs, columns=["en", "hi"])
df.to_csv("en_hi_pairs.csv", index=False, encoding="utf-8")
print("Saved en_hi_pairs.csv with", len(df), "pairs.")
df.head(6)


Saved en_hi_pairs.csv with 39 pairs.


Unnamed: 0,en,hi
0,Hello!,नमस्ते!
1,How are you?,आप कैसे हैं?
2,"I am fine, thank you.","मैं ठीक हूँ, धन्यवाद।"
3,What is your name?,आपका नाम क्या है?
4,My name is Kishan.,मेरा नाम किशन है।
5,Where are you from?,आप कहाँ से हैं?


In [3]:
# Cell 3 — imports and configuration
import random
import numpy as np
import pandas as pd
import torch

from datasets import Dataset
from transformers import (AutoTokenizer, AutoModelForSeq2SeqLM,
                          DataCollatorForSeq2Seq,
                          Seq2SeqTrainingArguments, Seq2SeqTrainer)
import evaluate

# Config - change MODEL_NAME to `Helsinki-NLP/opus-mt-hi-en` for reverse direction
MODEL_NAME = "Helsinki-NLP/opus-mt-en-hi"
CSV_PATH = "en_hi_pairs.csv"
OUTPUT_DIR = "fine_tuned_en_hi"
MAX_SOURCE_LENGTH = 128
MAX_TARGET_LENGTH = 128
SEED = 42
random.seed(SEED)
np.random.seed(SEED)
torch.manual_seed(SEED)


<torch._C.Generator at 0x7b439264fa10>

In [4]:
# Cell 4 — load CSV, split, build huggingface Datasets
df = pd.read_csv(CSV_PATH)
assert 'en' in df.columns and 'hi' in df.columns, "CSV must have 'en' and 'hi' columns"
df = df.sample(frac=1, random_state=SEED).reset_index(drop=True)

n = len(df)
n_train = int(0.8 * n)
n_val = int(0.1 * n)
train_df = df.iloc[:n_train].reset_index(drop=True)
val_df = df.iloc[n_train:n_train + n_val].reset_index(drop=True)
test_df = df.iloc[n_train + n_val:].reset_index(drop=True)

train_ds = Dataset.from_pandas(train_df)
val_ds = Dataset.from_pandas(val_df)
test_ds = Dataset.from_pandas(test_df)

print("Sizes -> train:", len(train_ds), "val:", len(val_ds), "test:", len(test_ds))
train_df.head(3)


Sizes -> train: 31 val: 3 test: 5


Unnamed: 0,en,hi
0,Where can I buy a ticket?,मैं टिकट कहाँ खरीद सकता/सकती हूँ?
1,Excuse me.,माफ़ कीजिए।
2,My name is Kishan.,मेरा नाम किशन है।


In [5]:
# Cell 5 — tokenizer, preprocess function (handles label padding -> -100)
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)

def preprocess_batch(batch):
    inputs = batch["en"]
    targets = batch["hi"]
    model_inputs = tokenizer(inputs, max_length=MAX_SOURCE_LENGTH, truncation=True, padding="max_length")
    with tokenizer.as_target_tokenizer():
        labels = tokenizer(targets, max_length=MAX_TARGET_LENGTH, truncation=True, padding="max_length")
    # replace padding token id's in labels by -100 so they are ignored by loss
    labels_ids = labels["input_ids"]
    labels_ids = [[(token if token != tokenizer.pad_token_id else -100) for token in lbl] for lbl in labels_ids]
    model_inputs["labels"] = labels_ids
    return model_inputs

train_tok = train_ds.map(preprocess_batch, batched=True, batch_size=8, remove_columns=train_ds.column_names)
val_tok = val_ds.map(preprocess_batch, batched=True, batch_size=8, remove_columns=val_ds.column_names)
test_tok = test_ds.map(preprocess_batch, batched=True, batch_size=8, remove_columns=test_ds.column_names)

print("Tokenized datasets ready.")


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/44.0 [00:00<?, ?B/s]

config.json: 0.00B [00:00, ?B/s]

source.spm:   0%|          | 0.00/812k [00:00<?, ?B/s]

target.spm:   0%|          | 0.00/1.07M [00:00<?, ?B/s]

vocab.json: 0.00B [00:00, ?B/s]



Map:   0%|          | 0/31 [00:00<?, ? examples/s]



Map:   0%|          | 0/3 [00:00<?, ? examples/s]

Map:   0%|          | 0/5 [00:00<?, ? examples/s]

Tokenized datasets ready.


In [6]:
# Cell 6 (REPLACEMENT) — model, data collator, adaptive trainer args
from transformers import AutoModelForSeq2SeqLM, DataCollatorForSeq2Seq
from transformers import Seq2SeqTrainingArguments, TrainingArguments
import inspect
import numpy as np

model = AutoModelForSeq2SeqLM.from_pretrained(MODEL_NAME)
data_collator = DataCollatorForSeq2Seq(tokenizer, model=model)

# Use fp16 if CUDA available
use_fp16 = torch.cuda.is_available()

# Desired training arguments (we'll filter to only the accepted ones)
desired_training_args = {
    "output_dir": OUTPUT_DIR,
    "evaluation_strategy": "epoch",
    "save_strategy": "epoch",
    "logging_strategy": "epoch",
    "per_device_train_batch_size": 4,
    "per_device_eval_batch_size": 8,
    "num_train_epochs": 8,
    "fp16": use_fp16,
    "predict_with_generate": True,
    "save_total_limit": 2,
    "seed": SEED,
    "learning_rate": 5e-5,
    "remove_unused_columns": True,
    "push_to_hub": False,
    "report_to": "none"
}

def make_args(cls, desired_kwargs):
    # inspect constructor parameters and only keep allowed ones
    sig = inspect.signature(cls.__init__)
    allowed = [p for p in desired_kwargs.keys() if p in sig.parameters]
    filtered = {k: desired_kwargs[k] for k in allowed}
    return cls(**filtered), allowed

# Try Seq2SeqTrainingArguments first (preferred), else fallback to TrainingArguments
try:
    training_args, used = make_args(Seq2SeqTrainingArguments, desired_training_args)
    print("Using Seq2SeqTrainingArguments with args:", used)
except Exception as e:
    training_args, used = make_args(TrainingArguments, desired_training_args)
    print("Falling back to TrainingArguments with args:", used, "error:", str(e))

# Prepare metric
bleu = evaluate.load("sacrebleu")

def compute_metrics(eval_pred):
    preds, labels = eval_pred
    if isinstance(preds, tuple):
        preds = preds[0]
    decoded_preds = tokenizer.batch_decode(preds, skip_special_tokens=True)
    # replace -100 with pad token id
    labels = np.where(labels != -100, labels, tokenizer.pad_token_id)
    decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)
    result = bleu.compute(predictions=decoded_preds, references=[[r] for r in decoded_labels])
    return {"bleu": result["score"]}


pytorch_model.bin:   0%|          | 0.00/306M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/293 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/306M [00:00<?, ?B/s]

Using Seq2SeqTrainingArguments with args: ['output_dir', 'save_strategy', 'logging_strategy', 'per_device_train_batch_size', 'per_device_eval_batch_size', 'num_train_epochs', 'fp16', 'predict_with_generate', 'save_total_limit', 'seed', 'learning_rate', 'remove_unused_columns', 'push_to_hub', 'report_to']


Downloading builder script: 0.00B [00:00, ?B/s]

In [7]:
# Cell 7 — create Trainer and train
trainer = Seq2SeqTrainer(
    model=model,
    args=training_args,
    train_dataset=train_tok,
    eval_dataset=val_tok,
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics
)

train_result = trainer.train()
trainer.save_model(OUTPUT_DIR)
tokenizer.save_pretrained(OUTPUT_DIR)

print("Training finished. Results:", train_result.metrics)


  trainer = Seq2SeqTrainer(
The tokenizer has new PAD/BOS/EOS tokens that differ from the model config and generation config. The model config and generation config were aligned accordingly, being updated with the tokenizer's values. Updated tokens: {'bos_token_id': None}.


Step,Training Loss
8,1.4022
16,0.7049
24,0.3378
32,0.1957
40,0.1176
48,0.0609
56,0.064
64,0.0533




Training finished. Results: {'train_runtime': 81.0656, 'train_samples_per_second': 3.059, 'train_steps_per_second': 0.789, 'total_flos': 8406794502144.0, 'train_loss': 0.3670429144985974, 'epoch': 8.0}


In [8]:
# Cell 8 — evaluate on test set and show a few example translations
metrics = trainer.evaluate(test_tok)
print("Eval on test:", metrics)

def translate_texts(texts, max_length=128):
    inputs = tokenizer(texts, return_tensors="pt", padding=True, truncation=True, max_length=MAX_SOURCE_LENGTH)
    if torch.cuda.is_available():
        model.to("cuda")
        inputs = {k:v.to("cuda") for k,v in inputs.items()}
    outputs = model.generate(**inputs, max_length=max_length, num_beams=4, early_stopping=True)
    decoded = tokenizer.batch_decode(outputs, skip_special_tokens=True)
    return decoded

examples = [
    "Hello!",
    "Where is the restroom?",
    "I am learning to code.",
    "Please help me."
]
print("Examples ->", translate_texts(examples))


Eval on test: {'eval_loss': 1.2038506269454956, 'eval_bleu': 10.539181489019553, 'eval_runtime': 2.4722, 'eval_samples_per_second': 2.022, 'eval_steps_per_second': 0.404, 'epoch': 8.0}
Examples -> ['नमस्ते!', 'शौचालय कहाँ है?', 'मैं कोडिंग सीख रहा हूँ।', 'कृपया मेरी मदद करो।']


In [9]:
# Cell 9 — quick inference loop (interactive)
print("Type English sentences (enter 'quit' to stop). The model translates to Hindi.")
while True:
    txt = input("EN > ")
    if not txt or txt.lower().strip() in ["quit","exit"]:
        break
    print("HI >", translate_texts([txt])[0])


Type English sentences (enter 'quit' to stop). The model translates to Hindi.
EN > how are you?
HI > आप कैसे हैं?


KeyboardInterrupt: Interrupted by user