In [1]:
# Core NLP + Training (for mBART)
!pip install -q transformers datasets accelerate sentencepiece evaluate sacrebleu rouge-score

# Indic normalization (for Hindi cleaning - optional but useful)
!pip install -q indic-nlp-library

# Utilities
!pip install -q pandas tqdm


  Preparing metadata (setup.py) ... [?25l[?25hdone
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m84.1/84.1 kB[0m [31m3.5 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m100.8/100.8 kB[0m [31m11.1 MB/s[0m eta [36m0:00:00[0m
[?25h  Building wheel for rouge-score (setup.py) ... [?25l[?25hdone
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m40.3/40.3 kB[0m [31m1.2 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.7/7.7 MB[0m [31m33.6 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m121.1/121.1 kB[0m [31m5.7 MB/s[0m eta [36m0:00:00[0m
[?25h

In [2]:
# Upgrade core libraries (mBART needs latest tokenizer support)
!pip install -U transformers accelerate sentencepiece datasets evaluate rouge-score


Collecting transformers
  Downloading transformers-5.0.0-py3-none-any.whl.metadata (37 kB)
Collecting datasets
  Downloading datasets-4.5.0-py3-none-any.whl.metadata (19 kB)
Collecting huggingface-hub<2.0,>=1.3.0 (from transformers)
  Downloading huggingface_hub-1.3.5-py3-none-any.whl.metadata (13 kB)
Collecting pyarrow>=21.0.0 (from datasets)
  Downloading pyarrow-23.0.0-cp312-cp312-manylinux_2_28_x86_64.whl.metadata (3.0 kB)
Downloading transformers-5.0.0-py3-none-any.whl (10.1 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m10.1/10.1 MB[0m [31m97.8 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading datasets-4.5.0-py3-none-any.whl (515 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m515.2/515.2 kB[0m [31m43.6 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading huggingface_hub-1.3.5-py3-none-any.whl (536 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m536.7/536.7 kB[0m [31m50.2 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading pyarrow

In [3]:
# Core
import json
import pandas as pd
import random

# Text cleaning (light normalization)
import re
import unicodedata


In [4]:
def load_json_file(path, lang_code):
    import json
    import pandas as pd

    with open(path, "r", encoding="utf-8") as f:
        data = json.load(f)

    rows = []

    for item in data:
        # Safe key lookup
        source = item.get("sourceText") or item.get("sourcetext") or item.get("source") or item.get("source_text")
        target = item.get("targetText") or item.get("targettext") or item.get("target") or item.get("target_text")

        if source is None or target is None:
            continue

        rows.append({
            "text": source.strip(),      # mBART input
            "summary": target.strip(),   # mBART label
            "lang_code": lang_code       # hi_IN or en_XX
        })

    df = pd.DataFrame(rows)
    print(f"Loaded {len(df)} rows from {path}")
    return df


In [5]:
df_hindi = load_json_file("sum_hindi.json", "hi_IN")
df_english = load_json_file("sum_eng.json", "en_XX")


Loaded 394 rows from sum_hindi.json
Loaded 329 rows from sum_eng.json


In [6]:
df_all = pd.concat([df_hindi, df_english]).sample(frac=1).reset_index(drop=True)

print(df_all.head())
print(len(df_all))


                                                text  \
0  वेल्स से बाहर रहने वाले पाठकों के लिए यह बात स...   
1  डॉ. फोर्ड की सार्वजनिक गवाही से पहले, जिसमें उ...   
2  "Today, everything is disappearing." Mr. Adams...   
3  At the Davos World Economic Forum, U.S. Presid...   
4  अमेरिका इसे लेकर हमलावर दिख रहा है। अमेरिकी वि...   

                                             summary lang_code  
0  वेल्श भाषा में कुछ शब्दों के नकारात्मक अर्थ हो...     hi_IN  
1  विश्लेषकों ने पहले ही राजनीतिक प्रतिक्रिया की ...     hi_IN  
2  Mr. Adams expressed concern that Harlem’s cult...     en_XX  
3  At Davos, Trump’s remark about buying Greenlan...     en_XX  
4  अमेरिका इस मामले में आक्रामक है; अमेरिकी वित्त...     hi_IN  
723


In [7]:
print("Empty text:", df_all["text"].isnull().sum())
print("Empty summary:", df_all["summary"].isnull().sum())


Empty text: 0
Empty summary: 0


In [8]:
print("Blank text:", (df_all["text"].str.strip() == "").sum())
print("Blank summary:", (df_all["summary"].str.strip() == "").sum())


Blank text: 0
Blank summary: 0


In [9]:
import re
import unicodedata

def clean_text_mbart(text):
    if not isinstance(text, str):
        return ""

    # Normalize unicode form (safe)
    text = unicodedata.normalize("NFKC", text)

    # Remove control characters
    text = re.sub(r"[\x00-\x1f\x7f]", " ", text)

    # Collapse multiple spaces/newlines
    text = re.sub(r"\s+", " ", text)

    return text.strip()


In [10]:
# Clean source text and summary safely for mBART
df_all["text"] = df_all["text"].apply(clean_text_mbart)
df_all["summary"] = df_all["summary"].apply(clean_text_mbart)

print("Cleaning completed!")
df_all.head()


Cleaning completed!


Unnamed: 0,text,summary,lang_code
0,वेल्स से बाहर रहने वाले पाठकों के लिए यह बात स...,वेल्श भाषा में कुछ शब्दों के नकारात्मक अर्थ हो...,hi_IN
1,"डॉ. फोर्ड की सार्वजनिक गवाही से पहले, जिसमें उ...",विश्लेषकों ने पहले ही राजनीतिक प्रतिक्रिया की ...,hi_IN
2,"""Today, everything is disappearing."" Mr. Adams...",Mr. Adams expressed concern that Harlem’s cult...,en_XX
3,"At the Davos World Economic Forum, U.S. Presid...","At Davos, Trump’s remark about buying Greenlan...",en_XX
4,अमेरिका इसे लेकर हमलावर दिख रहा है। अमेरिकी वि...,अमेरिका इस मामले में आक्रामक है; अमेरिकी वित्त...,hi_IN


In [11]:
df_all["text"] = df_all["text"].str.strip()
df_all["summary"] = df_all["summary"].str.strip()


In [12]:
import unicodedata

def normalize_text(text):
    if not isinstance(text, str):
        return ""
    return unicodedata.normalize("NFKC", text)

df_all["text"] = df_all["text"].apply(normalize_text)
df_all["summary"] = df_all["summary"].apply(normalize_text)


In [13]:
df_all["text"] = df_all["text"].str.replace(r"\s+", " ", regex=True)
df_all["summary"] = df_all["summary"].str.replace(r"\s+", " ", regex=True)


In [14]:
# Remove rows with empty text or summary
before = len(df_all)

df_all = df_all[
    (df_all["text"].str.len() > 0) &
    (df_all["summary"].str.len() > 0)
]

print("Removed empty rows:", before - len(df_all))


Removed empty rows: 0


In [15]:
before = len(df_all)

df_all = df_all.drop_duplicates(
    subset=["text", "summary", "lang_code"]
)

print("Removed duplicate rows:", before - len(df_all))


Removed duplicate rows: 1


In [16]:
# Group all rows by summary sentence
groups = df_all.groupby("summary")

unique_summaries = list(groups.groups.keys())
print("Unique summary sentences:", len(unique_summaries))



Unique summary sentences: 719


In [17]:
import random
random.seed(42)

random.shuffle(unique_summaries)

n = len(unique_summaries)

train_summaries = set(unique_summaries[:int(0.8 * n)])
val_summaries   = set(unique_summaries[int(0.8 * n):int(0.9 * n)])
test_summaries  = set(unique_summaries[int(0.9 * n):])



In [18]:
def assign_split(row):
    if row["summary"] in train_summaries:
        return "train"
    elif row["summary"] in val_summaries:
        return "val"
    else:
        return "test"

df_all["split"] = df_all.apply(assign_split, axis=1)



In [19]:
train_df2 = df_all[df_all["split"] == "train"]
val_df2   = df_all[df_all["split"] == "val"]
test_df2  = df_all[df_all["split"] == "test"]

print("Train size:", len(train_df2))
print("Validation size:", len(val_df2))
print("Test size:", len(test_df2))


Train size: 578
Validation size: 72
Test size: 72


In [20]:
train_summaries_set = set(train_df2["summary"])
val_summaries_set   = set(val_df2["summary"])
test_summaries_set  = set(test_df2["summary"])

print("Train ∩ Val overlap:", len(train_summaries_set & val_summaries_set))
print("Train ∩ Test overlap:", len(train_summaries_set & test_summaries_set))
print("Val ∩ Test overlap:", len(val_summaries_set & test_summaries_set))


Train ∩ Val overlap: 0
Train ∩ Test overlap: 0
Val ∩ Test overlap: 0


In [21]:
train_df2.to_csv("train.csv", index=False)
val_df2.to_csv("val.csv", index=False)
test_df2.to_csv("test.csv", index=False)

print("Saved train.csv, val.csv, test.csv")


Saved train.csv, val.csv, test.csv


In [22]:
print(pd.read_csv("train.csv").head(2))


                                                text  \
0  डॉ. फोर्ड की सार्वजनिक गवाही से पहले, जिसमें उ...   
1  "Today, everything is disappearing." Mr. Adams...   

                                             summary lang_code  split  
0  विश्लेषकों ने पहले ही राजनीतिक प्रतिक्रिया की ...     hi_IN  train  
1  Mr. Adams expressed concern that Harlem’s cult...     en_XX  train  


In [23]:
import torch
from datasets import load_dataset, Dataset
from transformers import (
    MBart50TokenizerFast,
    MBartForConditionalGeneration,
    DataCollatorForSeq2Seq,
    Trainer,
    TrainingArguments
)


In [24]:
from datasets import Dataset
import torch
from transformers import MBart50TokenizerFast, MBartForConditionalGeneration

# Convert to HF Dataset
train_dataset2 = Dataset.from_pandas(train_df2.reset_index(drop=True))
val_dataset2   = Dataset.from_pandas(val_df2.reset_index(drop=True))
test_dataset2  = Dataset.from_pandas(test_df2.reset_index(drop=True))

# Load mBART model & tokenizer
model2_name = "facebook/mbart-large-50-many-to-many-mmt"

tokenizer2 = MBart50TokenizerFast.from_pretrained(model2_name)
model2 = MBartForConditionalGeneration.from_pretrained(model2_name)

device2 = "cuda" if torch.cuda.is_available() else "cpu"
model2.to(device2)


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/529 [00:00<?, ?B/s]



sentencepiece.bpe.model:   0%|          | 0.00/5.07M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/649 [00:00<?, ?B/s]

config.json: 0.00B [00:00, ?B/s]

model.safetensors:   0%|          | 0.00/2.44G [00:00<?, ?B/s]

Loading weights:   0%|          | 0/516 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/261 [00:00<?, ?B/s]

MBartForConditionalGeneration(
  (model): MBartModel(
    (shared): MBartScaledWordEmbedding(250054, 1024, padding_idx=1)
    (encoder): MBartEncoder(
      (embed_tokens): MBartScaledWordEmbedding(250054, 1024, padding_idx=1)
      (embed_positions): MBartLearnedPositionalEmbedding(1026, 1024)
      (layers): ModuleList(
        (0-11): 12 x MBartEncoderLayer(
          (self_attn): MBartAttention(
            (k_proj): Linear(in_features=1024, out_features=1024, bias=True)
            (v_proj): Linear(in_features=1024, out_features=1024, bias=True)
            (q_proj): Linear(in_features=1024, out_features=1024, bias=True)
            (out_proj): Linear(in_features=1024, out_features=1024, bias=True)
          )
          (self_attn_layer_norm): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)
          (activation_fn): ReLU()
          (fc1): Linear(in_features=1024, out_features=4096, bias=True)
          (fc2): Linear(in_features=4096, out_features=1024, bias=True)
        

In [33]:
def preprocess_function(examples):

    lang_map = {
        "hin": "hi_IN",
        "eng": "en_XX"
    }

    input_ids = []
    attention_masks = []
    labels = []

    for i in range(len(examples["text"])):

        src = examples["text"][i]
        tgt = examples["summary"][i]
        lang = examples["lang_code"][i]

        if src is None or tgt is None:
            continue

        if len(src.strip()) == 0 or len(tgt.strip()) == 0:
            continue

        # Safe language mapping
        mbart_lang = lang_map.get(lang, lang)

        # Set source language
        tokenizer2.src_lang = mbart_lang

        # Tokenize source + target together (NEW WAY)
        model_inputs = tokenizer2(
            src,
            text_target=tgt,
            max_length=512,
            truncation=True,
            padding="max_length"
        )

        input_ids.append(model_inputs["input_ids"])
        attention_masks.append(model_inputs["attention_mask"])
        labels.append(model_inputs["labels"])

    return {
        "input_ids": input_ids,
        "attention_mask": attention_masks,
        "labels": labels
    }


In [31]:
print(train_dataset2["lang_code"][:20])


['hi_IN', 'en_XX', 'hi_IN', 'en_XX', 'en_XX', 'hi_IN', 'hi_IN', 'hi_IN', 'en_XX', 'en_XX', 'en_XX', 'hi_IN', 'en_XX', 'en_XX', 'en_XX', 'hi_IN', 'hi_IN', 'en_XX', 'en_XX', 'hi_IN']


In [34]:
tokenized2_train = train_dataset2.map(
    preprocess_function,
    batched=True,
    remove_columns=train_dataset2.column_names
)

tokenized2_val = val_dataset2.map(
    preprocess_function,
    batched=True,
    remove_columns=val_dataset2.column_names
)

print("Tokenization successful!")




Map:   0%|          | 0/578 [00:00<?, ? examples/s]

Map:   0%|          | 0/72 [00:00<?, ? examples/s]

Tokenization successful!


In [35]:
print(tokenized2_train[0].keys())



dict_keys(['input_ids', 'attention_mask', 'labels'])


In [36]:
# Freeze encoder layers of mBART summarizer
for param in model2.model.encoder.parameters():
    param.requires_grad = False

print("mBART summarizer encoder frozen")



mBART summarizer encoder frozen


In [37]:
from transformers import DataCollatorForSeq2Seq

# Data collator for summarization (mBART)
summarizer_collator = DataCollatorForSeq2Seq(
    tokenizer=tokenizer2,
    model=model2
)


In [39]:
from transformers import TrainingArguments

# Training arguments for mBART summarization
summarizer_training_args = TrainingArguments(
    output_dir="./mbart_summarizer",

    per_device_train_batch_size=1,
    per_device_eval_batch_size=1,

    gradient_accumulation_steps=8,
    learning_rate=2e-5,

    num_train_epochs=3,

    fp16=True,
    optim="adafactor",

    eval_strategy="no",   # <-- FIXED

    logging_steps=50,

    save_steps=1000,
    save_total_limit=2,

    report_to="none"
)



In [41]:
from transformers import DataCollatorForSeq2Seq

# Data collator for mBART summarization
summarizer_data_collator = DataCollatorForSeq2Seq(
    tokenizer=tokenizer2,   # mBART tokenizer
    model=model2            # mBART model
)


In [42]:
summarizer_trainer = Trainer(
    model=model2,
    args=summarizer_training_args,

    train_dataset=tokenized2_train,
    eval_dataset=tokenized2_val,

    data_collator=summarizer_data_collator
)


In [43]:
# Train mBART Summarization Model
summarizer_trainer.train()



Step,Training Loss
50,81.47375
100,76.813516
150,75.941201
200,77.261201


Writing model shards:   0%|          | 0/1 [00:00<?, ?it/s]

TrainOutput(global_step=219, training_loss=77.5162832878496, metrics={'train_runtime': 325.9561, 'train_samples_per_second': 5.32, 'train_steps_per_second': 0.672, 'total_flos': 1878901508800512.0, 'train_loss': 77.5162832878496, 'epoch': 3.0})

In [44]:
# Save mBART Summarization Model (Separate from Translation)

summarizer_model_path = "mbart_hi_en_summarizer_finetuned"

model2.save_pretrained(summarizer_model_path)
tokenizer2.save_pretrained(summarizer_model_path)

print("Summarization model saved at:", summarizer_model_path)


Writing model shards:   0%|          | 0/1 [00:00<?, ?it/s]

Summarization model saved at: mbart_hi_en_summarizer_finetuned


In [45]:
from transformers import MBart50TokenizerFast, MBartForConditionalGeneration
import torch

# Path for SUMMARIZATION model (mBART)
summarizer_model_path = "mbart_hi_en_summarizer_finetuned"

# Load tokenizer & model (Summarization only)
summarizer_tokenizer = MBart50TokenizerFast.from_pretrained(summarizer_model_path)
summarizer_model = MBartForConditionalGeneration.from_pretrained(summarizer_model_path)

# Device setup
summarizer_device = "cuda" if torch.cuda.is_available() else "cpu"
summarizer_model.to(summarizer_device)
summarizer_model.eval()

print(" Summarization model (mBART) loaded successfully")


Loading weights:   0%|          | 0/516 [00:00<?, ?it/s]

 Summarization model (mBART) loaded successfully


In [51]:
import torch

# Hindi / English text (AFTER translation)
text = """
हम रोज सबेरे जल्दी उठते हैं और घर का सारा काम निपटाकर खेत की ओर जाते हैं। पिताजी अपने बूढ़े शरीर से कड़ी मेहनत करते हैं, लेकिन कभी बोझ नहीं बनते। गांव में पढ़ाई पर जोर है, पर गरीब बच्चों को कठिनाई होती है।

"""

# Set language (IMPORTANT for mBART)
summarizer_tokenizer.src_lang = "hi_IN"   # use "en_XX" for English

# Tokenize
inputs = summarizer_tokenizer(
    text,
    return_tensors="pt",
    max_length=512,
    truncation=True
).to(summarizer_device)

# Generate summary
with torch.no_grad():
    summary_ids = summarizer_model.generate(
        **inputs,
        max_length=150,
        min_length=40,
        num_beams=4,
        early_stopping=True
    )

# Decode
summary = summarizer_tokenizer.batch_decode(
    summary_ids,
    skip_special_tokens=True
)[0]

print(" Input (Translated Text):", text)
print(" Summary:", summary)


 Input (Translated Text): 
हम रोज सबेरे जल्दी उठते हैं और घर का सारा काम निपटाकर खेत की ओर जाते हैं। पिताजी अपने बूढ़े शरीर से कड़ी मेहनत करते हैं, लेकिन कभी बोझ नहीं बनते। गांव में पढ़ाई पर जोर है, पर गरीब बच्चों को कठिनाई होती है।


 Summary: दिन-प्रतिदिन जल्दी उठकर घर की सारी काम-सुलझाकर खेत की ओर जाते हैं, पिता अपने पुराने शरीर से कड़ी मेहनत करते हैं, लेकिन कभी बोझ नहीं बनते। गांव में पढ़ाई कठिनाई में पड़ती है, जबकि गरीब बच्चे कठिनाई में पड़ते हैं।


In [50]:
import torch

text = """
It is important to understand the difference between a summary and a paraphrase.
A paraphrase is simply a rewriting of a passage in your own words. A summary, on
the other hand, contains only the main idea and the supporting ideas of a passage.
A summary will be much shorter than a paraphrase.
"""

# Set source language
summarizer_tokenizer.src_lang = "en_XX"

# Tokenize
inputs = summarizer_tokenizer(
    text,
    return_tensors="pt",
    max_length=512,
    truncation=True
).to(summarizer_device)

# Target language token (VERY IMPORTANT for mBART)
forced_lang_token = summarizer_tokenizer.lang_code_to_id["en_XX"]

with torch.no_grad():
    summary_ids = summarizer_model.generate(
        **inputs,

        forced_bos_token_id=forced_lang_token,

        max_length=60,               # shorter → summary forced
        min_length=20,

        num_beams=6,                 # stronger search
        length_penalty=1.4,          # encourages compression
        no_repeat_ngram_size=3,      # prevents copying
        repetition_penalty=1.3,      # reduces reuse

        early_stopping=True
    )

summary = summarizer_tokenizer.batch_decode(
    summary_ids,
    skip_special_tokens=True
)[0]

print("Input:", text)
print("Summary:", summary)


Input: 
It is important to understand the difference between a summary and a paraphrase.
A paraphrase is simply a rewriting of a passage in your own words. A summary, on
the other hand, contains only the main idea and the supporting ideas of a passage.
A summary will be much shorter than a paraphrase.

Summary: A paraphrase is simply rewriting a passage in your own words. A summary, on the other hand, contains only the main idea and supporting ideas of a passage. It will be shorter than a paragraph.
