In [1]:
# Cell 1 install & upgrade
!pip install -q transformers datasets sentencepiece accelerate evaluate
!pip install -q --upgrade transformers accelerate


[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m84.0/84.0 kB[0m [31m3.3 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m363.4/363.4 MB[0m [31m2.9 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m13.8/13.8 MB[0m [31m103.9 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m24.6/24.6 MB[0m [31m24.9 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m883.7/883.7 kB[0m [31m53.6 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m664.8/664.8 MB[0m [31m1.4 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m211.5/211.5 MB[0m [31m6.3 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m56.3/56.3 MB[0m [31m14.7 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━

In [2]:
# Cell 2 checkup the GPU
import torch

DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Running on {DEVICE} {'✅' if DEVICE.type=='cuda' else 'X no GPU found'}")


Running on cuda ✅


In [3]:
# Cell 3 Peeking at the header / column names of each CSV
import pandas as pd
from pathlib import Path

base = Path("/content")
csv_files = [
    "Conversation.csv",
    "chatbot_dataset.csv",
]

for fn in csv_files:
    f = base / fn
    if not f.exists():
        print(f"X {fn} not found")
        continue
    try:
        # reading zero rows to get columns
        df = pd.read_csv(f, nrows=0)
        cols = df.columns.tolist()
    except Exception as e:
        cols = f"Error reading: {e}"
    print(f"{fn}: {cols}")


Conversation.csv: ['Unnamed: 0', 'question', 'answer']
chatbot_dataset.csv: ['User ID', 'User Utterance', 'Bot Response', 'Timestamp', 'Context/Session ID', 'Entities', 'User Feedback', 'Conversation Outcome', 'User Profile', 'Channel/Platform', 'Language', 'User Emotion/Sentiment', 'Location', 'User Segment']


In [4]:
# Cell 4  loading & normalizing CSVs into prompt/response pairs
import pandas as pd
from pathlib import Path
from datasets import Dataset

base = Path("/content")

def load_csv(f):
    try:
        return pd.read_csv(f)
    except Exception:
        return pd.read_csv(f, engine="python", on_bad_lines="skip")

dfs = []

for fn in [
    "Conversation.csv",
    "chatbot_dataset.csv"
]:
    f = base/fn
    if not f.exists():
        print(f"⚠️ {fn} missing, skipping")
        continue
    df = load_csv(f)
    name = fn.lower()
    if name=="conversation.csv":
        df2 = df[["question","answer"]].rename(columns={"question":"prompt","answer":"response"})
    elif name=="chatbot_dataset.csv":
        df2 = df[["User Utterance","Bot Response"]].rename(
                  columns={"User Utterance":"prompt","Bot Response":"response"})
    else:
        continue

    dfs.append(df2.astype(str))

combined = pd.concat(dfs, ignore_index=True) \
             .dropna() \
             .sample(frac=1, random_state=42) \
             .reset_index(drop=True)

print(f"✅ Combined {len(combined)} examples from all CSVs")

hf_ds = Dataset.from_pandas(combined)
print("✅ hf_ds ready with columns:", hf_ds.column_names)


✅ Combined 13725 examples from all CSVs
✅ hf_ds ready with columns: ['prompt', 'response']


In [5]:
# Cell 5 Loading & normalizing our TXT + JSON files into prompt/response pairs
import pandas as pd, json
from pathlib import Path
from datasets import Dataset

base = Path("/content")

txt_dfs = []

# 1 dialogs.txt  (split on "__eou__")
p = base/"dialogs.txt"
if p.exists():
    rows = []
    for line in open(p, encoding="utf-8", errors="ignore"):
        turns = [u.strip() for u in line.split("__eou__") if u.strip()]
        for i in range(len(turns)-1):
            rows.append({"prompt":turns[i], "response":turns[i+1]})
    txt_dfs.append(pd.DataFrame(rows))
    print(f"✅ Loaded {len(rows)} pairs from dialogs.txt")

# 2 human_chat.txt (Human 1 → Human 2)
p = base/"human_chat.txt"
if p.exists():
    lines = [l.strip() for l in open(p, encoding="utf-8", errors="ignore")]
    rows = []
    for i in range(len(lines)-1):
        if lines[i].startswith("Human 1:") and lines[i+1].startswith("Human 2:"):
            q = lines[i].split(":",1)[1].strip()
            a = lines[i+1].split(":",1)[1].strip()
            rows.append({"prompt":q,"response":a})
    txt_dfs.append(pd.DataFrame(rows))
    print(f"✅ Loaded {len(rows)} pairs from human_chat.txt")

# 3 chatbot dataset.txt / chatbot_dataset.txt (tab-sep)
for fn in ["chatbot dataset.txt","chatbot_dataset.txt"]:
    p = base/fn
    if p.exists():
        df = pd.read_csv(p, sep="\t", names=["prompt","response"],
                         engine="python", on_bad_lines="skip")
        txt_dfs.append(df.astype(str))
        print(f"✅ Loaded {len(df)} pairs from {fn}")

# 4 input_texts.txt + label_texts.txt
ipt, lab = base/"input_texts.txt", base/"label_texts.txt"
if ipt.exists() and lab.exists():
    ins = [l.strip() for l in open(ipt) if l.strip()]
    outs= [l.strip() for l in open(lab) if l.strip()]
    df = pd.DataFrame({"prompt":ins,"response":outs})
    txt_dfs.append(df)
    print(f"✅ Loaded {len(df)} pairs from input_texts/label_texts")

# 5 intents.json files
for fn in ["intents.json","intents-2.json","intents-3.json","intents-4.json"]:
    p = base/fn
    if p.exists():
        j = json.load(open(p, encoding="utf-8"))
        rows = []
        for intent in j.get("intents",[]):
            resp = intent.get("responses",[""])[0]
            for pat in intent.get("patterns",[]):
                rows.append({"prompt":pat, "response":resp})
        df = pd.DataFrame(rows)
        txt_dfs.append(df)
        print(f"✅ Loaded {len(df)} patterns from {fn}")

# Combining all TXT/JSON DataFrames
combined_txt = pd.concat(txt_dfs, ignore_index=True)
combined_txt = (
    combined_txt
    .dropna()
    .astype(str)
    .sample(frac=1, random_state=42)
    .reset_index(drop=True)
)
print(f"🔗 Combined TXT/JSON → {len(combined_txt)} examples")

hf_ds_txt = Dataset.from_pandas(combined_txt)
print("✅ hf_ds_txt ready with columns:", hf_ds_txt.column_names)


✅ Loaded 0 pairs from dialogs.txt
✅ Loaded 730 pairs from human_chat.txt
✅ Loaded 566 pairs from chatbot dataset.txt
✅ Loaded 3982 pairs from input_texts/label_texts
✅ Loaded 347 patterns from intents.json
✅ Loaded 232 patterns from intents-2.json
✅ Loaded 405 patterns from intents-3.json
✅ Loaded 1539 patterns from intents-4.json
🔗 Combined TXT/JSON → 7801 examples
✅ hf_ds_txt ready with columns: ['prompt', 'response']


In [6]:
# Cell 6
import csv
import pandas as pd

# 1) Merging the CSV‐derived and TXT/JSON‐derived DataFrames
df_all = pd.concat([combined, combined_txt], ignore_index=True)

# 2) Adding T5‐style prompt prefix and EOS token
df_all["prompt"]   = "question: " + df_all["prompt"].str.strip()
df_all["response"] = df_all["response"].str.strip() + " </s>"


out_path = "/content/clean_combined.csv"
df_all.to_csv(out_path, index=False, quoting=csv.QUOTE_ALL)
print(f"Wrote {len(df_all)} rows to {out_path}")



Wrote 21526 rows to /content/clean_combined.csv


In [None]:
# Cell 7 LOAD CLEAN CSV VIA PANDAS + CONVERT TO HF DATASET

import pandas as pd
from datasets import Dataset
from transformers import T5TokenizerFast

# 1 Read your cleaned file with pandas
df = pd.read_csv("/content/clean_combined.csv")
print(f"✅ Loaded {len(df)} rows from clean_combined.csv")

# 2 Convert to an in-memory HF Dataset
ds = Dataset.from_pandas(df)

#    If you get a leftover index column, drop it:
if "__index_level_0__" in ds.column_names:
    ds = ds.remove_columns("__index_level_0__")

print("Dataset features:", ds.column_names)

# 3 Preparing tokenizer and map function
tok = T5TokenizerFast.from_pretrained("t5-small")
MAX_IN, MAX_OUT = 64, 64

def to_features(batch):
    enc = tok(batch["prompt"],  max_length=MAX_IN,  truncation=True, padding="max_length")
    dec = tok(batch["response"], max_length=MAX_OUT, truncation=True, padding="max_length")
    enc["labels"] = dec["input_ids"]
    return enc

# 4 Tokenizing & split
ds = ds.map(
    to_features,
    batched=True,
    remove_columns=["prompt", "response"]
)
ds = ds.train_test_split(test_size=0.1, seed=42)

print(ds)


✅ Loaded 21526 rows from clean_combined.csv
Dataset features: ['prompt', 'response']


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/2.32k [00:00<?, ?B/s]

spiece.model:   0%|          | 0.00/792k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.39M [00:00<?, ?B/s]

Map:   0%|          | 0/21526 [00:00<?, ? examples/s]

DatasetDict({
    train: Dataset({
        features: ['input_ids', 'attention_mask', 'labels'],
        num_rows: 19373
    })
    test: Dataset({
        features: ['input_ids', 'attention_mask', 'labels'],
        num_rows: 2153
    })
})


In [None]:
# Cell 8 Fine tuning with HuggingFace Seq2SeqTrainer

import torch
from transformers import (
    T5ForConditionalGeneration,
    Seq2SeqTrainer, Seq2SeqTrainingArguments,
    DataCollatorForSeq2Seq
)

# 1 Device & model
DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model  = T5ForConditionalGeneration.from_pretrained("t5-small").to(DEVICE)

# 2 Preparing the Trainer args
args = Seq2SeqTrainingArguments(
    output_dir                = "t5_finetuned",
    per_device_train_batch_size = 8,
    per_device_eval_batch_size  = 8,
    num_train_epochs          = 5,
    learning_rate             = 5e-4,
    label_smoothing_factor    = 0.1,


    do_eval                   = True,
    eval_steps                = 500,
    save_steps                = 500,

    predict_with_generate     = True,
    fp16                      = True,
    logging_steps             = 100,
    report_to                 = "none",
)

# 3 DataCollator will pad and shift labels → decoder_input_ids
data_collator = DataCollatorForSeq2Seq(
    tokenizer   = tok,
    model       = model,
    return_tensors = "pt",
)

# 4 Building the trainer
trainer = Seq2SeqTrainer(
    model           = model,
    args            = args,
    train_dataset   = ds["train"],
    eval_dataset    = ds["test"],
    tokenizer       = tok,
    data_collator   = data_collator,
)

# 5 Launching training
trainer.train()

# 6 Saving
model.save_pretrained("best_t5")
tok.save_pretrained("best_t5")
print("✅ Training complete – checkpoint saved in best_t5/")

config.json:   0%|          | 0.00/1.21k [00:00<?, ?B/s]

Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`


model.safetensors:   0%|          | 0.00/242M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/147 [00:00<?, ?B/s]

  trainer = Seq2SeqTrainer(
Passing a tuple of `past_key_values` is deprecated and will be removed in Transformers v4.48.0. You should pass an instance of `EncoderDecoderCache` instead, e.g. `past_key_values=EncoderDecoderCache.from_legacy_cache(past_key_values)`.


Step,Training Loss
100,3.3275
200,2.5483
300,2.4776
400,2.5021
500,2.4618
600,2.4469
700,2.4318
800,2.3596
900,2.3349
1000,2.3221


✅ Training complete – checkpoint saved in best_t5/


In [None]:
# Cell 9 Chatting with our fine tuned model
import torch, textwrap
from transformers import T5TokenizerFast, T5ForConditionalGeneration

DEVICE   = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model_dir= "best_t5"

tokenizer = T5TokenizerFast.from_pretrained(model_dir, local_files_only=True)
model     = T5ForConditionalGeneration.from_pretrained(model_dir, local_files_only=True).to(DEVICE)
model.eval()

print("Chatbot ready! (type 'exit' or blank to quit)\n")
while True:
    user = input("you: ").strip()
    if not user or user.lower() in {"exit","quit"}:
        print("Goodbye! 👋")
        break

    batch = tokenizer("question: "+user,
                      return_tensors="pt",
                      truncation=True, max_length=64).to(DEVICE)

    out = model.generate(**batch,
                         max_length=64,
                         do_sample=True, top_p=0.9, temperature=0.7,
                         no_repeat_ngram_size=3, repetition_penalty=1.2,
                         eos_token_id=tokenizer.eos_token_id)

    reply = tokenizer.decode(out[0], skip_special_tokens=True)
    print("bot:", textwrap.fill(reply, 80))


Chatbot ready! (type 'exit' or blank to quit)

you: Explain program documentation. Why is it important
bot: Program documentation is a type of software that uses the knowledge and
experience to provide information about the components of a program. It is
important to establish a consistent understanding of the basic principles and
functions of
you: What is software testing
bot: Software testing is the process of identifying and analyzing the data and
activities of a software system. It involves the identification of tens of
software, including the tests they use to make certain that the system is
functioning as planned. Software testing can include testing, testing, and
testing, validation
you: Can you explain doubly linked list
bot: A doubly linked list is a linear data structure that is used to trace the
information of a single member of sacrament in a computer system. It is commonly
used for various functions, such as memory storage, memory storage and other
connected devices, such 

In [None]:
import pandas as pd
from datasets import Dataset
import evaluate
from transformers import T5TokenizerFast, T5ForConditionalGeneration, Seq2SeqTrainer

# Reading & spliting our clean CSV
raw_df = pd.read_csv("/content/clean_combined.csv")
splits = Dataset.from_pandas(raw_df).train_test_split(test_size=0.1, seed=42)
raw_test = splits["test"]

# Tokenizing and remove the text columns
tok = T5TokenizerFast.from_pretrained("t5-small")
MAX_IN, MAX_OUT = 64, 64

def to_features(batch):
    enc = tok(batch["prompt"],  max_length=MAX_IN,  truncation=True, padding="max_length")
    dec = tok(batch["response"], max_length=MAX_OUT, truncation=True, padding="max_length")
    enc["labels"] = dec["input_ids"]
    return enc

# applying to the training and eval datasets
tokenized = splits.map(
    to_features,
    batched=True,
    remove_columns=["prompt", "response"]
)

# Fine tuning as before
model = T5ForConditionalGeneration.from_pretrained("t5-small").cuda()
trainer = Seq2SeqTrainer(
    model=model,
    args=Seq2SeqTrainingArguments(
        output_dir="t5_finetuned",
        per_device_train_batch_size=8,
        per_device_eval_batch_size=8,
        num_train_epochs=5,
        learning_rate=5e-4,
        label_smoothing_factor=0.1,
        do_eval=True,
        eval_steps=500,
        save_steps=500,
        predict_with_generate=True,
        fp16=True,
        logging_steps=100,
        report_to="none",
    ),
    train_dataset=tokenized["train"],
    eval_dataset=tokenized["test"],
    tokenizer=tok,
    data_collator=DataCollatorForSeq2Seq(tok, model),
)
trainer.train()


Map:   0%|          | 0/19373 [00:00<?, ? examples/s]

Map:   0%|          | 0/2153 [00:00<?, ? examples/s]

  trainer = Seq2SeqTrainer(


Step,Training Loss
100,3.3275
200,2.5483
300,2.4776
400,2.5021
500,2.4618
600,2.4469
700,2.4318
800,2.3596
900,2.3349
1000,2.3221


OverflowError: out of range integral type conversion attempted

In [None]:
# METRICS
import numpy as np
import evaluate


pred_out   = trainer.predict(tokenized["test"])
raw_preds  = pred_out.predictions
raw_labels = pred_out.label_ids


if isinstance(raw_preds, tuple):
    raw_preds = raw_preds[0]


if raw_preds.ndim == 3:
    pred_ids = np.argmax(raw_preds, axis=-1)
else:
    pred_ids = raw_preds


pad = tok.pad_token_id
label_ids = np.where(raw_labels == -100, pad, raw_labels)


vocab_max = tok.vocab_size - 1

def clip_and_pythonify(arr):

    clipped = np.clip(arr, 0, vocab_max)
    return clipped.astype(int).tolist()

pred_ids  = clip_and_pythonify(pred_ids)
label_ids = clip_and_pythonify(label_ids)

# Decode per example
pred_strs  = [tok.decode(seq, skip_special_tokens=True) for seq in pred_ids]
label_strs = [tok.decode(seq, skip_special_tokens=True) for seq in label_ids]

# Compute ROUGE-L
rouge     = evaluate.load("rouge")
rouge_res = rouge.compute(
    predictions=pred_strs,
    references =label_strs,
    rouge_types=["rougeL"],
    use_stemmer=True
)

# Compute BLEU
import evaluate

bleu = evaluate.load("bleu")
bleu_res = bleu.compute(
    predictions=pred_strs,
    references=[[r] for r in label_strs]
)


# Compute Exact-Match
exact_matches = sum(p == r for p, r in zip(pred_strs, label_strs))
em_score      = exact_matches / len(label_strs)


# Print all metrics
print(f"→ ROUGE-L F1:  {rouge_res['rougeL']:.4f}")
print(f"→ BLEU:        {bleu_res['bleu']:.4f}")
print(f"→ Exact-Match: {em_score:.2%}  ({exact_matches}/{len(label_strs)})")



→ ROUGE-L F1:  0.1617
→ BLEU:        0.1060
→ Exact-Match: 1.95%  (42/2153)


In [19]:
# Hyperparameter Sweep
!pip install -q rouge_score

import os
import torch
import numpy as np
import pandas as pd
import evaluate

from sklearn.model_selection import train_test_split
from datasets import Dataset, load_from_disk
from transformers import (
    T5TokenizerFast,
    T5ForConditionalGeneration,
    Seq2SeqTrainer,
    Seq2SeqTrainingArguments,
    DataCollatorForSeq2Seq
)

# Load tokenizer
tok = T5TokenizerFast.from_pretrained("t5-small")

# Load & split the cleaned CSV for raw references
df = pd.read_csv(
    "/content/clean_combined.csv",
    engine="python",
    on_bad_lines="skip",
    dtype=str
)
df_train, df_test = train_test_split(df, test_size=0.10, random_state=42)

# Convert raw pandas → HF Dataset
raw_train = Dataset.from_pandas(df_train.reset_index(drop=True))
raw_test  = Dataset.from_pandas(df_test.reset_index(drop=True))

# Load tokenized splits (must exist already, otherwise you need to run tokenization once)
TOKENIZED_TRAIN_DIR = "tokenized_train"
TOKENIZED_TEST_DIR  = "tokenized_test"

if os.path.isdir(TOKENIZED_TRAIN_DIR) and os.path.isdir(TOKENIZED_TEST_DIR):
    print("→ Loading tokenized splits from disk...")
    tokenized_train = load_from_disk(TOKENIZED_TRAIN_DIR)
    tokenized_test  = load_from_disk(TOKENIZED_TEST_DIR)
else:
    raise RuntimeError(
        "tokenized_train/ and tokenized_test/ not found. "
        "Run the tokenization step once before invoking this sweep."
    )

# Build a “tokenized” dict for Trainer
tokenized = {
    "train": tokenized_train,
    "test": tokenized_test
}

# Prepare ground‐truth reference strings from df_test
references = df_test["response"].tolist()

# Data collator (model will be bound later)
data_collator = DataCollatorForSeq2Seq(tokenizer=tok, model=None)

# Load ROUGE
rouge = evaluate.load("rouge")

# Device
DEVICE = "cuda" if torch.cuda.is_available() else "cpu"

# Defining our hyperparameter grid
grid = [
    {"learning_rate": 1e-4, "per_device_train_batch_size": 8},
    {"learning_rate": 3e-4, "per_device_train_batch_size": 8},
    {"learning_rate": 5e-4, "per_device_train_batch_size": 8},
    {"learning_rate": 1e-3, "per_device_train_batch_size": 8},
    {"learning_rate": 5e-4, "per_device_train_batch_size": 4},
]

results = []

# Looping over each combo
for hp in grid:
    lr = hp["learning_rate"]
    bs = hp["per_device_train_batch_size"]
    run_name = f"lr{lr}_bs{bs}"
    base_dir = f"tmp/{run_name}"
    final_ckpt = os.path.join(base_dir, "checkpoint-final")

    print(f"\n🔶 Sweep combo: lr={lr}, bs={bs}")

    # Case A checkpoint final already exists = skip training entirely
    if os.path.isdir(final_ckpt) and os.listdir(final_ckpt):
        print(f"→ Found existing checkpoint-final for {run_name}.  Skipping training.")
        model = T5ForConditionalGeneration.from_pretrained(final_ckpt).to(DEVICE)
        trainer_needed = False

    # Case B no checkpoint final = train from scratch, then save
    else:
        print(f"→ No checkpoint-final for {run_name}.  Training now…")
        model = T5ForConditionalGeneration.from_pretrained("t5-small").to(DEVICE)
        data_collator.model = model

        args = Seq2SeqTrainingArguments(
            output_dir=base_dir,
            num_train_epochs=5,
            save_steps=500,
            eval_steps=500,
            learning_rate=lr,
            per_device_train_batch_size=bs,
            per_device_eval_batch_size=bs,
            predict_with_generate=True,
            fp16=torch.cuda.is_available(),
            logging_steps=100,
            report_to="none",
            do_eval=True,
            save_total_limit=1,
        )

        trainer = Seq2SeqTrainer(
            model=model,
            args=args,
            train_dataset=tokenized["train"],
            eval_dataset=tokenized["test"],
            tokenizer=tok,
            data_collator=data_collator,
        )

        try:
            train_out = trainer.train()
            train_metrics = train_out.metrics


            os.makedirs(final_ckpt, exist_ok=True)
            trainer.save_model(final_ckpt)
            tok.save_pretrained(final_ckpt)

            trainer_needed = True

        except Exception as e:
            # if evaluation or generation fails we will already have
            #  tokenized_train/ & tokenized_test/
            #  checkpoint-final/ (model weights)
            print(f" ERROR during training of {run_name}: {e}")
            print(" tokenized data & checkpoint-final are on disk; re‐run to resume.")
            raise


    if not trainer_needed:
        model = T5ForConditionalGeneration.from_pretrained(final_ckpt).to(DEVICE)
        trainer = Seq2SeqTrainer(
            model=model,
            args=Seq2SeqTrainingArguments(
                output_dir=os.path.join(base_dir, "eval"),
                per_device_eval_batch_size=bs,
                predict_with_generate=True,
                fp16=torch.cuda.is_available(),
                report_to="none",
            ),
            tokenizer=tok,
            data_collator=DataCollatorForSeq2Seq(tok, model=model),
            eval_dataset=tokenized["test"],
        )

    # Evaluating on test split
    eval_metrics = trainer.evaluate()
    eval_loss = eval_metrics.get("eval_loss", None)
    print(f"→ Eval loss for {run_name}: {eval_loss}")

    # Generating on test split
    pred_out = trainer.predict(tokenized["test"])
    preds_np = pred_out.predictions  # shape (num_examples, seq_len)

    # Robust decode
    generated_texts = []
    vocab_size = tok.vocab_size
    for seq in preds_np:
        filtered_ids = [int(x) for x in seq if 0 <= int(x) < vocab_size]
        if not filtered_ids:
            filtered_ids = [tok.pad_token_id]
        generated_texts.append(tok.decode(filtered_ids, skip_special_tokens=True))

    # Computing ROUGE L
    r = rouge.compute(
        predictions=generated_texts,
        references=references,
        rouge_types=["rougeL"],
        use_stemmer=True
    )
    rougeL_score = r["rougeL"]
    if hasattr(rougeL_score, "mid"):
        rougeL_val = rougeL_score.mid.fmeasure * 100
    else:
        rougeL_val = float(rougeL_score) * 100

    # Recording results
    train_loss = train_metrics.get("train_loss", None) if trainer_needed else None
    results.append({
        "learning_rate": lr,
        "bs": bs,
        "train_loss": train_loss,
        "eval_loss":  eval_loss,
        "rougeL":     rougeL_val
    })

# Print sweep table
df = pd.DataFrame(results)
print("\n### Hyperparameter Sweep Results (Train + Eval)")
print(df.to_markdown(index=False))


→ Loading tokenized splits from disk...

🔶 Sweep combo: lr=0.0001, bs=8
→ Found existing checkpoint-final for lr0.0001_bs8.  Skipping training.


  trainer = Seq2SeqTrainer(


→ Eval loss for lr0.0001_bs8: 0.8918564319610596

🔶 Sweep combo: lr=0.0003, bs=8
→ No checkpoint-final for lr0.0003_bs8.  Training now…


  trainer = Seq2SeqTrainer(


Step,Training Loss
100,2.0268
200,1.1915
300,1.1787
400,1.142
500,1.0612
600,1.0782
700,1.0586
800,1.0248
900,1.0475
1000,1.0377


→ Eval loss for lr0.0003_bs8: 0.829267144203186

🔶 Sweep combo: lr=0.0005, bs=8
→ No checkpoint-final for lr0.0005_bs8.  Training now…


  trainer = Seq2SeqTrainer(


Step,Training Loss
100,1.8411
200,1.1587
300,1.137
400,1.1021
500,1.0186
600,1.0427
700,1.0269
800,0.9792
900,1.0175
1000,1.0011


→ Eval loss for lr0.0005_bs8: 0.807678759098053

🔶 Sweep combo: lr=0.001, bs=8
→ No checkpoint-final for lr0.001_bs8.  Training now…


  trainer = Seq2SeqTrainer(


Step,Training Loss
100,1.7321
200,1.1328
300,1.09
400,1.0551
500,0.9781
600,1.0129
700,0.9954
800,0.9421
900,0.9869
1000,0.9668


→ Eval loss for lr0.001_bs8: 0.785469114780426

🔶 Sweep combo: lr=0.0005, bs=4
→ No checkpoint-final for lr0.0005_bs4.  Training now…


  trainer = Seq2SeqTrainer(


Step,Training Loss
100,1.8989
200,1.243
300,1.1302
400,1.1303
500,1.106
600,1.103
700,1.0585
800,1.0846
900,0.957
1000,1.0274


Step,Training Loss
100,1.8989
200,1.243
300,1.1302
400,1.1303
500,1.106
600,1.103
700,1.0585
800,1.0846
900,0.957
1000,1.0274


→ Eval loss for lr0.0005_bs4: 0.795783281326294

### Hyperparameter Sweep Results (Train + Eval)
|   learning_rate |   bs |   train_loss |   eval_loss |   rougeL |
|----------------:|-----:|-------------:|------------:|---------:|
|          0.0001 |    8 |   nan        |    0.891856 |  12.1094 |
|          0.0003 |    8 |     0.894863 |    0.829267 |  14.0308 |
|          0.0005 |    8 |     0.859541 |    0.807679 |  15.0313 |
|          0.001  |    8 |     0.823373 |    0.785469 |  16.3519 |
|          0.0005 |    4 |     0.834802 |    0.795783 |  15.9599 |
