<a href="https://colab.research.google.com/github/ktanguy/University_chatbot_assistant/blob/main/University_chatbot_assistant.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
# ============================================
# Cell 1 — Environment Setup (Run FIRST)
# --------------------------------------------
# What this does:
# 1) Upgrades pip (the Python package manager).
# 2) Installs a known-good, compatible set of libraries:
#    - numpy / pandas / scipy (matching binary versions)
#    - tensorflow (we'll use TF with Hugging Face)
#    - transformers, datasets, evaluate (Hugging Face stack)
#    - nltk, rouge-score (text preprocessing + evaluation)
# 3) Force a runtime RESTART so Colab reloads C-extensions
#    against the just-installed NumPy. This prevents
#    the "numpy.dtype size changed" error.
#
# After the restart:
#   -> Run your next cell to MOUNT Google Drive.
#   -> Then continue with loading your dataset, etc.
# ============================================

# (Optional) Make TensorFlow logs less noisy
import os
os.environ["TF_CPP_MIN_LOG_LEVEL"] = "2"

# 1) Upgrade pip so installs are smooth
!pip -q install --upgrade pip

# 2) Install a compatible set of packages
#    These versions play nicely together in Colab.
!pip -q install \
  "numpy==1.26.4" \
  "pandas==2.2.2" \
  "scipy==1.13.1" \
  "tensorflow==2.16.1" \
  "transformers==4.44.2" \
  "datasets==2.21.0" \
  "evaluate==0.4.2" \
  "nltk==3.9.1" \
  "rouge-score==0.1.2"

# 3) RESTART the runtime so Python reloads binary extensions
import time, sys
print("\nEnvironment ready. The runtime will now restart to apply changes...")
time.sleep(1)

# This kills the current Python process; Colab auto-reconnects.
import os
os.kill(os.getpid(), 9)


[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/1.8 MB[0m [31m?[0m eta [36m-:--:--[0m[2K   [91m━━━━━━━━━━━━[0m[91m╸[0m[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.6/1.8 MB[0m [31m16.6 MB/s[0m eta [36m0:00:01[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.8/1.8 MB[0m [31m26.2 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
[33m  DEPRECATION: Building 'rouge-score' using the legacy setup.py bdist_wheel mechanism, which will be removed in a future version. pip 25.3 will enforce this behaviour change. A possible replacement is to use the standardized build interface by setting the `--use-pep517` option, (possibly combined with `--no-build-isolation`), or adding a `pyproject.toml` file to the source tree of 'rouge-score'. Discussion can be found at https://github.com/pypa/pip/issues/6334[0m[33m
[0m  Building wheel for rouge-score (setup.py) ... [?25l[?25hdone
[31mERROR: pip's dependency 

In [1]:
from google.colab import drive
drive.mount('/content/drive')


Mounted at /content/drive


In [9]:
import json, os


DATASET_PATH = "/content/drive/MyDrive/UniversityChatbot/intents.json"

assert os.path.exists(DATASET_PATH), f"File not found at: {DATASET_PATH}"

with open(DATASET_PATH, "r") as f:
    data = json.load(f)


print(" Loaded JSON with keys:", list(data.keys()))
print("Number of intents:", len(data.get("intents", [])))
print("First intent example:")
print(json.dumps(data["intents"][0], indent=2)[:800])


 Loaded JSON with keys: ['intents']
Number of intents: 39
First intent example:
{
  "intent": "greeting",
  "text": [
    "Hi",
    "How are you?",
    "Is anyone there?",
    "Hello",
    "Good day",
    "What's up",
    "how are ya",
    "heyy",
    "whatsup",
    "??? ??? ??"
  ],
  "responses": [
    "Hello!",
    "Good to see you again!",
    "Hi there, how can I help?"
  ],
  "extension": {
    "function": "",
    "entities": false,
    "responses": []
  },
  "context": {
    "in": "",
    "out": "GreetingUserRequest",
    "clear": false
  },
  "entityType": "NA",
  "entities": []
}


In [10]:
# =========================
# SAFE TRAIN/VAL/TEST SPLIT + T5 PAIRS (BEGINNER-FRIENDLY)
# Expects: df with columns ["intent", "text", "response"]
# =========================

import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split

# ---------- 0) Sanity checks & light cleaning ----------
# If the loader cell created different column names, normalize them here.
expected_cols = {"intent", "text", "response"}
if not expected_cols.issubset(df.columns):
    # Try to auto-fix common naming issues (case / spaces)
    rename_map = {}
    for col in df.columns:
        c = col.strip().lower()
        if c in ["intent", "tag"]:  rename_map[col] = "intent"
        elif c in ["pattern", "patterns", "question", "query", "text"]: rename_map[col] = "text"
        elif c in ["answer", "answers", "response", "responses", "reply"]: rename_map[col] = "response"
    df = df.rename(columns=rename_map)

# Validate again
missing = expected_cols - set(df.columns)
assert not missing, f"Your dataframe is missing columns: {missing}. Found columns: {list(df.columns)}"

# Drop rows with missing fields and strip whitespace
df = df.dropna(subset=["intent", "text", "response"]).copy()
df["intent"]   = df["intent"].astype(str).str.strip()
df["text"]     = df["text"].astype(str).str.strip()
df["response"] = df["response"].astype(str).str.strip()

# Remove empty texts/responses (just in case)
df = df[(df["text"] != "") & (df["response"] != "")].reset_index(drop=True)

print(f"Data after cleaning: {len(df)} pairs")
print(df.head(3))

# ---------- 1) Handle rare intents to avoid stratify errors ----------
counts = df["intent"].value_counts()
RARE_MIN = 3  # intents with <3 examples go entirely to TRAIN
rare_mask = df["intent"].map(counts) < RARE_MIN
rare_df   = df[rare_mask].copy()
common_df = df[~rare_mask].copy()

print("\nSummary of intent frequencies:")
print(counts.head(10))
print(f"\nTotal pairs: {len(df)} | Rare (<{RARE_MIN}) kept in TRAIN: {len(rare_df)} | Common: {len(common_df)}")

# ---------- 2) Split common intents only (so stratify works) ----------
def safe_stratified_split(frame, test_size, desc):
    """Try a stratified split; if it fails, fall back to a plain split."""
    if len(frame) == 0:
        return frame.copy(), frame.copy()
    try:
        left, right = train_test_split(
            frame,
            test_size=test_size,
            random_state=42,
            shuffle=True,
            stratify=frame["intent"]
        )
        return left, right
    except Exception as e:
        print(f"[{desc}] Stratified split failed ({type(e).__name__}: {e}). Falling back to non-stratified split.")
        left, right = train_test_split(
            frame,
            test_size=test_size,
            random_state=42,
            shuffle=True
        )
        return left, right

# 20% test from common
train_common, test_common = safe_stratified_split(common_df, test_size=0.20, desc="COMMON->TEST")

# From the remaining common train, carve ~10% of the original as VAL (≈12.5% of the remaining 80%)
val_size_rel = 0.125 if len(train_common) > 0 else 0.0
if val_size_rel > 0:
    train_common, val_common = safe_stratified_split(train_common, test_size=val_size_rel, desc="TRAIN_COMMON->VAL")
else:
    val_common = pd.DataFrame(columns=train_common.columns if len(train_common)>0 else common_df.columns)

# ---------- 3) Add all rare examples to TRAIN ----------
train_df = pd.concat([train_common, rare_df], ignore_index=True)
val_df   = val_common.reset_index(drop=True)
test_df  = test_common.reset_index(drop=True)

# Shuffle train for good measure
train_df = train_df.sample(frac=1.0, random_state=42).reset_index(drop=True)

# ---------- 4) Report split stats ----------
def report_split(name, frame):
    print(f"\n{name} shape: {frame.shape}")
    if len(frame):
        print(frame["intent"].value_counts().head(15))

report_split("TRAIN", train_df)
report_split("VAL",   val_df)
report_split("TEST",  test_df)

print("\nNote:")
print(f"- Intents with < {RARE_MIN} total examples were kept entirely in TRAIN to avoid stratification errors.")
print("- Some very-rare intents may not appear in VAL/TEST — document this in your report.")

# ---------- 5) Build T5-style (input_text, target_text) columns ----------
def to_t5_pair(row):
    # You can tweak this prompt later; keep it simple while training.
    return pd.Series({
        "input_text":  f"domain: university | intent: {row['intent']} | user: {row['text']}",
        "target_text": row["response"]
    })

train_pairs = train_df.apply(to_t5_pair, axis=1)
val_pairs   = val_df.apply(to_t5_pair,   axis=1)
test_pairs  = test_df.apply(to_t5_pair,  axis=1)

train_df_proc = pd.concat([train_df.reset_index(drop=True), train_pairs], axis=1)
val_df_proc   = pd.concat([val_df.reset_index(drop=True),   val_pairs],   axis=1)
test_df_proc  = pd.concat([test_df.reset_index(drop=True),  test_pairs],  axis=1)

# Final sanity check
for name, frame in [("train_df_proc", train_df_proc), ("val_df_proc", val_df_proc), ("test_df_proc", test_df_proc)]:
    assert {"input_text","target_text"}.issubset(frame.columns), f"{name} is missing required columns"
print("\nPrepared T5 pairs. Example rows:")
print(train_df_proc[["input_text","target_text"]].head(3))


AssertionError: Your dataframe is missing columns: {'text', 'intent', 'response'}. Found columns: []

In [3]:

import random
import pandas as pd



pairs = []
for intent in data.get("intents", []):
    tag = intent.get("tag", "unknown")
    patterns = intent.get("patterns", [])
    responses = intent.get("responses", [])
    if not patterns or not responses:
        continue


    for p in patterns:
        target = responses[0]
        pairs.append({
            "tag": tag,
            "input_text": f"question: {p.strip()}",
            "target_text": target.strip()
        })

df = pd.DataFrame(pairs)
print("Total pairs:", len(df))
df.head(10)


Total pairs: 0


In [5]:
# =========================
# Robust, stratified train/val/test split with rare-intent handling
# =========================
from sklearn.model_selection import train_test_split
import pandas as pd
import numpy as np

# df must already exist with columns: ["intent", "text", "response"]

# 1) Count how many examples per intent
counts = df["intent"].value_counts()

# 2) Define "rare" intents: fewer than 3 examples total
#    We keep all rare examples in TRAIN to avoid stratification errors.
RARE_MIN = 3
rare_mask   = df["intent"].map(counts) < RARE_MIN
rare_df     = df[rare_mask].copy()
common_df   = df[~rare_mask].copy()

print(f"Total pairs: {len(df)} | Rare pairs (kept in train only): {len(rare_df)} | Common pairs: {len(common_df)}")
print("\nRare intents (kept in train only):")
print(df.loc[rare_mask, "intent"].value_counts())

# 3) Stratified split on common intents (those with >= RARE_MIN)
#    First: common -> train_common + test (20% test)
if len(common_df["intent"].unique()) > 1:
    train_common, test_common = train_test_split(
        common_df,
        test_size=0.20,
        random_state=42,
        shuffle=True,
        stratify=common_df["intent"]
    )
else:
    # Fallback if somehow only one common intent remains
    train_common, test_common = common_df, pd.DataFrame(columns=common_df.columns)

# 4) From train_common, carve out a validation set (~10% of total)
#    Since we already took 20% test, taking 12.5% of train_common ≈ 10% of original common
if len(train_common) > 0 and len(train_common["intent"].unique()) > 1:
    train_common, val_common = train_test_split(
        train_common,
        test_size=0.125,  # 12.5% of the remaining 80% ≈ 10% overall
        random_state=42,
        shuffle=True,
        stratify=train_common["intent"]
    )
else:
    val_common = pd.DataFrame(columns=train_common.columns)

# 5) Add all rare samples to TRAIN
train_df = pd.concat([train_common, rare_df], ignore_index=True)
val_df   = val_common.reset_index(drop=True)
test_df  = test_common.reset_index(drop=True)

# 6) Shuffle train for good measure
train_df = train_df.sample(frac=1.0, random_state=42).reset_index(drop=True)

# 7) Report splits and intent coverage
def counts_by_split(name, frame):
    print(f"\n{name} shape: {frame.shape}")
    print(frame["intent"].value_counts())

counts_by_split("TRAIN", train_df)
counts_by_split("VAL",   val_df)
counts_by_split("TEST",  test_df)

print("\nNote:")
print("- Intents with < 3 total examples were kept entirely in TRAIN to avoid stratification errors.")
print("- This means some very-rare intents may not appear in VAL/TEST (that’s okay; document this in your report).")


KeyError: 'intent'

In [6]:
# =========================================
# Build seq2seq fields for T5 fine-tuning
# =========================================

def build_io(df):
    df = df.copy()
    # A simple, informative prompt template for T5
    # You can remove `intent:` if you want pure text-to-text, but keeping it helps conditioning.
    df["input_text"]  = "intent: " + df["intent"].astype(str) + " | user: " + df["text"].astype(str)
    df["target_text"] = df["response"].astype(str)
    return df

train_df_io = build_io(train_df)
val_df_io   = build_io(val_df)
test_df_io  = build_io(test_df)

# Sanity check
print(train_df_io[["intent","text","response","input_text","target_text"]].head(3))


NameError: name 'train_df' is not defined

In [7]:
# ==== CELL 5: Tokenize inputs/targets for T5-small (TensorFlow) ====
from transformers import AutoTokenizer
import numpy as np

MODEL_NAME = "t5-small"   # small and fast; good for seq2seq

tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)

# Set some safe max lengths (adjust if your texts are long)
MAX_INPUT_LEN  = 64
MAX_TARGET_LEN = 64

def tokenize_pairs(df_slice):
    """
    Convert a pandas slice with columns 'input_text', 'target_text'
    into (input_ids, attention_mask, labels) numpy arrays padded
    to fixed lengths so we can use tf.data easily.
    """
    enc_inputs = tokenizer(
        df_slice["input_text"].tolist(),
        padding="max_length",
        truncation=True,
        max_length=MAX_INPUT_LEN,
        return_tensors="np"
    )

    with tokenizer.as_target_tokenizer():
        enc_targets = tokenizer(
            df_slice["target_text"].tolist(),
            padding="max_length",
            truncation=True,
            max_length=MAX_TARGET_LEN,
            return_tensors="np"
        )

    # Important: For seq2seq training in TF, labels should be -100 for padded tokens
    labels = enc_targets["input_ids"].copy()
    labels[labels == tokenizer.pad_token_id] = -100

    return enc_inputs["input_ids"], enc_inputs["attention_mask"], labels

Xtr_ids, Xtr_mask, ytr = tokenize_pairs(train_df)
Xv_ids,  Xv_mask,  yv  = tokenize_pairs(val_df)
Xt_ids,  Xt_mask,  yt  = tokenize_pairs(test_df)

print("Train shapes:", Xtr_ids.shape, Xtr_mask.shape, ytr.shape)
print("Val shapes:",   Xv_ids.shape,  Xv_mask.shape,  yv.shape)
print("Test shapes:",  Xt_ids.shape,  Xt_mask.shape,  yt.shape)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/2.32k [00:00<?, ?B/s]

spiece.model:   0%|          | 0.00/792k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.39M [00:00<?, ?B/s]

NameError: name 'train_df' is not defined

In [None]:
# ==== CELL 6: Build tf.data.Dataset pipelines ====
import tensorflow as tf

def make_tf_dataset(input_ids, attn_mask, labels, batch_size=16, shuffle=False):
    ds = tf.data.Dataset.from_tensor_slices((
        {
            "input_ids": input_ids,
            "attention_mask": attn_mask
        },
        labels
    ))
    if shuffle:
        ds = ds.shuffle(buffer_size=len(input_ids), seed=42)
    ds = ds.batch(batch_size).prefetch(tf.data.AUTOTUNE)
    return ds

BATCH = 16  # default; will be overridden in experiments if needed

train_ds = make_tf_dataset(Xtr_ids, Xtr_mask, ytr, batch_size=BATCH, shuffle=True)
val_ds   = make_tf_dataset(Xv_ids,  Xv_mask,  yv,  batch_size=BATCH, shuffle=False)
test_ds  = make_tf_dataset(Xt_ids,  Xt_mask,  yt,  batch_size=BATCH, shuffle=False)

for batch in train_ds.take(1):
    print("Batch keys:", batch[0].keys(), "input_ids shape:", batch[0]["input_ids"].shape)


In [None]:
# ==== CELL 7: Build & compile the model (TensorFlow) ====
from transformers import TFAutoModelForSeq2SeqLM
import math

# Load TF model
tf.keras.backend.clear_session()
model = TFAutoModelForSeq2SeqLM.from_pretrained(MODEL_NAME)

# Loss is automatically computed inside the model when 'labels' are present.
# But Keras requires a compiled loss to show metrics. We pass a dummy loss;
# the model will override it using internal seq2seq loss (cross-entropy).
# We also track accuracy on the token-level (optional).
optimizer = tf.keras.optimizers.Adam(learning_rate=2e-4)

model.compile(optimizer=optimizer)  # internal loss used (labels -> teacher forcing)

# Quick param count
model.summary()


In [None]:
# ==== CELL 8: Simple evaluation helpers (BLEU, F1, Perplexity) ====
from nltk.translate.bleu_score import corpus_bleu, SmoothingFunction
from sklearn.metrics import f1_score

def generate_texts(model, tokenizer, inputs, max_new_tokens=32):
    """
    Use model.generate to produce outputs for a batch of inputs.
    Returns list of decoded strings.
    """
    outputs = model.generate(
        input_ids=inputs["input_ids"],
        attention_mask=inputs["attention_mask"],
        max_new_tokens=max_new_tokens,
        num_beams=4,
        early_stopping=True
    )
    texts = tokenizer.batch_decode(outputs, skip_special_tokens=True)
    return texts

def dataset_to_numpy_inputs(ds):
    """Collect a whole tf.data.Dataset into numpy arrays for generation."""
    all_input_ids = []
    all_attention = []
    all_labels    = []
    for batch in ds:
        X, Y = batch
        all_input_ids.append(X["input_ids"].numpy())
        all_attention.append(X["attention_mask"].numpy())
        all_labels.append(Y.numpy())
    return {
        "input_ids": np.vstack(all_input_ids),
        "attention_mask": np.vstack(all_attention)
    }, np.vstack(all_labels)

def decode_labels_to_text(labels_np, tokenizer):
    """Convert label ids (with -100 for padding) back to strings for metric calc."""
    # replace -100 (ignore index) back to pad_token_id for decoding
    lab = labels_np.copy()
    lab[lab == -100] = tokenizer.pad_token_id
    texts = tokenizer.batch_decode(lab, skip_special_tokens=True)
    return texts

def compute_metrics(model, ds, tokenizer, label_texts=None):
    """
    Compute BLEU, F1 (word-level, rough), and perplexity from average loss.
    """
    # 1) Perplexity from loss over the dataset
    losses = []
    for X, Y in ds:
        out = model(X, labels=Y, training=False)
        # out.loss is mean cross-entropy for the batch
        losses.append(float(out.loss.numpy()))
    avg_loss = float(np.mean(losses))
    ppl = math.exp(avg_loss)

    # 2) Generate predictions
    X_np, Y_np = dataset_to_numpy_inputs(ds)
    pred_texts = generate_texts(model, tokenizer, X_np, max_new_tokens=32)
    true_texts = decode_labels_to_text(Y_np, tokenizer)

    # Prepare for BLEU
    smoothie = SmoothingFunction().method4
    refs = [[t.split()] for t in true_texts]   # list of reference lists
    hyps = [p.split()] for p in pred_texts

    bleu = corpus_bleu(refs, hyps, smoothing_function=smoothie)

    # 3) Rough F1: we’ll compute token-level macro F1 on whitespace tokens (very rough)
    # Build a vocabulary of tokens seen in references+pairs (for a simple mapping)
    # For a simple “set overlap” style: label present token vs predicted present token
    # (Not a perfect metric, but illustrates additional evaluation per rubric.)
    from collections import Counter

    def to_binary_bag(tokens, vocab):
        c = Counter(tokens)
        return np.array([1 if v in c else 0 for v in vocab], dtype=int)

    # Build a small vocab from the validation set (limit size)
    vocab = []
    for t in true_texts:
        for tok in t.split():
            if tok not in vocab:
                vocab.append(tok)
            if len(vocab) > 2000:
                break
        if len(vocab) > 2000: break

    y_true_bin, y_pred_bin = [], []
    for t, p in zip(true_texts, pred_texts):
        y_true_bin.append(to_binary_bag(t.split(), vocab))
        y_pred_bin.append(to_binary_bag(p.split(), vocab))

    if y_true_bin and y_pred_bin:
        y_true_bin = np.vstack(y_true_bin)
        y_pred_bin = np.vstack(y_pred_bin)
        # Compute macro F1 across the token indicators
        f1_macro = f1_score(y_true_bin, y_pred_bin, average="macro", zero_division=0)
    else:
        f1_macro = 0.0

    return {
        "avg_loss": avg_loss,
        "perplexity": ppl,
        "bleu": bleu,
        "f1_macro_bow": f1_macro,
        "samples": list(zip(true_texts[:5], pred_texts[:5]))  # preview some pairs
    }


In [None]:
# ==== CELL 9: Train once (baseline run) ====
EPOCHS = 5
BATCH  = 16  # used above; redefining for clarity

callbacks = [
    tf.keras.callbacks.EarlyStopping(monitor="val_loss", patience=2, restore_best_weights=True)
]

history = model.fit(
    train_ds,
    validation_data=val_ds,
    epochs=EPOCHS,
    callbacks=callbacks,
    verbose=1
)

# Plot training curves (loss only; accuracy not meaningful for seq2seq)
import matplotlib.pyplot as plt
plt.plot(history.history["loss"], label="train")
plt.plot(history.history["val_loss"], label="val")
plt.xlabel("Epoch"); plt.ylabel("Loss"); plt.title("T5-small Fine-tuning Loss")
plt.legend(); plt.show()


In [None]:
# ==== CELL 10: Baseline evaluation ====
baseline_metrics_val  = compute_metrics(model, val_ds,  tokenizer)
baseline_metrics_test = compute_metrics(model, test_ds, tokenizer)

import pandas as pd
pd.DataFrame([{
    "split": "val",
    "avg_loss": baseline_metrics_val["avg_loss"],
    "perplexity": baseline_metrics_val["perplexity"],
    "BLEU": baseline_metrics_val["bleu"],
    "F1_macro(bag)": baseline_metrics_val["f1_macro_bow"]
},{
    "split": "test",
    "avg_loss": baseline_metrics_test["avg_loss"],
    "perplexity": baseline_metrics_test["perplexity"],
    "BLEU": baseline_metrics_test["bleu"],
    "F1_macro(bag)": baseline_metrics_test["f1_macro_bow"]
}])


In [None]:
# ==== CELL 11: Run 3 experiments and make a results table ====
def reset_model(lr):
    tf.keras.backend.clear_session()
    m = TFAutoModelForSeq2SeqLM.from_pretrained(MODEL_NAME)
    opt = tf.keras.optimizers.Adam(learning_rate=lr)
    m.compile(optimizer=opt)
    return m

experiments = [
    {"name": "ExpA_baseline", "lr": 2e-4, "batch": 16, "epochs": 4},
    {"name": "ExpB_higherLR", "lr": 5e-4, "batch": 16, "epochs": 4},
    {"name": "ExpC_biggerBatch_lessEpochs", "lr": 2e-4, "batch": 32, "epochs": 3},
]

rows = []
for cfg in experiments:
    print(f"\n🚀 Running {cfg['name']}  (lr={cfg['lr']}, batch={cfg['batch']}, epochs={cfg['epochs']})")

    # rebuild datasets if batch size changes
    tr_ds = make_tf_dataset(Xtr_ids, Xtr_mask, ytr, batch_size=cfg["batch"], shuffle=True)
    v_ds  = make_tf_dataset(Xv_ids,  Xv_mask,  yv,  batch_size=cfg["batch"], shuffle=False)
    te_ds = make_tf_dataset(Xt_ids,  Xt_mask,  yt,  batch_size=cfg["batch"], shuffle=False)

    exp_model = reset_model(cfg["lr"])
    cb = [tf.keras.callbacks.EarlyStopping(monitor="val_loss", patience=1, restore_best_weights=True)]
    h = exp_model.fit(tr_ds, validation_data=v_ds, epochs=cfg["epochs"], callbacks=cb, verbose=1)

    # Evaluate
    m_val  = compute_metrics(exp_model, v_ds,  tokenizer)
    m_test = compute_metrics(exp_model, te_ds, tokenizer)

    rows.append({
        "Experiment": cfg["name"],
        "LR": cfg["lr"],
        "Batch": cfg["batch"],
        "Epochs": cfg["epochs"],
        "Val_Loss": m_val["avg_loss"],
        "Val_PPL":  m_val["perplexity"],
        "Val_BLEU": m_val["bleu"],
        "Val_F1":   m_val["f1_macro_bow"],
        "Test_Loss": m_test["avg_loss"],
        "Test_PPL":  m_test["perplexity"],
        "Test_BLEU": m_test["bleu"],
        "Test_F1":   m_test["f1_macro_bow"],
    })

results_df = pd.DataFrame(rows)
results_df.sort_values("Test_BLEU", ascending=False).reset_index(drop=True)


In [None]:
# ==== CELL 12: Simple interactive demo ====
def answer(question, max_new_tokens=48):
    # Prepare a single input
    enc = tokenizer(
        [f"question: {question}"],
        padding=True, truncation=True, max_length=MAX_INPUT_LEN, return_tensors="tf"
    )
    out = model.generate(
        input_ids=enc["input_ids"],
        attention_mask=enc["attention_mask"],
        max_new_tokens=max_new_tokens,
        num_beams=4,
        early_stopping=True
    )
    return tokenizer.decode(out[0], skip_special_tokens=True)

print("Try me:")
print("Q: How do I apply for admission?")
print("A:", answer("How do I apply for admission?"))
