<a href="https://colab.research.google.com/github/ktanguy/University_chatbot_assistant/blob/main/University_chatbot_assistant.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# 🎓 University Chatbot — FLAN-T5 (TensorFlow, Hugging Face, Gradio)

This notebook fine-tunes **FLAN-T5 Small** on a **University FAQ** dataset (intents → (user, bot) pairs), evaluates with **BLEU**, and provides a **Gradio** demo.

**Fixes included:**
- Proper `pad_token` fallback for T5
- Masked loss (ignore padding via `-100`)
- Deterministic seeds (Python / NumPy / TensorFlow)
- sacrebleu reference shape
- Optional GPU memory-growth for TF
- Clean section ordering for “Run All”

> Update `INTENTS_PATH` in the Config cell if your file lives somewhere else in Drive.



In [1]:
!pip install -q sacrebleu==2.4.0

[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m57.4/57.4 kB[0m [31m1.9 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m106.3/106.3 kB[0m [31m5.4 MB/s[0m eta [36m0:00:00[0m
[?25h

In [2]:
!pip install -q "tensorflow==2.15.1" "keras==2.15.0" "transformers==4.41.2" "huggingface_hub==0.23.4"
print(" Installed versions. Now go to Runtime > Restart runtime, then run the next cells.")


[31mERROR: Could not find a version that satisfies the requirement tensorflow==2.15.1 (from versions: 2.16.0rc0, 2.16.1, 2.16.2, 2.17.0rc0, 2.17.0rc1, 2.17.0, 2.17.1, 2.18.0rc0, 2.18.0rc1, 2.18.0rc2, 2.18.0, 2.18.1, 2.19.0rc0, 2.19.0, 2.19.1, 2.20.0rc0, 2.20.0)[0m[31m
[0m[31mERROR: No matching distribution found for tensorflow==2.15.1[0m[31m
[0m Installed versions. Now go to Runtime > Restart runtime, then run the next cells.


In [2]:
# If the runtime is fresh, run this once (takes a few minutes).
!pip install -q transformers==4.42.0 tensorflow==2.13.0 accelerate==0.29.0 \
               gradio==4.0.0 datasets==2.18.0 evaluate==0.4.0 sacrebleu==2.4.0 \
               pandas numpy




[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/43.6 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m43.6/43.6 kB[0m [31m1.8 MB/s[0m eta [36m0:00:00[0m
[?25h[31mERROR: Could not find a version that satisfies the requirement tensorflow==2.13.0 (from versions: 2.16.0rc0, 2.16.1, 2.16.2, 2.17.0rc0, 2.17.0rc1, 2.17.0, 2.17.1, 2.18.0rc0, 2.18.0rc1, 2.18.0rc2, 2.18.0, 2.18.1, 2.19.0rc0, 2.19.0, 2.19.1, 2.20.0rc0, 2.20.0)[0m[31m
[0m[31mERROR: No matching distribution found for tensorflow==2.13.0[0m[31m
[0m

In [3]:
# Optional: Use if your intents.json is in Drive
from google.colab import drive
drive.mount('/content/drive')


Mounted at /content/drive


In [4]:
import os, json, random, csv
import numpy as np
import pandas as pd
import tensorflow as tf

from transformers import AutoTokenizer, TFAutoModelForSeq2SeqLM
import sacrebleu

# Optional: soften GPU memory spikes
gpus = tf.config.list_physical_devices('GPU')
for g in gpus:
    try:
        tf.config.experimental.set_memory_growth(g, True)
    except Exception as e:
        print("GPU memory growth not set:", e)

print("TF version:", tf.__version__)


TF version: 2.19.0


In [7]:
# === Paths ===
BASE_DIR = "/content/university_chatbot_nb"
DATA_DIR = os.path.join(BASE_DIR, "data")
MODEL_DIR = os.path.join(BASE_DIR, "models", "flan_t5_small_finetuned")
os.makedirs(DATA_DIR, exist_ok=True)
os.makedirs(MODEL_DIR, exist_ok=True)

# Prefer the merged dataset saved from your data-extraction notebook
INTENTS_PATH = "/content/drive/MyDrive/UniversityChatbot/intents_merged_tagged.json"



In [8]:
import os
if not os.path.exists(INTENTS_PATH):
    print("Merged dataset not found. Falling back to original intents.json")
    INTENTS_PATH = "/content/drive/MyDrive/UniversityChatbot/intents_merged_tagged.json"
print("Using:", INTENTS_PATH)


Using: /content/drive/MyDrive/UniversityChatbot/intents_merged_tagged.json


In [11]:
import os, json

print("Exists?", os.path.exists(INTENTS_PATH), "→", INTENTS_PATH)

with open(INTENTS_PATH, "r", encoding="utf-8") as f:
    data = json.load(f)

num_intents = len(data.get("intents", []))
num_texts   = sum(len(it.get("text", [])) for it in data.get("intents", []))
print(f"Intents: {num_intents} | Total user texts: {num_texts}")


Exists? True → /content/drive/MyDrive/UniversityChatbot/intents_merged_tagged.json
Intents: 39 | Total user texts: 412


In [12]:
# Expecting an intents.json in the common "intents" format:
# {"intents": [{"tag":"...", "text":["u1","u2"], "responses":["r1","r2"]}, ...]}

with open(INTENTS_PATH, "r", encoding="utf-8") as f:
    intents = json.load(f)

pairs = []
for intent in intents.get("intents", []):
    responses = intent.get("responses", [])
    texts     = intent.get("text", [])
    if not responses or not texts:
        continue
    # use the first response for supervised training (you can expand this later)
    resp = str(responses[0]).strip()
    for t in texts:
        t_clean = str(t).strip()
        if t_clean and resp:
            pairs.append((t_clean, resp))

# Train / test split (80/20)
random.shuffle(pairs)
split = int(0.8 * len(pairs))
train_pairs = pairs[:split]
test_pairs  = pairs[split:]

train_csv = os.path.join(DATA_DIR, "train.csv")
test_csv  = os.path.join(DATA_DIR, "test.csv")

with open(train_csv, "w", newline="", encoding="utf-8") as f:
    w = csv.writer(f); w.writerow(["user", "bot"]); w.writerows(train_pairs)
with open(test_csv, "w", newline="", encoding="utf-8") as f:
    w = csv.writer(f); w.writerow(["user", "bot"]); w.writerows(test_pairs)

print(f"Total pairs: {len(pairs)} | Train: {len(train_pairs)} | Test: {len(test_pairs)}")
print("Train CSV:", train_csv)
print("Test  CSV:", test_csv)


Total pairs: 412 | Train: 329 | Test: 83
Train CSV: /content/university_chatbot_nb/data/train.csv
Test  CSV: /content/university_chatbot_nb/data/test.csv


In [14]:
import re
import pandas as pd

TRAIN = "/content/university_chatbot_nb/data/train.csv"
TEST  = "/content/university_chatbot_nb/data/test.csv"

# Minimal ALU-aware fallback texts (edit as you like)
ALU_KB = {
    "fee": "Please check the Student Portal > Finance for current fees and payment instructions.",
    "hostel": "ALU Kigali has limited on-campus rooms and partner hostels. See Student Services for availability and rates.",
    "address": "ALU Kigali, Bumbogo Innovation City, Gasabo, Kigali. Refer to the official site for directions.",
    "calendar": "See the Academic Calendar on the Student Portal for term dates and exams.",
    "canteen": "ALU provides on-campus canteen/food options; availability and menus may vary by term.",
}

# Helpers
TAG_RE = re.compile(r"<[^>]+>")
PLACEHOLDER_PATTERNS = [
    r"ADD\s+YOUR.*",
    r"ADD\s+YOU\s+OWN\s+ANSWERS.*",
    r"ADD\s+YOU\s+GOOGLE\s+MAP.*",
]
def looks_placeholder(text: str) -> bool:
    if not text: return True
    T = text.upper()
    if any(re.search(p, T) for p in PLACEHOLDER_PATTERNS): return True
    # anchor without real URL
    if ("HREF=" in T or "TARGET=" in T) and ("HTTP" not in T and "HTTPS" not in T):
        return True
    return False

def clean_html_and_placeholders(user: str, bot: str) -> str:
    # Strip HTML
    s = TAG_RE.sub("", str(bot))
    # Remove obvious placeholder phrases
    for p in PLACEHOLDER_PATTERNS:
        s = re.sub(p, "", s, flags=re.IGNORECASE)
    s = re.sub(r"\s{2,}", " ", s).strip()

    # If it still looks empty/placeholder, replace with ALU KB based on user text keywords
    if not s or looks_placeholder(bot):
        u = (user or "").lower()
        if any(k in u for k in ["fee","tuition","payment","invoice"]):
            return ALU_KB["fee"]
        if any(k in u for k in ["hostel","accommodation","residence","housing"]):
            return ALU_KB["hostel"]
        if any(k in u for k in ["address","location","where"]):
            return ALU_KB["address"]
        if any(k in u for k in ["calendar","exam","timetable","schedule"]):
            return ALU_KB["calendar"]
        if any(k in u for k in ["canteen","cafeteria","food","menu"]):
            return ALU_KB["canteen"]
        # generic fallback
        return "Please check the Student Portal or contact Student Services for the latest official information."
    return s

def clean_file(path):
    df = pd.read_csv(path).dropna()
    before = len(df)
    df["bot"] = [clean_html_and_placeholders(u, b) for u, b in zip(df["user"], df["bot"])]
    # Drop rows that became empty (should be none, but safe)
    df = df[df["bot"].str.strip() != ""].drop_duplicates().reset_index(drop=True)
    after = len(df)
    df.to_csv(path, index=False)
    print(f"Cleaned {path}: {before} -> {after}")

clean_file(TRAIN)
clean_file(TEST)

# Quick peek after cleaning
print("\nCleaned train sample:")
print(pd.read_csv(TRAIN).sample(min(5, len(pd.read_csv(TRAIN)))))


Cleaned /content/university_chatbot_nb/data/train.csv: 329 -> 327
Cleaned /content/university_chatbot_nb/data/test.csv: 83 -> 83

Cleaned train sample:
                              user  \
153      what are branches in UNI?   
315                 scholarship it   
120            Who is computer HOD   
119  fees for non-Ac room for boys   
285                  See you later   

                                                   bot  
153  Our university offers Information Technology, ...  
315  Please check the Student Portal or contact Stu...  
120  All engineering departments have only one hod ...  
119  Please check the Student Portal > Finance for ...  
285                               Sad to see you go :(  


In [15]:
import pandas as pd

train_df = pd.read_csv("/content/university_chatbot_nb/data/train.csv")
test_df  = pd.read_csv("/content/university_chatbot_nb/data/test.csv")

print("Train sample:")
print(train_df.sample(min(5, len(train_df))))

print("\nTest sample:")
print(test_df.sample(min(5, len(test_df))))


Train sample:
                                  user  \
102             what is the hostel fee   
0    How many floors does college have   
232                         how are ya   
322                list of scholarship   
84              Information Technology   

                                                   bot  
102  Please check the Student Portal > Finance for ...  
0                        My College has total 2 floors  
232                                             Hello!  
322  Please check the Student Portal or contact Stu...  
84   Our university offers Information Technology, ...  

Test sample:
                                      user  \
29                                    exam   
67                                     Bye   
25                information about sports   
56  what documents do I need for admission   
45                                     sem   

                                                  bot  
29  See the Academic Calendar on the Student

In [17]:
# === Model & Training Hyperparams ===
MODEL_NAME = "google/flan-t5-small"   # or another model if you prefer
MAX_LEN    = 128
BATCH_SIZE = 8
EPOCHS     = 5
LR         = 5e-5
SEED       = 42

import tensorflow as tf
import numpy as np
import random

# Set random seeds for reproducibility
random.seed(SEED)
np.random.seed(SEED)
tf.random.set_seed(SEED)

print("Model name:", MODEL_NAME)


Model name: google/flan-t5-small


In [24]:
# Upgrade Torch to a safe version (>=2.6). Use CPU wheels to avoid CUDA conflicts.
!pip install -q --upgrade torch==2.8.0+cpu torchvision==0.23.0+cpu torchaudio==2.8.0+cpu \
  -f https://download.pytorch.org/whl/cpu/torch_stable.html

import torch
print("Torch version:", torch.__version__)


[31mERROR: Could not find a version that satisfies the requirement torch==2.8.0+cpu (from versions: 2.2.0, 2.2.0+cpu, 2.2.1, 2.2.1+cpu, 2.2.2, 2.2.2+cpu, 2.3.0, 2.3.0+cpu, 2.3.1, 2.3.1+cpu, 2.4.0, 2.4.1, 2.5.0, 2.5.1, 2.6.0, 2.7.0, 2.7.1, 2.8.0)[0m[31m
[0m[31mERROR: No matching distribution found for torch==2.8.0+cpu[0m[31m
[0mTorch version: 2.8.0+cu126


In [26]:
# Sanity check: you’re already on a safe Torch version (>=2.6)
import torch, transformers
print("Torch:", torch.__version__)
print("Transformers:", transformers.__version__)


Torch: 2.8.0+cu126
Transformers: 4.56.2


In [1]:
from huggingface_hub import snapshot_download
from transformers import AutoTokenizer, TFAutoModelForSeq2SeqLM

# Download ONLY TF files for t5-small so we avoid any PT files entirely
local_repo = snapshot_download(
    repo_id="t5-small",
    allow_patterns=["tokenizer.json","spiece.model","*.json","tf_model.h5"]
)

tokenizer = AutoTokenizer.from_pretrained(local_repo)
if tokenizer.pad_token is None:
    tokenizer.pad_token = tokenizer.eos_token

# Load native TensorFlow weights (no extra flags needed)
model = TFAutoModelForSeq2SeqLM.from_pretrained(local_repo)

print(" Loaded t5-small with native TF weights.")


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


Fetching 6 files:   0%|          | 0/6 [00:00<?, ?it/s]

tokenizer_config.json:   0%|          | 0.00/2.32k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.39M [00:00<?, ?B/s]

spiece.model:   0%|          | 0.00/792k [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/147 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/1.21k [00:00<?, ?B/s]

tf_model.h5:   0%|          | 0.00/242M [00:00<?, ?B/s]

TensorFlow and JAX classes are deprecated and will be removed in Transformers v5. We recommend migrating to PyTorch classes or pinning your version of Transformers.
All model checkpoint layers were used when initializing TFT5ForConditionalGeneration.

All the layers of TFT5ForConditionalGeneration were initialized from the model checkpoint at /root/.cache/huggingface/hub/models--t5-small/snapshots/df1b051c49625cf57a3d0d8d3863ed4d13564fe4.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFT5ForConditionalGeneration for predictions without further training.


 Loaded t5-small with native TF weights.


In [27]:
def load_pairs_csv(path):
    df = pd.read_csv(path).dropna().astype(str)
    return df["user"].tolist(), df["bot"].tolist()

def make_tf_dataset(tokenizer, sources, targets, max_len=128, batch_size=8):
    # Encode user inputs
    enc = tokenizer(
        sources,
        truncation=True,
        padding="max_length",
        max_length=max_len,
        return_tensors="np",
    )
    # Encode target responses
    dec = tokenizer(
        targets,
        truncation=True,
        padding="max_length",
        max_length=max_len,
        return_tensors="np",
    )

    input_ids      = enc["input_ids"]
    attention_mask = enc["attention_mask"]
    labels         = dec["input_ids"]

    # Replace PAD tokens with -100 so Hugging Face ignores them automatically
    labels = np.where(labels == tokenizer.pad_token_id, -100, labels)

    # Hugging Face models can compute loss internally if "labels" is in the input dict
    features = {
        "input_ids": input_ids,
        "attention_mask": attention_mask,
        "labels": labels,
    }

    ds = tf.data.Dataset.from_tensor_slices(features)
    ds = ds.shuffle(len(sources)).batch(batch_size).prefetch(tf.data.AUTOTUNE)
    return ds


# Create datasets
train_src, train_tgt = load_pairs_csv(train_csv)
val_src,   val_tgt   = load_pairs_csv(test_csv)

train_ds = make_tf_dataset(tokenizer, train_src, train_tgt, max_len=MAX_LEN, batch_size=BATCH_SIZE)
val_ds   = make_tf_dataset(tokenizer, val_src,   val_tgt,   max_len=MAX_LEN, batch_size=BATCH_SIZE)

len(train_src), len(val_src)


(329, 83)

In [29]:
# Hugging Face automatically computes the masked loss when labels are provided
optimizer = tf.keras.optimizers.Adam(learning_rate=LR)
model.compile(optimizer=optimizer)  # no custom loss needed

callbacks = [
    tf.keras.callbacks.EarlyStopping(monitor="val_loss", patience=2, restore_best_weights=True),
    tf.keras.callbacks.ModelCheckpoint(
        filepath=os.path.join(MODEL_DIR, "ckpt"),
        save_weights_only=True,
        save_best_only=True,
        monitor="val_loss"
    )
]

history = model.fit(
    train_ds,
    validation_data=val_ds,
    epochs=EPOCHS,
    callbacks=callbacks
)


Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


In [30]:
model.save_pretrained(MODEL_DIR)
tokenizer.save_pretrained(MODEL_DIR)

# Save training history for reference
hist = {k: [float(x) for x in v] for k, v in (history.history or {}).items()}
with open(os.path.join(MODEL_DIR, "history.json"), "w") as f:
    json.dump(hist, f, indent=2)

print(" Model and tokenizer saved to:", MODEL_DIR)


 Model and tokenizer saved to: /content/university_chatbot_nb/models/flan_t5_small_finetuned


In [31]:
def generate_response(prompt, max_len=128):
    out = model.generate(**tokenizer(prompt, return_tensors="tf", truncation=True, max_length=max_len))
    return tokenizer.decode(out[0], skip_special_tokens=True)

df_val = pd.read_csv(test_csv).dropna().astype(str)

preds = []
refs  = []
for _, row in df_val.iterrows():
    preds.append(generate_response(row["user"]))
    refs.append(row["bot"])

# sacrebleu expects list of reference corpora → wrap refs in [refs]
refs_corpus = [refs]
import sacrebleu
bleu = sacrebleu.corpus_bleu(preds, refs_corpus)
print(f"BLEU Score: {bleu.score:.2f}")

qual_df = pd.DataFrame({"user": df_val["user"], "ref": refs, "pred": preds})
qual_path = os.path.join(MODEL_DIR, "qualitative_samples.csv")
qual_df.to_csv(qual_path, index=False)
print("Saved qualitative samples to:", qual_path)

# Display a few sample predictions
qual_df.sample(min(5, len(qual_df)))




BLEU Score: 19.77
Saved qualitative samples to: /content/university_chatbot_nb/models/flan_t5_small_finetuned/qualitative_samples.csv


Unnamed: 0,user,ref,pred
30,what is the name of your developers,College students,You can contact at: NUMBER
0,where is college located,"<a target=""_blank"" href=""ADD YOU GOOGLE MAP LI...","For College detail visit a target=""_blank"" href="""
22,maximum number of seats,"For IT, Computer and extc 60 per branch and se...","For seat detail visit a target=""_blank"" href="""
31,where is college,"<a target=""_blank"" href=""ADD YOU GOOGLE MAP LI...",College is open 8am-5pm Monday-Saturday!
18,can you tell me the courses available in UNI?,"Our university offers Information Technology, ...","For more information visit a target=""_blank"" h..."


In [32]:
import gradio as gr

WHITELIST_HINTS = ("admission", "fees", "hostel", "course", "placement",
                   "timetable", "exam", "department", "semester", "library", "scholarship", "contact")

def respond(user_input):
    text = user_input.strip()
    if len(text) < 2:
        return "Please ask a full question related to university information."
    if not any(w in text.lower() for w in WHITELIST_HINTS):
        return "I can help with university questions (admissions, fees, hostel, courses, timetable, etc.). Please rephrase."
    inputs  = tokenizer(text, return_tensors="tf", truncation=True, max_length=MAX_LEN)
    output  = model.generate(**inputs, max_length=MAX_LEN, num_beams=4)
    answer  = tokenizer.decode(output[0], skip_special_tokens=True)
    return answer

demo = gr.Interface(
    fn=respond,
    inputs="text",
    outputs="text",
    title="🎓 University Chatbot",
    description="Ask about admissions, fees, hostel, courses, timetable, placements, and more."
)


demo.launch(share=True)


Colab notebook detected. To show errors in colab notebook, set debug=True in launch()
* Running on public URL: https://cccb28ac0a2b3c49a9.gradio.live

This share link expires in 1 week. For free permanent hosting and GPU upgrades, run `gradio deploy` from the terminal in the working directory to deploy to Hugging Face Spaces (https://huggingface.co/spaces)


