<a href="https://colab.research.google.com/github/ktanguy/University_chatbot_assistant/blob/main/University_chatbot_assistant.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# 🎓 University Chatbot — FLAN-T5 (TensorFlow, Hugging Face, Gradio)

This notebook fine-tunes **FLAN-T5 Small** on a **University FAQ** dataset (intents → (user, bot) pairs), evaluates with **BLEU**, and provides a **Gradio** demo.

**Fixes included:**
- Proper `pad_token` fallback for T5
- Masked loss (ignore padding via `-100`)
- Deterministic seeds (Python / NumPy / TensorFlow)
- sacrebleu reference shape
- Optional GPU memory-growth for TF
- Clean section ordering for “Run All”

> Update `INTENTS_PATH` in the Config cell if your file lives somewhere else in Drive.



In [2]:
!pip install -q sacrebleu==2.4.0

[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m57.4/57.4 kB[0m [31m2.9 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m106.3/106.3 kB[0m [31m5.1 MB/s[0m eta [36m0:00:00[0m
[?25h

In [3]:
!pip install -q "tensorflow==2.15.1" "keras==2.15.0" "transformers==4.41.2" "huggingface_hub==0.23.4"
print(" Installed versions. Now go to Runtime > Restart runtime, then run the next cells.")


[31mERROR: Could not find a version that satisfies the requirement tensorflow==2.15.1 (from versions: 2.16.0rc0, 2.16.1, 2.16.2, 2.17.0rc0, 2.17.0rc1, 2.17.0, 2.17.1, 2.18.0rc0, 2.18.0rc1, 2.18.0rc2, 2.18.0, 2.18.1, 2.19.0rc0, 2.19.0, 2.19.1, 2.20.0rc0, 2.20.0)[0m[31m
[0m[31mERROR: No matching distribution found for tensorflow==2.15.1[0m[31m
[0m Installed versions. Now go to Runtime > Restart runtime, then run the next cells.


In [4]:
# If the runtime is fresh, run this once (takes a few minutes).
!pip install -q transformers==4.42.0 tensorflow==2.13.0 accelerate==0.29.0 \
               gradio==4.0.0 datasets==2.18.0 evaluate==0.4.0 sacrebleu==2.4.0 \
               pandas numpy




[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/43.6 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m43.6/43.6 kB[0m [31m1.7 MB/s[0m eta [36m0:00:00[0m
[?25h[31mERROR: Could not find a version that satisfies the requirement tensorflow==2.13.0 (from versions: 2.16.0rc0, 2.16.1, 2.16.2, 2.17.0rc0, 2.17.0rc1, 2.17.0, 2.17.1, 2.18.0rc0, 2.18.0rc1, 2.18.0rc2, 2.18.0, 2.18.1, 2.19.0rc0, 2.19.0, 2.19.1, 2.20.0rc0, 2.20.0)[0m[31m
[0m[31mERROR: No matching distribution found for tensorflow==2.13.0[0m[31m
[0m

In [5]:
# Optional: Use if your intents.json is in Drive
from google.colab import drive
drive.mount('/content/drive')


Mounted at /content/drive


In [6]:
import os, json, random, csv
import numpy as np
import pandas as pd
import tensorflow as tf

from transformers import AutoTokenizer, TFAutoModelForSeq2SeqLM
import sacrebleu

# Optional: soften GPU memory spikes
gpus = tf.config.list_physical_devices('GPU')
for g in gpus:
    try:
        tf.config.experimental.set_memory_growth(g, True)
    except Exception as e:
        print("GPU memory growth not set:", e)

print("TF version:", tf.__version__)


TF version: 2.19.0


In [7]:
# === Paths ===
BASE_DIR = "/content/university_chatbot_nb"
DATA_DIR = os.path.join(BASE_DIR, "data")
MODEL_DIR = os.path.join(BASE_DIR, "models", "flan_t5_small_finetuned")
os.makedirs(DATA_DIR, exist_ok=True)
os.makedirs(MODEL_DIR, exist_ok=True)

# Prefer the merged dataset saved from your data-extraction notebook
INTENTS_PATH = "/content/drive/MyDrive/UniversityChatbot/intents_merged_tagged.json"



In [8]:
import os
if not os.path.exists(INTENTS_PATH):
    print("Merged dataset not found. Falling back to original intents.json")
    INTENTS_PATH = "/content/drive/MyDrive/UniversityChatbot/intents_merged_tagged.json"
print("Using:", INTENTS_PATH)


Using: /content/drive/MyDrive/UniversityChatbot/intents_merged_tagged.json


In [9]:
import os, json

print("Exists?", os.path.exists(INTENTS_PATH), "→", INTENTS_PATH)

with open(INTENTS_PATH, "r", encoding="utf-8") as f:
    data = json.load(f)

num_intents = len(data.get("intents", []))
num_texts   = sum(len(it.get("text", [])) for it in data.get("intents", []))
print(f"Intents: {num_intents} | Total user texts: {num_texts}")


Exists? True → /content/drive/MyDrive/UniversityChatbot/intents_merged_tagged.json
Intents: 39 | Total user texts: 412


In [10]:
# Expecting an intents.json in the common "intents" format:
# {"intents": [{"tag":"...", "text":["u1","u2"], "responses":["r1","r2"]}, ...]}

with open(INTENTS_PATH, "r", encoding="utf-8") as f:
    intents = json.load(f)

pairs = []
for intent in intents.get("intents", []):
    responses = intent.get("responses", [])
    texts     = intent.get("text", [])
    if not responses or not texts:
        continue
    # use the first response for supervised training (you can expand this later)
    resp = str(responses[0]).strip()
    for t in texts:
        t_clean = str(t).strip()
        if t_clean and resp:
            pairs.append((t_clean, resp))

# Train / test split (80/20)
random.shuffle(pairs)
split = int(0.8 * len(pairs))
train_pairs = pairs[:split]
test_pairs  = pairs[split:]

train_csv = os.path.join(DATA_DIR, "train.csv")
test_csv  = os.path.join(DATA_DIR, "test.csv")

with open(train_csv, "w", newline="", encoding="utf-8") as f:
    w = csv.writer(f); w.writerow(["user", "bot"]); w.writerows(train_pairs)
with open(test_csv, "w", newline="", encoding="utf-8") as f:
    w = csv.writer(f); w.writerow(["user", "bot"]); w.writerows(test_pairs)

print(f"Total pairs: {len(pairs)} | Train: {len(train_pairs)} | Test: {len(test_pairs)}")
print("Train CSV:", train_csv)
print("Test  CSV:", test_csv)


Total pairs: 412 | Train: 329 | Test: 83
Train CSV: /content/university_chatbot_nb/data/train.csv
Test  CSV: /content/university_chatbot_nb/data/test.csv


In [21]:
import re, html
import pandas as pd
from collections import Counter

TRAIN = "/content/university_chatbot_nb/data/train.csv"
TEST  = "/content/university_chatbot_nb/data/test.csv"

# --- ALU knowledge snippets (succinct, safe, no HTML) ---
ALU_KB = {
    "admission":  "Admissions are subject to change. Please check the official ALU Admissions page for the most current requirements and deadlines.",
    "fees":       "Tuition varies by programme and year. Check the Student Portal > Finance for current fees and payment instructions.",
    "installment":"Installment options may be available. See Student Finance on the Student Portal for approved plans.",
    "scholarship":"Scholarships and financial aid may be available. Review the Scholarships/Financial Aid section on the admissions or finance pages.",
    "hostel":     "Limited on-campus or partner housing may be available. Contact Student Services for current availability, rates, and application steps.",
    "address":    "ALU Kigali is located at Bumbogo Innovation City, Gasabo, Kigali. Refer to the official site for directions.",
    "calendar":   "See the Academic Calendar on the Student Portal for term dates, add/drop windows, and exams.",
    "timetable":  "Class schedules are published each term. Check your timetable on the Student Portal.",
    "exam":       "Exam schedules are listed on the Academic Calendar and communicated via the Student Portal.",
    "programme":  "See the Programmes page for current offerings, entry requirements, and learning modes.",
    "library":    "Students have access to digital resources and on-site study spaces. Use your student credentials to access the library portal.",
    "canteen":    "On-campus food options are available; offerings and hours may vary by term. Check campus notices or Student Services.",
    "contact":    "Use the Student Portal or the Contact page for official emails and support channels.",
    "careers":    "Career Services supports internships and job readiness. Visit the Careers page or portal to get started.",
    "general":    "Please check the Student Portal or contact Student Services for the latest official information."
}

# --- Keyword routing for replacements (ordered by specificity -> general) ---
ROUTES = [
    ("installment",  ["installment","installments","instalment","instalments","payment plan"]),
    ("scholarship",  ["scholarship","financial aid","bursary","aid","funding"]),
    ("fees",         ["fee","fees","tuition","invoice","payment","pay","bank","card","tuition cost","costs"]),
    ("hostel",       ["hostel","housing","accommodation","residence","dorm"]),
    ("address",      ["address","location","where is","direction","map","campus"]),
    ("calendar",     ["calendar","academic calendar"]),
    ("timetable",    ["timetable","schedule","class time","class schedule","time table"]),
    ("exam",         ["exam","examination","midterm","final","assessment","tests"]),
    ("programme",    ["programme","program","course","degree","major","specialization","track"]),
    ("library",      ["library","ebook","e-book","database","journal","catalog","catalogue"]),
    ("canteen",      ["canteen","cafeteria","food","menu","dining"]),
    ("contact",      ["contact","email","phone","helpdesk","support","whatsapp"]),
    ("careers",      ["career","internship","jobs","employer","placement"]),
    ("admission",    ["admission","admissions","apply","application","offer","enroll","enrol","join"]),
]

# --- Placeholder/HTML cleanup rules ---
TAG_RE = re.compile(r"<[^>]+>")
PLACEHOLDER_PATTERNS = [
    r"ADD\s+YOUR.*",
    r"ADD\s+YOU\s+OWN\s+ANSWERS.*",
    r"ADD\s+YOU\s+GOOGLE\s+MAP.*",
    r"CLICK\s+HERE.*",
    r"\bhere\b\s*$",                # bare 'here' ending
    r"visit\s+.*(link|site|page).*",# vague link directives
]
JUNK_SNIPPETS = [
    "target=\"_blank\"", "href=", "</a>", "<a", "http://", "https://"
]

# answers that are too generic to be useful
GENERIC_RESP_PATTERNS = [
    r"visit the (official )?(site|page)",
    r"for more (info|information).*",
    r"contact us.*",
    r"refer to.*",
    r"see above.*",
]

def looks_placeholder(text: str) -> bool:
    if not text or str(text).strip() == "":
        return True
    T = str(text)
    # any HTML tag?
    if TAG_RE.search(T): return True
    # obvious placeholders
    for p in PLACEHOLDER_PATTERNS:
        if re.search(p, T, flags=re.IGNORECASE):
            return True
    # anchor/link artifacts without meaningful content
    if any(s in T for s in JUNK_SNIPPETS):
        return True
    # too short / non-informative
    if len(T.strip()) < 12:
        return True
    # generic boilerplate
    for p in GENERIC_RESP_PATTERNS:
        if re.search(p, T, flags=re.IGNORECASE):
            return True
    return False

def normalize_text(s: str) -> str:
    s = html.unescape(str(s or ""))
    s = TAG_RE.sub("", s)                         # strip HTML
    s = re.sub(r"\s{2,}", " ", s)                 # collapse spaces
    s = re.sub(r"[ \t]*\n+[ \t]*", " ", s)        # collapse newlines
    s = re.sub(r"\s*([?!.,;:])\s*", r"\1 ", s)    # tidy punctuation spacing
    s = re.sub(r"[!?]{3,}", "!!", s)              # limit punctuation runs
    s = s.strip()
    return s

def route_to_kb(user: str) -> str:
    u = (user or "").lower()
    for key, kws in ROUTES:
        if any(kw in u for kw in kws):
            return ALU_KB[key]
    return ALU_KB["general"]

def clean_answer(user: str, bot: str, stats: Counter) -> str:
    raw = str(bot or "")
    # quick remove obvious placeholder phrases (pre-normalization)
    for p in PLACEHOLDER_PATTERNS:
        raw = re.sub(p, "", raw, flags=re.IGNORECASE)
    raw = normalize_text(raw)

    # if after normalization it's weak, route to KB
    if looks_placeholder(raw):
        routed = route_to_kb(user)
        stats["replaced_with_kb"] += 1
        # track which bucket we used
        bucket_used = next((k for k, kws in ROUTES if any(kw in (user or "").lower() for kw in kws)), "general")
        stats[f"kb_{bucket_used}"] += 1
        return routed

    return raw

def clean_file(path, preview_path=None):
    df = pd.read_csv(path).dropna()
    before = len(df)
    stats = Counter()

    # 1) dedupe exact pairs
    df = df.drop_duplicates(subset=["user","bot"]).reset_index(drop=True)

    # 2) clean/route
    df["bot"] = [clean_answer(u, b, stats) for u, b in zip(df["user"], df["bot"])]

    # 3) drop rows that are still empty (very unlikely now)
    df = df[df["bot"].astype(str).str.strip() != ""].reset_index(drop=True)
    after = len(df)
    df.to_csv(path, index=False)

    # small preview for debugging/report
    if preview_path:
        df.sample(min(10, len(df))).to_csv(preview_path, index=False)

    print(f"Cleaned {path}: {before} -> {after}")
    print("Replacements:", stats["replaced_with_kb"])
    # print top buckets used
    for k, v in sorted(stats.items()):
        if k.startswith("kb_"):
            print(f"  {k}: {v}")

# Run cleaning + save previews
clean_file(TRAIN, preview_path="/content/university_chatbot_nb/data/train_clean_preview.csv")
clean_file(TEST,  preview_path="/content/university_chatbot_nb/data/test_clean_preview.csv")

print("\nCleaned train sample:")
print(pd.read_csv("/content/university_chatbot_nb/data/train_clean_preview.csv").head())


Cleaned /content/university_chatbot_nb/data/train.csv: 324 -> 322
Replacements: 17
  kb_address: 10
  kb_general: 7
Cleaned /content/university_chatbot_nb/data/test.csv: 83 -> 83
Replacements: 4
  kb_address: 1
  kb_general: 3

Cleaned train sample:
                                            user                                                bot
0  courses offered in (your univrsity(UNI) name)  Our university offers Information Technology, ...
1                                    hostel fees  Tuition varies by programme and year. Check th...
2                                        address  ALU Kigali is located at Bumbogo Innovation Ci...
3                               name of extc hod  Different school wise hod are different. So be...
4                                     how are ya  Please check the Student Portal or contact Stu...


In [22]:
import pandas as pd

train_df = pd.read_csv("/content/university_chatbot_nb/data/train.csv")
test_df  = pd.read_csv("/content/university_chatbot_nb/data/test.csv")

print("Train sample:")
print(train_df.sample(min(5, len(train_df))))

print("\nTest sample:")
print(test_df.sample(min(5, len(test_df))))


Train sample:
                            user                                                bot
196  Does college provide hostel  Limited on-campus or partner housing may be av...
299               seat allotment  For IT, Computer and extc 60 per branch and se...
228               principal name  XYZ is college principal and if you need any h...
227                         exam              Here is the Academic Calendar website
134     Is scholarship available  Scholarships and financial aid may be availabl...

Test sample:
                                   user                                                bot
15                            more info                         You can contact at: NUMBER
52                              shut up                    please use appropriate language
63                 how much is the fees  Tuition varies by programme and year. Check th...
72  what is the name of your developers                                   College students
30           

In [23]:
# === Model & Training Hyperparams ===
MODEL_NAME = "google/flan-t5-small"   # or another model if you prefer
MAX_LEN    = 128
BATCH_SIZE = 8
EPOCHS     = 5
LR         = 5e-5
SEED       = 42

import tensorflow as tf
import numpy as np
import random

# Set random seeds for reproducibility
random.seed(SEED)
np.random.seed(SEED)
tf.random.set_seed(SEED)

print("Model name:", MODEL_NAME)


Model name: google/flan-t5-small


In [24]:
# Upgrade Torch to a safe version (>=2.6). Use CPU wheels to avoid CUDA conflicts.
!pip install -q --upgrade torch==2.8.0+cpu torchvision==0.23.0+cpu torchaudio==2.8.0+cpu \
  -f https://download.pytorch.org/whl/cpu/torch_stable.html

import torch
print("Torch version:", torch.__version__)


[31mERROR: Could not find a version that satisfies the requirement torch==2.8.0+cpu (from versions: 2.2.0, 2.2.0+cpu, 2.2.1, 2.2.1+cpu, 2.2.2, 2.2.2+cpu, 2.3.0, 2.3.0+cpu, 2.3.1, 2.3.1+cpu, 2.4.0, 2.4.1, 2.5.0, 2.5.1, 2.6.0, 2.7.0, 2.7.1, 2.8.0)[0m[31m
[0m[31mERROR: No matching distribution found for torch==2.8.0+cpu[0m[31m
[0mTorch version: 2.8.0+cu126


In [25]:
# Sanity check: you’re already on a safe Torch version (>=2.6)
import torch, transformers
print("Torch:", torch.__version__)
print("Transformers:", transformers.__version__)


Torch: 2.8.0+cu126
Transformers: 4.57.0


In [26]:
from huggingface_hub import snapshot_download
from transformers import AutoTokenizer, TFAutoModelForSeq2SeqLM

# Download ONLY TF files for t5-small so we avoid any PT files entirely
local_repo = snapshot_download(
    repo_id="t5-small",
    allow_patterns=["tokenizer.json","spiece.model","*.json","tf_model.h5"]
)

tokenizer = AutoTokenizer.from_pretrained(local_repo)
if tokenizer.pad_token is None:
    tokenizer.pad_token = tokenizer.eos_token

# Load native TensorFlow weights (no extra flags needed)
model = TFAutoModelForSeq2SeqLM.from_pretrained(local_repo)

print(" Loaded t5-small with native TF weights.")


Fetching 6 files:   0%|          | 0/6 [00:00<?, ?it/s]

All model checkpoint layers were used when initializing TFT5ForConditionalGeneration.

All the layers of TFT5ForConditionalGeneration were initialized from the model checkpoint at /root/.cache/huggingface/hub/models--t5-small/snapshots/df1b051c49625cf57a3d0d8d3863ed4d13564fe4.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFT5ForConditionalGeneration for predictions without further training.


 Loaded t5-small with native TF weights.


In [27]:
import numpy as np, pandas as pd, tensorflow as tf

MAX_LEN    = 128
BATCH_SIZE = 8

def load_pairs_csv(path):
    df = pd.read_csv(path).dropna().astype(str)
    return df["user"].tolist(), df["bot"].tolist()

def make_tf_dataset(tokenizer, sources, targets, max_len=128, batch_size=8):
    enc = tokenizer(sources, truncation=True, padding="max_length",
                    max_length=max_len, return_tensors="np")
    dec = tokenizer(targets, truncation=True, padding="max_length",
                    max_length=max_len, return_tensors="np")

    features = {
        "input_ids": enc["input_ids"],
        "attention_mask": enc["attention_mask"],
        "labels": np.where(dec["input_ids"] == tokenizer.pad_token_id, -100, dec["input_ids"]),
    }
    ds = tf.data.Dataset.from_tensor_slices(features)
    return ds.shuffle(len(sources)).batch(batch_size).prefetch(tf.data.AUTOTUNE)

train_src, train_tgt = load_pairs_csv("/content/university_chatbot_nb/data/train.csv")
val_src,   val_tgt   = load_pairs_csv("/content/university_chatbot_nb/data/test.csv")

train_ds = make_tf_dataset(tokenizer, train_src, train_tgt, max_len=MAX_LEN, batch_size=BATCH_SIZE)
val_ds   = make_tf_dataset(tokenizer, val_src,   val_tgt,   max_len=MAX_LEN, batch_size=BATCH_SIZE)

len(train_src), len(val_src)


(322, 83)

In [28]:
LR = 5e-5
EPOCHS = 5

optimizer = tf.keras.optimizers.Adam(learning_rate=LR)
model.compile(optimizer=optimizer)  # HF computes masked loss because "labels" exist

callbacks = [
    tf.keras.callbacks.EarlyStopping(monitor="val_loss", patience=2, restore_best_weights=True),
    tf.keras.callbacks.ModelCheckpoint(
        filepath="/content/university_chatbot_nb/models/t5_small_finetuned/ckpt",
        save_weights_only=True, save_best_only=True, monitor="val_loss"
    ),
]

history = model.fit(train_ds, validation_data=val_ds, epochs=EPOCHS, callbacks=callbacks)


Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


In [29]:
import os, json, re, sacrebleu

MODEL_DIR = "/content/university_chatbot_nb/models/t5_small_finetuned"
os.makedirs(MODEL_DIR, exist_ok=True)
model.save_pretrained(MODEL_DIR)
tokenizer.save_pretrained(MODEL_DIR)
with open(os.path.join(MODEL_DIR, "history.json"), "w") as f:
    json.dump({k: [float(x) for x in v] for k, v in (history.history or {}).items()}, f, indent=2)

# Clean decoded outputs
TAG_RE = re.compile(r"<[^>]+>")
BAD = ["ADD YOUR","ADD YOU OWN ANSWERS","target=\"_blank\"","href=","</a>","<a"]
def clean_answer(text: str) -> str:
    s = TAG_RE.sub("", str(text))
    for b in BAD: s = s.replace(b, "")
    return re.sub(r"\s{2,}", " ", s).strip()

def generate_response(prompt, max_len=128):
    out = model.generate(**tokenizer(prompt, return_tensors="tf", truncation=True, max_length=max_len))
    return clean_answer(tokenizer.decode(out[0], skip_special_tokens=True))

# BLEU on test
import pandas as pd
df_val = pd.read_csv("/content/university_chatbot_nb/data/test.csv").dropna().astype(str)
preds = [generate_response(q) for q in df_val["user"]]
refs  = df_val["bot"].tolist()
bleu  = sacrebleu.corpus_bleu(preds, [refs])
print(f"BLEU: {bleu.score:.2f}")

# Peek a few predictions
pd.DataFrame({"user": df_val["user"], "ref": refs, "pred": preds}).sample(5)




BLEU: 0.36


Unnamed: 0,user,ref,pred
30,dumb ass,please use appropriate language,I'm not sure I'm going to be a fan of this post.
0,I love you,"I am not program for this, please ask appropri...",I love you and your family!
22,number of seats in each branch,"For IT, Computer and extc 60 per branch and se...",The University of New York has a number of bra...
31,holiday list,Academic calender is given to you by your clas...,Check out the official website for the latest ...
18,Sports activities,Our university encourages all-round developmen...,"Sport activities include sports activities, sp..."


In [30]:
import gradio as gr

def respond(user_input):
    text = user_input.strip()
    if len(text) < 2:
        return "Please ask a full question related to university information."
    return generate_response(text, max_len=MAX_LEN)

demo = gr.Interface(
    fn=respond,
    inputs="text",
    outputs="text",
    title="🎓 University Chatbot (t5-small)",
    description=(
        "Ask about admissions, fees, accommodation/hostels, programmes, timetable, "
        "library, or contacts. Developed by Tanguy Kwizera for ALU."
    )
)

# 🚀 Launch with public link
demo.launch(share=True)


Colab notebook detected. To show errors in colab notebook, set debug=True in launch()
* Running on public URL: https://748ddf784fe2a213ae.gradio.live

This share link expires in 1 week. For free permanent hosting and GPU upgrades, run `gradio deploy` from the terminal in the working directory to deploy to Hugging Face Spaces (https://huggingface.co/spaces)


