# Koa-Chatbot — Colab Demo (FastAPI + ML Inference)

This notebook starts the Koa FastAPI backend and sends requests to `/chat` to verify the ML inference pipeline is running.

In [None]:
!git clone https://github.com/kokoc30/Koa-Chatbot.git
%cd Koa-Chatbot
!ls


In [None]:
from pathlib import Path

req_path = Path("requirements.txt")
reqs = req_path.read_text().splitlines() if req_path.exists() else []

blocked = ("torch", "torchvision", "torchaudio")
filtered = [
    r.strip() for r in reqs
    if r.strip() and not r.strip().startswith("#")
    and not any(r.strip().startswith(b) for b in blocked)
]

Path("requirements_colab.txt").write_text("\n".join(filtered) + ("\n" if filtered else ""))
print("Installing (filtered):", len(filtered), "packages")

!pip -q install -r requirements_colab.txt
!pip -q install accelerate sentencepiece huggingface_hub


In [None]:
import torch
print("CUDA available:", torch.cuda.is_available())
if torch.cuda.is_available():
    print("GPU:", torch.cuda.get_device_name(0))

!nvidia-smi -L


In [None]:
import os, getpass
from huggingface_hub import login

# Hide progress bars/log spam
os.environ["HF_HUB_DISABLE_PROGRESS_BARS"] = "1"
os.environ["TRANSFORMERS_VERBOSITY"] = "error"

hf_token = getpass.getpass("Hugging Face token (hidden, optional): ").strip()
if hf_token:
    login(hf_token)

koa_model_id = getpass.getpass("Model ID (hidden): ").strip()
if not koa_model_id:
    raise ValueError("Model ID is required to run the demo.")

os.environ["KOA_MODEL_ID"] = koa_model_id


In [None]:
from pathlib import Path

chat_path = Path("inference/chat.py")
txt = chat_path.read_text()

# 1) Make model configurable via env (no hardcoded model id)
import_line = "import os\n"
if import_line not in txt:
    txt = "import os\n" + txt

# Replace the BASE_MODEL_NAME line (your file currently hardcodes it)
# Works even if you later change the string.
import re
txt = re.sub(
    r'^BASE_MODEL_NAME\s*=\s*".*?"\s*$',
    'BASE_MODEL_NAME = os.environ.get("KOA_MODEL_ID")\n'
    'if not BASE_MODEL_NAME:\n'
    '    raise ValueError("KOA_MODEL_ID is not set")\n',
    txt,
    flags=re.MULTILINE
)

# 2) Reduce logging that would reveal model id
txt = txt.replace('print(f"[chat] Loading tokenizer {BASE_MODEL_NAME}...")', 'print("[chat] Loading tokenizer...")')
txt = txt.replace('print(f"[chat] Loading base model {BASE_MODEL_NAME}...")', 'print("[chat] Loading base model...")')

# 3) Make dtype Colab-safe
txt = txt.replace("torch_dtype=torch.bfloat16", "torch_dtype=torch.float16")

# 4) Disable transformers logs inside runtime (extra safe)
if "from transformers import AutoTokenizer, AutoModelForCausalLM" in txt and "transformers.utils import logging" not in txt:
    txt = txt.replace(
        "from transformers import AutoTokenizer, AutoModelForCausalLM",
        "from transformers import AutoTokenizer, AutoModelForCausalLM\nfrom transformers.utils import logging as hf_logging\nhf_logging.set_verbosity_error()"
    )

chat_path.write_text(txt)
print("Patched inference/chat.py for private model selection + quieter logs.")


In [None]:
!sed -n '1,120p' inference/api_server.py


In [None]:
PORT = 9010
!nohup python -m uvicorn inference.api_server:app --host 127.0.0.1 --port 9010 --log-level warning > uvicorn.log 2>&1 &
print("Server started in background. Logs -> uvicorn.log")



In [None]:
import time, requests
for _ in range(45):
    try:
        r = requests.get(f"http://127.0.0.1:{PORT}/docs", timeout=1)
        if r.status_code == 200:
            print("Server is up ✅  /docs reachable")
            break
    except:
        pass
    time.sleep(1)



In [None]:
import requests

payload = {"message": "Hi Koa! In one sentence, what can you do?"}
r = requests.post(f"http://127.0.0.1:{PORT}/chat", json=payload, timeout=300)

print("Status:", r.status_code)
print("Reply:", r.json().get("reply"))


In [None]:
import requests

text = """Summarize this:
Koa is a machine-learning chat assistant with a FastAPI backend and a responsive web UI.
It supports configurable prompts and is built for fast, streaming-style responses.
The system is designed for easy local deployment and web use.
"""

r = requests.post(f"http://127.0.0.1:{PORT}/chat", json={"message": text}, timeout=300)
print(r.json().get("reply"))


In [None]:
print(f"✅ Koa backend is running and responding to /chat on port {PORT}.")


In [None]:
!pkill -f "uvicorn inference.api_server:app"
