In [6]:
# Single Colab cell: Upgraded GUVI multilingual chatbot (Streamlit app) + optional ngrok
# --------------------------------------------------------------------
# 1) Paste into Google Colab and run.
# 2) Replace NGROK_AUTH_TOKEN with your ngrok authtoken (or leave empty to skip tunnel).
# 3) Upload your Excel KB to /content/guvi_qa_table (1).xlsx (or change EXCEL_PATH).
# --------------------------------------------------------------------

# Install dependencies
!pip install -q streamlit transformers sentence-transformers langdetect pyngrok pandas openpyxl python-docx datasets accelerate

# ---------- Config: set your ngrok token & HF token here (or leave blank) ----------
NGROK_AUTH_TOKEN = "PASTE_YOUR_NGROK_TOKEN_HERE"
HF_TOKEN = ""  # If you need to access private HF models or rate-limit, put your token here.

# ---------- App file content (app.py) ----------
app_code = r'''
import os
import streamlit as st
import pandas as pd
from langdetect import detect
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM, pipeline
from sentence_transformers import SentenceTransformer, util
import torch
import numpy as np
from datetime import datetime

# ---------- SETTINGS ----------
EXCEL_PATH = "/content/guvi_qa_table.xlsx"  # change if different
NLLB_MODEL = "facebook/nllb-200-distilled-600M"
EMBED_MODEL = "paraphrase-multilingual-MiniLM-L12-v2"
SCORE_THRESHOLD = 0.28

# Text generator model (use small default; swap to your fine-tuned HF repo)
GEN_MODEL = "google/flan-t5-small"   # lightweight default; replace with your fine-tuned model id when ready
USE_GENERATOR = True                 # set to False to skip generation and rely solely on KB answer

# Optional: set push-to-hf or private model token via HF_TOKEN env var externally
HF_TOKEN = os.environ.get("HF_TOKEN", "")

SHORT_TO_NLLB = {
    "ta": "tam_Taml",
    "hi": "hin_Deva",
    "te": "tel_Telu",
    "ml": "mal_Mlym",
    "en": "eng_Latn"
}

# ---------- Load KB ----------
@st.cache_data
def load_kb(path=EXCEL_PATH):
    if not os.path.exists(path):
        raise FileNotFoundError(f"Excel KB not found at {path}. Upload the file or change EXCEL_PATH.")
    df = pd.read_excel(path)
    # Normalize columns if needed
    if "Question" not in df.columns or "Answer" not in df.columns:
        cols = [c.lower() for c in df.columns]
        qcol = None; acol = None
        for c in df.columns:
            if c.lower() in ("question","q","query","prompt"):
                qcol = c
            if c.lower() in ("answer","a","response","reply"):
                acol = c
        if qcol and acol:
            df = df[[qcol, acol]].rename(columns={qcol:"Question", acol:"Answer"})
        else:
            # fallback: duplicate first column
            df = df.rename(columns={df.columns[0]:"Question"})
            df["Answer"] = df["Question"]
    df = df.dropna(subset=["Question","Answer"])
    df = df.astype({"Question": str, "Answer": str})
    return df.reset_index(drop=True)

# ---------- Load models ----------
@st.cache_resource
def load_translation_model_and_tokenizer():
    tokenizer = AutoTokenizer.from_pretrained(NLLB_MODEL)
    model = AutoModelForSeq2SeqLM.from_pretrained(NLLB_MODEL)
    return tokenizer, model

@st.cache_resource
def load_embedder():
    return SentenceTransformer(EMBED_MODEL)

@st.cache_resource
def load_generator(model_name=GEN_MODEL, hf_token=None):
    # uses transformers pipeline for text2text-generation
    pipe = pipeline("text2text-generation", model=model_name, device=0 if torch.cuda.is_available() else -1, use_auth_token=hf_token if hf_token else None)
    return pipe

# ---------- NLLB translate helper ----------
def lang_to_nllb(code_or_short):
    if code_or_short in SHORT_TO_NLLB:
        return SHORT_TO_NLLB[code_or_short]
    return code_or_short

def translate_nllb(text, tgt_nllb, src_nllb=None, tokenizer=None, model=None):
    if tokenizer is None or model is None:
        tokenizer, model = load_translation_model_and_tokenizer()
    bos_id = None
    try:
        bos_id = tokenizer.lang_code_to_id.get(tgt_nllb, None)
    except Exception:
        bos_id = None
    if bos_id is None:
        token_string_candidates = [f"<{tgt_nllb}>", tgt_nllb]
        for tokstr in token_string_candidates:
            tok_id = tokenizer.convert_tokens_to_ids(tokstr)
            if tok_id != tokenizer.unk_token_id:
                bos_id = tok_id
                break
    inputs = tokenizer(text, return_tensors="pt", truncation=True, max_length=512)
    gen_kwargs = {"max_length": 512}
    if bos_id is not None and bos_id != tokenizer.unk_token_id:
        gen_kwargs["forced_bos_token_id"] = bos_id
    translated = model.generate(**inputs, **gen_kwargs)
    out = tokenizer.batch_decode(translated, skip_special_tokens=True)[0]
    return out

# ---------- Embedding prep & search ----------
@st.cache_resource
def prepare_embeddings(kb_questions):
    embedder = load_embedder()
    return embedder.encode(kb_questions, convert_to_tensor=True)

def semantic_search(query_en, kb_questions, kb_embs, top_k=3):
    embedder = load_embedder()
    q_emb = embedder.encode(query_en, convert_to_tensor=True)
    scores = util.pytorch_cos_sim(q_emb, kb_embs)[0].cpu().numpy()
    idxs = np.argsort(-scores)[:top_k]
    return [(int(i), float(scores[i]), kb_questions[int(i)]) for i in idxs]

# ---------- Utility: build generator prompt ----------
def build_generation_prompt(user_query, top_contexts):
    # Create a concise prompt containing top KB answers as context
    ctx_text = "\n\n".join([f"Context {i+1} Q: {q}\nA: {a}" for i,(q,a) in enumerate(top_contexts)])
    prompt = f"You are a helpful assistant for GUVI. Use the following context to answer the user's question.\n\n{ctx_text}\n\nUser question: {user_query}\n\nAnswer concisely and mention if the KB doesn't contain enough info."
    return prompt

# ---------- Streamlit UI ----------
st.set_page_config(page_title="GUVI Multilingual Chatbot (Upgraded)", layout="wide")
st.title("🤖 GUVI Multilingual Chatbot — Upgraded (KB + Generator + Logging)")

# Load KB and models
try:
    df = load_kb()
except FileNotFoundError as e:
    st.error(str(e))
    st.stop()

tokenizer_nllb, nllb_model = load_translation_model_and_tokenizer()
embedder = load_embedder()
questions = df["Question"].astype(str).tolist()
answers = df["Answer"].astype(str).tolist()
kb_embs = prepare_embeddings(questions)

# Generator (optional)
gen_pipe = None
if USE_GENERATOR:
    try:
        gen_pipe = load_generator(GEN_MODEL, hf_token=os.environ.get("HF_TOKEN","") or None)
    except Exception as ex:
        st.warning(f"Failed to load generator model {GEN_MODEL}: {ex}")
        gen_pipe = None

st.sidebar.header("KB & Model Info")
st.sidebar.write(f"Loaded {len(questions)} Q/A rows.")
st.sidebar.write(f"Embedding model: {EMBED_MODEL}")
st.sidebar.write(f"Translation model: {NLLB_MODEL}")
st.sidebar.write(f"Generator model: {GEN_MODEL} (enabled={USE_GENERATOR})")
if st.sidebar.checkbox("Show sample rows"):
    st.sidebar.write(df.head())

user_input = st.text_input("Ask about GUVI (type in Tamil/Hindi/Telugu/Malayalam/English):", "")

if st.button("Ask") and user_input.strip():
    # detect language
    try:
        detected_short = detect(user_input)
    except Exception:
        detected_short = "en"
    src_nllb = lang_to_nllb(detected_short)
    tgt_nllb = "eng_Latn"

    # Translate to English for search
    if detected_short != "en":
        try:
            query_en = translate_nllb(user_input, tgt_nllb, src_nllb, tokenizer=tokenizer_nllb, model=nllb_model)
        except Exception:
            # fallback simple pipeline if available
            try:
                small_pipe = pipeline("translation", model="Helsinki-NLP/opus-mt-mul-en")
                query_en = small_pipe(user_input)[0]["translation_text"]
            except Exception:
                query_en = user_input
    else:
        query_en = user_input

    # Semantic search top_k
    top_k = st.sidebar.slider("Top-K contexts for generator", 1, 6, 3)
    matches = semantic_search(query_en, questions, kb_embs, top_k=top_k)
    top_contexts = [(questions[i], answers[i]) for i,_,_ in matches]

    best_idx, best_score, best_q = matches[0]
    best_answer_en = answers[best_idx]

    # Decide whether to use KB answer or generator
    final_answer_en = best_answer_en
    used_generator = False
    if USE_GENERATOR and gen_pipe is not None:
        # if confidence low, or user wants generator, call generator to produce an answer using context
        gen_trigger = st.sidebar.selectbox("Answer source", ["Auto (KB unless low confidence)","Always use generator","KB only"])
        if gen_trigger == "Always use generator" or best_score < SCORE_THRESHOLD:
            prompt = build_generation_prompt(query_en, top_contexts)
            try:
                gen_out = gen_pipe(prompt, max_length=256, do_sample=False)
                final_answer_en = gen_out[0]["generated_text"] if isinstance(gen_out, list) else str(gen_out)
                used_generator = True
            except Exception as ex:
                st.warning(f"Generator failed: {ex}. Falling back to KB answer.")
                final_answer_en = best_answer_en
                used_generator = False
        else:
            # KB answer used
            final_answer_en = best_answer_en
    else:
        # generator disabled or not loaded; rely on KB
        final_answer_en = best_answer_en

    # Translate back to user's language if needed
    if detected_short != "en":
        try:
            answer_local = translate_nllb(final_answer_en, lang_to_nllb(detected_short), src_nllb="eng_Latn", tokenizer=tokenizer_nllb, model=nllb_model)
        except Exception:
            try:
                fallback_pipe = pipeline("translation", model="Helsinki-NLP/opus-mt-en-mul")
                answer_local = fallback_pipe(final_answer_en)[0]["translation_text"]
            except Exception:
                answer_local = final_answer_en
    else:
        answer_local = final_answer_en

    # Display confidence and answer
    if best_score < SCORE_THRESHOLD:
        st.info(f"Low KB confidence (score={best_score:.3f}).")
    st.markdown("**Answer:**")
    st.write(answer_local)

    # Show debug matches if wanted
    if st.checkbox("Show top matches (debug)"):
        for i, s, txt in matches:
            st.write(f"- (score={s:.3f}) Q: {questions[i]}")

    # Logging for evaluation
    log_row = {
        "timestamp": datetime.utcnow().isoformat(),
        "user_query": user_input,
        "query_en": query_en,
        "detected_lang": detected_short,
        "best_score": best_score,
        "best_q_idx": int(best_idx),
        "used_generator": used_generator,
        "answer_en": final_answer_en,
        "answer_local": answer_local
    }
    log_path = "/content/guvi_chat_logs.csv"
    df_logs = pd.DataFrame([log_row])
    if os.path.exists(log_path):
        df_logs.to_csv(log_path, mode="a", header=False, index=False)
    else:
        df_logs.to_csv(log_path, index=False)
    st.sidebar.write(f"Logged query to {log_path}")
'''

# Write app.py to disk
with open("app.py", "w", encoding="utf-8") as f:
    f.write(app_code)

# ---------- Create requirements.txt ----------
requirements = """streamlit
transformers
sentence-transformers
langdetect
pyngrok
pandas
openpyxl
python-docx
datasets
accelerate
"""
with open("requirements.txt", "w") as f:
    f.write(requirements)



In [7]:
from pyngrok import ngrok
import time
import subprocess

# Kill any old tunnels and processes
ngrok.kill()

# Start Streamlit app in background
process = subprocess.Popen(["streamlit", "run", "app.py", "--server.port", "8501"])

# Wait a bit for Streamlit to start
time.sleep(5)

# Start ngrok tunnel
public_url = ngrok.connect(8501)
print("✅ Public Streamlit URL:", public_url)


✅ Public Streamlit URL: NgrokTunnel: "https://be43c878d5bd.ngrok-free.app" -> "http://localhost:8501"
