#### 07 - Serving + Demo (Portfolio Style)

This notebook packages the RAG pipeline into a single callable `ask()` function:
1) Embed the question (Azure OpenAI)
2) Retrieve top-k chunks (Option A brute-force or Option B Vector Search)
3) Build a grounded prompt
4) Generate an answer (Azure OpenAI)
5) Log the request/response to `rag_query_logs`
6) (Optional) Evaluate the answer and write to `rag_evaluations`

Output is a JSON-like dict that can later be used for an API or a simple UI.

In [0]:
%run ./00_install_deps_and_restart

In [0]:
%run ./00_constants

In [0]:
%run ./00_utils

Collecting openai<2.0.0,>=1.0.0
  Downloading openai-1.109.1-py3-none-any.whl.metadata (29 kB)
Collecting anyio<5,>=3.5.0 (from openai<2.0.0,>=1.0.0)
  Downloading anyio-4.12.1-py3-none-any.whl.metadata (4.3 kB)
Collecting httpx<1,>=0.23.0 (from openai<2.0.0,>=1.0.0)
  Downloading httpx-0.28.1-py3-none-any.whl.metadata (7.1 kB)
Collecting jiter<1,>=0.4.0 (from openai<2.0.0,>=1.0.0)
  Downloading jiter-0.12.0-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (5.2 kB)
Collecting sniffio (from openai<2.0.0,>=1.0.0)
  Downloading sniffio-1.3.1-py3-none-any.whl.metadata (3.9 kB)
Collecting tqdm>4 (from openai<2.0.0,>=1.0.0)
  Downloading tqdm-4.67.1-py3-none-any.whl.metadata (57 kB)
[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/57.7 kB[0m [31m?[0m eta [36m-:--:--[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m57.7/57.7 kB[0m [31m2.7 MB/s[0m eta [36m0:00:00[0m
Collecting httpcore==1.* (from httpx<1,>=0.23.0->openai<2.0.0

In [0]:
%run ./00_init_openai_client

In [0]:
from typing import List, Dict, Any
from datetime import datetime

# Helpers: validate chunk schema + compact sources

def _validate_chunks(chunks: List[Dict[str, Any]]):
    if not isinstance(chunks, list):
        raise TypeError("retrieved_chunks must be a list[dict]")
    if len(chunks) == 0:
        return
    required = {"chunk_id", "doc_id", "title", "url", "chunk_index", "chunk_text", "category", "score"}
    missing = required - set(chunks[0].keys())
    if missing:
        raise ValueError(f"retrieved_chunks items missing keys: {missing}")

def _compact_sources(chunks: List[Dict[str, Any]]):
    # Return a minimal source list for the final API-like payload
    sources = []
    for c in chunks:
        sources.append({
            "title": c.get("title"),
            "url": c.get("url"),
            "category": c.get("category"),
            "chunk_id": c.get("chunk_id"),
            "chunk_index": c.get("chunk_index"),
            "score": c.get("score"),
        })
    return sources

In [0]:
# The unified pipeline function: ask():
# embed → retrieve → prompt → Azure OpenAI → log → (optional) evaluate → return JSON. It 
# also includes a simple notebook “UI” cell and a few SQL queries to show metrics.

def ask(
    question: str,
    *,
    k: int = 6,
    retriever: str = "A",     # "A" brute-force, "B" vector search
    do_eval: bool = True,     # run LLM judge + write to rag_evaluations
    filters: dict = None,     # optional metadata filters for Vector Search
    temperature: float = 0.2
) -> dict:
    """
    End-to-end RAG call:
      - Embedding (Azure OpenAI)
      - Retrieval (A or B)
      - Prompt assembly
      - Chat completion (Azure OpenAI)
      - Logging (rag_query_logs)
      - Optional evaluation (rag_evaluations)

    Returns:
      {
        query_id: str,
        question: str,
        answer: str,
        sources: [...],
        eval: {...} | None
      }
    """
    if not isinstance(question, str) or not question.strip():
        raise ValueError("question must be a non-empty string")

    # 1) Embed
    q_emb = embed_texts(question)

    # 2) Retrieve (returns list[dict])
    if retriever.upper() == "B":
        chunks = retrieve_top_k(q_emb, option="B", k=k, filters=filters)
    else:
        chunks = retrieve_top_k(q_emb, option="A", k=k)

    _validate_chunks(chunks)

    # 3) Prompt
    prompt = build_prompt(question, chunks)

    # 4) Call LLM
    resp = aoai.chat.completions.create(
        model=CHAT_DEPLOYMENT,
        messages=[{"role": "user", "content": prompt}],
        temperature=temperature
    )
    answer = resp.choices[0].message.content

    # 5) Log
    rag_event = {
        "question": question,
        "top_k": k,
        "retrieved_chunks": chunks,
        "prompt": prompt,
        "answer": answer,
        "embedding_deployment": EMBEDDING_DEPLOYMENT,
        "chat_deployment": CHAT_DEPLOYMENT
    }
    query_id = log_rag_event(rag_event)

    # 6) Optional evaluation
    eval_result = None
    if do_eval:
        # judge_rag expects chunk_text to exist; ensure it's present
        eval_scores = judge_rag(question, answer, chunks)
        eval_id = write_evaluation(
            query_id=query_id,
            question=question,
            answer=answer,
            scores=eval_scores,
            evaluator="llm_judge_v1"
        )
        eval_result = {
            "evaluation_id": eval_id,
            **eval_scores
        }

    return {
        "query_id": query_id,
        "question": question,
        "answer": answer,
        "sources": _compact_sources(chunks),
        "eval": eval_result
    }

In [0]:
# Notebook “UI” demo cell

question = "How do I resize an Azure virtual machine?"
result = ask(question, k=6, retriever="A", do_eval=True)

print("Query ID:", result["query_id"])
print("\nAnswer:\n", result["answer"])

print("\nTop sources:")
for s in result["sources"][:3]:
    print("-", s["url"])

Trace(request_id=tr-12937a50144345fb9de7aa1f46b2f8f7)

[0;31m---------------------------------------------------------------------------[0m
[0;31mTypeError[0m                                 Traceback (most recent call last)
File [0;32m<command-5573401567029324>, line 4[0m
[1;32m      1[0m [38;5;66;03m# Notebook “UI” demo cell[39;00m
[1;32m      3[0m question [38;5;241m=[39m [38;5;124m"[39m[38;5;124mHow do I resize an Azure virtual machine?[39m[38;5;124m"[39m
[0;32m----> 4[0m result [38;5;241m=[39m ask(question, k[38;5;241m=[39m[38;5;241m6[39m, retriever[38;5;241m=[39m[38;5;124m"[39m[38;5;124mA[39m[38;5;124m"[39m, do_eval[38;5;241m=[39m[38;5;28;01mTrue[39;00m)
[1;32m      6[0m [38;5;28mprint[39m([38;5;124m"[39m[38;5;124mQuery ID:[39m[38;5;124m"[39m, result[[38;5;124m"[39m[38;5;124mquery_id[39m[38;5;124m"[39m])
[1;32m      7[0m [38;5;28mprint[39m([38;5;124m"[39m[38;5;130;01m\n[39;00m[38;5;124mAnswer:[39m[38;5;130;01m\n[39;00m[38;5;124m"[39m, result[[38;5;124m"[39m[38;5

In [0]:
# Try a few:
qs = [
    "What is an availability set used for?",
    "How do I change VM size in Azure?",
    "How do I create a VM from an image?",
]
for q in qs:
    out = ask(q, k=6, retriever="A", do_eval=True)
    print("\n======================")
    print("Q:", q)
    print("A:", out["answer"][:400], "...")
    print("Faithfulness:", out["eval"]["faithfulness"] if out["eval"] else None)

In [0]:
# Latest logs

spark.sql(f"""
SELECT
  created_at,
  query_id,
  question,
  top_k,
  retrieved_chunks[0].url AS top_source
FROM {RAG_LOG_TABLE}
ORDER BY created_at DESC
LIMIT 20
""").display()

In [0]:
# Latest evaluations

spark.sql(f"""
SELECT
  created_at,
  query_id,
  retrieval_relevance,
  answer_relevance,
  faithfulness,
  notes
FROM {RAG_EVAL_TABLE}
ORDER BY created_at DESC
LIMIT 20
""").display()