In [0]:
%pip install -U databricks-langchain==0.12.1 langchain==1.2.0 langchain-community==0.4.1 langchain-openai==1.1.6 faiss-cpu sentence-transformers langchain-classic rank_bm25 transformers scikit-learn spacy

In [0]:
dbutils.library.restartPython()

In [0]:
!python -m spacy download en_core_web_sm

### Define Foundation Terminology

Foundation	Terminology glossary construction, term extraction, preprocessing standardization, term embeddings and vector indexing

- NER, TF-IDF, KeyBERT
- Term Normalization
- Text Standardization
- Semantic Chunking

In [0]:
# 1. Define a terminology glossary for RAG variants (keep it updated, including context tags)
GLOSSARY = [
    {
    "term": "Retrieval-Augmented Generation",
    "synonyms": ["RAG", "retrieval augmented generation", "retrieval-augmented generation"],
    "definition": "A generation framework that retrieves external evidence and conditions an LLM on it.",
    "context_tags": ["LLM", "search", "grounding"],
    },
    {
        "term": "Self-RAG",
        "synonyms": ["Self RAG", "self-rag", "self-reflective RAG", "self-refining RAG"],
        "definition": (
            "A RAG approach where the model explicitly self-checks (e.g., reflect, critique, verify) during generation, "
            "deciding when to retrieve more evidence and how to revise its answer based on feedback signals."
        ),
        "context_tags": ["reflection", "self-critique", "verification", "iterative retrieval", "hallucination mitigation"],
    },
    {
        "term": "Corrective RAG",
        "synonyms": ["C-RAG", "CRAG", "Corrective-RAG", "corrective rag"],
        "definition": (
            "A RAG approach that detects low-quality retrieval or unsupported generations and applies corrective actions "
            "(e.g., re-retrieve, rewrite queries, filter evidence, or re-rank) to improve factual grounding."
        ),
        "context_tags": ["retrieval quality", "re-ranking", "query rewriting", "evidence filtering", "robustness"],
    },
    {
        "term": "Knowledge Graph RAG",
        "synonyms": ["KG-RAG", "KGRAG", "KG RAG", "knowledge-graph RAG", "graph RAG"],
        "definition": (
            "A RAG approach that retrieves and reasons over a knowledge graph (entities/relations) as structured evidence, "
            "often combining graph traversal with text retrieval to support multi-hop and relational questions."
        ),
        "context_tags": ["knowledge graph", "multi-hop reasoning", "entity linking", "graph traversal", "structured grounding"],
    },
    {
        "term": "Entity Linking",
        "synonyms": ["EL", "entity resolution", "mention linking", "entity disambiguation"],
        "definition": (
            "The process of mapping a text mention (e.g., 'Apple') to a canonical entity (e.g., Apple Inc.) in a KB/KG "
            "to support structured retrieval and reduce ambiguity."
        ),
        "context_tags": ["KG-RAG", "disambiguation", "knowledge base", "information extraction"],
    },
    {
        "term": "Evidence Grounding",
        "synonyms": ["grounding", "source grounding", "evidence-based generation"],
        "definition": (
            "Constraining or evaluating generation based on retrieved evidence so claims are supported by sources; "
            "commonly paired with citation, attribution, or entailment checks."
        ),
        "context_tags": ["factuality", "citations", "attribution", "hallucination mitigation"],
    },
]

In [0]:
import re
from typing import List, Dict, Any

class TerminologyProcessor:
    def __init__(self, glossary: List[Dict[str, Any]]):
        self.glossary = glossary
        self.standard_term_map = {}
        self.alias_to_entries_map = {}
        self._build_mappings()

    def _build_mappings(self):
        """Build mappings; one alias may map to multiple terminology entries to handle ambiguity."""
        for entry in self.glossary:
            standard_term = entry["term"]
            self.standard_term_map[standard_term.lower()] = standard_term

            all_aliases = [standard_term] + entry.get("synonyms", [])
            for alias in all_aliases:
                alias_lower = alias.lower()
                if alias_lower not in self.alias_to_entries_map:
                    self.alias_to_entries_map[alias_lower] = []
                if entry not in self.alias_to_entries_map[alias_lower]:
                    self.alias_to_entries_map[alias_lower].append(entry)

    def standardize_text(self, text: str, context_window: int = 10) -> str:
        """
        Context-aware terminology standardization using iteration + a replacement function.
        Dynamically generates the correct regex for each term type.
        """
        standardized_text = text
        sorted_keys = sorted(self.alias_to_entries_map.keys(), key=len, reverse=True)

        for key_lower in sorted_keys:
            possible_entries = self.alias_to_entries_map[key_lower]

            # --- Dynamically create the correct regex for each key ---
            pattern_str = ""
            # If key contains Latin letters, assume it's an abbreviation and enforce boundaries
            if re.search(r"[a-zA-Z]", key_lower):
                # Use lookarounds to avoid matching inside a larger word
                pattern_str = r"(?<![a-zA-Z])" + re.escape(key_lower) + r"(?![a-zA-Z])"
            else:
                # For Chinese (or non-Latin) terms, match exactly
                pattern_str = re.escape(key_lower)

            pattern = re.compile(pattern_str, flags=re.IGNORECASE)

            # Replacement function called for each match
            def replacer(match: re.Match) -> str:
                if len(possible_entries) == 1:
                    return possible_entries[0]["term"]
                else:
                    # --- Context-based disambiguation ---
                    context_snippet = standardized_text[
                        max(0, match.start() - context_window) : min(len(standardized_text), match.end() + context_window)
                    ]
                    for entry in possible_entries:
                        clues = entry.get("context_tags", []) + [entry["term"]]
                        if any(clue in context_snippet for clue in clues):
                            return entry["term"]
                    # If no context clue is found, fall back to the first definition
                    return possible_entries[0]["term"]

            # Update text using the replacement function
            standardized_text = pattern.sub(replacer, standardized_text)

        return standardized_text

    def extract_terms(self, text: str) -> List[str]:
        """
        Extract known standardized terms from text
        """
        found_terms = set()
        text_lower = text.lower()

        for standard_term_lower, original_standard_term in self.standard_term_map.items():
            # Direct substring search; do not use \b
            if re.search(re.escape(standard_term_lower), text_lower):
                found_terms.add(original_standard_term)

        return sorted(list(found_terms))

In [0]:
# 1. Initialize the terminology processor with the glossary.
term_processor = TerminologyProcessor(GLOSSARY)
print("--- 1. Initialize the terminology processor with the glossary ---")

# 2. Data preprocessing: terminology standardization
print("--- 2. Data Preprocessing: Terminology Standardization ---")
user_query = "I want to learn about applications of using RAG."
processed_query = term_processor.standardize_text(user_query)
print(f"Original query: {user_query}")
print(f"Standardized query: {processed_query}")

document_text = "Recently I studied Self-RAG."
processed_document = term_processor.standardize_text(document_text)
print(f"Original document: {document_text}")
print(f"Standardized document: {processed_document}")

In [0]:
# 3. Term extraction (for downstream vectorization or metadata tagging)
print("\n--- 3. Term Extraction ---")
extracted_terms_query = term_processor.extract_terms(processed_query)
print(f"Extracted terms from query: {extracted_terms_query}")

extracted_terms_document = term_processor.extract_terms(processed_document)
print(f"Extracted terms from document: {extracted_terms_document}")

In [0]:
# 4. Simulated vector storage and retrieval augmentation (conceptual)
print("\n--- 4. Simulated Vector Storage and Retrieval Augmentation (Conceptual) ---")
print("In a real application, we would use an embedding model (e.g., SentenceTransformers) to convert the standardized text and terms into vectors.")
print("These vectors would then be stored in a dedicated vector database (e.g., FAISS, Pinecone, or Weaviate) for efficient similarity search.")
print("During retrieval, the user query is first standardized and vectorized, then used to query the vector database to fetch relevant documents.")

In [0]:
# 5. Simulated retrieval augmentation: query expansion
def enhance_query_for_retrieval(query: str, processor: TerminologyProcessor) -> List[str]:
    """Expand query keywords using the terminology glossary to improve recall."""
    standardized_query = processor.standardize_text(query)
    query_terms = processor.extract_terms(standardized_query)

    expanded_keywords = set([standardized_query])
    for term in query_terms:
        expanded_keywords.add(term)
        for entry in processor.glossary:
            if entry["term"] == term:
                for synonym in entry.get("synonyms", []):
                    expanded_keywords.add(synonym)
                break
    return sorted(list(expanded_keywords))

In [0]:
print("\n--- 5. Simulated Retrieval Augmentation: Query Expansion ---")
original_query_for_retrieval = "I want to know what a RAG does in an LLM?"
expanded_keywords = enhance_query_for_retrieval(original_query_for_retrieval, term_processor)
print(f"Original retrieval query: {original_query_for_retrieval}")
print(f"Expanded retrieval keyword list: {expanded_keywords}")

### Detect Synonyms to Extend Terminology Glossary

Detect Synonyms by Similarity

- FAISS
- Legal-BERT, ChatLaw-Text2Vec
- Sentence Transformers + PEFT (LoRA) Fine-Tuning

In [0]:
import spacy
import re

DASHES = r"[-\u2010\u2011\u2012\u2013\u2014\u2212]"  # common dash chars

def alias_to_pattern(alias: str):
    # If alias contains a dash, match it as its own token with regex
    if re.search(DASHES, alias):
        # split on dash and keep the two sides
        left, right = re.split(DASHES, alias, maxsplit=1)
        left_tokens = left.strip().split()
        right_tokens = right.strip().split()
        return [{"LOWER": t.lower()} for t in left_tokens] + [{"TEXT": {"REGEX": DASHES}}] + [{"LOWER": t.lower()} for t in right_tokens]
    else:
        # normal phrase: token-by-token, case-insensitive
        return [{"LOWER": t.lower()} for t in alias.strip().split()]

def extract_terms_with_ruler(text, glossary):
    nlp = spacy.load("en_core_web_sm")

    # avoid adding duplicate pipes if you call this multiple times
    if "entity_ruler" in nlp.pipe_names:
        ruler = nlp.get_pipe("entity_ruler")
        ruler.clear()
    else:
        ruler = nlp.add_pipe("entity_ruler", before="ner", config={"overwrite_ents": True})

    patterns = []
    for entry in glossary:
        for alias in [entry.get("term")] + entry.get("synonyms", []):
            if not alias:
                continue
            patterns.append({"label": "TERM", "pattern": alias_to_pattern(alias)})

    # prefer longer patterns first (helps with overlaps like KG-RAG vs RAG)
    patterns.sort(key=lambda p: len(p["pattern"]), reverse=True)
    ruler.add_patterns(patterns)

    doc = nlp(text)
    return {ent.text for ent in doc.ents if ent.label_ == "TERM"}

In [0]:
# Example 
query = "Examples of KG-RAG" 
processed_query = term_processor.standardize_text(query)
print(processed_query)
# processed_query = "Examples of Knowledge Graph Retrieval-Augmented Generation"
candidates = extract_terms_with_ruler(processed_query, GLOSSARY)
print(f"Automatically extracted term candidates: {candidates}")


### Term Glossary Indexing

In [0]:
from sentence_transformers import SentenceTransformer, util

# It is recommended to load the model once during project initialization
# to avoid repeated loading overhead.
# model = SentenceTransformer('paraphrase-MiniLM-L6-v2')

def map_synonyms_by_similarity(main_terms: list, candidates: list, threshold: float = 0.8) -> dict:
    """
    Map candidate terms to the closest standard terms by computing
    cosine similarity between embeddings.

    Args:
        main_terms (list): List of standard (canonical) terms.
        candidates (list): List of candidate synonyms to be matched.
        threshold (float): Similarity threshold above which a candidate
                           is considered a synonym.

    Returns:
        dict: A dictionary mapping each standard term to a list of
              successfully matched synonyms.
    """
    _matched_synonyms = {term: [] for term in main_terms}

    if not main_terms or not candidates:
        return _matched_synonyms
    
    model = SentenceTransformer("paraphrase-MiniLM-L6-v2")
    
    # Encode in batches for better efficiency
    embeddings = model.encode(main_terms + candidates, convert_to_tensor=True)
    term_embeddings = embeddings[:len(main_terms)]
    candidate_embeddings = embeddings[len(main_terms):]

    # Compute the cosine similarity matrix between standard terms and candidates
    similarity_matrix = util.cos_sim(term_embeddings, candidate_embeddings)

    for i, term in enumerate(main_terms):
        for j, candidate in enumerate(candidates):
            if similarity_matrix[i][j] > threshold:
                _matched_synonyms[term].append(candidate)

    return _matched_synonyms

In [0]:
main_terms_to_map = ["RAG"]
all_possible_synonyms = [
    entry["term"]
    for entry in GLOSSARY
] + [
    synonym
    for entry in GLOSSARY
    for synonym in entry.get("synonyms", [])
]
optimized_mapped_synonyms = map_synonyms_by_similarity(
    main_terms_to_map,
    all_possible_synonyms
)
print("\nOptimized matched synonyms:", optimized_mapped_synonyms)

In [0]:
model_name = 'paraphrase-MiniLM-L6-v2'

print(f"\nAttempting to load model '{model_name}'...")
model = SentenceTransformer(model_name)

In [0]:
import faiss
import numpy as np
from sentence_transformers import SentenceTransformer
from typing import Dict, Tuple, List


def build_term_vector_index(
    term_glossary: Dict[str, dict],
    model: SentenceTransformer,
    use_cosine: bool = True
) -> Tuple[faiss.Index, List[str]]:
    """
    Convert all terms and their synonyms into vector embeddings and build a FAISS index.

    Args:
        term_glossary (dict):
            A structured glossary where keys are canonical terms and values contain
            a 'synonyms' list.
        model (SentenceTransformer):
            A loaded SentenceTransformer model.
        use_cosine (bool):
            Whether to use cosine similarity (recommended for sentence embeddings).

    Returns:
        tuple:
            (faiss_index, indexed_terms)
            - faiss_index: FAISS index containing all embeddings
            - indexed_terms: list of terms aligned with index rows
    """
    terms_to_index: List[str] = []

    # Collect canonical terms and synonyms
    for canonical_term, info in term_glossary.items():
        terms_to_index.append(canonical_term)
        synonyms = info.get("synonyms", [])
        if isinstance(synonyms, list):
            terms_to_index.extend(synonyms)

    # Deduplicate while keeping deterministic order
    indexed_terms = sorted(set(terms_to_index))
    if not indexed_terms:
        raise ValueError("No terms found in glossary.")

    print("Generating term embeddings...")
    embeddings = model.encode(
        indexed_terms,
        convert_to_numpy=True,
        normalize_embeddings=use_cosine,
        show_progress_bar=True
    ).astype("float32")

    dim = embeddings.shape[1]

    # Choose FAISS index type
    if use_cosine:
        # Cosine similarity = inner product on normalized vectors
        index = faiss.IndexFlatIP(dim)
    else:
        index = faiss.IndexFlatL2(dim)

    index.add(embeddings)

    metric = "cosine similarity" if use_cosine else "L2 distance"
    print(f"FAISS index built successfully. "
          f"Vectors: {index.ntotal}, Dimension: {dim}, Metric: {metric}")

    return index, indexed_terms

In [0]:
from typing import List, Dict, Any

def convert_glossary_to_term_dict(
    glossary: List[Dict[str, Any]],
    deduplicate: bool = True,
    keep_term_as_synonym: bool = False
) -> Dict[str, Dict[str, list]]:
    """
    Convert list-based glossary into FAISS-friendly dict format.

    Args:
        glossary: original GLOSSARY list
        deduplicate: remove duplicate synonyms
        keep_term_as_synonym: include canonical term in synonyms or not

    Returns:
        dict: {canonical_term: {"synonyms": [...]}}
    """
    term_dict = {}

    for entry in glossary:
        term = entry.get("term")
        if not term:
            continue

        synonyms = entry.get("synonyms", [])
        if not isinstance(synonyms, list):
            synonyms = []

        if keep_term_as_synonym:
            synonyms = [term] + synonyms

        if deduplicate:
            # preserve order, remove duplicates
            seen = set()
            synonyms = [s for s in synonyms if not (s in seen or seen.add(s))]

        term_dict[term] = {"synonyms": synonyms}

    return term_dict

In [0]:
term_glossary = convert_glossary_to_term_dict(GLOSSARY)
print(term_glossary)

In [0]:
index, indexed_terms = build_term_vector_index(
    term_glossary=term_glossary,
    model=model,
    use_cosine=True
)

In [0]:
def search_terms(
    query: str,
    model: SentenceTransformer,
    index: faiss.Index,
    indexed_terms: List[str],
    top_k: int = 5,
    use_cosine: bool = True
):
    """
    Search the FAISS index for the most similar terms to a query.

    Args:
        query (str): Input query text.
        model (SentenceTransformer): SentenceTransformer model.
        index (faiss.Index): FAISS index.
        indexed_terms (list): Terms aligned with index rows.
        top_k (int): Number of results to return.
        use_cosine (bool): Whether cosine similarity is used.

    Returns:
        list of (term, score) tuples.
    """
    query_embedding = model.encode(
        [query],
        convert_to_numpy=True,
        normalize_embeddings=use_cosine
    ).astype("float32")

    scores, indices = index.search(query_embedding, top_k)

    results = []
    for score, idx in zip(scores[0], indices[0]):
        if idx == -1:
            continue
        results.append((indexed_terms[idx], float(score)))

    return results

In [0]:
print("\n--- Index Build Successful ---")
print("Number of vectors in FAISS index:", index.ntotal)
print("Indexed terms:", indexed_terms)

In [0]:
# --- Part 2: Define our core retrieval function ---

def search_similar_terms(
    query_text: str,
    model: SentenceTransformer,
    index: faiss.Index,
    term_list: list,
    k: int = 5
):
    """
    Retrieve the top-k most similar terms to a query text from a FAISS index.

    Args:
        query_text (str): The user input query term/text.
        model (SentenceTransformer): The embedding model used to encode the query.
        index (faiss.Index): The FAISS index object.
        term_list (list): The term list aligned with the order of vectors in the FAISS index.
        k (int): The number of most similar results to return.
    """
    print(f"\n--- Running Retrieval ---")
    print(f"Query: '{query_text}'")

    # 1) Encode the query text into an embedding vector
    query_vector = model.encode([query_text])
    query_vector = query_vector.astype("float32")

    # 2) Search in the FAISS index
    # index.search returns two arrays: D (distances/scores) and I (indices)
    distances, indices = index.search(query_vector, k)

    # 3) Parse and print results
    print("Results:")
    for i in range(k):
        idx = int(indices[0][i])
        dist = float(distances[0][i])
        term = term_list[idx]

        # For IndexFlatL2, distance is squared Euclidean distance:
        # smaller distance => more similar
        print(f"  Top {i+1}: term='{term}', distance={dist:.4f} (smaller = more similar)")


# 4) === Demo ===

# Case 1: Query using a synonym for the canonical term
search_similar_terms(query_text="RAG", model=model, index=index, term_list=indexed_terms, k=3)

# Case 2: Semantically similar query (core advantage)
search_similar_terms(query_text="Vector Store", model=model, index=index, term_list=indexed_terms, k=3)

# Case 3: Query with a broader term
# Goal: Query "Language Model" and see whether it matches more specific related terms (if present).
search_similar_terms(query_text="Language Model", model=model, index=index, term_list=indexed_terms, k=3)

# Case 4: Tolerance to minor noise / paraphrases
# Goal: Query "Transformer model" (a paraphrase) and see if it matches "Transformer" / "transformer".
search_similar_terms(query_text="Transformer model", model=model, index=index, term_list=indexed_terms, k=3)

### Enhancement Query and Answer based on Glossary

Hybrid retrieval (BM25 + vectors), query expansion (MultiQuery), hypothetical document embeddings (HyDE), cross-encoder re-ranking

- Multi-Query Retriever
- HyDE
- Hybrid Search
- BGE-Reranker

In [0]:
## Query Expansion and Rewriting
from langchain_community.vectorstores import FAISS
from langchain_openai import OpenAIEmbeddings, ChatOpenAI
from langchain_community.document_loaders import TextLoader
from langchain_text_splitters import RecursiveCharacterTextSplitter
import os

In [0]:
from langchain_core.documents import Document

# 1. Prepare sample documents
# We create some example text containing technical terminology
doc_text = """
Convolutional Neural Networks (CNNs) are a key model in deep learning,
especially effective in the field of image recognition.
Their core idea is to automatically extract local image features
through convolutional layers and pooling layers.

Unlike CNNs, Transformer models were originally applied to
natural language processing (NLP) tasks such as machine translation.
Today, they have also been successfully applied to computer vision,
known as Vision Transformers.

Large Language Models (LLMs) are a major focus of current AI research.
Based on the Transformer architecture, they are capable of
understanding and generating human-like text,
demonstrating strong reasoning capabilities.
""".strip()

documents = [Document(page_content=doc_text, metadata={"source": "sample_tech_doc"})]
text_splitter = RecursiveCharacterTextSplitter(
    chunk_size=100,
    chunk_overlap=20
)
docs = text_splitter.split_documents(documents)

In [0]:
from databricks_langchain import ChatDatabricks, DatabricksEmbeddings

EMBEDDING_MODEL = "databricks-bge-large-en"
embeddings = DatabricksEmbeddings(endpoint=EMBEDDING_MODEL)
vectorstore = FAISS.from_documents(docs, embeddings)

In [0]:
from langchain_classic.retrievers.multi_query import MultiQueryRetriever
from databricks_langchain import ChatDatabricks, DatabricksEmbeddings

LLM_ENDPOINT_NAME = "databricks-meta-llama-3-1-8b-instruct"
llm = ChatDatabricks(endpoint=LLM_ENDPOINT_NAME, temperature=0.2)
retriever_from_llm = MultiQueryRetriever.from_llm(
    retriever=vectorstore.as_retriever(),
    llm=llm
)

In [0]:
query = "What is a RAG?"
retrieved_docs = retriever_from_llm.invoke(query)
for doc in retrieved_docs:
    print(doc.page_content)

### Hybrid retrieval

In [0]:
from langchain_community.retrievers import BM25Retriever

bm25_retriever = BM25Retriever.from_documents(docs)
bm25_retriever.k = 3
print("BM25 retriever built successfully.")

In [0]:
print("\nInitializing MergerRetriever...")
faiss_retriever = vectorstore.as_retriever(search_kwargs={"k": 3})
retriever_list = [bm25_retriever, faiss_retriever]

In [0]:
from langchain_classic.retrievers import MergerRetriever
# MergerRetriever handles parallel retrieval and deduplication
merged_retriever = MergerRetriever(retrievers=retriever_list)
print("MergerRetriever initialized successfully.")

In [0]:
print("\nInitializing MergerRetriever...")
retriever_list = [bm25_retriever, faiss_retriever]
# MergerRetriever handles parallel retrieval and deduplication
merged_retriever = MergerRetriever(retrievers=retriever_list)
print("MergerRetriever initialized successfully.")

In [0]:
# --- 4. Run a query and compare results ---
query = "Self-RAG"
print(f"\n\n--- Running Hybrid Retrieval ---")
print(f"Query: '{query}'")

In [0]:
# For comparison, inspect each retrieverâ€™s results individually first
print("\n--- Individual Retriever Results ---")

bm25_results = bm25_retriever.invoke(query)
print(f"[BM25 Keyword Retrieval Results] (total {len(bm25_results)}):")
for doc in bm25_results:
    print(f"  - {doc.page_content[:50]}...")

faiss_results = faiss_retriever.invoke(query)
print(f"\n[FAISS Vector Retrieval Results] (total {len(faiss_results)}):")
for doc in faiss_results:
    print(f"  - {doc.page_content[:50]}...")

In [0]:
# Now inspect the merged (hybrid) results
print("\n--- MergerRetriever Hybrid Results ---")
merged_results = merged_retriever.invoke(query)
print(f"[Final Hybrid Results] (total {len(merged_results)}, deduplicated):")
for doc in merged_results:
    print(f"  - {doc.page_content[:50]}...")

### Response Generation and Evaluation

- Prompt Engineering
- Structured Output
- Output Parser
- Post-processing 
- LLM-as-a-judge

In [0]:
import html
import re
from typing import List, Dict, Any, Optional

from pydantic import BaseModel, Field
from langchain_core.prompts import ChatPromptTemplate
from langchain_databricks import ChatDatabricks


# --- 1) Preparation ---
LLM_ENDPOINT_NAME = "databricks-meta-llama-3-1-8b-instruct"
llm = ChatDatabricks(endpoint=LLM_ENDPOINT_NAME, temperature=0.2)


# --- 2) Define the expected output structure ---
class TerminologyInAnswer(BaseModel):
    """A structured model containing the main answer and the technical terms used."""
    answer: str = Field(description="A detailed and accurate answer to the user's question.")
    standard_terms_used: List[str] = Field(
        default_factory=list,
        description="Standard technical terms from the official glossary explicitly used in the answer.",
        examples=[["RAG", "Self-RAG"]],
    )


# --- 3) Structured output chain (parameterized) ---
prompt = ChatPromptTemplate.from_messages([
    ("system", "You are an AI expert with deep technical knowledge. Provide a structured answer."),
    ("human", "Please explain: {question}")
])

structured_llm_chain = prompt | llm.with_structured_output(TerminologyInAnswer)


def glossary_list_to_def_dict(glossary_list: List[Dict[str, Any]], include_synonyms: bool = True) -> Dict[str, str]:
    """
    Convert list-style glossary into dict: {term_or_synonym: definition}.
    Skips empty terms and empty definitions.
    """
    out: Dict[str, str] = {}
    for entry in glossary_list or []:
        term = (entry.get("term") or "").strip()
        definition = (entry.get("definition") or "").strip()
        if term and definition:
            out[term] = definition

        if include_synonyms and definition:
            for syn in entry.get("synonyms", []) or []:
                syn = (syn or "").strip()
                if syn and syn not in out:
                    out[syn] = definition
    return out


class TermEnhancer:
    """
    Efficiently wraps known terms with <abbr title="...">term</abbr> in one pass.
    - Single compiled regex for all terms
    - Avoids touching text inside HTML tags
    - Avoids re-wrapping if text already contains <abbr ...>term</abbr> (best-effort)
    """
    def __init__(self, term_defs: Dict[str, str], *, case_sensitive: bool = True):
        self.term_defs = {k: v for k, v in (term_defs or {}).items() if k and v}
        if not self.term_defs:
            self._regex = None
            return

        # Prefer longer matches first so "KG-RAG" beats "RAG" when both exist.
        terms_sorted = sorted(self.term_defs.keys(), key=len, reverse=True)

        # Build a single alternation regex.
        # NOTE: if you want strict word boundaries, you can wrap each term with \b,
        # but that breaks on hyphenated terms (Self-RAG). So we keep it flexible.
        escaped = [re.escape(t) for t in terms_sorted]
        flags = 0 if case_sensitive else re.IGNORECASE
        self._regex = re.compile("|".join(escaped), flags=flags)

    def enhance(self, text: str) -> str:
        if not text or not self._regex:
            return text

        # Split on HTML tags; only substitute in non-tag parts.
        # This prevents replacing inside attributes like title="...".
        parts = re.split(r"(<[^>]+>)", text)

        def repl(match: re.Match) -> str:
            term = match.group(0)
            definition = self.term_defs.get(term)
            # If case-insensitive mode is desired, you can map via a normalized dict instead.
            if not definition:
                return term
            safe_def = html.escape(definition, quote=True)
            return f'<abbr title="{safe_def}">{term}</abbr>'

        for i in range(0, len(parts), 2):  # even indices are outside tags
            # Best-effort: avoid wrapping inside existing <abbr>...</abbr>
            # If your text can contain nested HTML, you may want a real HTML parser.
            parts[i] = self._regex.sub(repl, parts[i])

        return "".join(parts)


# --- 4) Execute ---
question = "what a self-RAG is"
structured_response: TerminologyInAnswer = structured_llm_chain.invoke({"question": question})

print("--- Structured object returned by the LLM ---")
print(structured_response)
print("\nAnswer content:", structured_response.answer)
print("Standard terms:", structured_response.standard_terms_used)

# --- 5) Enhance output with glossary definitions ---
term_to_definition = glossary_list_to_def_dict(GLOSSARY, include_synonyms=True)
enhancer = TermEnhancer(term_to_definition, case_sensitive=True)

final_output = enhancer.enhance(structured_response.answer)

print("\n--- Final enhanced output ---")
print(final_output)


In [0]:
def enhance_text_with_definitions(text: str, term_glossary: Dict[str, str]) -> str:
    """
    Wrap glossary terms with <abbr title="...">term</abbr> in a single pass.

    Improvements over naive .replace loop:
    - One compiled regex (faster than O(#terms * text_len))
    - Longest term first to avoid partial matches
    - Escapes HTML in definitions
    - Avoids modifying inside HTML tags (best-effort)
    """
    if not text or not term_glossary:
        return text

    # Keep only non-empty terms with definitions
    term_glossary = {k: v for k, v in term_glossary.items() if k and v}
    if not term_glossary:
        return text

    # Prefer longer terms first ("Transformer Architecture" before "Transformer")
    terms_sorted = sorted(term_glossary.keys(), key=len, reverse=True)
    pattern = re.compile("|".join(re.escape(t) for t in terms_sorted))

    # Split on HTML tags so we only modify visible text (not attributes)
    parts = re.split(r"(<[^>]+>)", text)

    def repl(m: re.Match) -> str:
        term = m.group(0)
        definition = term_glossary.get(term, "")
        safe_def = html.escape(definition, quote=True)
        return f'<abbr title="{safe_def}">{term}</abbr>'

    # Replace only outside tags: even indices
    for i in range(0, len(parts), 2):
        parts[i] = pattern.sub(repl, parts[i])

    return "".join(parts)


In [0]:
term_to_definition = {
    "Large Language Models": "Neural networks trained on massive text corpora to understand and generate language.",
    "Transformer Architecture": "A neural network architecture based on self-attention mechanisms.",
    "RAG": "Retrieval-Augmented Generation, which combines retrieval with text generation."
}

llm_answer_text = (
    "Large Language Models are typically based on the Transformer Architecture, "
    "while RAG is a dominant approach for retrieving and generating knowledge."
)

final_output = enhance_text_with_definitions(llm_answer_text, term_to_definition)
print(final_output)

### Evaluation

In [0]:
from typing import List, Dict
from pydantic import BaseModel, Field
from langchain_core.prompts import ChatPromptTemplate
from langchain_databricks import ChatDatabricks


# -----------------------
# 1) Databricks LLM setup
# -----------------------
LLM_ENDPOINT_NAME = "databricks-meta-llama-3-1-8b-instruct"

# Use a separate evaluator LLM (lower temperature for consistency)
evaluator_llm = ChatDatabricks(endpoint=LLM_ENDPOINT_NAME, temperature=0.0)


# -----------------------------------------
# 2) Define a structured evaluation model
# -----------------------------------------
class TerminologyEvaluation(BaseModel):
    """A structured model for evaluating terminology consistency."""
    consistency_score: int = Field(
        description="Score from 1 to 5 (5=fully consistent, 1=severely inconsistent)."
    )
    is_consistent: bool = Field(
        description="Whether the answer is overall compliant with terminology standards."
    )
    reasoning: str = Field(
        description="Explanation of the score, highlighting strengths and issues."
    )
    suggestions_for_improvement: List[str] = Field(
        default_factory=list,
        description="Concrete suggestions to improve terminology usage."
    )


# -----------------------
# 3) Glossary (data-driven)
# -----------------------
# You can expand this anytime; prompt remains unchanged.
GLOSSARY: List[Dict[str, object]] = [
    {"term": "Convolutional Neural Network", "aliases": ["CNN"]},
    {"term": "Transformer Model", "aliases": ["Transformer", "Transformer Architecture"]},
    {"term": "Large Language Model", "aliases": ["LLM", "Large Language Models"]},
]

def format_glossary(glossary: List[Dict[str, object]]) -> str:
    """Render glossary into a compact, evaluator-friendly block."""
    lines = []
    for entry in glossary:
        term = entry["term"]
        aliases = entry.get("aliases", []) or []
        if aliases:
            lines.append(f"- {term} (aliases: {', '.join(aliases)})")
        else:
            lines.append(f"- {term}")
    return "\n".join(lines)

glossary_text = format_glossary(GLOSSARY)


# -----------------------
# 4) Build evaluation chain
# -----------------------
evaluation_prompt = ChatPromptTemplate.from_messages([
    ("system",
     "You are a strict technical documentation quality evaluator. "
     "You ONLY evaluate terminology usage based on the provided glossary and criteria."),
    ("human",
     """Evaluate the terminology consistency and correctness in the answer.

Evaluation Criteria:
1) Accuracy: Are standard terms used correctly?
2) Compliance: Does the answer avoid unofficial or ambiguous aliases when a standard term should be used?
3) Completeness: Does the answer use the most appropriate standard terms when needed?

Authoritative Terminology Glossary:
{glossary_text}

Answer to Evaluate:
{answer_text}

Rules:
- Score must be an integer 1..5.
- Set is_consistent=true only if terminology is largely compliant (minor issues ok).
- Give concrete rewrite suggestions (phrases to replace), not vague advice.

Return your result as structured output.
"""
    )
])

evaluation_chain = evaluation_prompt | evaluator_llm.with_structured_output(TerminologyEvaluation)


# -----------------------
# 5) Run evaluations
# -----------------------
good_answer = (
    "A Large Language Model (LLM) is built on a Transformer Model, while a "
    "Convolutional Neural Network (CNN) is widely used in image-related domains."
)

bad_answer = (
    "A big model is based on a transformer-style architecture, and a conv net "
    "is very strong at picture processing."
)

print("--- Evaluating [Good Answer] ---")
good_eval: TerminologyEvaluation = evaluation_chain.invoke({
    "glossary_text": glossary_text,
    "answer_text": good_answer
})
print(good_eval)
print("\nScore:", good_eval.consistency_score)
print("Consistent:", good_eval.is_consistent)
print("Suggestions:", good_eval.suggestions_for_improvement)

print("\n--- Evaluating [Needs Improvement Answer] ---")
bad_eval: TerminologyEvaluation = evaluation_chain.invoke({
    "glossary_text": glossary_text,
    "answer_text": bad_answer
})
print(bad_eval)
print("\nScore:", bad_eval.consistency_score)
print("Consistent:", bad_eval.is_consistent)
print("Suggestions:", bad_eval.suggestions_for_improvement)