In [0]:
# %pip install -U -qqqq 
# backoff 
# databricks-langchain 
# langgraph==0.5.3 
# uv 
# databricks-agents 
# mlflow-skinny[databricks] 
# chromadb 
# sentence-transformers 
# langchain-huggingface
# langchain-chroma 
# wikipedia 
# faiss-cpu


%pip install -U transformers sentence-transformers faiss-cpu scikit-learn spacy

In [0]:
dbutils.library.restartPython()

In [0]:
!python -m spacy download en_core_web_sm

In [0]:
# 1. Define a terminology glossary for RAG variants (keep it updated, including context tags)
GLOSSARY = [
    {
    "term": "Retrieval-Augmented Generation",
    "synonyms": ["RAG", "retrieval augmented generation", "retrieval-augmented generation"],
    "definition": "A generation framework that retrieves external evidence and conditions an LLM on it.",
    "context_tags": ["LLM", "search", "grounding"],
    },
    {
        "term": "Self-RAG",
        "synonyms": ["Self RAG", "self-rag", "self-reflective RAG", "self-refining RAG"],
        "definition": (
            "A RAG approach where the model explicitly self-checks (e.g., reflect, critique, verify) during generation, "
            "deciding when to retrieve more evidence and how to revise its answer based on feedback signals."
        ),
        "context_tags": ["reflection", "self-critique", "verification", "iterative retrieval", "hallucination mitigation"],
    },
    {
        "term": "Corrective RAG",
        "synonyms": ["C-RAG", "CRAG", "Corrective-RAG", "corrective rag"],
        "definition": (
            "A RAG approach that detects low-quality retrieval or unsupported generations and applies corrective actions "
            "(e.g., re-retrieve, rewrite queries, filter evidence, or re-rank) to improve factual grounding."
        ),
        "context_tags": ["retrieval quality", "re-ranking", "query rewriting", "evidence filtering", "robustness"],
    },
    {
        "term": "Knowledge Graph RAG",
        "synonyms": ["KG-RAG", "KGRAG", "KG RAG", "knowledge-graph RAG", "graph RAG"],
        "definition": (
            "A RAG approach that retrieves and reasons over a knowledge graph (entities/relations) as structured evidence, "
            "often combining graph traversal with text retrieval to support multi-hop and relational questions."
        ),
        "context_tags": ["knowledge graph", "multi-hop reasoning", "entity linking", "graph traversal", "structured grounding"],
    },
    {
        "term": "Entity Linking",
        "synonyms": ["EL", "entity resolution", "mention linking", "entity disambiguation"],
        "definition": (
            "The process of mapping a text mention (e.g., 'Apple') to a canonical entity (e.g., Apple Inc.) in a KB/KG "
            "to support structured retrieval and reduce ambiguity."
        ),
        "context_tags": ["KG-RAG", "disambiguation", "knowledge base", "information extraction"],
    },
    {
        "term": "Evidence Grounding",
        "synonyms": ["grounding", "source grounding", "evidence-based generation"],
        "definition": (
            "Constraining or evaluating generation based on retrieved evidence so claims are supported by sources; "
            "commonly paired with citation, attribution, or entailment checks."
        ),
        "context_tags": ["factuality", "citations", "attribution", "hallucination mitigation"],
    },
]

In [0]:
import re
from typing import List, Dict, Any

class TerminologyProcessor:
    def __init__(self, glossary: List[Dict[str, Any]]):
        self.glossary = glossary
        self.standard_term_map = {}
        self.alias_to_entries_map = {}
        self._build_mappings()

    def _build_mappings(self):
        """Build mappings; one alias may map to multiple terminology entries to handle ambiguity."""
        for entry in self.glossary:
            standard_term = entry["term"]
            self.standard_term_map[standard_term.lower()] = standard_term

            all_aliases = [standard_term] + entry.get("synonyms", [])
            for alias in all_aliases:
                alias_lower = alias.lower()
                if alias_lower not in self.alias_to_entries_map:
                    self.alias_to_entries_map[alias_lower] = []
                if entry not in self.alias_to_entries_map[alias_lower]:
                    self.alias_to_entries_map[alias_lower].append(entry)

    def standardize_text(self, text: str, context_window: int = 10) -> str:
        """
        Context-aware terminology standardization using iteration + a replacement function.
        Dynamically generates the correct regex for each term type.
        """
        standardized_text = text
        sorted_keys = sorted(self.alias_to_entries_map.keys(), key=len, reverse=True)

        for key_lower in sorted_keys:
            possible_entries = self.alias_to_entries_map[key_lower]

            # --- Dynamically create the correct regex for each key ---
            pattern_str = ""
            # If key contains Latin letters, assume it's an abbreviation and enforce boundaries
            if re.search(r"[a-zA-Z]", key_lower):
                # Use lookarounds to avoid matching inside a larger word
                pattern_str = r"(?<![a-zA-Z])" + re.escape(key_lower) + r"(?![a-zA-Z])"
            else:
                # For Chinese (or non-Latin) terms, match exactly
                pattern_str = re.escape(key_lower)

            pattern = re.compile(pattern_str, flags=re.IGNORECASE)

            # Replacement function called for each match
            def replacer(match: re.Match) -> str:
                if len(possible_entries) == 1:
                    return possible_entries[0]["term"]
                else:
                    # --- Context-based disambiguation ---
                    context_snippet = standardized_text[
                        max(0, match.start() - context_window) : min(len(standardized_text), match.end() + context_window)
                    ]
                    for entry in possible_entries:
                        clues = entry.get("context_tags", []) + [entry["term"]]
                        if any(clue in context_snippet for clue in clues):
                            return entry["term"]
                    # If no context clue is found, fall back to the first definition
                    return possible_entries[0]["term"]

            # Update text using the replacement function
            standardized_text = pattern.sub(replacer, standardized_text)

        return standardized_text

    def extract_terms(self, text: str) -> List[str]:
        """
        Extract known standardized terms from text
        """
        found_terms = set()
        text_lower = text.lower()

        for standard_term_lower, original_standard_term in self.standard_term_map.items():
            # Direct substring search; do not use \b
            if re.search(re.escape(standard_term_lower), text_lower):
                found_terms.add(original_standard_term)

        return sorted(list(found_terms))

In [0]:
# 1. Initialize the terminology processor with the glossary.
term_processor = TerminologyProcessor(GLOSSARY)
print("--- 1. Initialize the terminology processor with the glossary ---")

# 2. Data preprocessing: terminology standardization
print("--- 2. Data Preprocessing: Terminology Standardization ---")
user_query = "I want to learn about applications of using RAG."
processed_query = term_processor.standardize_text(user_query)
print(f"Original query: {user_query}")
print(f"Standardized query: {processed_query}")

document_text = "Recently I studied Self-RAG."
processed_document = term_processor.standardize_text(document_text)
print(f"Original document: {document_text}")
print(f"Standardized document: {processed_document}")

In [0]:
# 3. Term extraction (for downstream vectorization or metadata tagging)
print("\n--- 3. Term Extraction ---")
extracted_terms_query = term_processor.extract_terms(processed_query)
print(f"Extracted terms from query: {extracted_terms_query}")

extracted_terms_document = term_processor.extract_terms(processed_document)
print(f"Extracted terms from document: {extracted_terms_document}")

In [0]:
# 4. Simulated vector storage and retrieval augmentation (conceptual)
print("\n--- 4. Simulated Vector Storage and Retrieval Augmentation (Conceptual) ---")
print("In a real application, we would use an embedding model (e.g., SentenceTransformers) to convert the standardized text and terms into vectors.")
print("These vectors would then be stored in a dedicated vector database (e.g., FAISS, Pinecone, or Weaviate) for efficient similarity search.")
print("During retrieval, the user query is first standardized and vectorized, then used to query the vector database to fetch relevant documents.")

In [0]:
# 5. Simulated retrieval augmentation: query expansion
def enhance_query_for_retrieval(query: str, processor: TerminologyProcessor) -> List[str]:
    """Expand query keywords using the terminology glossary to improve recall."""
    standardized_query = processor.standardize_text(query)
    query_terms = processor.extract_terms(standardized_query)

    expanded_keywords = set([standardized_query])
    for term in query_terms:
        expanded_keywords.add(term)
        for entry in processor.glossary:
            if entry["term"] == term:
                for synonym in entry.get("synonyms", []):
                    expanded_keywords.add(synonym)
                break
    return sorted(list(expanded_keywords))

In [0]:
print("\n--- 5. Simulated Retrieval Augmentation: Query Expansion ---")
original_query_for_retrieval = "I want to know what a RAG does in an LLM?"
expanded_keywords = enhance_query_for_retrieval(original_query_for_retrieval, term_processor)
print(f"Original retrieval query: {original_query_for_retrieval}")
print(f"Expanded retrieval keyword list: {expanded_keywords}")

In [0]:
import spacy
import re

DASHES = r"[-\u2010\u2011\u2012\u2013\u2014\u2212]"  # common dash chars

def alias_to_pattern(alias: str):
    # If alias contains a dash, match it as its own token with regex
    if re.search(DASHES, alias):
        # split on dash and keep the two sides
        left, right = re.split(DASHES, alias, maxsplit=1)
        left_tokens = left.strip().split()
        right_tokens = right.strip().split()
        return [{"LOWER": t.lower()} for t in left_tokens] + [{"TEXT": {"REGEX": DASHES}}] + [{"LOWER": t.lower()} for t in right_tokens]
    else:
        # normal phrase: token-by-token, case-insensitive
        return [{"LOWER": t.lower()} for t in alias.strip().split()]

def extract_terms_with_ruler(text, glossary):
    nlp = spacy.load("en_core_web_sm")

    # avoid adding duplicate pipes if you call this multiple times
    if "entity_ruler" in nlp.pipe_names:
        ruler = nlp.get_pipe("entity_ruler")
        ruler.clear()
    else:
        ruler = nlp.add_pipe("entity_ruler", before="ner", config={"overwrite_ents": True})

    patterns = []
    for entry in glossary:
        for alias in [entry.get("term")] + entry.get("synonyms", []):
            if not alias:
                continue
            patterns.append({"label": "TERM", "pattern": alias_to_pattern(alias)})

    # prefer longer patterns first (helps with overlaps like KG-RAG vs RAG)
    patterns.sort(key=lambda p: len(p["pattern"]), reverse=True)
    ruler.add_patterns(patterns)

    doc = nlp(text)
    return {ent.text for ent in doc.ents if ent.label_ == "TERM"}

In [0]:
# Example 
query = "Examples of KG-RAG" 
processed_query = term_processor.standardize_text(query)
print(processed_query)
# processed_query = "Examples of Knowledge Graph Retrieval-Augmented Generation"
candidates = extract_terms_with_ruler(processed_query, GLOSSARY)
print(f"Automatically extracted term candidates: {candidates}")


### Configure LLM  and Embeddings

In [0]:
from sentence_transformers import SentenceTransformer, util

# It is recommended to load the model once during project initialization
# to avoid repeated loading overhead.
# model = SentenceTransformer('paraphrase-MiniLM-L6-v2')

def map_synonyms_by_similarity(main_terms: list, candidates: list, threshold: float = 0.8) -> dict:
    """
    Map candidate terms to the closest standard terms by computing
    cosine similarity between embeddings.

    Args:
        main_terms (list): List of standard (canonical) terms.
        candidates (list): List of candidate synonyms to be matched.
        threshold (float): Similarity threshold above which a candidate
                           is considered a synonym.

    Returns:
        dict: A dictionary mapping each standard term to a list of
              successfully matched synonyms.
    """
    _matched_synonyms = {term: [] for term in main_terms}

    if not main_terms or not candidates:
        return _matched_synonyms
    
    model = SentenceTransformer("paraphrase-MiniLM-L6-v2")
    
    # Encode in batches for better efficiency
    embeddings = model.encode(main_terms + candidates, convert_to_tensor=True)
    term_embeddings = embeddings[:len(main_terms)]
    candidate_embeddings = embeddings[len(main_terms):]

    # Compute the cosine similarity matrix between standard terms and candidates
    similarity_matrix = util.cos_sim(term_embeddings, candidate_embeddings)

    for i, term in enumerate(main_terms):
        for j, candidate in enumerate(candidates):
            if similarity_matrix[i][j] > threshold:
                _matched_synonyms[term].append(candidate)

    return _matched_synonyms

In [0]:
main_terms_to_map = ["RAG"]
all_possible_synonyms = [
    entry["term"]
    for entry in GLOSSARY
] + [
    synonym
    for entry in GLOSSARY
    for synonym in entry.get("synonyms", [])
]
optimized_mapped_synonyms = map_synonyms_by_similarity(
    main_terms_to_map,
    all_possible_synonyms
)
print("\nOptimized matched synonyms:", optimized_mapped_synonyms)

In [0]:
model_name = 'paraphrase-MiniLM-L6-v2'

print(f"\nAttempting to load model '{model_name}'...")
model = SentenceTransformer(model_name)

In [0]:
import faiss
import numpy as np
from sentence_transformers import SentenceTransformer
from typing import Dict, Tuple, List


def build_term_vector_index(
    term_glossary: Dict[str, dict],
    model: SentenceTransformer,
    use_cosine: bool = True
) -> Tuple[faiss.Index, List[str]]:
    """
    Convert all terms and their synonyms into vector embeddings and build a FAISS index.

    Args:
        term_glossary (dict):
            A structured glossary where keys are canonical terms and values contain
            a 'synonyms' list.
        model (SentenceTransformer):
            A loaded SentenceTransformer model.
        use_cosine (bool):
            Whether to use cosine similarity (recommended for sentence embeddings).

    Returns:
        tuple:
            (faiss_index, indexed_terms)
            - faiss_index: FAISS index containing all embeddings
            - indexed_terms: list of terms aligned with index rows
    """
    terms_to_index: List[str] = []

    # Collect canonical terms and synonyms
    for canonical_term, info in term_glossary.items():
        terms_to_index.append(canonical_term)
        synonyms = info.get("synonyms", [])
        if isinstance(synonyms, list):
            terms_to_index.extend(synonyms)

    # Deduplicate while keeping deterministic order
    indexed_terms = sorted(set(terms_to_index))
    if not indexed_terms:
        raise ValueError("No terms found in glossary.")

    print("Generating term embeddings...")
    embeddings = model.encode(
        indexed_terms,
        convert_to_numpy=True,
        normalize_embeddings=use_cosine,
        show_progress_bar=True
    ).astype("float32")

    dim = embeddings.shape[1]

    # Choose FAISS index type
    if use_cosine:
        # Cosine similarity = inner product on normalized vectors
        index = faiss.IndexFlatIP(dim)
    else:
        index = faiss.IndexFlatL2(dim)

    index.add(embeddings)

    metric = "cosine similarity" if use_cosine else "L2 distance"
    print(f"FAISS index built successfully. "
          f"Vectors: {index.ntotal}, Dimension: {dim}, Metric: {metric}")

    return index, indexed_terms

In [0]:
from typing import List, Dict, Any

def convert_glossary_to_term_dict(
    glossary: List[Dict[str, Any]],
    deduplicate: bool = True,
    keep_term_as_synonym: bool = False
) -> Dict[str, Dict[str, list]]:
    """
    Convert list-based glossary into FAISS-friendly dict format.

    Args:
        glossary: original GLOSSARY list
        deduplicate: remove duplicate synonyms
        keep_term_as_synonym: include canonical term in synonyms or not

    Returns:
        dict: {canonical_term: {"synonyms": [...]}}
    """
    term_dict = {}

    for entry in glossary:
        term = entry.get("term")
        if not term:
            continue

        synonyms = entry.get("synonyms", [])
        if not isinstance(synonyms, list):
            synonyms = []

        if keep_term_as_synonym:
            synonyms = [term] + synonyms

        if deduplicate:
            # preserve order, remove duplicates
            seen = set()
            synonyms = [s for s in synonyms if not (s in seen or seen.add(s))]

        term_dict[term] = {"synonyms": synonyms}

    return term_dict

In [0]:
term_glossary = convert_glossary_to_term_dict(GLOSSARY)
print(term_glossary)

In [0]:
index, indexed_terms = build_term_vector_index(
    term_glossary=term_glossary,
    model=model,
    use_cosine=True
)

In [0]:
def search_terms(
    query: str,
    model: SentenceTransformer,
    index: faiss.Index,
    indexed_terms: List[str],
    top_k: int = 5,
    use_cosine: bool = True
):
    """
    Search the FAISS index for the most similar terms to a query.

    Args:
        query (str): Input query text.
        model (SentenceTransformer): SentenceTransformer model.
        index (faiss.Index): FAISS index.
        indexed_terms (list): Terms aligned with index rows.
        top_k (int): Number of results to return.
        use_cosine (bool): Whether cosine similarity is used.

    Returns:
        list of (term, score) tuples.
    """
    query_embedding = model.encode(
        [query],
        convert_to_numpy=True,
        normalize_embeddings=use_cosine
    ).astype("float32")

    scores, indices = index.search(query_embedding, top_k)

    results = []
    for score, idx in zip(scores[0], indices[0]):
        if idx == -1:
            continue
        results.append((indexed_terms[idx], float(score)))

    return results

In [0]:
print("\n--- Index Build Successful ---")
print("Number of vectors in FAISS index:", index.ntotal)
print("Indexed terms:", indexed_terms)