## **Config (paths, model names, imports)**


In [1]:
import os
from dotenv import load_dotenv

# Load environment variables from .env file
load_dotenv()

# Set LangChain configuration from environment variables
os.environ['LANGSMITH_TRACING'] = os.getenv('LANGSMITH_TRACING')
os.environ['LANGSMITH_ENDPOINT'] = os.getenv('LANGSMITH_ENDPOINT')
os.environ['LANGSMITH_API_KEY'] = os.getenv('LANGSMITH_API_KEY')    
os.environ['LANGSMITH_PROJECT'] = os.getenv('LANGSMITH_PROJECT')

#API KEY
os.environ['GROQ_API_KEY'] = os.getenv('GROQ_API_KEY')

In [2]:
# Necessary packages
from langchain_community.document_loaders import DirectoryLoader, TextLoader
import yaml
from pathlib import Path
from typing import Dict, List, Any

## **Extract metadata from mkdocs.yml**


In [None]:
# Load mkdocs.yml configuration
# Get the project root directory by checking for fastapi directory
current_dir = Path.cwd()
# Check if fastapi directory exists in current dir or parent
if (current_dir / "fastapi").exists():
    project_root = current_dir
elif (current_dir.parent / "fastapi").exists():
    project_root = current_dir.parent
else:
    # Fallback: assume parent directory (notebook is in rag-learning/rag-learning/)
    project_root = current_dir.parent

mkdocs_path = project_root / "fastapi" / "docs" / "en" / "mkdocs.yml"
docs_base_path = project_root / "fastapi" / "docs" / "en" / "docs"

# Convert to string for compatibility
mkdocs_path = str(mkdocs_path)
docs_base_path = str(docs_base_path)

# Pre-process the YAML file to handle the problematic !!python/name: tag
# We'll replace it with a null value since we only need the 'nav' section
import re

with open(mkdocs_path, 'r') as f:
    yaml_content = f.read()
    
# Replace the problematic Python tag with null using regex
# This handles the tag whether it's on its own or part of a value
yaml_content = re.sub(r'!!python/name:[^\s]+', 'null', yaml_content)

# Now parse the cleaned YAML
mkdocs_config = yaml.safe_load(yaml_content)

# Extract navigation structure
nav_structure = mkdocs_config.get('nav', [])

# Build a mapping from file paths to their metadata
def extract_nav_metadata(nav_items: List[Any], parent_path: List[str] = None) -> Dict[str, Dict[str, Any]]:
    """
    Recursively extract navigation metadata from mkdocs nav structure.
    Returns a dictionary mapping file paths to their metadata.
    """
    if parent_path is None:
        parent_path = []
    
    metadata_map = {}
    
    for item in nav_items:
        if isinstance(item, dict):
            # Handle dictionary items (e.g., {"Tutorial - User Guide": [...]})
            for key, value in item.items():
                if isinstance(value, list):
                    # Recursive case: nested navigation
                    new_path = parent_path + [key]
                    metadata_map.update(extract_nav_metadata(value, new_path))
                elif isinstance(value, str):
                    # Leaf case: key is section name, value is file path
                    file_path = value
                    metadata_map[file_path] = {
                        'section': key,
                        'category_path': parent_path + [key],
                        'top_level_category': parent_path[0] if parent_path else key,
                        'subcategory': parent_path[-1] if len(parent_path) > 1 else None,
                    }
        elif isinstance(item, str):
            # Handle simple string items (e.g., "tutorial/first-steps.md")
            file_path = item
            metadata_map[file_path] = {
                'section': None,
                'category_path': parent_path.copy(),
                'top_level_category': parent_path[0] if parent_path else 'Root',
                'subcategory': parent_path[-1] if parent_path else None,
            }
    
    return metadata_map

# Create the metadata mapping
nav_metadata_map = extract_nav_metadata(nav_structure)

# Display some examples
print(f"Total documents in navigation: {len(nav_metadata_map)}")
print("\nExample metadata entries:")
for i, (path, meta) in enumerate(list(nav_metadata_map.items())[:5]):
    print(f"\n{i+1}. {path}")
    print(f"   Category Path: {' > '.join(meta['category_path'])}")
    print(f"   Top Level: {meta['top_level_category']}")
    if meta['subcategory']:
        print(f"   Subcategory: {meta['subcategory']}")


Total documents in navigation: 144

Example metadata entries:

1. index.md
   Category Path: FastAPI
   Top Level: FastAPI

2. features.md
   Category Path: 
   Top Level: Root

3. learn/index.md
   Category Path: Learn
   Top Level: Learn
   Subcategory: Learn

4. python-types.md
   Category Path: Learn
   Top Level: Learn
   Subcategory: Learn

5. async.md
   Category Path: Learn
   Top Level: Learn
   Subcategory: Learn


## **Build dense vs lightweight corpora**


In [None]:
# Helper function to enrich documents with mkdocs metadata
def enrich_document_metadata(doc, metadata_map: Dict[str, Dict[str, Any]], docs_base_path: str):
    """Add mkdocs navigation metadata to a document based on its file path."""
    # Get the relative path from the document's source
    source_path = doc.metadata.get('source', '')
    
    # Convert absolute path to relative path from docs base
    if source_path.startswith(docs_base_path):
        relative_path = source_path[len(docs_base_path):].lstrip('/')
        
        # Normalize paths for comparison (handle both with/without .md extension)
        relative_path_normalized = relative_path.replace('.md', '')
        
        # Try exact match first
        if relative_path in metadata_map:
            nav_meta = metadata_map[relative_path]
        else:
            # Try matching by normalized path (without .md)
            nav_meta = None
            for nav_path, meta in metadata_map.items():
                nav_path_normalized = nav_path.replace('.md', '')
                # Match if paths are the same when normalized
                if nav_path_normalized == relative_path_normalized:
                    nav_meta = meta
                    break
                # Also try if the relative path ends with the nav path
                if relative_path.endswith(nav_path) or nav_path in relative_path:
                    nav_meta = meta
                    break
        
        # Add metadata if found
        if nav_meta:
            doc.metadata['section'] = nav_meta.get('section')
            doc.metadata['category_path'] = ' > '.join(nav_meta.get('category_path', []))
            doc.metadata['top_level_category'] = nav_meta.get('top_level_category')
            doc.metadata['subcategory'] = nav_meta.get('subcategory')
    
    return doc

# Dense corpus: tutorials + advanced guides
dense_loader = DirectoryLoader(
    docs_base_path,
    glob="tutorial/**/*.md",
    loader_cls=TextLoader,
)
dense_docs = dense_loader.load()
for d in dense_docs:
    d.metadata["corpus"] = "dense_docs"
    # Enrich with mkdocs metadata
    enrich_document_metadata(d, nav_metadata_map, docs_base_path)

# Lightweight corpus: e.g. fastapi-best-practices repo
faq_path = project_root / "fastapi-best-practices"
faq_loader = DirectoryLoader(
    str(faq_path),
    glob="README.md",
    loader_cls=TextLoader,
)
faq_docs = faq_loader.load()
for d in faq_docs:
    d.metadata["corpus"] = "faq_docs"

# Display enriched metadata
print("Sample enriched document metadata:")
print(dense_docs[2].metadata)

Sample enriched document metadata:
{'source': '/Users/dimitar/Desktop/Software_Dev/rag-learning/fastapi/docs/en/docs/tutorial/query-params.md', 'corpus': 'dense_docs', 'section': None, 'category_path': 'Learn > Tutorial - User Guide', 'top_level_category': 'Learn', 'subcategory': 'Tutorial - User Guide'}


## **Chunk + embed + index (two vector stores)**


In [5]:
from typing import Optional
from datetime import date
from pydantic import BaseModel, Field

class FastAPISearch(BaseModel):
    """
    Structured query for searching over FastAPI documentation chunks.
    The LLM will fill this from a natural-language question.
    """

    # What to search for semantically
    text: str = Field(
        ...,
        description=(
            "Main semantic search query over the document content. "
            "Use natural language describing the user's problem or question."
        ),
    )

    # High-level doc classification
    top_level_category: Optional[str] = Field(
        None,
        description=(
            "Top-level documentation category, such as 'Learn', "
            "'Reference', or 'Tutorials'. "
            "Use when the user seems to want conceptual/how-to material "
            "vs pure API reference."
        ),
    )

    subcategory: Optional[str] = Field(
        None,
        description=(
            "More specific documentation subcategory, such as "
            "'Tutorial - User Guide', 'Advanced User Guide', etc. "
            "Use when the user implicitly asks for guides or tutorials."
        ),
    )

    corpus: Optional[str] = Field(
        None,
        description=(
            "Internal corpus label (for example 'dense_docs', 'api_reference', "
            "'examples'). Use when the user implicitly wants a particular type "
            "of docs (e.g., reference vs prose)."
        ),
    )

    # Optional filters you might add later if your metadata has them
    earliest_publish_date: Optional[date] = Field(
        None,
        description=(
            "Earliest publish date for documents, inclusive. "
            "Use only if the user explicitly cares about recent or old docs."
        ),
    )

    latest_publish_date: Optional[date] = Field(
        None,
        description=(
            "Latest publish date for documents, exclusive. "
            "Use only if the user explicitly limits the time range."
        ),
    )

    def pretty_print(self) -> None:
        # Access model_fields from the class, not the instance (Pydantic v2.11+)
        model_fields = self.__class__.model_fields
        for field_name in model_fields.keys():
            field_value = getattr(self, field_name, None)
            field_info = model_fields[field_name]
            # Get default value, handling Pydantic v2 FieldInfo structure
            default_value = getattr(field_info, 'default', None)
            # Skip if value is None or equals the default
            if field_value is not None and field_value != default_value:
                print(f"{field_name}: {field_value}")


In [6]:
from langchain_core.prompts import ChatPromptTemplate
from langchain_groq import ChatGroq

system = """You are an assistant that converts natural language questions
into structured search queries for the FastAPI documentation.

Your job is to:
- Extract a semantic search text that captures the user's problem or topic.
- Set metadata fields (top_level_category, subcategory, corpus, etc.)
  only when they are clearly implied by the question.

If you are unsure about a field, leave it empty (null).
Do NOT invent metadata values that are not supported by the schema.
Do NOT try to expand or reinterpret unknown acronyms; keep them as-is."""

prompt = ChatPromptTemplate.from_messages(
    [
        ("system", system),
        ("human", "{question}"),
    ]
)

llm = ChatGroq(model="llama-3.3-70b-versatile", temperature=0)  # or any chat model
structured_llm = llm.with_structured_output(FastAPISearch)
query_analyzer = prompt | structured_llm


In [7]:
query_analyzer.invoke({"question": "what is error 422"}).pretty_print()

text: error 422
top_level_category: Reference


## **Implement routing (which corpus, which strategy)**


## **Implement query translation (multi-query, maybe HyDE later)**


## **Retrieval + fusion + answer generation**


## **Small evaluation loop**
