# © Artur Czarnecki. All rights reserved.
# Integrax framework – proprietary and confidential.
# Use, modification, or distribution without written permission is prohibited.

In [1]:
import sys, os
sys.path.insert(0, os.path.abspath(os.path.join(os.getcwd(), "..", "..")))

# Hybrid Multi-Source RAG with Intergrax + LangGraph

This notebook demonstrates a **practical, end-to-end RAG workflow** that combines multiple knowledge sources into a single in-memory vector index and exposes it through a LangGraph-based agent.

We will use **Intergrax** components together with **LangGraph** to:

1. **Ingest content from multiple sources:**
   - Local PDF files (from a given directory),
   - Local DOCX/Word files (from a given directory),
   - Live web results using the Intergrax `WebSearchExecutor`.

2. **Build a unified RAG corpus:**
   - Normalize all documents into a common internal format,
   - (Optionally) attach basic metadata about origin (pdf / docx / web),
   - Split documents into chunks suitable for embedding.

3. **Create an in-memory vector index:**
   - Use an Intergrax embedding manager (e.g. OpenAI / Ollama),
   - Store embeddings in an **in-memory Chroma** collection via Intergrax vectorstore manager,
   - Keep everything ephemeral (no persistence, perfect for “ad-hoc research” scenarios).

4. **Answer user questions with a RAG agent:**
   - The user provides a natural language question,
   - LangGraph orchestrates the flow: load → merge → index → retrieve → answer,
   - An Intergrax `RagAnswerer` (or `WindowedAnswerer`) generates a **single, structured report**:
     - short summary of the relevant information,
     - key insights and conclusions,
     - optionally: recommendations / action items.

---

## What this notebook showcases

- How to combine **local files + web search** in a single RAG pipeline.
- How to plug Intergrax components (loaders, splitter, embeddings, vectorstore, RAG answerer, websearch) into a **LangGraph `StateGraph`**.
- How to build a **temporary, in-memory knowledge graph** for one-off research tasks (no database setup required).
- A clean, production-oriented pattern that can be reused in:
  - internal knowledge explorers,
  - “research bot” agents,
  - prototype assistants for teams or clients.

All code and comments in this notebook are in **English** to keep it ready for public documentation, GitHub examples, and international collaborators.


## 1. Environment and configuration

In this section we prepare the environment for the hybrid multi-source RAG agent.

The goals of this step:

- Load all required configuration values (API keys, base paths, model names) from environment variables or a `.env` file.
- Import the core building blocks from:
  - the Intergrax framework (LLM adapter, embedding manager, vectorstore manager, document loaders, RAG answerer, websearch executor),
  - LangGraph (for defining and running the `StateGraph`),
  - Standard Python modules (`os`, `pathlib`, `typing`, etc.).
- Define the base directories for local documents:
  - one directory for PDF files (e.g. `./data/pdf`),
  - one directory for DOCX files (e.g. `./data/docx`).
- Decide on the core RAG parameters:
  - chunk size and overlap for splitting documents,
  - embedding model name,
  - LLM model name used by the answerer,
  - number of retrieved documents (`top_k`) during similarity search.
- Initialize the core Intergrax components that will be reused across the notebook:
  - an embedding manager (e.g. OpenAI-based or Ollama-based),
  - a vectorstore manager backed by an **in-memory** Chroma collection (no persistence),
  - an LLM adapter used by the RAG answerer,
  - a simple text splitter for turning documents into chunks.

At the end of this section we want to have a small configuration block that:

1. Reads configuration (API keys, paths, model names),
2. Instantiates the main Intergrax services (embeddings, vectorstore, LLM adapter, text splitter),
3. Is easy to adjust for different environments (OpenAI vs local models, different directories, different Chroma settings).

In [3]:
from pathlib import Path
import os

from intergrax.rag.documents_loader import IntergraxDocumentsLoader
from intergrax.rag.documents_splitter import IntergraxDocumentsSplitter
from intergrax.rag.embedding_manager import IntergraxEmbeddingManager
from intergrax.rag.vectorstore_manager import IntergraxVectorstoreManager, VSConfig

import intergrax.logging  # initializes logging format/levels for the framework

# ---- Tenant / corpus configuration (for metadata + filtering) ----
TENANT = "intergrax"
CORPUS = "hybrid-multi-source"
VERSION = "v1"

# ---- Base directories for local documents ----
# You can adjust these to your actual layout.
# For the hybrid demo we assume:
#   ../documents/hybrid-corpus/pdf
#   ../documents/hybrid-corpus/docx
BASE_DOCS_DIR = Path("../documents/hybrid-corpus")
PDF_DIR = BASE_DOCS_DIR / "pdf"
DOCX_DIR = BASE_DOCS_DIR / "docx"

# ---- Core RAG parameters ----
CHUNK_SIZE = 800
CHUNK_OVERLAP = 150
TOP_K = 8

# Embedding model configuration (using your existing Ollama-based setup)
EMBED_PROVIDER = "ollama"
EMBED_MODEL_NAME = "rjmalagon/gte-qwen2-1.5b-instruct-embed-f16:latest"
EMBED_DIM = 1536  # assumed dimension for this model

# Vectorstore configuration
# For *ephemeral* usage you can set `chroma_persist_directory=None`
# or point it to a throwaway path. For now we keep a dedicated collection.
VS_COLLECTION_NAME = "hybrid_multi_source_rag"
VS_PERSIST_DIR = None  # set to e.g. "chroma_db/hybrid_multi_source_rag_v1" if you want persistence

# ---- Instantiate core components ----

# Loader and splitter (used later to build the hybrid corpus)
doc_loader = IntergraxDocumentsLoader(
    verbose=True,
    # docx_mode="paragraphs" lets you load Word files in finer-grained segments
    docx_mode="paragraphs",
)

splitter = IntergraxDocumentsSplitter(
    verbose=True,
    # Note: the splitter currently takes its chunking config from inside the class;
    # if you expose chunk_size/overlap in the future, you can wire CHUNK_SIZE here.
)

# Embedding manager (Ollama-based embeddings)
embed_manager = IntergraxEmbeddingManager(
    verbose=True,
    provider=EMBED_PROVIDER,
    model_name=EMBED_MODEL_NAME,
    assume_ollama_dim=EMBED_DIM,
)

# Vectorstore manager (Chroma backend)
vs_config = VSConfig(
    provider="chroma",
    collection_name=VS_COLLECTION_NAME,
    chroma_persist_directory=VS_PERSIST_DIR,
)

vectorstore = IntergraxVectorstoreManager(
    config=vs_config,
    verbose=True,
)

os.makedirs(PDF_DIR, exist_ok=True)
os.makedirs(DOCX_DIR, exist_ok=True)


print("Environment initialized.")
print(f"TENANT={TENANT}, CORPUS={CORPUS}, VERSION={VERSION}")
print(f"PDF_DIR={PDF_DIR}")
print(f"DOCX_DIR={DOCX_DIR}")
print(f"Vectorstore collection={VS_COLLECTION_NAME}, persist={VS_PERSIST_DIR}")

2025-11-18 16:28:52,641 [INFO] [intergraxEmbeddingManager] Loading model 'rjmalagon/gte-qwen2-1.5b-instruct-embed-f16:latest' (provider=ollama)
2025-11-18 16:28:53,175 [INFO] HTTP Request: POST http://127.0.0.1:11434/api/embed "HTTP/1.1 200 OK"
2025-11-18 16:28:53,177 [INFO] [intergraxEmbeddingManager] Loaded. Embedding dim = 1536


[intergraxVectorstoreManager] Initialized provider=chroma, collection=hybrid_multi_source_rag
[intergraxVectorstoreManager] Existing count: 0
Environment initialized.
TENANT=intergrax, CORPUS=hybrid-multi-source, VERSION=v1
PDF_DIR=..\documents\hybrid-corpus\pdf
DOCX_DIR=..\documents\hybrid-corpus\docx
Vectorstore collection=hybrid_multi_source_rag, persist=None


## 2. Web search setup (Intergrax WebSearchExecutor)

In this section we configure the **web search layer** that will provide live web documents as one of the sources for the hybrid RAG corpus.

We:

- Load API keys from the environment,
- Initialize:
  - `OpenAIChatResponsesAdapter` (LLM used by web search),
  - `WebSearchExecutor` with `GoogleCSEProvider`,
  - `WebSearchContextBuilder` for building condensed context from web docs,
  - (optionally) `WebSearchAnswerer` for standalone web-only QA,
- Provide a small async helper function that returns **serialized web documents** (plain dicts) ready to be merged with local PDF/DOCX documents later in the notebook.

In [4]:

from typing import List, Dict, Any
import os

from dotenv import load_dotenv
from openai import Client

from intergrax.llm_adapters import OpenAIChatResponsesAdapter
from intergrax.websearch.service.websearch_executor import WebSearchExecutor
from intergrax.websearch.context.websearch_context_builder import WebSearchContextBuilder
from intergrax.websearch.service.websearch_answerer import WebSearchAnswerer
from intergrax.websearch.providers.google_cse_provider import GoogleCSEProvider

load_dotenv()

# --- Environment variables for OpenAI + Google CSE ---

os.environ["OPENAI_API_KEY"] = os.getenv("OPENAI_API_KEY", "")
os.environ["GOOGLE_CSE_API_KEY"] = os.getenv("GOOGLE_CSE_API_KEY")
os.environ["GOOGLE_CSE_CX"] = os.getenv("GOOGLE_CSE_CX")

# --- LLM adapter used by the websearch layer (and later we can reuse it) ---

openai_client = Client()

llm_adapter = OpenAIChatResponsesAdapter(
    client=openai_client,
    model="gpt-5-mini",  # adjust to your preferred model
)

# --- WebSearchExecutor with Google CSE provider ---

websearch_executor = WebSearchExecutor(
    providers=[GoogleCSEProvider()],
    default_top_k=6,
    default_locale="en-US",
    default_region="en-US",
    default_language="en",
    default_safe_search=True,
    max_text_chars=2000,
)

# --- Context builder and (optional) web-only answerer ---

context_builder = WebSearchContextBuilder(
    max_docs=4,
    max_chars_per_doc=1500,
    include_snippet=True,
    include_url=True,
    source_label_prefix="Source",
)

websearch_answerer = WebSearchAnswerer(
    adapter=llm_adapter,
    executor=websearch_executor,
    context_builder=context_builder,
    answer_language="en",
    # system_prompt=system_prompts.strict_web_rag,  # optional, if you have it
)

print("Web search layer initialized (Google CSE + OpenAI).")

# --- Async helper for raw web documents (serialized) ---

async def websearch_fetch_serialized(
    question: str,
    top_k: int = 8,
) -> List[Dict[str, Any]]:
    """
    Run web search for a given question and return a list of serialized documents.

    Each element in the returned list is a plain dict (serialized WebDocument),
    ready to be:
      - converted into RAG chunks,
      - or used by WebSearchContextBuilder to build a condensed text context.
    """
    web_docs: List[Dict[str, Any]] = await websearch_executor.search_async(
        query=question,
        top_k=top_k,
        serialize=True,
    )
    return web_docs


Web search layer initialized (Google CSE + OpenAI).


## 3. Hybrid RAG state definition

To orchestrate the hybrid RAG flow with LangGraph, we will use a single shared state
object that flows through all nodes.

The state should contain:

- `question`: the original user question,
- `pdf_docs`: documents loaded from local PDF files (before splitting),
- `docx_docs`: documents loaded from local DOCX files (before splitting),
- `web_docs_serialized`: web search results in serialized form (plain dicts coming from `WebSearchExecutor.search_async(..., serialize=True)`),
- `split_docs`: the final list of **chunked** documents (LangChain `Document` objects) ready for embedding,
- `vectorstore_ready`: a simple flag indicating that the vectorstore has been built/updated,
- `answer`: the final answer text produced by the RAG pipeline,
- `debug_info`: diagnostic information useful for inspecting what happened
  (counts of docs/chunks, vectorstore collection name, etc.).

We will represent this state as a `TypedDict` so that LangGraph can use it as the
graph state type. This also makes the code easier to reason about and refactor.

In [5]:
from typing import TypedDict, List, Dict, Any, Optional

from langchain_core.documents import Document
from langgraph.graph import StateGraph, END


class HybridRagState(TypedDict, total=False):
    """
    Shared state for the hybrid multi-source RAG pipeline.

    This object flows through all LangGraph nodes and is gradually enriched with:
      - local PDF/DOCX documents,
      - web documents (serialized),
      - split/chunked documents,
      - vectorstore metadata,
      - final RAG answer and debug info.
    """

    # User input
    question: str

    # Local documents before splitting (LangChain Document objects)
    pdf_docs: List[Document]
    docx_docs: List[Document]

    # Web documents as serialized dicts (from WebSearchExecutor.search_async(..., serialize=True))
    web_docs_serialized: List[Dict[str, Any]]

    # Final chunked documents ready for embedding / indexing
    split_docs: List[Document]

    # Vectorstore status / metadata
    vectorstore_ready: bool
    vectorstore_collection: Optional[str]

    # Final answer
    answer: str

    # Misc debug information (counts, timings, etc.)
    debug_info: Dict[str, Any]


## 4. Local documents loading (PDF + DOCX)

In this step we load **local documents** that will form the first part of the hybrid RAG corpus.

We use the existing `IntergraxDocumentsLoader` to:

- Load PDF files from a dedicated directory (e.g. `PDF_DIR`),
- Load DOCX files from a dedicated directory (e.g. `DOCX_DIR`),
- Return them as LangChain `Document` objects.

Each document already carries metadata (such as `source_path`, `source_name`, etc.) that we will later
retain when splitting into chunks and inserting into the vectorstore.

The goals of this step:

- Provide small helper functions:
  - `load_pdf_docs(pdf_dir: Path) -> list[Document]`
  - `load_docx_docs(docx_dir: Path) -> list[Document]`
- Provide a convenience function that loads **both** PDF and DOCX documents and returns:
  - the documents,
  - basic debug information (counts per type),
- Prepare the structure we will later wrap into a LangGraph node that updates `HybridRagState`
  (`pdf_docs`, `docx_docs`, `debug_info`).


In [None]:
from langchain_core.documents import Document
from pathlib import Path
from typing import List, Dict, Any, Tuple


def load_pdf_docs(pdf_dir: Path) -> List[Document]:
    """
    Load all PDF documents from the given directory using IntergraxDocumentsLoader.

    The loader is configured globally (doc_loader) and will:
      - scan the directory,
      - load supported files (PDF),
      - attach basic metadata (e.g. source_path, source_name).
    """
    if not pdf_dir.exists():
        print(f"[WARN] PDF directory does not exist: {pdf_dir}")
        return []

    # Reuse the global loader; it will handle PDF files as well.
    pdf_docs: List[Document] = doc_loader.load_documents(str(pdf_dir))
    print(f"[LOCAL LOAD] PDF docs loaded: {len(pdf_docs)} from {pdf_dir}")
    return pdf_docs


def load_docx_docs(docx_dir: Path) -> List[Document]:
    """
    Load all DOCX documents from the given directory using IntergraxDocumentsLoader.

    Because `doc_loader` was initialized with `docx_mode='paragraphs'`,
    DOCX files will be loaded with finer-grained paragraph segmentation
    (before we apply RAG splitting).
    """
    if not docx_dir.exists():
        print(f"[WARN] DOCX directory does not exist: {docx_dir}")
        return []

    docx_docs: List[Document] = doc_loader.load_documents(str(docx_dir))
    print(f"[LOCAL LOAD] DOCX docs loaded: {len(docx_docs)} from {docx_dir}")
    return docx_docs


def load_all_local_docs() -> Tuple[List[Document], List[Document], Dict[str, Any]]:
    """
    Convenience helper for the notebook:

    - Loads PDF docs from PDF_DIR,
    - Loads DOCX docs from DOCX_DIR,
    - Returns both lists plus a simple debug_info dict.
    """
    pdf_docs = load_pdf_docs(PDF_DIR)
    docx_docs = load_docx_docs(DOCX_DIR)

    debug_info: Dict[str, Any] = {
        "pdf_docs_count": len(pdf_docs),
        "docx_docs_count": len(docx_docs),
        "pdf_dir": str(PDF_DIR),
        "docx_dir": str(DOCX_DIR),
    }

    print(
        f"[LOCAL LOAD] Total local docs -> "
        f"PDF: {len(pdf_docs)}, DOCX: {len(docx_docs)}"
    )

    return pdf_docs, docx_docs, debug_info


## 5. Source loading nodes (local PDF/DOCX + web)

Now that we have:

- a shared `HybridRagState`,
- helpers for loading local documents (`load_all_local_docs()`),
- a helper for fetching web documents as serialized dicts (`websearch_fetch_serialized()`),

we can expose them as **LangGraph nodes**.

We will create two nodes:

1. `load_local_docs_node(state: HybridRagState) -> HybridRagState`  
   - Loads PDF and DOCX documents from the configured directories,
   - Stores them in `state["pdf_docs"]` and `state["docx_docs"]`,
   - Updates `state["debug_info"]` with basic counts and directory paths.

2. `load_web_docs_node(state: HybridRagState) -> HybridRagState` (async)  
   - Uses the user `question` from the state,
   - Calls `websearch_fetch_serialized(question, top_k=...)`,
   - Stores serialized web documents in `state["web_docs_serialized"]`,
   - Updates `state["debug_info"]` with the number of web documents.

These nodes will be the **first steps** of the hybrid RAG pipeline in the LangGraph graph.
Later nodes will split documents, build the vectorstore, and generate the final answer.


In [None]:
from typing import Dict, Any


def load_local_docs_node(state: HybridRagState) -> HybridRagState:
    """
    LangGraph node:
      - loads local PDF + DOCX documents,
      - stores them in the state,
      - updates debug_info with basic stats.
    """
    pdf_docs, docx_docs, debug_local = load_all_local_docs()

    # Merge with any existing debug info
    debug = dict(state.get("debug_info", {}))
    debug.update(debug_local)

    new_state: HybridRagState = {
        **state,
        "pdf_docs": pdf_docs,
        "docx_docs": docx_docs,
        "debug_info": debug,
    }

    return new_state


async def load_web_docs_node(state: HybridRagState) -> HybridRagState:
    """
    LangGraph node (async):
      - uses the question from the state,
      - runs web search via Intergrax WebSearchExecutor,
      - stores serialized web docs in the state,
      - updates debug_info with web_docs_count.
    """
    question = state.get("question", "").strip()
    if not question:
        print("[WEB LOAD] Empty question in state; skipping web search.")
        web_docs_serialized: list[Dict[str, Any]] = []
    else:
        web_docs_serialized = await websearch_fetch_serialized(
            question=question,
            top_k=8,
        )

    debug = dict(state.get("debug_info", {}))
    debug.update(
        {
            "web_docs_count": len(web_docs_serialized),
        }
    )

    new_state: HybridRagState = {
        **state,
        "web_docs_serialized": web_docs_serialized,
        "debug_info": debug,
    }

    return new_state