# © Artur Czarnecki. All rights reserved.
# Integrax framework – proprietary and confidential.
# Use, modification, or distribution without written permission is prohibited.


In [None]:
import sys, os
sys.path.insert(0, os.path.abspath(os.path.join(os.getcwd(), "..", "..")))

### Load files with metadata

In [None]:
from intergrax.rag.documents_loader import DocumentsLoader

import intergrax.logging

doc_loader = DocumentsLoader(verbose=True, docx_mode="paragraphs")
docs = doc_loader.load_documents("../documents/mooff-strategy",)
print(f"Loaded docs: {len(docs)}")

### Split documents into chunks

In [None]:
import sys, os

# Extend Python's import path to include the parent directory.
# This enables direct import of the Intergrax framework modules
# without requiring a separate installation.
sys.path.insert(0, os.path.abspath(os.path.join(os.getcwd(), "..")))

from intergrax.rag.documents_splitter import DocumentsSplitter
from intergrax.rag.documents_loader import DocumentsLoader
from langchain_core.documents import Document
import intergrax.logging

# Initialize the document loader.
# The 'docx_mode="paragraphs"' option indicates that DOCX files will be parsed
# into separated paragraph-level units rather than full-document chunks.
doc_loader = DocumentsLoader(verbose=True, docx_mode="paragraphs")

# Load all documents from the given directory.
# Supported formats depend on the loader configuration
# (e.g., PDF, DOCX, TXT, Markdown).
docs = doc_loader.load_documents("../documents/mooff-strategy")

# Initialize the document splitter.
# This component breaks larger documents into smaller segments (chunks),
# which improves embedding granularity and retrieval efficiency in RAG pipelines.
splitter = DocumentsSplitter(verbose=True)

# Perform chunking on the loaded document set.
# Splitting boundaries depend on the configured chunk size and rules in the splitter.
split_docs = splitter.split_documents(documents=docs)

# Print a summary of the transformation process for validation and debugging.
print(f"Loaded docs: {len(docs)} - splitter produced {len(split_docs)} chunks")


# Embedding

In [None]:
import sys, os

# Ensure the parent directory is included in the Python path.
# This allows importing the Intergrax framework modules without requiring installation.
sys.path.insert(0, os.path.abspath(os.path.join(os.getcwd(), "..")))

from intergrax.rag.documents_splitter import DocumentsSplitter
from intergrax.rag.documents_loader import DocumentsLoader
from intergrax.rag.embedding_manager import EmbeddingManager
import intergrax.logging

# Initialize the document loader.
# The loader is responsible for scanning the directory and normalizing files
# into a unified structured format used by the RAG pipeline.
doc_loader = DocumentsLoader(verbose=True)

# Load raw documents from the specified directory.
# Supported formats depend on the loader configuration (PDF, DOCX, TXT, etc.).
docs = doc_loader.load_documents("../documents/mooff-strategy/doc")

# Initialize the document splitter.
# This component divides documents into smaller chunks to ensure optimal
# embedding granularity for efficient vector search and context retrieval.
splitter = DocumentsSplitter(verbose=True)

# Perform the split operation on the loaded documents.
# Splitting typically occurs by paragraphs, tokens, or semantic boundaries.
split_docs = splitter.split_documents(documents=docs)

# Initialize the embedding manager.
# This component creates vector embeddings for each document chunk.
# Here we specify Ollama as the provider and use a model optimized for embeddings.
embed_manager = EmbeddingManager(
    verbose=True,
    provider="ollama",
    model_name="rjmalagon/gte-qwen2-1.5b-instruct-embed-f16:latest",
    assume_ollama_dim=1536  # Required if model metadata is not auto-detected.
)

# Generate embeddings for the document set.
# The resulting output consists of embedding vectors and document references.
embeddings, documents = embed_manager.embed_documents(docs=docs)

# Print summary information for verification and debugging.
print(f"Embedding length: {len(embeddings)} for documents: {len(docs)}")


# Vectorstore

In [None]:
import sys, os
sys.path.insert(0, os.path.abspath(os.path.join(os.getcwd(), "..")))

from intergrax.rag.embedding_manager import EmbeddingManager
from intergrax.rag.vectorstore_manager import VectorstoreManager, VSConfig
from intergrax.rag.documents_splitter import DocumentsSplitter
from intergrax.rag.documents_loader import DocumentsLoader
from langchain_core.documents import Document
from collections import Counter

TENANT = "intergrax"
CORPUS = "intergrax-strategy"
VERSION = "v1"

# 0) Initialize the vector store at startup (without loading/splitting/embedding yet).
#    This gives us a handle to the underlying database and allows for a light-weight
#    presence check before performing any expensive ingestion work.
cfg = VSConfig(
    provider="chroma",
    collection_name="intergrax_docs",
    chroma_persist_directory="chroma_db/intergrax_docs_v1",
)
store = VectorstoreManager(config=cfg, verbose=True)

# 1) Create an embedding manager used only for a lightweight "probe" query
#    to check whether the target corpus is already present in the vector store.
#    This avoids embedding all chunks if they are already ingested.
embed_manager = EmbeddingManager(
    verbose=False,
    provider="ollama",
    model_name="rjmalagon/gte-qwen2-1.5b-instruct-embed-f16:latest",
    assume_ollama_dim=1536
)

def corpus_present(store: VectorstoreManager, embed_mgr: EmbeddingManager) -> bool:
    """
    Returns True if the vector store appears to already contain the target corpus.

    Logic:
      1. If the global count is zero, we can safely assume the store is empty.
      2. Otherwise, embed a simple probe query and perform a filtered search
         constrained by tenant, corpus, and version.
      3. If at least one matching ID is returned, we treat the corpus as present.
    """
    if store.count() == 0:
        return False

    # Create a single probe embedding vector.
    qvec = embed_mgr.embed_one("probe")
    
    # Structured filter to ensure we only match documents belonging to the
    # current logical corpus (tenant + corpus + version).
    where = {
        "$and": [
            {"tenant": {"$eq": TENANT}},
            {"corpus": {"$eq": CORPUS}},
            {"version": {"$eq": VERSION}},
        ]
    }

    res = store.query(query_embeddings=qvec, top_k=1, where=where)
    return bool(res["ids"] and res["ids"][0])

# Determine whether ingestion is needed based on the above probe.
NEED_INGEST = not corpus_present(store, embed_manager)
print(f"Need ingest: {NEED_INGEST}")

if not NEED_INGEST:
    # Corpus already present: skip loading, splitting and embedding.
    # This prevents unnecessary and expensive re-ingestion work.
    print("[INGEST] Skipping — corpus already present.")
else:
    # 2) Load raw documents from the specified directory.
    #    At this stage we only bring documents into memory; no embeddings yet.
    doc_loader = DocumentsLoader(verbose=False)
    docs = doc_loader.load_documents("../documents/mooff-strategy/doc")

    # 3) Split documents into chunks and inject metadata (tenant/corpus/version).
    #    The metadata function is called for each chunk and merged into its metadata.
    def add_meta(chunk_doc: Document, idx: int, total: int):
        # Additional fields can be added here if needed (e.g. language, domain).
        return {"tenant": TENANT, "corpus": CORPUS, "version": VERSION}

    splitter = DocumentsSplitter(verbose=False)
    split_docs = splitter.split_documents(documents=docs, call_custom_metadata=add_meta)

    # 4) Embed only the chunked documents.
    #    This keeps the embedding granularity aligned with retrieval resolution.
    embeddings, documents = embed_manager.embed_documents(docs=split_docs)

    # 5) Build stable, deterministic IDs for each chunk.
    #    Prefer the chunk_id generated by the splitter; if missing, fall back
    #    to a deterministic pattern: parent identifier + chunk index.
    ids = []
    for d in documents:
        cid = d.metadata.get("chunk_id")
        if not cid:
            # Fallback: derive ID from parent document and chunk index.
            parent = (
                d.metadata.get("parent_id")
                or d.metadata.get("source_path")
                or d.metadata.get("source_name", "doc")
            )
            idx = int(d.metadata.get("chunk_index", 0))
            cid = f"{parent}#ch{idx:04d}"
        ids.append(cid)

    # 6) Ensure all IDs in this batch are unique.
    #    Some vector stores (like Chroma) require unique IDs per upsert call.
    #    We remove duplicates in-memory before ingestion.
    def dedup_batch(ids, docs, embs):
        """
        Remove duplicate IDs within the current batch while preserving order.

        Returns:
          new_ids, new_docs, new_embs
        """
        seen = set()
        new_ids, new_docs, new_embs = [], [], []
        for i, _id in enumerate(ids):
            if _id in seen:
                continue
            seen.add(_id)
            new_ids.append(_id)
            new_docs.append(docs[i])
            new_embs.append(embs[i])
        return new_ids, new_docs, new_embs

    ids, documents, embeddings = dedup_batch(ids, documents, embeddings)

    # Optionally, warn if duplicates remain (should normally not happen
    # after a correct deduplication step).
    c = Counter(ids)
    dups = [k for k, v in c.items() if v > 1]
    if dups:
        print(f"[WARN] Duplicate IDs after dedup? {len(dups)}")

    # 7) Ingest the chunked documents into the vector store.
    #    base_metadata is applied to each record and can be used later for filtering.
    base_metadata = {"tenant": TENANT, "corpus": CORPUS, "version": VERSION}
    store.add_documents(
        documents=documents,
        embeddings=embeddings,
        ids=ids,
        batch_size=128,
        base_metadata=base_metadata
    )
    print("[INGEST] Vectorstore updated. Count:", store.count())


### RAG Retriever Test

In [None]:
import sys, os
# Extend Python path with the parent directory so Intergrax modules
# can be imported without installing the package globally.
sys.path.insert(0, os.path.abspath(os.path.join(os.getcwd(), "..")))

from intergrax.rag.embedding_manager import EmbeddingManager
from intergrax.rag.rag_retriever import RagRetriever
from intergrax.rag.vectorstore_manager import VectorstoreManager, VSConfig

# Configure access to the underlying vector store (Chroma in this case).
# The collection name and persist directory must match the ingestion script.
cfg = VSConfig(
    provider="chroma",
    collection_name="intergrax_docs",
    chroma_persist_directory="chroma_db/intergrax_docs_v1",
)

# High-level wrapper around the selected vector store implementation.
store = VectorstoreManager(config=cfg, verbose=False)

# Initialize the embedding manager used for turning user questions
# into embedding vectors compatible with the stored document embeddings.
embed_manager = EmbeddingManager(
    verbose=False,
    provider="ollama",
    model_name="rjmalagon/gte-qwen2-1.5b-instruct-embed-f16:latest",
    assume_ollama_dim=1536
)

# Create the RAG retriever that combines:
# - the vector store (for similarity search),
# - the embedding manager (for encoding queries),
# - and retrieval logic (MMR, parent limiting, filters, etc.).
retriever = RagRetriever(store, embed_manager, verbose=True)

# Example user question.
# Note: the question is in Polish; this is fine as long as the embedding model
# can handle multilingual input and the corpus is in a compatible language.
question = "Czym są wirtualne targi mooff ?"

# Perform retrieval using Maximal Marginal Relevance (MMR).
# Parameters:
#   - top_k: maximum number of chunks to return.
#   - score_threshold: minimum similarity score to keep a hit.
#   - where: metadata filter (tenant/corpus/version must be defined in scope).
#   - max_per_parent: limit number of chunks per parent document.
#   - use_mmr: enable MMR to reduce redundancy among retrieved chunks.
#   - include_embeddings: include stored embeddings in the result if needed downstream.
#   - prefetch_factor: how many candidates to consider before MMR filtering.
hits = retriever.retrieve(
    question=question,
    top_k=8,
    score_threshold=0.15,
    where={"tenant": TENANT, "corpus": CORPUS, "version": VERSION},
    max_per_parent=2,
    use_mmr=True,
    include_embeddings=True,
    prefetch_factor=5
)

# Print diagnostic output for each retrieved chunk:
# rank, similarity score, metadata and the actual content.
for h in hits:
    print(h["rank"], f"{h['similarity_score']:.3f}", h["metadata"])
    print(h["content"])
    print()


# ReRanker Test

In [None]:
import sys, os
# Extend Python path with the parent directory so that Intergrax modules
# can be imported directly without installing the package globally.
sys.path.insert(0, os.path.abspath(os.path.join(os.getcwd(), "..")))

from intergrax.rag.embedding_manager import EmbeddingManager
from intergrax.rag.rag_retriever import RagRetriever
from intergrax.rag.vectorstore_manager import VectorstoreManager, VSConfig
from intergrax.rag.re_ranker import ReRanker, ReRankerConfig

# Configure connection to the vector store (Chroma in this example).
# The collection name and persist directory must match what was used during ingestion.
cfg = VSConfig(
    provider="chroma",
    collection_name="intergrax_docs",
    chroma_persist_directory="chroma_db/intergrax_docs_v1",
)

# High-level manager for the underlying vector store implementation.
store = VectorstoreManager(config=cfg, verbose=False)

# Initialize the embedding manager used to:
#  - encode user questions into vectors,
#  - support similarity and re-ranking computations.
embed_manager = EmbeddingManager(
    verbose=False,
    provider="ollama",
    model_name="rjmalagon/gte-qwen2-1.5b-instruct-embed-f16:latest",
    assume_ollama_dim=1536
)

# Configure and create the re-ranker.
# This component takes initial retrieval results and refines their ordering
# based on combined similarity signals (score fusion).
reranker = ReRanker(
    embedding_manager=embed_manager,
    config=ReRankerConfig(
        use_score_fusion=True,  # Combine original similarity with re-ranker scores.
        fusion_alpha=0.4,       # Weight for blending scores (0–1; tune per use case).
        normalize="minmax",     # Normalization strategy for scores before fusion.
        doc_batch_size=256      # Batch size for processing documents during re-ranking.
    ),
    verbose=True
)

# Create the main RAG retriever, which:
#  - queries the vector store,
#  - applies metadata filters,
#  - optionally delegates to a re-ranker for improved ranking.
retriever = RagRetriever(store, embed_manager, verbose=True)

# Example user query.
# The language of the question should be compatible with the language
# of the corpus and supported by the embedding model.
question = "What is the mooff virtual fair?"

# Retrieve documents using the retriever with re-ranking enabled.
# Parameters:
#   - top_k: maximum number of chunks to return.
#   - score_threshold: minimum similarity score required to keep a hit.
#   - where: metadata filter; TENANT/CORPUS/VERSION must be defined in scope.
#   - max_per_parent: limit the number of chunks per source document.
#   - reranker: IntergraxReRanker instance used to refine result ordering.
hits = retriever.retrieve(
    question=question,
    top_k=8,
    score_threshold=0.15,
    where={"tenant": TENANT, "corpus": CORPUS, "version": VERSION},
    max_per_parent=2,
    reranker=reranker
)

# Print diagnostic information for each retrieved and re-ranked hit:
# rank, final similarity score, metadata and the chunk content.
for h in hits:
    print(h["rank"], f"{h['similarity_score']:.3f}", h["metadata"])
    print(h["content"])
    print()


# RAG with LLM Test

In [None]:
import sys, os
# Extend Python path with the parent directory so that Intergrax modules
# can be imported directly without requiring a global package installation.
sys.path.insert(0, os.path.abspath(os.path.join(os.getcwd(), "..")))

from intergrax.rag.embedding_manager import EmbeddingManager
from intergrax.rag.rag_retriever import RagRetriever
from intergrax.rag.vectorstore_manager import VectorstoreManager, VSConfig
from intergrax.rag.re_ranker import ReRanker, ReRankerConfig
from intergrax.rag.rag_answerer import AnswererConfig, RagAnswerer
from intergrax.llm_adapters import LLMAdapterRegistry
from langchain_ollama import ChatOllama
import intergrax.system_prompts as prompts

# Configure access to the vector store (Chroma in this case).
# The collection name and persist directory must match the ingestion step.
cfg = VSConfig(
    provider="chroma",
    collection_name="intergrax_docs",
    chroma_persist_directory="chroma_db/intergrax_docs_v1",
)

# High-level manager for the underlying vector store implementation.
store = VectorstoreManager(config=cfg, verbose=False)

# Initialize the embedding manager responsible for:
#  - encoding user questions,
#  - encoding documents during ingestion,
#  - supporting re-ranking and retrieval.
embed_manager = EmbeddingManager(
    verbose=False,
    provider="ollama",
    model_name="rjmalagon/gte-qwen2-1.5b-instruct-embed-f16:latest",
    assume_ollama_dim=1536
)

# Configure and initialize the re-ranker.
# This component refines the initial vector store results by combining
# similarity scores and additional re-ranking scores (score fusion).
reranker = ReRanker(
    embedding_manager=embed_manager,
    config=ReRankerConfig(
        use_score_fusion=True,   # Blend base similarity with re-ranker scores.
        fusion_alpha=0.4,        # Balance between original similarity and re-ranker score.
        normalize="minmax",      # Normalization strategy applied to scores before fusion.
        doc_batch_size=256       # Batch size for processing documents during re-ranking.
    ),
    verbose=True
)

# Create the RAG retriever that:
#  - performs similarity search in the vector store,
#  - applies metadata filters, limits per parent document, etc.
retriever = RagRetriever(store, embed_manager, verbose=False)

# Create answerer configuration:
#  - top_k: maximum number of chunks retrieved from the vector store.
#  - min_score: minimum similarity score required to accept a hit.
#  - re_rank_k: number of top documents to pass through the re-ranker stage.
#  - max_context_chars: maximum size of concatenated context passed to the LLM.
cfg = AnswererConfig(
    top_k=10,
    min_score=0.15,
    re_rank_k=5,
    max_context_chars=12000,
)

# System-level instructions used to steer the LLM's behavior in RAG mode.
cfg.system_instructions = prompts.default_rag_system_instruction()

# Template for injecting retrieved context into the final system message.
# The placeholder {context} will be replaced with concatenated retrieved chunks.
cfg.system_context_template = "Use the following context to answer the user's question: {context}"

# Create an LLM instance via the Intergrax adapter registry.
# Here we use an Ollama-backed ChatOllama model and wrap it in a unified interface.
llm = LLMAdapterRegistry.create(
    name="ollama",
    chat=ChatOllama(model="llama3.1:latest")
)

# Create the high-level RAG answerer abstraction that:
#  - retrieves relevant context (retriever),
#  - optionally re-ranks it (reranker),
#  - builds prompts and calls the LLM to produce the final answer.
answerer = RagAnswerer(
    retriever=retriever,
    llm=llm,
    reranker=reranker,   # Optional re-ranker injection for higher-quality context.
    config=cfg,
    verbose=False,
)

# Sample user questions to be answered using the configured RAG pipeline.
questions = [
    "What unique advantages does Mooff have over its competitors? Answer very carefully, using no less than 1,000 words.",
    "What are virtual fairs on the mooff platform?",
]

# Execute the RAG pipeline for each question and print the answers.
for question in questions:
    print("QUESTION: ", question)

    # Run the answerer:
    #  - stream=False: collect the full response before printing,
    #  - summarize=True: if supported, may produce a concise answer plus reasoning.
    res = answerer.run(
        question=question,
        stream=False,
        summarize=True,
    )

    # Print final answer returned by the pipeline.
    print("ANSWER: ", res["answer"])
    print()


# min wiring : Dual-Index (TOC + Chunks) + map-reduce + citates

In [None]:
import sys, os
# Extend Python path with the parent directory so that Intergrax modules
# can be imported directly without installing the package globally.
sys.path.insert(0, os.path.abspath(os.path.join(os.getcwd(), "..")))

from intergrax.rag.embedding_manager import EmbeddingManager
from intergrax.rag.vectorstore_manager import VectorstoreManager, VSConfig
from intergrax.rag.re_ranker import ReRanker, ReRankerConfig
from intergrax.rag.rag_answerer import AnswererConfig, RagAnswerer
from intergrax.memory.conversational_memory import ConversationalMemory
from intergrax.rag.dual_index_builder import build_dual_index
from intergrax.rag.dual_retriever import DualRetriever
from intergrax.rag.windowed_answerer import WindowedAnswerer

from intergrax.rag.documents_loader import DocumentsLoader
from intergrax.rag.documents_splitter import DocumentsSplitter

from intergrax.llm_adapters import LLMAdapterRegistry
from langchain_ollama import ChatOllama
import intergrax.system_prompts as prompts

# -----------------------------------------------------------
# 1) Vector stores: CHUNKS + TOC (shared persistence path)
#    - CHUNKS: fine-grained content chunks
#    - TOC: higher-level structural entries (headings, sections)
# -----------------------------------------------------------
vs_chunks = VectorstoreManager(VSConfig(
    provider="chroma",
    collection_name="intergrax_chunks",
    chroma_persist_directory="chroma_db/intergrax_docs_v1",
))
vs_toc = VectorstoreManager(VSConfig(
    provider="chroma",
    collection_name="intergrax_toc",
    chroma_persist_directory="chroma_db/intergrax_docs_v1",
))

# -----------------------------------------------------------
# 2) Embedding manager
#    Shared component for:
#      - encoding documents for ingestion,
#      - encoding queries for retrieval,
#      - supporting re-ranking and dual retrieval logic.
# -----------------------------------------------------------
embed_manager = EmbeddingManager(
    verbose=False,
    provider="ollama",
    model_name="rjmalagon/gte-qwen2-1.5b-instruct-embed-f16:latest",
    assume_ollama_dim=1536,
)

# -----------------------------------------------------------
# 3) Document ingest and dual index build
#    This is executed only when the collections are empty.
#    - Loads raw docs
#    - Splits them into chunks
#    - Builds both CHUNKS and TOC indices in one pass
# -----------------------------------------------------------
def safe_count(vs_manager: VectorstoreManager) -> int:
    """
    Safely return the number of stored items.

    If the underlying store does not support counting or raises an error,
    treat it as empty (0) to avoid breaking initialization.
    """
    try:
        return int(vs_manager.count() or 0)
    except Exception:
        return 0  # if counting fails → treat as empty

chunks_count = safe_count(vs_chunks)
toc_count = safe_count(vs_toc)

if chunks_count == 0:
    # No chunks present: perform full initial ingest and build dual index.
    print(f"[INFO] Vectorstore CHUNKS empty (TOC={toc_count}) — performing initial ingest...")

    # Loader configuration:
    #  - docx_mode="paragraphs": DOCX files are split into logical paragraph units
    #  - pdf_enable_ocr / image_enable_ocr: OCR is used where needed
    loader = DocumentsLoader(
        verbose=True,
        docx_mode="paragraphs",
        pdf_enable_ocr=True,
        image_enable_ocr=True,
    )
    raw_docs = loader.load_documents("../documents/mooff-strategy/doc")

    # Split raw documents into semantic chunks for RAG.
    splitter = DocumentsSplitter(verbose=True)
    docs = splitter.split_documents(raw_docs)
    print(f"Loaded {len(docs)} documents after splitting")

    # Build the dual index:
    #  - vs_chunks: stores fine-grained chunks
    #  - vs_toc: stores higher-level TOC entries for structural guidance
    build_dual_index(
        docs=docs,
        embed_manager=embed_manager,
        vs_chunks=vs_chunks,
        vs_toc=vs_toc,
        batch_size=512,
        verbose=True,
    )
elif toc_count == 0:
    # Chunks exist but TOC is missing. We do not rebuild everything automatically
    # to avoid duplicated embeddings; this requires a targeted TOC rebuild later.
    print("[WARN] CHUNKS exist but TOC is empty — skipping full ingest to avoid duplicates.")
    print("       (Jeśli to niezamierzone, dobuduj TOC osobno lub upewnij się, że loader/splitter oznaczają nagłówki: doc_type='docx', is_heading=True).")
else:
    # Both indices contain data — skip ingest completely.
    print(f"[INFO] Vectorstore already populated — CHUNKS={chunks_count}, TOC={toc_count}. Skipping ingest.")

# -----------------------------------------------------------
# 4) Dual retriever + Re-ranker
#    Dual retriever:
#      - queries both CHUNKS and TOC indices
#      - blends structural and content-level evidence
# -----------------------------------------------------------
retriever = DualRetriever(
    vs_chunks=vs_chunks,
    vs_toc=vs_toc,
    embed_manager=embed_manager,
    k_chunks=40,  # how many chunk-level hits to consider
    k_toc=10,     # how many TOC-level hits to consider
    verbose=True,
)

# Re-ranker configuration:
#  - use_score_fusion: combine base similarity and re-ranker scores
#  - fusion_alpha: weighting between the two score types
#  - normalize: score normalization strategy
reranker = ReRanker(
    embedding_manager=embed_manager,
    config=ReRankerConfig(
        use_score_fusion=True,
        fusion_alpha=0.4,
        normalize="minmax",
        doc_batch_size=256,
    ),
    verbose=True,
)

# -----------------------------------------------------------
# 5) LLM + Answerer (with conversational memory)
#    - RAG answerer: retrieval + ranking + prompt construction + LLM call
#    - memory: stores dialogue history for follow-up questions
# -----------------------------------------------------------
cfg = AnswererConfig(
    top_k=20,           # max number of chunks fed into the answer
    min_score=None,     # no explicit global minimum score
    re_rank_k=8,        # number of items going through re-ranking
    max_context_chars=12000,  # context budget for the LLM
    temperature=0.0,    # deterministic behavior preferred for documentation Q&A
)

# System instructions controlling RAG behavior (STRICT RAG prompt, etc.).
cfg.system_instructions = prompts.default_rag_system_instruction()

# Template used to inject retrieved context into the system message
# before sending the prompt to the LLM.
cfg.system_context_template = "Use the following context to answer the user's question: {context}"

# Create LLM instance via the Intergrax adapter registry (Ollama backend).
llm = LLMAdapterRegistry.create(
    name="ollama",
    chat=ChatOllama(model="llama3.1:latest"),
)

# Conversational memory to keep track of previous Q&A pairs.
memory = ConversationalMemory()

# High-level RAG answerer that:
#  - runs the dual retriever,
#  - applies re-ranking,
#  - manages prompts,
#  - uses conversational memory,
#  - calls the LLM.
answerer = RagAnswerer(
    retriever=retriever,
    llm=llm,
    reranker=reranker,
    config=cfg,
    memory=memory,
    verbose=False,
)

# -----------------------------------------------------------
# 6) Windowed layer (map→reduce) for long-form answers
#    IntergraxWindowedAnswerer:
#      - runs answerer over multiple windows of context
#      - summarizes intermediate results
#      - produces a coherent global answer for large corpora/questions
# -----------------------------------------------------------
windowed = WindowedAnswerer(
    answerer=answerer,
    retriever=retriever,
    verbose=True,
)

# -----------------------------------------------------------
# 7) Queries:
#    - "standard": classic single-shot RAG answer
#    - "windowed": multi-window map→reduce for very detailed answers
# -----------------------------------------------------------
questions = [
    ("standard", "What are virtual fairs on the Mooff platform?"),
    ("windowed", "What unique advantages does Mooff have over its competitors? Answer very carefully, using no less than 1,000 words."),
]

for mode, question in questions:
    print("\n==========================")
    print("QUESTION (", mode, "): ", question)

    if mode == "windowed":
        # Windowed mode:
        #   - top_k_total: total number of docs collected across all windows
        #   - window_size: number of docs per window
        #   - summarize_each: whether to summarize each window separately
        res = windowed.ask_windowed(
            question=question,
            top_k_total=60,
            window_size=12,
            summarize_each=True,
        )
    else:
        # Standard mode: single-shot answer using the answerer.
        res = answerer.run(
            question=question,
            stream=False,
            summarize=True,
        )

    print("\n=== ANSWER ===\n", res["answer"])
    if res.get("summary"):
        print("\n=== SUMMARY ===\n", res["summary"])

    # Print structured list of sources used in the answer (if provided).
    print("\n=== SOURCES ===")
    for s in res.get("sources", []):
        # s is expected to be an AnswerSource-like object with:
        #   source, page, score, preview
        pg = f"|{s.page}" if s.page is not None else ""
        print(f"- {s.source}{pg}  (score={s.score})")
    print()

# -----------------------------------------------------------
# 8) Memory test — ask a question about the conversation history
#    This checks whether IntergraxConversationalMemory is correctly
#    accumulating interaction data.
# -----------------------------------------------------------
res = answerer.run(
    question="Based on our conversation history, write what the user is interested in in the context of Mooff.",
    summarize=False,
    stream=False,
)
print("\n=== MEMORY TEST ===\n", res["answer"])

# Optional: inspect raw memory entries for debugging.
if memory is not None:
    print("\n=== MEMORY DUMP ===")
    for m in memory.get_all():
        print(m)


# LLM + RAG + Memory

In [None]:
import sys, os
# Extend Python path with the parent directory so that Intergrax modules
# can be imported directly without installing the package globally.
sys.path.insert(0, os.path.abspath(os.path.join(os.getcwd(), "..")))

from intergrax.rag.embedding_manager import EmbeddingManager
from intergrax.rag.rag_retriever import RagRetriever
from intergrax.rag.vectorstore_manager import VectorstoreManager, VSConfig
from intergrax.rag.re_ranker import ReRanker, ReRankerConfig
from intergrax.rag.rag_answerer import AnswererConfig, RagAnswerer
from intergrax.llm_adapters import LLMAdapterRegistry
from intergrax.memory.conversational_memory import ConversationalMemory
from langchain_ollama import ChatOllama
import intergrax.system_prompts as prompts

# -----------------------------------------------------------
# 1) Vector store configuration
#    - Uses Chroma as the backend
#    - 'intergrax_docs' must match the collection name used during ingestion
# -----------------------------------------------------------
cfg = VSConfig(
    provider="chroma",
    collection_name="intergrax_docs",
    chroma_persist_directory="chroma_db/intergrax_docs_v1",
)

# High-level manager for the underlying vector store implementation.
store = VectorstoreManager(config=cfg, verbose=False)

# -----------------------------------------------------------
# 2) Embedding manager
#    - Shared component for encoding both documents and user questions
# -----------------------------------------------------------
embed_manager = EmbeddingManager(
    verbose=False,
    provider="ollama",
    model_name="rjmalagon/gte-qwen2-1.5b-instruct-embed-f16:latest",
    assume_ollama_dim=1536,
)

# -----------------------------------------------------------
# 3) Re-ranker
#    - Refines the initial similarity-based ranking from the vector store
#      using score fusion and optional normalization
# -----------------------------------------------------------
reranker = ReRanker(
    embedding_manager=embed_manager,
    config=ReRankerConfig(
        use_score_fusion=True,  # Combine base similarity and re-ranker scores.
        fusion_alpha=0.4,       # Weight for blending original and re-ranker scores.
        normalize="minmax",     # Normalize scores into a comparable range.
        doc_batch_size=256,     # Batch size for re-ranking operations.
    ),
    verbose=True,
)

# -----------------------------------------------------------
# 4) Retriever
#    - Responsible for:
#      * similarity search in the vector store,
#      * applying metadata filters (if provided),
#      * returning ranked context chunks
# -----------------------------------------------------------
retriever = RagRetriever(store, embed_manager, verbose=False)

# -----------------------------------------------------------
# 5) Answerer configuration
#    - Controls how many documents are retrieved and re-ranked
#    - Limits the maximum context size passed to the LLM
# -----------------------------------------------------------
cfg = AnswererConfig(
    top_k=10,             # Maximum number of chunks retrieved from the store.
    min_score=0.15,       # Minimum similarity threshold for accepting a hit.
    re_rank_k=5,          # Number of top candidates to pass through the re-ranker.
    max_context_chars=12000,  # Context budget (characters) fed into the LLM.
)

# System-level instructions for STRICT RAG behavior.
cfg.system_instructions = prompts.default_rag_system_instruction()

# Template for injecting the retrieved context into the system message.
cfg.system_context_template = (
    "Use the following context to answer the user's question: {context}"
)

# -----------------------------------------------------------
# 6) LLM + Conversational Memory
#    - LLM is created via the Intergrax adapter registry (Ollama backend)
#    - Memory stores conversation history for follow-up questions
# -----------------------------------------------------------
llm = LLMAdapterRegistry.create(
    name="ollama",
    chat=ChatOllama(model="llama3.1:latest"),
)

memory = ConversationalMemory()

# -----------------------------------------------------------
# 7) High-level RAG answerer
#    - Orchestrates retrieval, re-ranking, prompt construction and LLM calls
#    - Uses memory to keep context across multiple questions
# -----------------------------------------------------------
answerer = RagAnswerer(
    retriever=retriever,
    llm=llm,
    reranker=reranker,
    config=cfg,
    memory=memory,
    verbose=False,
)

# -----------------------------------------------------------
# 8) Example questions
#    - First: long, detailed strategic question about Mooff
#    - Second: focused feature question about virtual fairs
# -----------------------------------------------------------
questions = [
    "What unique advantages does Mooff have over its competitors? Answer very carefully, using no less than 1,000 words.",
    "What are virtual fairs on the mooff platform?",
]

# Execute the RAG pipeline for each question.
for question in questions:
    print("QUESTION: ", question)

    # Run the answerer:
    #   - stream=False: gather full response before printing
    #   - summarize=True: may trigger a summarized form if supported by the pipeline
    res = answerer.run(
        question=question,
        stream=False,
        summarize=True,
    )

    print("ANSWER: ", res["answer"])
    print()

# -----------------------------------------------------------
# 9) Memory test
#    - Ask a meta-question that should be answered based on conversation history
#    - Validates that IntergraxConversationalMemory is working as intended
# -----------------------------------------------------------
res = answerer.run(
    question="Based on the conversation history, what is the user interested in when it comes to mooff?",
    summarize=False,
    stream=False,
)
print("MEMORY TEST: ", res["answer"])
print()

# Optional: inspect raw memory content for debugging/validation.
if memory is not None:
    print("MEMORY:")
    for m in memory.get_all():
        print(m)
