# © Artur Czarnecki. All rights reserved.
# Integrax framework - proprietary and confidential.
# Use, modification, or distribution without written permission is prohibited.


In [None]:
import sys, os
sys.path.insert(0, os.path.abspath(os.path.join(os.getcwd(), "..", "..")))

# Load documents + split + embed + VectorStore

In [None]:
from intergrax.rag.documents_loader import DocumentsLoader
from intergrax.llm_adapters import LangChainOllamaAdapter
from langchain_ollama import ChatOllama
import os

# Ścieżki do plików testowych
vid_path = os.path.join("video", "video02", "vid_eur8dUO9mvE.mp4")
audio_path = "" #os.path.join("video", "video02", "audio_eur8dUO9mvE.mp3")
image_path = "" #os.path.join("images", "frame_37.jpg")


ollama_adapter = LangChainOllamaAdapter(
    chat=ChatOllama(
        model="llava-llama3:latest"
    ),
)


loader = DocumentsLoader(
    verbose=True,
    image_text_mode="both",
    image_caption_llm=ollama_adapter
)

if vid_path:
    vid_docs = loader.load_document(file_path=vid_path)
else:
    vid_docs = []

if audio_path:
    audio_docs = loader.load_document(file_path=audio_path)
else:
    audio_docs = []

if image_path:
    image_docs = loader.load_document(file_path=image_path)
else:
    image_docs = []


print(f"Video docs: {len(vid_docs)}")
print(f"Audio docs: {len(audio_docs)}")
print(f"Image docs: {len(image_docs)}")

if image_docs:
    print("\n--- IMAGE CAPTION OUTPUT ---")
    print(image_docs[0].page_content[:500])
    print("\nMetadata:", image_docs[0].metadata)



# Split and embed documents

In [None]:
from intergrax.rag.documents_splitter import DocumentsSplitter
from intergrax.rag.embedding_manager import EmbeddingManager

splitter = DocumentsSplitter(verbose=True)
chunks = splitter.split_documents(documents=vid_docs+audio_docs+image_docs)

embed_manager = EmbeddingManager(
    verbose=True,
    provider="ollama",
    model_name="rjmalagon/gte-qwen2-1.5b-instruct-embed-f16:latest", 
    assume_ollama_dim=1536)

embeddings, documents = embed_manager.embed_documents(docs=chunks)
print(f"Embeddings: {len(embeddings)}")
print(documents[:3])


In [None]:
from intergrax.rag.embedding_manager import EmbeddingManager
from intergrax.rag.vectorstore_manager import VectorstoreManager, VSConfig
from intergrax.rag.documents_splitter import DocumentsSplitter
from intergrax.rag.documents_loader import DocumentsLoader
from langchain_core.documents import Document
from collections import Counter

TENANT = "intergrax"
CORPUS = "intergrax-multimodal"
VERSION = "v1"

# 0) Vectorstore na starcie (bez ładowania/splita/embedu)
cfg = VSConfig(
    provider="chroma",
    collection_name="multimodal_docs",
    chroma_persist_directory="chroma_db/multimodal_docs_v1",
)
store = VectorstoreManager(config=cfg, verbose=True)

def corpus_present(store: VectorstoreManager, embed_mgr: EmbeddingManager) -> bool:
    if store.count() == 0:
        return False
    qvec = embed_mgr.embed_one("probe")
    
    where = {
        "$and": [
            {"tenant": {"$eq": TENANT}},
            {"corpus": {"$eq": CORPUS}},
            {"version": {"$eq": VERSION}},
        ]
    }

    res = store.query(query_embeddings=qvec, top_k=1, where=where)
    return bool(res["ids"] and res["ids"][0])

NEED_INGEST = not corpus_present(store, embed_manager)
print(f"Need ingest: {NEED_INGEST}")

if not NEED_INGEST:
    print("[INGEST] Skipping — corpus already present.")
else:
    doc_loader = DocumentsLoader(verbose=False)

    def add_meta(chunk_doc: Document, idx: int, total: int):
        return {"tenant": TENANT, "corpus": CORPUS, "version": VERSION}
    
    ids = []
    for d in documents:
        cid = d.metadata.get("chunk_id")
        if not cid:
            parent = d.metadata.get("parent_id") or d.metadata.get("source_path") or d.metadata.get("source_name", "doc")
            idx = int(d.metadata.get("chunk_index", 0))
            cid = f"{parent}#ch{idx:04d}"
        ids.append(cid)

    def dedup_batch(ids, docs, embs):
        seen = set()
        new_ids, new_docs, new_embs = [], [], []
        for i, _id in enumerate(ids):
            if _id in seen:
                continue
            seen.add(_id)
            new_ids.append(_id)
            new_docs.append(docs[i])
            new_embs.append(embs[i])
        return new_ids, new_docs, new_embs

    ids, documents, embeddings = dedup_batch(ids, documents, embeddings)

    c = Counter(ids)
    dups = [k for k, v in c.items() if v > 1]
    if dups:
        print(f"[WARN] Duplicate IDs after dedup? {len(dups)}")

    base_metadata = {"tenant": TENANT, "corpus": CORPUS, "version": VERSION}
    store.add_documents(
        documents=documents,
        embeddings=embeddings,
        ids=ids,
        batch_size=128,
        base_metadata=base_metadata
    )
    print("[INGEST] Vectorstore updated. Count:", store.count())

# Retriever test

In [None]:
from intergrax.rag.rag_retriever import RagRetriever
from intergrax.multimedia.ipynb_display import display_audio_at_data, display_image, display_video_jump
from IPython.display import display, Image, HTML
from pathlib import Path

retriever = RagRetriever(store, embed_manager, verbose=True)

question = "Men writting on the table 'mcp host'"

# MMR
hits = retriever.retrieve(
    question=question,
    top_k=8,
    score_threshold=0.15,
    where={"tenant": TENANT, "corpus": CORPUS, "version": VERSION},
    max_per_parent=2,
    use_mmr=True,
    include_embeddings=True,
    prefetch_factor=5
)


for h in hits:
    meta = h.get("metadata", {})
    print(h["rank"], f"{h['similarity_score']:.3f}", meta)
    print(h["content"])

    if meta.get("source_type") == "audio" and os.path.exists(meta.get("source_path","")):        
        start_s = meta.get("start_s")
        if start_s is None and meta.get("start_ms") is not None:
            start_s = float(meta["start_ms"]) / 1000.0
        if start_s is None:
            start_s = 0.0

        print(h["rank"], f"{h['similarity_score']:.3f}", meta.get("source_name"))
        print(h["content"])
        display_audio_at_data( 
            path=meta["source_path"],
            start_s=float(start_s or 0.0),
            autoplay=False,
            label=f"start @ {float(start_s or 0.0):.2f}s"
        )
        print()
        continue
    
    if meta.get("doc_type") == "video" and meta.get("video_path"):
        start_s = float((meta.get("mid_time_ms") or meta.get("start_ms") or 0) / 1000.0)
        dur_s   = float((meta.get("duration_ms") or 6000) / 1000.0)
        display_video_jump(
            path=meta["video_path"],
            start_s=float((meta.get("mid_time_ms") or meta.get("start_ms") or 0)/1000.0),
            # duration_s=float((meta.get("duration_ms") or 6000)/1000.0),
            poster=meta.get("extracted_frame_path"),
            autoplay=False,
            muted=False,
            label=f"jump @ { (meta.get('mid_time_ms') or meta.get('start_ms') or 0)/1000:.2f}s (segment #{meta.get('video_segment_id')})"
        )
        print()
        continue

    display_image(meta.get("source_path"))

    print()