In [4]:
# créer un nouvel env conda à partir du terminal
# conda create --name pathrag python=3.10
# installer ollama: https://www.ollama.com/download

# installer les dépendences
%pip install -r requirements.txt
# intsaller le modème d'OllamaEmbeddings
!ollama pull nomic-embed-text
# créer une clé api sur openrouter pour utiliser des llm gratuitement

Collecting chromadb (from -r requirements.txt (line 46))
  Downloading chromadb-1.0.13-cp39-abi3-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (7.0 kB)
Collecting build>=1.0.3 (from chromadb->-r requirements.txt (line 46))
  Downloading build-1.2.2.post1-py3-none-any.whl.metadata (6.5 kB)
Collecting pybase64>=1.4.1 (from chromadb->-r requirements.txt (line 46))
  Downloading pybase64-1.4.1-cp310-cp310-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (8.4 kB)
Collecting uvicorn>=0.18.3 (from uvicorn[standard]>=0.18.3->chromadb->-r requirements.txt (line 46))
  Downloading uvicorn-0.35.0-py3-none-any.whl.metadata (6.5 kB)
Collecting posthog>=2.4.0 (from chromadb->-r requirements.txt (line 46))
  Downloading posthog-6.0.0-py3-none-any.whl.metadata (6.0 kB)
Collecting opentelemetry-api>=1.2.0 (from chromadb->-r requirements.txt (line 46))
  Downloading opentelemetry_api-1.34.1-py3-none-any.whl.metadata (1.5 kB)
Collecting opentelemetr

In [5]:
from openai import OpenAI, AsyncOpenAI
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.vectorstores import Chroma
from langchain.embeddings import OpenAIEmbeddings
from langchain_ollama import OllamaEmbeddings
from langchain.schema.document import Document
from pathrag_retriever import create_graphdb, load_existing_graphdb, load_knowledgeGraph_vis
import time


In [13]:
from langchain.vectorstores import Chroma
from langchain_ollama import OllamaEmbeddings
from langchain_community.retrievers import TFIDFRetriever
from langchain_unstructured import UnstructuredLoader

#========= choix du modèle d'embedding
"""
    Le modèle choisi impacte la qualité du retriever, mais aussi le temps de traitement
    Si le déploiement est prévu sur une VM limitée, un modèle plus petit est nécessaire
    Explorer les comparatifs: https://huggingface.co/spaces/mteb/leaderboard

"""
# Utiliser OllamaEmbeddings avec le modèle local "nomic-embed-text"
embeddings = OllamaEmbeddings(model="nomic-embed-text")



# chargement et fragmentation du doc
filename="PU_P01_PP01.docx"
doc_name="PP mahakam" # nom de doc significatif


# loader = UnstructuredFileLoader(filename)
loader = UnstructuredLoader(filename)

docx_docs = loader.load()
print(f"Loaded {len(docx_docs)} documents from {filename}")


#======== choix des paramètres de fragmentation
"""
    la taille du chunck_size est très important dans l'accès à une info précise
    une plus petite taille permet de cibler de courts passages contenant l'info nécessaire à des réponses précises:
        * lieu du projet
        * dates du projet
        * budget ...    
    l'envoi de passages plus courts au llm évite une dispertion de son attention
"""

text_splitter = RecursiveCharacterTextSplitter(
    chunk_size=500,
    chunk_overlap=200,
    length_function=len
)

docs = text_splitter.split_documents(docx_docs)

# Filter out complex metadata (e.g., lists, dicts)
docs = [Document(doc.page_content) for doc in docs]

print(len(docs))

# Conversion des docs en embeddings 
chroma_db = Chroma.from_documents(
    docs,
    embedding=embeddings,
    persist_directory=f'./storage/vector_scores/{doc_name.replace(" ","_")}',
    collection_name=doc_name.replace(" ","_")
)

retriever=chroma_db.as_retriever()

# ...existing code...
all_docs = chroma_db.get()
print(len(all_docs['documents']))  # This will print the total number of docs stored
# ...existing code...

INFO:chromadb.telemetry.product.posthog:Anonymized telemetry enabled. See                     https://docs.trychroma.com/telemetry for more information.
ERROR:chromadb.telemetry.product.posthog:Failed to send telemetry event ClientStartEvent: capture() takes 1 positional argument but 3 were given
ERROR:chromadb.telemetry.product.posthog:Failed to send telemetry event ClientCreateCollectionEvent: capture() takes 1 positional argument but 3 were given


Loaded 533 documents from PU_P01_PP01.docx
590


INFO:httpx:HTTP Request: POST http://127.0.0.1:11434/api/embed "HTTP/1.1 200 OK"
ERROR:chromadb.telemetry.product.posthog:Failed to send telemetry event CollectionGetEvent: capture() takes 1 positional argument but 3 were given


590


In [16]:
# appliquer nest_asyncio uniquement sur notebook pour corriger l'erreur de loop event
import nest_asyncio
nest_asyncio.apply()

# remetre à plat le text
filename="PU_P01_PP01.docx"
loader = UnstructuredLoader(filename)

docx_docs = loader.load()
text=""
for doc in docx_docs:
    text+=doc.page_content


r=input("Saisir 'C' pour créer un nouveau graphe, 'L' pour charger un graphe existant")

# créer un nouveau graphe
messages=None
if r=='C':
    doc_name=input('Saisir un nom unique pour votre graphe')
    print(f"Le nom de votre graphe est {doc_name}")
    messages= create_graphdb(
        text=text, 
        doc_name=doc_name, # il faut donner un nom unique permettant d'identifier et charger le graph les prochaines fois
    )
# charger un graphe existant
elif r=='L':
    doc_name=input('Saisir le nom du graphe à charger')
    print(f"Le nom de votre graphe est {doc_name}")

    messages=load_existing_graphdb(doc_name)
else:
    print('Option invalide')



if messages:
    pipeline_args={}
    for feedback in messages:
        if isinstance(feedback, str):
            print(feedback)
        elif isinstance(feedback, dict):
            pipeline_args[f"graphrag_pipeline_{doc_name}"]=feedback["pipeline_args"]
            


INFO:PathRAG:Logger initialized for working directory: /home/chougar/Documents/GitHub/Projet-portail-immo/docs-to-rag share/storage/graph_stores/39a6c98b9231f8b45540c3c6802c80d091e49aed11883eb887fc10dd366d605c
INFO:PathRAG:Load KV llm_response_cache with 0 data
INFO:PathRAG:Load KV full_docs with 1 data
INFO:PathRAG:Load KV text_chunks with 17 data
INFO:PathRAG:Loaded graph from /home/chougar/Documents/GitHub/Projet-portail-immo/docs-to-rag share/storage/graph_stores/39a6c98b9231f8b45540c3c6802c80d091e49aed11883eb887fc10dd366d605c/graph_chunk_entity_relation.graphml with 464 nodes, 206 edges
INFO:nano-vectordb:Load (447, 768) data
INFO:nano-vectordb:Init {'embedding_dim': 768, 'metric': 'cosine', 'storage_file': '/home/chougar/Documents/GitHub/Projet-portail-immo/docs-to-rag share/storage/graph_stores/39a6c98b9231f8b45540c3c6802c80d091e49aed11883eb887fc10dd366d605c/vdb_entities.json'} 447 data
INFO:nano-vectordb:Load (206, 768) data
INFO:nano-vectordb:Init {'embedding_dim': 768, 'metri

Le nom de votre graphe est PP mahakam

        ----------------
        #### Graph RAG retriever
        Chargement de la base Graph RAG
    
**✅ Graph RAG chargé**


In [18]:
from PathRAG import QueryParam
import asyncio


def stream_pathRAG_response(stream_resp):
    async def stream_response():        
        # Process the async generator
        async for chunk in stream_resp:
            print(chunk or "", end="")



    # Run in Streamlit's existing event loop
    loop = asyncio.get_event_loop()
    loop.run_until_complete(stream_response())


# question="résume ce texte dans sa langue source"
question = "Quels sont les principaux thèmes de ce texte et les questions qui peuvent être posées ?"

resp=pipeline_args[f"graphrag_pipeline_{doc_name}"]["rag"].query(query= question, param=QueryParam(mode="hybrid", stream=True))

stream_pathRAG_response(resp)

INFO:httpx:HTTP Request: POST https://openrouter.ai/api/v1/chat/completions "HTTP/1.1 200 OK"
INFO:PathRAG:kw_prompt result:
INFO:PathRAG:```json
{
  "high_level_keywords": ["Thèmes principaux", "Analyse de texte", "Compréhension de texte", "Questionnement"],
  "low_level_keywords": ["Idées clés", "Résumé", "Sujet", "Questions de discussion", "Points importants"]
}
```
INFO:PathRAG:Local query uses 40 entites, 9 relations, 3 text units
INFO:PathRAG:Global query uses 45 entites, 40 relations, 3 text units
INFO:httpx:HTTP Request: POST https://openrouter.ai/api/v1/chat/completions "HTTP/1.1 200 OK"


response all ready

## Principaux thèmes et questions potentielles du texte

Le texte fourni présente un ensemble d'informations concernant un projet de conservation des mangroves, le "NEW MAHAKAM PROJECT", dans la région du Kalimantan (Bornéo, Indonésie). Voici les principaux thèmes qui ressortent, ainsi que des questions qui pourraient être posées à partir de ces données :

**Thèmes principaux:**

*   **Conservation des mangroves :** C'est le thème central, incluant la restauration, la plantation, la protection et la gestion durable des écosystèmes de mangroves.
*   **Collaboration multipartite :** Le projet s'appuie sur une collaboration importante entre diverses organisations, incluant Planète Urgence (PU), des ONG locales (YML, Pokja Pesisir), des agences gouvernementales, et des groupes communautaires.
*   **Développement durable et communautaire :** Le projet vise non seulement la restauration écologique, mais également l'amélioration des moyens de subsistance des communautés lo

In [None]:
from langchain.vectorstores import Chroma
from langchain.embeddings import OllamaEmbeddings
from langchain.retrievers import EnsembleRetriever
from langchain.retrievers import TFIDFRetriever
from langchain.schema.document import Document
from openai import OpenAI, AsyncOpenAI
import asyncio
import json
import re

class RAG_hybrid():
    def __init__(self, model):
        self.model=model
        self.retrieved_docs=[]
        self.semantic_retriever_topK=10
        self.sparse_retriever_topK=10
        self.history=[]
        self.llm_client = AsyncOpenAI(
            base_url="https://openrouter.ai/api/v1",
            api_key="clé",
        )
        self.reranker_llm="mistralai/mistral-small-3.1-24b-instruct:free"
        self.reranker_score_thresh=5
        self.reranked_doc=[]

    def semanticRetriever(self):
        # 1. Semantic Retriever (Chroma + OllamaEmbeddings)
        embeddings = OllamaEmbeddings(model="nomic-embed-text")
        chroma_db = Chroma(
            persist_directory=f'./storage/vector_scores/{doc_name.replace(" ","_")}',
            collection_name=doc_name.replace(" ","_"),
            embedding_function=embeddings
        )

        semantic_retriever=chroma_db.as_retriever(search_type="mmr", k=self.semantic_retriever_topK)

        self.chroma_db=chroma_db
        self.semantic_retriever=semantic_retriever
    
    def sparseRetriever(self):
        # 2. Sparse Retriever (TF-IDF)

        # Récupérer TOUS les documents depuis Chroma
        all_data = self.chroma_db.get(include=["documents", "metadatas"])

        # Convertir en liste de `Document` objects pour LangChain
        docs = [
            Document(page_content=text, metadata=meta or {})  # <-- Si meta est None, on met {}
            for text, meta in zip(all_data["documents"], all_data["metadatas"])
        ]

        # Créer le retriever TF-IDF
        sparse_retriever = TFIDFRetriever.from_documents(
            documents=docs,
            k=self.sparse_retriever_topK,
            tfidf_params={"min_df": 1, "ngram_range": (1, 2)}
        )

        self.sparse_retriever= sparse_retriever
    
    def ensembleRetriever(self):
        # 3. Ensemble Retriever (Semantic + Sparse)
        ensemble_retriever = EnsembleRetriever(
            retrievers=[self.semantic_retriever, self.sparse_retriever],
            weights=[0.5, 0.5]
        )

        self.ensemble_retriever=ensemble_retriever

    async def reranker(self, results, query):


        async def llm_eval(doc, query):
            system_prompt="""
                You're an expert assistant in reranking documents against a question.
                Your role is to compare the question with a document and give a score from 0 to 10, where:
                0=document out of context, unable to answer the question
                10=highly relevant document, able to answer the question
                                
                The expected final output is the score in json format
                Example:
                ```json{"score": 5}```
                
                Always end your answer with this format                
            """            
            response = await self.llm_client.chat.completions.create(
                model=self.reranker_llm,
                messages=[
                    {"role": "system", "content": system_prompt},
                    {"role": "user", "content": f"La question est: {query}\n Le document à évaluer est le suivant\n: {doc}" }
                ],
                temperature=0,
            )
            # Post-process to extract only the JSON part if extra text is present
            content = response.choices[0].message.content
            # Try to extract the JSON block if the model adds extra text
            match = re.search(r"\{.*?\}", content, re.DOTALL)
            if match:
                content = match.group(0)

            # extract score
            score=None
            try:
                score=content.replace("```json", "").replace("```", "")
                
                score= json.loads(score)
                score=score["score"]
            except Exception as e:
                print(e)                
            
            return {"content": doc, "score": score}


        tasks=[llm_eval(doc.page_content, query) for doc in results]
        scored_docs= await asyncio.gather(*tasks)
        i=1

        for doc in scored_docs:
          
            print(f'chunk {i} score: {doc["score"]}')
            i+=1

        filtred_docs=[d for d in scored_docs if d["score"]>=self.reranker_score_thresh]
        # print(f"scored docs; \n{scored_docs}")
        self.reranked_doc=filtred_docs

        return filtred_docs

    async def ask_llm(self, query):
        # 5. Final processing step with an LLM (e.g., OpenAI via OpenRouter)

        # init retrievers
        self.semanticRetriever()
        self.sparseRetriever()
        self.ensembleRetriever()

        # retrieve relevant docs
        results = self.ensemble_retriever.get_relevant_documents(query)
        print(f"Nb of retrieved docs: {len(results)}")

        # rerank
        scored_results=await self.reranker(results, query)
        
        # Concatenate retrieved documents for context
        context = "\n".join([f"Fragment: \n{doc['content']}\n" for doc in scored_results])

        print(f"Context lenght: {len(context.split(' '))} words")
        llm_prompt = f"""
            Answer the question based **only** on the provided context.  

            - If the context contains enough information to provide a complete or partial answer, use it to formulate a detailed and factual response.  
            - If the context lacks relevant information, respond with: "I don't know."  

            ### **Context:**  
            {context}  

            ### **Question:**  
            {query}  

            ### **Answer:**  
            Provide a clear, factual, and well-structured response based on the available context. Avoid speculation or adding external knowledge.  
        """

        llm_completion = await self.llm_client.chat.completions.create(
            model=self.model,
            messages=[
                {"role": "system", "content": "You are an expert in document Q/A and document synthesis"},
                {"role": "user", "content": llm_prompt}
            ],
            temperature=0.2,
            stream=True
        )

        final_answer = ""
        print("Réponse:\n=========")
        async for chunk in llm_completion:
            if hasattr(chunk.choices[0].delta, "content") and chunk.choices[0].delta.content:
                final_answer += chunk.choices[0].delta.content
                print(chunk.choices[0].delta.content, end="", flush=True)
        
        self.history+=[
            {"role": "user", 'content': query},
            {"role": "assistant", "content": final_answer}
        ]
        
        return final_answer


rag_hybrid=RAG_hybrid(model="google/gemma-3-27b-it:free")
# 4. Ask a question
# question = "Quels sont les principaux conseils pour réussir dans l'entrepreneuriat ?"
# question="que faut il absolument éviter pour réussir sa startup ?"
# question="résume ce texte"
results = await rag_hybrid.ask_llm(question)




ERROR:chromadb.telemetry.product.posthog:Failed to send telemetry event ClientStartEvent: capture() takes 1 positional argument but 3 were given
ERROR:chromadb.telemetry.product.posthog:Failed to send telemetry event ClientCreateCollectionEvent: capture() takes 1 positional argument but 3 were given


Nb of retrieved docs: 14


INFO:httpx:HTTP Request: POST https://openrouter.ai/api/v1/chat/completions "HTTP/1.1 200 OK"
INFO:httpx:HTTP Request: POST https://openrouter.ai/api/v1/chat/completions "HTTP/1.1 200 OK"
INFO:httpx:HTTP Request: POST https://openrouter.ai/api/v1/chat/completions "HTTP/1.1 200 OK"
INFO:httpx:HTTP Request: POST https://openrouter.ai/api/v1/chat/completions "HTTP/1.1 200 OK"
INFO:httpx:HTTP Request: POST https://openrouter.ai/api/v1/chat/completions "HTTP/1.1 200 OK"
INFO:httpx:HTTP Request: POST https://openrouter.ai/api/v1/chat/completions "HTTP/1.1 200 OK"
INFO:httpx:HTTP Request: POST https://openrouter.ai/api/v1/chat/completions "HTTP/1.1 200 OK"
INFO:httpx:HTTP Request: POST https://openrouter.ai/api/v1/chat/completions "HTTP/1.1 200 OK"
INFO:httpx:HTTP Request: POST https://openrouter.ai/api/v1/chat/completions "HTTP/1.1 200 OK"
INFO:httpx:HTTP Request: POST https://openrouter.ai/api/v1/chat/completions "HTTP/1.1 200 OK"
INFO:httpx:HTTP Request: POST https://openrouter.ai/api/v1/c

chunk 1 score: 0
chunk 2 score: 3
chunk 3 score: 0
chunk 4 score: 2
chunk 5 score: 0
chunk 6 score: 0
chunk 7 score: 0
chunk 8 score: 0
chunk 9 score: 4
chunk 10 score: 1
chunk 11 score: 0
chunk 12 score: 2
chunk 13 score: 4
chunk 14 score: 0
Context lenght: 1 words


INFO:httpx:HTTP Request: POST https://openrouter.ai/api/v1/chat/completions "HTTP/1.1 200 OK"


Réponse:

I don't know. 

The provided context is empty. Therefore, I cannot identify the main themes of a text or formulate questions about it.



