# TEST

- Charger un ensemble de typiques de documents

#  Chargement des documents

In [62]:
#!pip install --quiet python-docx

In [1]:
import os
from docx import Document

def load_docx_text(filepath):
    """
    Lit un document (.docx) et renvoie une liste de paragraphes.
    """
    doc = Document(filepath)
    paragraphs = [para.text.strip() for para in doc.paragraphs if para.text.strip()]
    return paragraphs

def collect_all_documents(folder_path):
    """
    Récupère tous les fichiers .docx dans un dossier et renvoie un dictionnaire {filename: [paragraphes]}.
    """
    all_docs = {}
    for filename in os.listdir(folder_path):
        if filename.lower().endswith(".docx"):
            full_path = os.path.join(folder_path, filename)
            all_docs[filename] = load_docx_text(full_path)
    return all_docs

In [2]:

# Détermination du répertoire courant (pour Notebook ou script)
try:
    script_dir = os.path.dirname(os.path.abspath(__file__))
except NameError:
    script_dir = os.getcwd()

folder_path = os.path.join(script_dir, "..", "data", "docs_planete_urgence")
all_docs = collect_all_documents(folder_path)
print("Documents trouvés :", list(all_docs.keys()))
for doc_name, paras in all_docs.items():
    print(f"---> {doc_name} contient {len(paras)} paragraphes.")

Documents trouvés : ['PROJECT DOCUMENT MAHAKAM 2023-2025.docx', 'Dépôt sur site.docx', 'Fiche projet ATFY_2024.docx', 'Project_Proposal_Standard Grant (1).docx']
---> PROJECT DOCUMENT MAHAKAM 2023-2025.docx contient 474 paragraphes.
---> Dépôt sur site.docx contient 30 paragraphes.
---> Fiche projet ATFY_2024.docx contient 134 paragraphes.
---> Project_Proposal_Standard Grant (1).docx contient 65 paragraphes.


In [3]:
doc_name, paras

('Project_Proposal_Standard Grant (1).docx',
 ['* Please fill it out briefly with a total length less than 14 pages in total (font size 11 at A4 size).',
  'For Standard Grant',
  'Project objective\u3000(Explain which GBF goal project the applies to)',
  'Note: Please specify which target(s) your project contributes to the achievement of from the 23 targets listed as the 2030 action targets proposed in “Kunming-Montreal Global biodiversity framework (GBF) .',
  '2.  Project implementation plan',
  '(Please describe the projects by year and items.)',
  'Note; For project, please describe not only fiscal year for which this application is being submitted (“applying FY”, hereafter) but also other FY(s).',
  'Applying FY',
  'FY(s) before applying FY',
  'FY(s) after applying FY',
  '3．Expected concrete activity results',
  'Note; For project, please describe not only applying FYbut also other FY(s).',
  '（１）\tApplying FY',
  '（２）\tFY(s) before applying FY',
  '（３）\tFY(s) after applying F

# Chunking des doocuments

In [4]:
def chunk_text(paragraphs, max_tokens=200):
    """
    Découpe la liste de paragraphes en chunks d'environ max_tokens mots.
    Retourne une liste de strings (chunks).
    """
    chunks = []
    current_chunk = []
    current_length = 0
    for para in paragraphs:
        words = para.split()
        if (current_length + len(words)) <= max_tokens:
            current_chunk.append(para)
            current_length += len(words)
        else:
            chunks.append(" ".join(current_chunk))
            current_chunk = [para]
            current_length = len(words)
    if current_chunk:
        chunks.append(" ".join(current_chunk))
    return chunks


In [6]:
# all_chunks = []
# for doc_name, paras in all_docs.items():
#     doc_chunks = chunk_text(paras, max_tokens=200)
#     all_chunks.extend(doc_chunks)
# print(f"Nombre total de chunks : {len(all_chunks)}")


In [None]:
# Exemple de chunking sur un document précis
doc_name = "PROJECT DOCUMENT MAHAKAM 2023-2025.docx"
all_chunks = chunk_text(all_docs[doc_name], max_tokens=200)
print(f"Nombre total de chunks pour {doc_name} : {len(all_chunks)}")

Nombre total de chunks pour PROJECT DOCUMENT MAHAKAM 2023-2025.docx : 68


In [6]:
all_chunks

['Brief project description East Kalimantan Province in 2021/2022 received great attention nationally because of the moving of the state capital city (Jakarta) to a location near the city of Balikpapan and Penajam Paser Utara in East Kalimantan Province. The development of the new capital will start in 2022. Although the Indonesia President commit to develop the new capital as Forest and Smart City, the surrounding area particularly the coastal area such as Delta Mahakam and Adang Bay might get high pressure as the consequence of the new development and the movement of 1.5 million people to the new capital. Delta Mahakam, in the eastern part of East Kalimantan, is an area that is relatively close to the prospective center of the State capital (about 100 km). Mahakam Delta is naturally a mangrove habitat, but due to excessive land clearing for extensive aquaculture about 47.5 % of the mangrove ecosystem is degraded to be converted into aquaculture (2017). Despite various conservation ef

 # Calcul des embeddings et création de l'index FAISS

In [8]:
#!pip install sentence-transformers

In [22]:
# !pip install langchain
# !pip install -U langchain-community
# !pip install -U langchain-huggingface

In [7]:
# Modèle d'embedding (Hugging Face)
from langchain_huggingface import HuggingFaceEmbeddings

embedding_model_name = "sentence-transformers/all-MiniLM-L6-v2"
hf_embeddings = HuggingFaceEmbeddings(model_name=embedding_model_name)
hf_embeddings 

HuggingFaceEmbeddings(model_name='sentence-transformers/all-MiniLM-L6-v2', cache_folder=None, model_kwargs={}, encode_kwargs={}, multi_process=False, show_progress=False)

In [11]:
#!pip install faiss-cpu

In [8]:
# "Vector store" en FAISS
from langchain.vectorstores import FAISS
docsearch = FAISS.from_texts(all_chunks, hf_embeddings)
print(f"{docsearch} Vector store FAISS initialisée.")

<langchain_community.vectorstores.faiss.FAISS object at 0x7187a6320a40> Vector store FAISS initialisée.


#  Appel d'un LLM via API


In [None]:
# from langchain.llms.base import LLM

# class MyMockLLM(LLM):
#     """
#     Un mock LLM pour illustrer l'API de LangChain, 
#     qui renvoie un texte simpliste.
#     """

#     @property
#     def _llm_type(self) -> str:
#         return "my_mock_llm"

#     def _call(self, prompt: str, stop=None):
#         return f"[Réponse fictive du LLM API] Prompt: {prompt[:100]}..."

#     @property
#     def _identifying_params(self):
#         return {"name_of_llm": "MyMockLLM"}



In [10]:
# mock_llm_api = MyMockLLM()

# # Exemple de RetrievalQA
# from langchain.chains import RetrievalQA
# qa_chain_api = RetrievalQA.from_chain_type(
#     llm=mock_llm_api,
#     chain_type="stuff",
#     retriever=docsearch.as_retriever(search_type="similarity", search_kwargs={"k": 3})
# )

# query_api = "Quels sont les objectifs de ce projet selon les documents ?"
# print("QUESTION (API) :", query_api)
# answer_api = qa_chain_api.run(query_api)
# print("REPONSE API :\n", answer_api)


# Appel d'un modèle local

In [11]:
#!pip install -U langchain-huggingface

In [12]:
#!pip install ipywidgets

In [13]:
# !pip install "accelerate>=0.26.0"

In [None]:
from transformers import AutoModelForCausalLM, AutoTokenizer, pipeline
from langchain_huggingface import HuggingFacePipeline
import torch


# Test plusieurs models
# model_gp2= "gpt2"
# model_gpt_neo = "EleutherAI/gpt-neo-125M" 
# Choisissez le modèle Llama2 Chat gated
#model_llama2 = "meta-llama/Llama-2-7b-chat-hf"

# Utiliser le modèle public "tiiuae/falcon-7b-instruct"
model_falcon7b = "tiiuae/falcon-7b-instruct"

# Passez le token lors du chargement du tokenizer et du modèle
tokenizer_local = AutoTokenizer.from_pretrained(model_falcon7b)
model_local = AutoModelForCausalLM.from_pretrained(
    model_falcon7b,
    torch_dtype=torch.float16,
    device_map="auto"
)

# Création d'un pipeline de génération
local_pipeline = pipeline(
    "text-generation",
    model=model_local,
    tokenizer=tokenizer_local,
    #device=0,                   # Utilisation du GPU
    max_new_tokens=128,         # Nombre de tokens générés après le prompt
    truncation=True,             # Tronque le prompt s'il dépasse la fenêtre du modèle
    temperature=0.7,       # Paramètre pour plus de déterminisme
    top_p=0.9,
    top_k=50
)

# Intégration dans LangChain
local_llm = HuggingFacePipeline(pipeline=local_pipeline)

# Vous pouvez maintenant utiliser local_llm pour vos appels
print("Modèle Llama2 chargé avec succès.")



Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

Some parameters are on the meta device because they were offloaded to the cpu.
Device set to use cuda:0


Modèle Llama2 chargé avec succès.


# Prompt engineering

In [12]:
from langchain.prompts import PromptTemplate
from langchain.chains import RetrievalQA, LLMChain

template_fr = """
Tu es un assistant expert dans la rédaction de projets ONG et environnementaux.
Réponds de manière claire et concise en 2 à 3 phrases maximum.
Si le contexte ne contient pas l'information, réponds "Je ne sais pas".

Contexte (environ {nb_mots} mots) :
{context}

Question :
{question}

Réponse :
"""

prompt_template = PromptTemplate(
    template=template_fr,
    input_variables=["context", "question", "nb_mots"]
)

## Fonction de requête RAG personnalisée

In [14]:


def custom_rag_query(question, top_k=1):
    """
    - Recherche les chunks pertinents via FAISS.
    - Construit le prompt via le template.
    - Appelle Falcon-7B Instruct via LangChain pour générer la réponse.
    """
    docs = docsearch.similarity_search(question, k=top_k)
    context_text = "\n".join([d.page_content for d in docs])
    nb_mots = len(context_text.split())
    final_prompt = prompt_template.format(
        context=context_text,
        question=question,
        nb_mots=nb_mots
    )
    return local_llm(final_prompt)


## Tests de la chaîne

In [15]:
question_1 = "Décris brièvement le projet :"
print("\n--- QUESTION (Falcon 7B Instruct) ---")
response_1 = custom_rag_query(question_1, top_k=1)
print("Réponse Falcon 7B Instruct :", response_1)

question_2 = "Quels sont les objectifs environnementaux du projet ?"
print("\n--- QUESTION (Falcon 7B Instruct Environnement) ---")
response_2 = custom_rag_query(question_2, top_k=1)
print("Réponse Falcon 7B Instruct :", response_2)

  return local_llm(final_prompt)
Setting `pad_token_id` to `eos_token_id`:11 for open-end generation.



--- QUESTION (Falcon 7B Instruct) ---


Setting `pad_token_id` to `eos_token_id`:11 for open-end generation.


Réponse Falcon 7B Instruct : 
Tu es un assistant expert dans la rédaction de projets ONG et environnementaux.
Réponds de manière claire et concise en 2 à 3 phrases maximum.
Si le contexte ne contient pas l'information, réponds "Je ne sais pas".

Contexte (environ 176 mots) :
Therefore, PU and its implementing partners will coordinate closely with gender working group at the provincial and district levels to gain more understanding the gender issue, challenge and possible solution in the coastal area. We also try to create gender friendly environment in order to ensure the women has active participation such as identifying the right time for women to participate, separate the women and men group if it will give chance for women to speak, always ask and give chance to women to speak in every meeting. In addition, this project aims to acknowledge the rights of local people in the project by implementing the Free, Prior and Informed Consent (FPIC) on the project. PU and implementing partne

### Comparaison avec une chaîne RetrievalQA standard

In [None]:
from langchain.chains import RetrievalQA
qa_chain_standard = RetrievalQA.from_chain_type(
    llm=local_llm,
    chain_type="stuff",
    retriever=docsearch.as_retriever(search_type="similarity", search_kwargs={"k": 1})
)

print("\n--- Comparaison RetrievalQA classique ---")
standard_response = qa_chain_standard.run(question_1)
print("Réponse standard :", standard_response)

Setting `pad_token_id` to `eos_token_id`:11 for open-end generation.



--- Comparaison RetrievalQA classique ---
Réponse standard : Use the following pieces of context to answer the question at the end. If you don't know the answer, just say that you don't know, don't try to make up an answer.

Therefore, PU and its implementing partners will coordinate closely with gender working group at the provincial and district levels to gain more understanding the gender issue, challenge and possible solution in the coastal area. We also try to create gender friendly environment in order to ensure the women has active participation such as identifying the right time for women to participate, separate the women and men group if it will give chance for women to speak, always ask and give chance to women to speak in every meeting. In addition, this project aims to acknowledge the rights of local people in the project by implementing the Free, Prior and Informed Consent (FPIC) on the project. PU and implementing partners will provide information of the project, the ri