In [1]:
# Étape 1 : Installations (dans shell ou via %pip)
# !pip install langchain langchain-google-vertexai chromadb

# Imports
from pathlib import Path
import json
from langchain_core.documents import Document
from langchain_google_vertexai import VertexAIEmbeddings
from langchain_community.vectorstores import Chroma
from langchain.text_splitter import RecursiveCharacterTextSplitter

In [2]:
# Étape 2 : chargement
def load_jsonl(path: str):
    docs = []
    with open(path, "r", encoding="utf-8") as f:
        for line in f:
            item = json.loads(line)
            docs.append(Document(page_content=item["page_content"], metadata=item["metadata"]))
    return docs

BASE_PATH = "../../data/"

docs_cgi = load_jsonl(BASE_PATH + "cgi/cgi_documents.jsonl")
docs_bofip = load_jsonl(BASE_PATH + "bofip/bofip_documents.jsonl")
print(f"{len(docs_cgi)} articles chargés pour le CGI")
print(f"{len(docs_bofip)} articles chargés pour le BOFIP")

2500 articles chargés pour le CGI
42669 articles chargés pour le BOFIP


In [3]:
# Étape 3 : découpage
from langchain.text_splitter import RecursiveCharacterTextSplitter

splitter = RecursiveCharacterTextSplitter(chunk_size=800, chunk_overlap=100)

chunks_cgi = splitter.split_documents(docs_cgi)
chunks_bofip = splitter.split_documents(docs_bofip)

print(f"{len(chunks_cgi)} chunks pour le CGI")
print(f"{len(chunks_bofip)} chunks pour le BOFIP")

8397 chunks pour le CGI
110814 chunks pour le BOFIP


In [7]:
# Étape 4 : créer les vector stores Chroma (CGI & BOFiP) via VertexAI
from langchain_google_vertexai import VertexAIEmbeddings
from langchain_community.vectorstores import FAISS


embedding = VertexAIEmbeddings(
    model_name="textembedding-gecko@001"  # ou gecko-multilingual
)

VECTOR_STORE_PATH = BASE_PATH + "vector_stores/"

# Vector store CGI
db_cgi = FAISS.from_documents(
    documents=chunks_cgi,
    embedding=embedding
)
db_cgi.save_local(VECTOR_STORE_PATH + "cgi_faiss")

print("✅ Vector store CGI sauvegardé dans" + VECTOR_STORE_PATH + "cgi")

# Vector store BOFiP
db_bofip = FAISS.from_documents(
    documents=chunks_bofip,
    embedding=embedding
)
db_bofip.save_local(VECTOR_STORE_PATH + "bofip_faiss")

print("✅ Vector store Bofip sauvegardé dans" + VECTOR_STORE_PATH + "bofip")


Forbidden: 403 POST https://us-central1-aiplatform.googleapis.com/v1/projects/fiscalia/locations/us-central1/publishers/google/models/textembedding-gecko@001:predict?%24alt=json%3Benum-encoding%3Dint: Vertex AI API has not been used in project fiscalia before or it is disabled. Enable it by visiting https://console.developers.google.com/apis/api/aiplatform.googleapis.com/overview?project=fiscalia then retry. If you enabled this API recently, wait a few minutes for the action to propagate to our systems and retry. [{'@type': 'type.googleapis.com/google.rpc.ErrorInfo', 'reason': 'SERVICE_DISABLED', 'domain': 'googleapis.com', 'metadata': {'service': 'aiplatform.googleapis.com', 'serviceTitle': 'Vertex AI API', 'activationUrl': 'https://console.developers.google.com/apis/api/aiplatform.googleapis.com/overview?project=fiscalia', 'consumer': 'projects/fiscalia', 'containerInfo': 'fiscalia'}}, {'@type': 'type.googleapis.com/google.rpc.LocalizedMessage', 'locale': 'en-US', 'message': 'Vertex AI API has not been used in project fiscalia before or it is disabled. Enable it by visiting https://console.developers.google.com/apis/api/aiplatform.googleapis.com/overview?project=fiscalia then retry. If you enabled this API recently, wait a few minutes for the action to propagate to our systems and retry.'}, {'@type': 'type.googleapis.com/google.rpc.Help', 'links': [{'description': 'Google developers console API activation', 'url': 'https://console.developers.google.com/apis/api/aiplatform.googleapis.com/overview?project=fiscalia'}]}]