In [39]:
from langchain_community.embeddings import HuggingFaceBgeEmbeddings
from langchain.schema import Document
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.vectorstores import FAISS
import pymupdf  
import re

In [40]:
embedding_model = HuggingFaceBgeEmbeddings(
    model_name="../multilingual-e5-large",
    encode_kwargs={"normalize_embeddings": True},
    query_instruction="query: "
)

In [None]:
def clean_text(raw_text : str) -> str :
    return 


def extract_and_split_pdf(file_path: str, chunk_size : int, chunk_overlap : int) -> list :
    all_chunks = []
    doc = pymupdf.open(file_path)
    splitter = RecursiveCharacterTextSplitter(
        chunk_size=chunk_size,
        chunk_overlap=chunk_overlap,
        separators=["\n\n", "\n", ".", " "]
    )
    for page_num, page in enumerate(doc, start=1):
        page_text = page.get_text()
        chunks = splitter.split_text(page_text)
        for i, chunk in enumerate(chunks):
            all_chunks.append(Document(
                page_content=f"passage: {chunk}",
                metadata={
                    "source": file_path,
                    "page_number": page_num,
                }
            ))

    doc.close()
    return all_chunks


def index_documents(chunks, embedding_model) : 
    vectorstore = FAISS.from_documents(chunks, embedding_model)
    vectorstore.save_local("../faiss_vectorstore/faiss_index_cv")

In [46]:
file_path = "../data/CV_Data_Science_Haboubacar_TB.pdf"
chunks = extract_and_split_pdf(file_path, 450, 64)[0:10]
index_documents(chunks, embedding_model)

In [2]:
from logger import get_logger
logger = get_logger("faiss_ingestion")

logger.info("Ingestion démarrée...")


2025-07-02 14:04:37,851 - INFO - Ingestion démarrée...
