In [19]:
from langchain_community.embeddings import HuggingFaceBgeEmbeddings
from langchain.schema import Document
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.document_loaders import UnstructuredPDFLoader
from langchain.vectorstores import FAISS
from langchain_community.document_loaders import YoutubeLoader
import os  
import re

In [3]:
embedding_model = HuggingFaceBgeEmbeddings(
    model_name="multilingual-e5-large",
    encode_kwargs={"normalize_embeddings": True},
    query_instruction="query: "
)

No sentence-transformers model found with name sentence-transformers/multilingual-e5-large. Creating a new one with mean pooling.


OSError: sentence-transformers/multilingual-e5-large is not a local folder and is not a valid model identifier listed on 'https://huggingface.co/models'
If this is a private repository, make sure to pass a token having permission to this repo either by logging in with `huggingface-cli login` or by passing `token=<your_token>`

In [None]:
def clean_raw_text(text: str) -> str:
    # 1. Fusionner les lignes coupées
    text = re.sub(r'(?<![.\n])\n(?![\n0-9•\-])', ' ', text)
    
    # 2. Supprimer les pieds de page / mentions AMF
    text = re.sub(r"GROUPE RENAULT\s+I\s+DOCUMENT D.*?\d{4}", '', text, flags=re.IGNORECASE)
    text = re.sub(r"Ce document.*?AMF", "", text, flags=re.DOTALL | re.IGNORECASE)

    # 3. Supprimer les lignes parasites du sommaire
    lines = text.splitlines()
    clean_lines = []
    for line in lines:
        line = line.strip()
        if re.match(r'^\d{1,3}$', line): continue
        if re.match(r'^[A-Z\s\-]{5,}$', line): continue
        if re.match(r'^\d+(\.\d+)*\s+', line): continue
        clean_lines.append(line)
    text = "\n".join(clean_lines)

    # 4. Supprimer les numéros de page
    text = re.sub(r'^Page \d+.*$', '', text, flags=re.MULTILINE | re.IGNORECASE)
    text = re.sub(r'^\s*\d{1,4}\s*$', '', text, flags=re.MULTILINE)

    # 5. Nettoyage espaces
    text = re.sub(r'[ \t]{2,}', ' ', text)
    text = re.sub(r'\n{2,}', '\n', text)

    return text.strip()

def join_short_lines(text: str, min_len: int = 60) -> str:
    lines = text.splitlines()
    result = []
    buffer = ""
    for i, line in enumerate(lines):
        line = line.strip()
        # Ignore ligne vide
        if not line:
            if buffer:
                result.append(buffer.strip())
                buffer = ""
            result.append("")
            continue
        # Si ligne est courte et ne finit pas par ponctuation forte
        if len(line) < min_len and not re.search(r'[.!?:;»”]\s*$', line):
            buffer += " " + line  # On accumule
        else:
            buffer += " " + line
            result.append(buffer.strip())
            buffer = ""
    # Append dernier buffer si nécessaire
    if buffer:
        result.append(buffer.strip())

    return "\n".join(result)

def create_chunks(documents : Document, filename : str, chunk_size : int, chunk_overlap : int, activate_cleaning = True) -> list :
    splitter = RecursiveCharacterTextSplitter(
        chunk_size=chunk_size,
        chunk_overlap=chunk_overlap
    )
    all_chunks = []
    chunk_count = 0
    for doc in documents:
        page_num = documents.index(doc)
        if activate_cleaning : 
            cleaned_text_page = join_short_lines(clean_raw_text(doc.page_content))
            chunks = splitter.split_text(cleaned_text_page)
        else :
            chunks = splitter.split_text(doc.page_content)
        for chunk in chunks:
            all_chunks.append(Document(
                page_content = chunk,
                metadata={
                    "source": filename,
                    "page_number": page_num,
                    "chunk_id": chunk_count
                }
            ))
            chunk_count += 1
    return all_chunks

def extract_and_split_pdf(file_path: str, chunk_size: int, chunk_overlap: int) -> list:
    # Chargement du PDF avec structure
    filename = os.path.basename(file_path)
    loader = UnstructuredPDFLoader(file_path, 
                                   strategy="auto")
    documents = loader.load()  # Chaque page est un Document avec metadata
    print(documents)
    all_pdf_chunks = create_chunks(documents, filename, chunk_size, chunk_overlap)
    
    return all_pdf_chunks


def index_documents(chunks, embedding_model) : 
    vectorstore = FAISS.from_documents(chunks, embedding_model)
    vectorstore.save_local("../faiss_vectorstore/faiss_index_cv")

In [36]:
file_path = "../../data/CV_Data_Science_Haboubacar_TB.pdf"
#chunks = extract_and_split_pdf(file_path, 450, 64)[0:10]
#index_documents(chunks, embedding_model)

In [75]:
print(extract_and_split_pdf(file_path, 450, 64)[0].page_content)

[Document(metadata={'source': '../../data/CV_Data_Science_Haboubacar_TB.pdf'}, page_content='Haboubacar Tidjani Boukari\n\nData Scientist (NLP - GenAI - LLM)\n\nSp´ecialis´e en NLP, IA g´en´erative et machine learning, avec une expertise av´er´ee dans le d´eveloppement d’applications IA et l’optimisation de mod`eles. Fort de plusieurs exp´eriences r´eussies en NLP, MLOps, moteurs de recherche, et en gestion de projets. Je nourris depuis tout jeune l’envie et le rˆeve de travailler un jour chez Total Energie.\n\nCOMPETENCES\n\nLangages : Python, Node.js, SQL\n\nNLP & IA Gen : LangChain, LangGraph, Chainlit, FAISS, ONNX Runtime, Ollama, RAG Unsloth, LLM\n\nBig Data & Cloud: Spark, Hive, MySQL, PostgreSQL, Redis, Cloud Azure, GCP\n\nD´eploiement : Git, Docker, FastAPI, Flask\n\nOutils analytiques : Databricks, Power BI, MLFLOW, Alteryx\n\nSoft Skills : Leadership, gestion de projet agile, proactivit´e\n\nEXPERIENCES\n\nMc2i Consultant confirm´e - Data & IA\n\nDepuis Oct 2022\n\nMission : 

In [None]:
 
def load_youtube_video(url: str, chunk_size=1000, chunk_overlap=200):
    loader = YoutubeLoader.from_youtube_url(
        url,
        add_video_info=False,   
        language=["fr", "en"]  
    )
    documents = loader.load()
    

    return chunks

In [17]:
load_youtube_video("https://www.youtube.com/watch?v=VfIeaIFSCQA")

1
