In [11]:
from langchain_community.embeddings import HuggingFaceBgeEmbeddings
from tqdm import tqdm
from pdf_ingestor import extract_and_split_pdf
from ytb_ingestor import extract_and_split_ytb
from vectorestore import index_documents, create_vectorstore
import os 
from logger import get_logger
logger = get_logger()

In [8]:
embedding_model = HuggingFaceBgeEmbeddings(
    model_name="../../multilingual-e5-large",
    encode_kwargs={"normalize_embeddings": True},
    query_instruction="query: "
)

In [18]:
create_vectorstore(embedding_model, "../../faiss_vectorestore")

[20:58:55.755] INFO - Création d'un index FAISS vide (dim = 1024)
[20:58:55.923] INFO - Sauvegarde de l'index vide dans: ../../faiss_vectorestore
[20:58:55.925] INFO - Index vide (cosinus) créé avec succès.


<langchain_community.vectorstores.faiss.FAISS at 0x1ea2f3396a0>

In [19]:
folder_path = "../../data"
pdf_files_list = ["Renault DEU 2020.pdf", "Renault DEU 2021.pdf", "Renault URD 2022.pdf", "Renault URD 2023.pdf", "Renault 2024 Rapport d’activité.pdf"]

dict_ytb_videos_urls = {
    "PLAN STRATEGIQUE RENAULUTION" : "https://www.youtube.com/watch?app=desktop&v=EtivAvmDr2Q&t=901s",
    "Conférence résultats financiers 2021 de Renault Group" : "https://www.youtube.com/watch?v=VfIeaIFSCQA",
    "Conférence résultats financiers 2022 de Renault Group" : "https://www.youtube.com/watch?v=UWHlyjVtwT8",
    "Conférence résultats financiers 2023 de Renault Group" : "https://www.youtube.com/watch?v=B57wephix-w",
    "Conférence résultats financiers 2024 de Renault Group" : "https://www.youtube.com/watch?v=BA5ZOtWfpY0",
}

In [20]:
pdf_chunks = []
ytb_chunks = []
CHUNK_SIZE = 450
OVERLAP = 64

for filename in (pdf_files_list) : 
    file_path = os.path.join(folder_path, filename)
    pdf_chunks.extend(extract_and_split_pdf(file_path, CHUNK_SIZE, OVERLAP))

"""for video_name, url in dict_ytb_videos_urls.items() : 
    ytb_chunks.extend(extract_and_split_ytb(url, video_name, CHUNK_SIZE, OVERLAP))

all_chunks = pdf_chunks + ytb_chunks"""

[20:58:59.715] INFO - Chargement du PDF : Renault DEU 2020.pdf
Cannot set gray non-stroke color because /'P1' is an invalid float value
Cannot set gray non-stroke color because /'P2' is an invalid float value
[21:03:47.726] INFO - PDF chargé avec succès : Renault DEU 2020.pdf (1 pages)
[21:03:48.002] INFO - Chunking terminé : 3423 chunks générés
[21:03:48.003] INFO - Chargement du PDF : Renault DEU 2021.pdf
[21:04:58.240] INFO - PDF chargé avec succès : Renault DEU 2021.pdf (1 pages)
[21:04:58.590] INFO - Chunking terminé : 3022 chunks générés
[21:04:58.591] INFO - Chargement du PDF : Renault URD 2022.pdf
Cannot set gray non-stroke color because /'P0' is an invalid float value
[21:05:39.757] INFO - PDF chargé avec succès : Renault URD 2022.pdf (1 pages)
[21:05:39.758] INFO - Chunking terminé : 0 chunks générés
[21:05:39.760] INFO - Chargement du PDF : Renault URD 2023.pdf
[21:07:28.045] INFO - PDF chargé avec succès : Renault URD 2023.pdf (1 pages)
[21:07:28.570] INFO - Chunking termin

'for video_name, url in dict_ytb_videos_urls.items() : \n    ytb_chunks.extend(extract_and_split_ytb(url, video_name, CHUNK_SIZE, OVERLAP))\n\nall_chunks = pdf_chunks + ytb_chunks'

In [24]:
ytb_chunks = []
CHUNK_SIZE = 450
OVERLAP = 64

for video_name, url in dict_ytb_videos_urls.items() : 
    ytb_chunks.extend(extract_and_split_ytb(url, video_name, CHUNK_SIZE, OVERLAP))

[00:00:55.260] INFO - Extraction des sous-titres pour la vidéo : PLAN STRATEGIQUE RENAULUTION (https://www.youtube.com/watch?app=desktop&v=EtivAvmDr2Q&t=901s)
[00:00:56.489] ERROR - Erreur lors de l'extraction ou du chunking de la vidéo 'PLAN STRATEGIQUE RENAULUTION': 
Could not retrieve a transcript for the video https://www.youtube.com/watch?v=EtivAvmDr2Q! This is most likely caused by:

Request to YouTube failed: 429 Client Error: Too Many Requests for url: https://www.youtube.com/api/timedtext?v=EtivAvmDr2Q&ei=lf1maKSiJYvlxN8PtoKzyQo&caps=asr&opi=112496729&xoaf=5&hl=en&ip=0.0.0.0&ipbits=0&expire=1751605253&sparams=ip,ipbits,expire,v,ei,caps,opi,xoaf&signature=3D58C2AA3C75C3059599410E31F275AF3AF01411.695190225552EBFEBECBA6E8CAADF459EA12A9DE&key=yt8&kind=asr&lang=fr

If you are sure that the described cause is not responsible for this error and that a transcript should be retrievable, please create an issue at https://github.com/jdepoix/youtube-transcript-api/issues. Please add which

In [23]:
index_documents(pdf_chunks, embedding_model, "../../faiss_vectorestore")

[21:07:59.148] INFO - Début de l’indexation de 9678 chunks en batches de 64...
[21:07:59.149] INFO - Calcul des embeddings et indexation par batch avec barre de progression...
Indexing:   0%|          | 0/152 [00:00<?, ?batch/s][21:07:59.153] INFO - Indexation du batch 1...
Indexing:   1%|          | 1/152 [00:49<2:05:06, 49.71s/batch][21:08:48.865] INFO - Indexation du batch 2...
Indexing:   1%|▏         | 2/152 [02:04<2:40:40, 64.27s/batch][21:10:03.322] INFO - Indexation du batch 3...
Indexing:   2%|▏         | 3/152 [03:10<2:42:11, 65.31s/batch][21:11:09.874] INFO - Indexation du batch 4...
Indexing:   3%|▎         | 4/152 [04:15<2:40:27, 65.05s/batch][21:12:14.530] INFO - Indexation du batch 5...
Indexing:   3%|▎         | 5/152 [05:49<3:04:38, 75.36s/batch][21:13:48.173] INFO - Indexation du batch 6...
Indexing:   4%|▍         | 6/152 [07:05<3:03:57, 75.60s/batch][21:15:04.241] INFO - Indexation du batch 7...
Indexing:   5%|▍         | 7/152 [08:05<2:51:02, 70.78s/batch][21:16:05

In [15]:
"""pdf_chunks = extract_and_split_pdf(file_path, 450, 64)
ytb_chunks = extract_and_split_ytb("https://www.youtube.com/watch?v=VfIeaIFSCQA", 450, 64)"""

'pdf_chunks = extract_and_split_pdf(file_path, 450, 64)\nytb_chunks = extract_and_split_ytb("https://www.youtube.com/watch?v=VfIeaIFSCQA", 450, 64)'

In [None]:
import pickle

with open("../../vectorstore/documents.pkl", "wb") as f:
    pickle.dump(pdf_chunks, f)