In [22]:
import os
from langchain.document_loaders import DirectoryLoader, PyPDFLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.embeddings import HuggingFaceEmbeddings
from langchain.vectorstores import FAISS

In [23]:
DATA_PATH = "pdfs"


In [24]:
loader = DirectoryLoader(DATA_PATH, glob="*.pdf", loader_cls=PyPDFLoader)
documents = loader.load()
print(f"Loaded {len(documents)} documents.")

Loaded 2 documents.


In [25]:
text_splitter = RecursiveCharacterTextSplitter(chunk_size=500, chunk_overlap=50)
chunks = text_splitter.split_documents(documents)
print(f"Total Chunks Created: {len(chunks)}")


embedding_model = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")
print("Embedding model loaded.")

Total Chunks Created: 3
Embedding model loaded.


In [30]:
vector_store = FAISS.from_documents(chunks, embedding_model)
vector_store.save_local("vector_store")
if os.path.exists("vector_store"):
    print("✅ FAISS vector store created successfully!")
else:
    print("❌ Error: FAISS vector store not found.")


for doc in documents:
    print(doc.metadata)  

✅ FAISS vector store created successfully!
{'producer': 'iLovePDF', 'creator': 'PyPDF', 'creationdate': '', 'moddate': '2025-02-16T16:26:17+00:00', 'source': 'pdfs\\Printing of SNS DT Playbook after Editing for SNS Students (2)_removed.pdf', 'total_pages': 1, 'page': 0, 'page_label': '1'}
{'producer': 'Qt 4.8.7', 'creator': 'wkhtmltopdf 0.12.3', 'creationdate': '2024-01-02T17:48:54+05:30', 'title': '', 'source': 'pdfs\\R23-2407402.pdf', 'total_pages': 1, 'page': 0, 'page_label': '1'}
