In [1]:
import os
from langchain_community.vectorstores import FAISS
from langchain_community.docstore.in_memory import InMemoryDocstore
from langchain_ollama import OllamaEmbeddings
from langchain_text_splitters import RecursiveCharacterTextSplitter
from langchain_community.document_loaders import PyMuPDFLoader
import faiss

# --- Config ---
PDF_FOLDER = "Data"
MODEL_NAME = "nomic-embed-text"
BASE_URL = "http://localhost:11434"
SAVE_DIR = "Testing_Big_data"

os.makedirs(SAVE_DIR, exist_ok=True)

# --- Load PDFs ---
pdfs = [
    os.path.join(root, file)
    for root, dirs, files in os.walk(PDF_FOLDER)
    for file in files
    if file.endswith(".pdf")
]

print(f"Found PDFs: {pdfs}")
print(len(pdfs))

Found PDFs: ['Data\\1mb.pdf', 'Data\\2mb.pdf', 'Data\\3-mb-sample-pdf-file.pdf', 'Data\\4-mb-example-file.pdf', 'Data\\40mb.pdf', 'Data\\5-mb-example-file.pdf', 'Data\\6mb.pdf', 'Data\\7mb.pdf', 'Data\\8mb.pdf']
9


In [2]:

# Load documents with PyMuPDFLoader
from langchain.schema import Document

docs = []
for pdf in pdfs:
    loader = PyMuPDFLoader(pdf)
    pages = loader.load()
    for page in pages:
        # Add metadata, e.g., source PDF and page number
        page.metadata["source"] = pdf
        page.metadata["page"] = page.metadata.get("page", None)  # if available
        docs.append(page)


In [3]:

# --- Chunk documents ---
text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=100)
chunks = text_splitter.split_documents(docs)
print(f"Split into {len(chunks)} chunks")


Split into 52686 chunks


In [4]:

# --- Embedding model ---
embedding_model = OllamaEmbeddings(model=MODEL_NAME, base_url=BASE_URL)

In [5]:
# Get vector dimension from sample embedding
sample_embedding = embedding_model.embed_query("test")
dim = len(sample_embedding)
print(f"Embedding dimension: {dim}")

Embedding dimension: 768


In [6]:

# --- Create FAISS index ---
index = faiss.IndexFlatL2(dim)

# --- Create vector store ---
vector_store = FAISS(
    embedding_function=embedding_model,
    index=index,
    docstore=InMemoryDocstore(),
    index_to_docstore_id={}
)

In [None]:
# Add docs to vector store
ids = vector_store.add_documents(chunks)
print(f"Added {len(ids)} documents to vector store")


In [None]:

# Save FAISS index + metadata
vector_store.save_local(SAVE_DIR)
print(f"Saved vector store to {SAVE_DIR}")