In [6]:
from pathlib import Path
from pypdf import PdfReader
from sentence_transformers import SentenceTransformer
import faiss
import numpy as np


In [7]:
# Load all PDF files from docs/ folder
pdf_dir = Path("../docs")  # adjust if your path is different
all_texts = []

for pdf_path in pdf_dir.glob("*.pdf"):
    reader = PdfReader(pdf_path)
    text = ""
    for page in reader.pages:
        text += page.extract_text() or ""
    
    # Optional: skip empty files
    if text.strip():
        all_texts.append({"filename": pdf_path.name, "content": text})


In [8]:
# Split texts into smaller chunks
def split_text(text, chunk_size=500, overlap=50):
    chunks = []
    start = 0
    while start < len(text):
        end = start + chunk_size
        chunk = text[start:end]
        chunks.append(chunk)
        start += chunk_size - overlap
    return chunks

# Apply to all loaded PDFs
documents = []
for item in all_texts:
    chunks = split_text(item["content"])
    for chunk in chunks:
        documents.append({"filename": item["filename"], "text": chunk})


In [9]:
# Load a sentence transformer model
model = SentenceTransformer("all-MiniLM-L6-v2")

# Generate embeddings
texts = [doc["text"] for doc in documents]
embeddings = model.encode(texts, show_progress_bar=True)


Batches: 100%|███████████████████████████████████████████████████████████████████████████| 1/1 [00:00<00:00,  2.13it/s]


In [10]:
# Create FAISS index
dimension = embeddings[0].shape[0]
index = faiss.IndexFlatL2(dimension)
index.add(np.array(embeddings))

# Optional: Save index and documents
faiss.write_index(index, "docs/lmu_index.faiss")

import pickle
with open("docs/lmu_documents.pkl", "wb") as f:
    pickle.dump(documents, f)
