We first load in the PDF documents and chunk them.

In [None]:
from langchain_community.document_loaders import PyPDFLoader
from langchain_text_splitters import RecursiveCharacterTextSplitter
import os

pdf_folder = "data/degree_requirements.pdf"

all_docs = []

for filename in os.listdir(pdf_folder):
    if filename.lower().endswith(".pdf"):
        pdf_path = os.path.join(pdf_folder, filename)
        loader = PyPDFLoader(pdf_path)
        docs = loader.load()

        # Optional: tag documents with source info
        for doc in docs:
            doc.metadata["source"] = filename

        all_docs.extend(docs)

print(f"Loaded {len(all_docs)} pages from {len(os.listdir(pdf_folder))} PDF files.")

# Split documents into smaller chunks
splitter = RecursiveCharacterTextSplitter(
    chunk_size=1000,   # size per chunk in characters
    chunk_overlap=200, # overlap between consecutive chunks
    length_function=len,
    separators=["\n\n", "\n", ".", " "]
)

chunks = splitter.split_documents(all_docs)
print(f"Total chunks created: {len(chunks)}")

Create an embedding store using OllamaEmbeddings and FAISS.

In [None]:
from langchain_community.vectorstores import FAISS
from langchain.embeddings import OllamaEmbeddings  # or OpenAIEmbeddings, etc.

embedding = OllamaEmbeddings(model="mxbai-embed-large")  # or any other embedding model
vectorstore = FAISS.from_documents(chunks, embedding)
vectorstore.save_local("degree_requirements_index")