# Create Vector Database to be used for local RAG

How to create a simple Vector Database for storing data chunks extracted from a PDF file

- PDF file: Lafayette High School Course Directory 2025-2026
- VectorDB: Chroma
- Embedding Model: HuggingFaceEmbeddings

In [None]:
from langchain_chroma import Chroma
from langchain.text_splitter import RecursiveCharacterTextSplitter, Language
from langchain_huggingface.embeddings import HuggingFaceEmbeddings
from uuid import uuid4

## Load PDF file using a document loader

In [None]:
from langchain_community.document_loaders import PyPDFLoader

file_path = "LHS_Course_Directory_25-26.pdf"

loader = PyPDFLoader(file_path)
pages = []
async for page in loader.alazy_load():
    pages.append(page)

## Clean blank lines from pages

In [None]:
# Clean up blank lines and extra whitespace from each page
for page in pages:
    # Remove blank lines and excessive whitespace
    lines = page.page_content.split('\n')
    # Filter out empty lines and strip whitespace from each line
    cleaned_lines = [line.strip() for line in lines if line.strip()]
    # Join back with single newlines
    page.page_content = '\n'.join(cleaned_lines)

### (optional) Check a sample page

In [None]:
# Check the cleaned content
print("Cleaned page Example:")
print(repr(pages[25].metadata))
print(repr(pages[25].page_content))  # Show first 500 chars with repr to see newlines

## Split each page into smaller chunks

Tip: Often, but not always, necessary.

In [None]:
text_splitter = RecursiveCharacterTextSplitter(
    chunk_size=1000, chunk_overlap=100, add_start_index=True
)
splits = text_splitter.split_documents(pages)

## Choose an embedding model, DB, and store the text chunks

In [None]:
# Embedding model
embedding_model = HuggingFaceEmbeddings()

# Vector store (and make it persist)
database_loc = ("./chroma_db_test1")
vector_store = Chroma(embedding_function=embedding_model,
                      persist_directory=database_loc)

uuids = [str(uuid4()) for _ in range(len(splits))]

# Add text chunks
vector_store.add_documents(splits, ids=uuids)