In [3]:
import os
from langchain.document_loaders import PyPDFLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.embeddings import SentenceTransformerEmbeddings
from langchain.vectorstores import Chroma

In [4]:
kb_folder = '../knowledge_base/'

all_docs = []

print("Loading documents from knowledge_base...")

for file_name in os.listdir(kb_folder):
    if file_name.endswith('.pdf'):
        file_path = os.path.join(kb_folder, file_name)
        loader = PyPDFLoader(file_path)
        docs = loader.load_and_split() 

        all_docs.extend(docs)

print(f"Loaded {len(all_docs)} pages from {len(os.listdir(kb_folder))} PDF files.")

Loading documents from knowledge_base...
Loaded 31 pages from 5 PDF files.


In [5]:
text_splitter = RecursiveCharacterTextSplitter(
    chunk_size=1000,
    chunk_overlap=200 
)

print("Splitting documents into chunks...")
chunks = text_splitter.split_documents(all_docs)

print(f"Total chunks created: {len(chunks)}")

Splitting documents into chunks...
Total chunks created: 115


In [9]:
embedding_model = SentenceTransformerEmbeddings(model_name="all-MiniLM-L6-v2")

In [10]:
db_path = '../models/chroma_db'
print("Creating and saving vector database.")
vector_db = Chroma.from_documents(
    documents=chunks,
    embedding=embedding_model,
    persist_directory=db_path
)

print(f"Vector database saved to: {db_path}")

Creating and saving vector database.
Vector database saved to: ../models/chroma_db


In [None]:
retriever = vector_db.as_retriever(search_kwargs={"k": 3})

query = "What can reduce the chances of getting a heart attack"

print(f"\nTesting database with query: '{query}'")

relevant_docs = retriever.invoke(query)

for i, doc in enumerate(relevant_docs):
    print(f"\n--- RELEVANT CHUNK {i+1} ---")
    print(doc.page_content)
    print(f"(Source: {doc.metadata['source']})")


Testing database with query: 'What can reduce the chances of getting a heart attack'

--- RELEVANT CHUNK 1 ---
` Reduces the risk of abdominal aortic aneurysm, with risk reduction increasing with time since cessation. 
 ` May reduce the risk of atrial fibrillation, sudden cardiac death, heart failure, venous thromboembolism, and peripheral arterial disease.
Benefits of Smoking Cessation for Patients With Coronary Heart Disease
 ` Reduces the risk of all-cause mortality.
 ` Reduces the risk of death due to cardiac causes and sudden death.
 ` Reduces the risk of new and recurrent cardiac events.
Smoking and Cardiovascular Disease 
What Healthcare Professionals Need to Know
Smoking cessation improves cardiovascular 
health. Healthcare professionals, particularly 
those in cardiovascular care, should treat 
patients’ tobacco use and dependence.
(Source: ../knowledge_base/smoking.pdf)

--- RELEVANT CHUNK 2 ---
` Reduces the risk of abdominal aortic aneurysm, with risk reduction increasing 