In [1]:
import os
from langchain.document_loaders import PyPDFLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.embeddings import HuggingFaceEmbeddings
from langchain_community.vectorstores import LanceDB
from langchain_community.embeddings import HuggingFaceEmbeddings
import lancedb
from langchain.chains import RetrievalQA
from langchain_openai import ChatOpenAI


In [2]:
PDF_PATH = "Kenya-ARV-Guidelines-2022-Final-1.pdf"

loader = PyPDFLoader(PDF_PATH)
documents = loader.load()

print(f"Loaded {len(documents)} pages")


Loaded 286 pages


In [3]:
# Chunking

In [4]:
text_splitter = RecursiveCharacterTextSplitter(
    chunk_size=800,
    chunk_overlap=150,
    separators=["\n\n", "\n", ".", " "]
)

chunks = text_splitter.split_documents(documents)
print(f"Created {len(chunks)} chunks")


Created 1013 chunks


In [5]:
documents

[Document(page_content=' \n \n', metadata={'source': 'Kenya-ARV-Guidelines-2022-Final-1.pdf', 'page': 0}),
 Document(page_content='', metadata={'source': 'Kenya-ARV-Guidelines-2022-Final-1.pdf', 'page': 1}),
 Document(page_content='  \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \nKenya HIV Prevention and Treatment Guidelines, 2022  \n \n2022 Edition  \n ', metadata={'source': 'Kenya-ARV-Guidelines-2022-Final-1.pdf', 'page': 2}),
 Document(page_content=' \n  \n \n \n \n \n \n \n \n \n \n© National AIDS & STI Control Program 2022  \n \nThis guideline document is a publication of the National AIDS & STI Control Program, Ministry of Health \nKenya. No part of this publication may be reproduced, distributed, or transmitted in any form or by any \nmeans, including photocopying or recording, without the prior written permission of the National AIDS and \nSTI Contro l Program (NASCOP), Ministry of Health Kenya, except for non -commercial uses permitted by \ncopyright la

In [6]:
chunks

[Document(page_content='Kenya HIV Prevention and Treatment Guidelines, 2022  \n \n2022 Edition', metadata={'source': 'Kenya-ARV-Guidelines-2022-Final-1.pdf', 'page': 2}),
 Document(page_content='© National AIDS & STI Control Program 2022  \n \nThis guideline document is a publication of the National AIDS & STI Control Program, Ministry of Health \nKenya. No part of this publication may be reproduced, distributed, or transmitted in any form or by any \nmeans, including photocopying or recording, without the prior written permission of the National AIDS and \nSTI Contro l Program (NASCOP), Ministry of Health Kenya, except for non -commercial uses permitted by \ncopyright law.  \n \nKenya HIV Prevention and Treatment Guidelines, 2022  edition contain relevant information required by \nhealthcare providers in the use of ARVs as of the date of issue. All reasonable precautions have been taken \nby NASCOP to verify the information contained in this guideline document.', metadata={'source': '

In [7]:
# Chunking wit LLM

In [8]:
from dotenv import load_dotenv
load_dotenv()  # OPENAI_API_KEY from .env


In [9]:
from openai import OpenAI

openai_client = OpenAI()


def llm(prompt, model='gpt-4o-mini'):
    messages = [
        {"role": "user", "content": prompt}
    ]

    response = openai_client.responses.create(
        model='gpt-4o-mini',
        input=messages
    )

    return response.output_text

In [10]:
prompt_template = """
Split the provided document into logical sections
that make sense for a Q&A system.

Each section should be self-contained and cover
a specific topic or concept.

<DOCUMENT>
{document}
</DOCUMENT>

Use this format:

## Section Name

Section content with all relevant details

---

## Another Section Name

Another section content

---
""".strip()

In [11]:
def intelligent_chunking(text):
    prompt = prompt_template.format(document=text)
    response = llm(prompt)
    sections = response.split('---')
    sections = [s.strip() for s in sections if s.strip()]
    return sections

In [12]:
documents

[Document(page_content=' \n \n', metadata={'source': 'Kenya-ARV-Guidelines-2022-Final-1.pdf', 'page': 0}),
 Document(page_content='', metadata={'source': 'Kenya-ARV-Guidelines-2022-Final-1.pdf', 'page': 1}),
 Document(page_content='  \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \nKenya HIV Prevention and Treatment Guidelines, 2022  \n \n2022 Edition  \n ', metadata={'source': 'Kenya-ARV-Guidelines-2022-Final-1.pdf', 'page': 2}),
 Document(page_content=' \n  \n \n \n \n \n \n \n \n \n \n© National AIDS & STI Control Program 2022  \n \nThis guideline document is a publication of the National AIDS & STI Control Program, Ministry of Health \nKenya. No part of this publication may be reproduced, distributed, or transmitted in any form or by any \nmeans, including photocopying or recording, without the prior written permission of the National AIDS and \nSTI Contro l Program (NASCOP), Ministry of Health Kenya, except for non -commercial uses permitted by \ncopyright la

In [13]:
from tqdm.auto import tqdm
from concurrent.futures import ThreadPoolExecutor, as_completed

guides_chunks = []

def process_doc(doc):
    """
    Splits a single document into sections and returns a list of section dicts.
    """
    doc_content = doc.page_content
    doc_metadata = doc.metadata
    sections = intelligent_chunking(doc_content)
    return [{**doc_metadata, 'section': section} for section in sections]

# Use ThreadPoolExecutor to process multiple docs in parallel
with ThreadPoolExecutor() as executor:
    futures = [executor.submit(process_doc, doc) for doc in documents]
    
    # Create a tqdm progress bar with total=len(futures)
    for future in tqdm(as_completed(futures), total=len(futures), desc="Processing docs"):
        # future.result() blocks until the future is done
        guides_chunks.extend(future.result())

print(f"Total chunks created: {len(guides_chunks)}")



Processing docs:   0%|          | 0/286 [00:00<?, ?it/s]

Total chunks created: 1277


In [15]:
# Embedding

In [16]:
embeddings = HuggingFaceEmbeddings(
    model_name="sentence-transformers/all-MiniLM-L6-v2"
)


In [17]:
guides_chunks

[{'source': 'Kenya-ARV-Guidelines-2022-Final-1.pdf',
  'page': 0,
  'section': 'Sure! Please provide the document you would like me to split into logical sections.'},
 {'source': 'Kenya-ARV-Guidelines-2022-Final-1.pdf',
  'page': 1,
  'section': 'Sure, please provide the document you would like me to split into logical sections for your Q&A system.'},
 {'source': 'Kenya-ARV-Guidelines-2022-Final-1.pdf',
  'page': 3,
  'section': '## Copyright Information\n\nThis guideline document is a publication of the National AIDS & STI Control Program, Ministry of Health Kenya. No part of this publication may be reproduced, distributed, or transmitted in any form or by any means, including photocopying or recording, without the prior written permission of the National AIDS and STI Control Program (NASCOP), Ministry of Health Kenya, except for non-commercial uses permitted by copyright law.'},
 {'source': 'Kenya-ARV-Guidelines-2022-Final-1.pdf',
  'page': 3,
  'section': '## Purpose of Guidelines\n

In [19]:
from langchain_community.vectorstores import LanceDB
from langchain_openai import OpenAIEmbeddings
from langchain.schema import Document
import lancedb
import pyarrow as pa


# LanceDB configuration
db_path = "./kenya_arv_guidelines_lancedb"
table_name = "kenya-arv-guidelines"

# Connect to LanceDB
db = lancedb.connect(db_path)

# Drop existing table if it exists (for fresh start)
if table_name in db.table_names():
    print(f"Dropping existing table '{table_name}'...")
    db.drop_table(table_name)

# Create vectorstore using add_texts method
print("Creating new vectorstore...")

# Extract texts and metadatas
texts = [doc.page_content for doc in documents]
metadatas = [doc.metadata for doc in documents]

# Generate embeddings manually
print("Generating embeddings...")
vectors = embeddings.embed_documents(texts)

# Create the table manually with proper schema
data = []
for i, (text, metadata, vector) in enumerate(zip(texts, metadatas, vectors)):
    data.append({
        "text": text,
        "vector": vector,
        "id": str(i),
        "source": metadata.get("source", ""),
        "page": metadata.get("page", 0)
    })

# Create table
table = db.create_table(table_name, data=data, mode="overwrite")
print(f"✓ Created table with {len(data)} records")

# Now create the LanceDB vectorstore wrapper
vectorstore = LanceDB(
    connection=table,
    embedding=embeddings
)
print("✓ Vectorstore created successfully!")

# Search
query = "What are the first-line ART regimens?"
print(f"\n🔍 Searching for: '{query}'")
print("=" * 80)

results = vectorstore.similarity_search(query, k=4)

for i, doc in enumerate(results, 1):
    print(f"\n📄 Result {i}:")
    print(f"Source: {doc.metadata.get('source', 'Unknown')}")
    print(f"Page: {doc.metadata.get('page', 'N/A')}")
    print(f"\nContent:\n{doc.page_content[:300]}...")
    print("-" * 80)

Creating new vectorstore...
Generating embeddings...
✓ Created table with 286 records
✓ Vectorstore created successfully!

🔍 Searching for: 'What are the first-line ART regimens?'

📄 Result 1:
Source: Kenya-ARV-Guidelines-2022-Final-1.pdf
Page: 236

Content:
  
Annexes  
 
13 - 15 Annex 8: Cont.  
Section 13: Management plan  
• Which investigations will you have today  
- See Table 3.2 and Table 3.5 for recommended baseline and follow -up investigations 
respectively  
• Which medications will you start today  
- May include: ART; CPT; TPT; other  
• W...
--------------------------------------------------------------------------------

📄 Result 2:
Source: Kenya-ARV-Guidelines-2022-Final-1.pdf
Page: 54

Content:
  
Initial Evaluation and Follow up  
3 - 9  3.5.1 First 6 months after ART initiation  
After ART initiation, patients need to be monitored closely for development of adverse drug events, 
identify and address barriers to adherence, and development of IRIS. A reasonable follow

[2026-02-10T15:39:30Z WARN  lance::dataset] No existing dataset at /Users/itsmuriuki/Desktop/cdss-notebooks/kenya-hiv-cdss/kenya_arv_guidelines_lancedb/kenya-arv-guidelines.lance, it will be created
