In [1]:
import os 
import argparse
import shutil
from langchain.document_loaders.pdf import PyPDFDirectoryLoader
from langchain_text_splitters import RecursiveCharacterTextSplitter
from langchain.schema.document import Document
from langchain.vectorstores.chroma import Chroma
from langchain_community.embeddings.ollama import OllamaEmbeddings
from langchain.prompts import ChatPromptTemplate
from langchain_community.llms.llamafile import Llamafile

In [12]:
chroma_path = "/Users/jps93/Desktop/AIPI_561_Project/chroma"
data_path = "/Users/jps93/Desktop/Retina"
#local_model_path = "/Users/jps93/Desktop/llamafiles/TinyLlama-1.1B-Chat-v1.0.F16.llamafile"


PROMPT_TEMPLATE = """
Answer the question based only on the following context:

You are an experienced vitreoretinal surgeon speaking with trainees. You can answer detailed questions about vitreoretinal surgery concisely and accurately. Keep response length short (within 100 works if possible)
You interpret the following abbreviations automatically: 

AC - anterior chamber, ACIOL – anterior chamber intraocular lens, AFX - air-fluid exchange,
AIDS - acquired immunodeficiency syndrome, AMD - age-related macular degeneration, AP - anterior-posterior, 
APD - afferent pupillary defect, BP - blood pressure, BRVO - branch retinal vein occlusion, 
BSS - balanced salt solution, cc - cubic centimeter, CME – cystoid macular edema, CMV - cytomegalovirus, 
CNV - choroidal neovascularization, cpm – cuts per minute, CSF- cerebrospinal fluid, 
CT - computed tomography, D5W - dextrose 5% in water, DD - disc diameter, DDX - differential diagnosis, 
EBV - Epstein-Barr virus, ERG – electroretinogram, ERM - epiretinal membrane, EUA - examination under anesthesia, 
FAX - fluid-air exchange, FB - foreign body, FDA – Food and Drug Administration,, FEVR - familial exudative vitreoretinopathy, 
FTMH – full-thickness macular hole, G – gauge, GA – geographic atrophy, GRT - giant retinal tear, 
HIV - human immunodeficiency virus, HOB - head of bed, HSV - Herpes simplex virus, HZV - Herpes zoster virus, 
ICG - indocyanine green, ILM - internal limiting membrane, IOFB - intraocular foreign body, 
IOL – intraocular lens, IOP – intraocular pressure, IRF- intraretinal fluid, IRH - intraretinal hemorrhage, 
IV - intravenous, LASIK - laser-assisted in situ keratomileusis, LP – light perception, mcg - microgram, 
MH - macular hole, MIVS - microincisional vitrectomy surgery, MRI - magnetic resonance imaging, ms - millisecond,
MVR - microvitreoretinal, mW - milliwatt, NCVH - non-clearing vitreous hemorrhage, NLP - no light perception, 
nm – nanometer, NSAID - nonsteroidal anti-inflammatory drug, NV - neovascularization, NVD – neovascularization of the disc, 
NVI – neovascularization of the iris, OCT - optical coherence tomography, OR - operating room, PCIOL – posterior chamber intraocular lens, 
PCR - polymerase chain reaction, PCV - polypoidal choroidal vasculopathy, PFCL - perfluorocarbon liquid, PI - peripheral iridotomy, 
PPV - pars plana vitrectomy, PRH - preretinal hemorrhage, PRK - photorefractive keratectomy, PRP - panretinal laser photocoagulation, 
PVD - posterior vitreous detachment, PVR - proliferative vitreoretinopathy, RAM - retinal arterial macroaneurysm, 
RD - retinal detachment, RRD - rhegmatogenous retinal detachment, ROP - retinopathy of prematurity, RP - retinitis pigmentosa,
RPE - retinal pigment epithelium, rtPA - recombinant tissue plasminogen activator, SB - scleral buckle, SRF - subretinal fluid, 
SRH - subretinal hemorrhage, TB - tuberculin, TRD - tractional retinal detachment, UGH – uveitis-glaucoma-hyphema, VA - visual acuity, 
VEGF - vascular endothelial growth factor, VH – vitreous hemorrhage, VMT - vitreomacular traction, VR - vitreoretinal, VZV - Varicella Zoster virus"

{context}

---

Answer the question based on the above context: {question}

"""

In [13]:
def load_documents():
    document_loader = PyPDFDirectoryLoader(data_path)
    return document_loader.load()

In [14]:
documents = load_documents()

In [15]:

def get_embedding_function():
    embeddings = OllamaEmbeddings(model = "nomic-embed-text")
    return embeddings

In [16]:
def split_documents(documents: list[Document]):
    text_splitter = RecursiveCharacterTextSplitter(
        chunk_size=1000,
        chunk_overlap= 200,
        length_function=len,
        is_separator_regex=False,
    )
    return text_splitter.split_documents(documents)


In [17]:
documents = load_documents()
chunks = split_documents(documents)

In [18]:
def add_to_chroma(chunks: list[Document]):
    # Load the existing database.
    db = Chroma(
        persist_directory=chroma_path, embedding_function=get_embedding_function()
    )

    # Calculate Page IDs.
    chunks_with_ids = calculate_chunk_ids(chunks)

    # Add or Update the documents.
    existing_items = db.get(include=[])  # IDs are always included by default
    existing_ids = set(existing_items["ids"])
    print(f"Number of existing documents in DB: {len(existing_ids)}")

    # Only add documents that don't exist in the DB.
    new_chunks = []
    for chunk in chunks_with_ids:
        if chunk.metadata["id"] not in existing_ids:
            new_chunks.append(chunk)

    if len(new_chunks):
        print(f"👉 Adding new documents: {len(new_chunks)}")
        new_chunk_ids = [chunk.metadata["id"] for chunk in new_chunks]
        db.add_documents(new_chunks, ids=new_chunk_ids)
        db.persist()
    else:
        print("✅ No new documents to add")

In [19]:
def calculate_chunk_ids(chunks):

    # This will create IDs like "data/monopoly.pdf:6:2"
    # Page Source : Page Number : Chunk Index

    last_page_id = None
    current_chunk_index = 0

    for chunk in chunks:
        source = chunk.metadata.get("source")
        page = chunk.metadata.get("page")
        current_page_id = f"{source}:{page}"

        # If the page ID is the same as the last one, increment the index.
        if current_page_id == last_page_id:
            current_chunk_index += 1
        else:
            current_chunk_index = 0

        # Calculate the chunk ID.
        chunk_id = f"{current_page_id}:{current_chunk_index}"
        last_page_id = current_page_id

        # Add it to the page meta-data.
        chunk.metadata["id"] = chunk_id

    return chunks

In [20]:
add_to_chroma(chunks)

Number of existing documents in DB: 748
👉 Adding new documents: 694


In [21]:
def query_rag(query_text: str):
    # Prepare the DB.
    embedding_function = get_embedding_function()

    db = Chroma(persist_directory=chroma_path, embedding_function=embedding_function)

    # Search the DB.
    results = db.similarity_search_with_score(query_text, k=5)

    context_text = "\n\n---\n\n".join([doc.page_content for doc, _score in results])
    prompt_template = ChatPromptTemplate.from_template(PROMPT_TEMPLATE)
    prompt = prompt_template.format(context=context_text, question=query_text)

    # Instantiate and use the Llamafile to get the response
    llamafile = Llamafile()
    response_text = llamafile.invoke(prompt)

    sources = [doc.metadata.get("id", None) for doc, _score in results]
    formatted_response = f"Response: {response_text}\nSources: {sources}"
    print(formatted_response)
    return response_text

In [74]:
#query_text = "When using a DORC EVA machine, what is the purpose of using vacuum mode vs. flow mode?"

#query_rag(query_text)

In [75]:
#query_text = "How do I localize the position of breaks in retinal detachment?"

#query_rag(query_text)

In [76]:
#query_text = "What is BSS Plus infusion fluid and what are its properties?"

#query_rag(query_text)

In [77]:
#query_text = "What do I do when the hyaloid is particularly difficult to lift?"

#query_rag(query_text)

In [78]:
#query_text = "What are risks associated with using Endolaser?"

#query_rag(query_text)

In [79]:
#query_text = "What are the different probe types for Endolaser probes?"

#query_rag(query_text)

In [24]:
query_text = "What are the settings for repeat vs continuous mode for Endolaser?"

query_rag(query_text)

Response: Repeat Mode:  i. Initial settings: Power 200 mw, Duration 0.1 - 0.2 sec. Adjust power and distance of probe tip to retina to get desired burn size and intensity. Set the aiming beam brightness so it does not interfere with determining spot intensity, which should be gray-white, not bone-white, popping, or smoking.

Continuous Mode:

---

The settings for continuous mode are the same as repeat mode. However, the primary difference is that continuous mode allows the laser to operate indefinitely without interruption by turning it off and on again. In contrast, repeat mode requires continuous power (200 mW) to maintain adequate burn size and intensity.</s>
Sources: ['/Users/jps93/Desktop/RAG/Duke_Manual_PDFs/Vitreoretinal_Surgery/CHAPTER 11 ENDOLASER TYPES AND SETTINGS-need help with formatting (1).pdf:1:0', '/Users/jps93/Desktop/RAG/Duke_Manual_PDFs/Vitreoretinal_Surgery/CHAPTER 11 ENDOLASER TYPES AND SETTINGS-need help with formatting (1).pdf:1:1', '/Users/jps93/Desktop/RAG/Du

'Repeat Mode:  i. Initial settings: Power 200 mw, Duration 0.1 - 0.2 sec. Adjust power and distance of probe tip to retina to get desired burn size and intensity. Set the aiming beam brightness so it does not interfere with determining spot intensity, which should be gray-white, not bone-white, popping, or smoking.\n\nContinuous Mode:\n\n---\n\nThe settings for continuous mode are the same as repeat mode. However, the primary difference is that continuous mode allows the laser to operate indefinitely without interruption by turning it off and on again. In contrast, repeat mode requires continuous power (200 mW) to maintain adequate burn size and intensity.</s>'

In [23]:
query_text = "What are the risk factors for age related macular degeneration?"

query_rag(query_text)

Response: Risk factors for age-related macular degeneration (AMD) are older age, genetic risk, smoking, and low dietary intake of antioxidants. The epidemiology of AMD is predominantly driven by a combination of these factors. Risk factors have been shown to increase the likelihood of developing AMD over time, with some studies suggesting that early risk factors may be modifiable through lifestyle changes such as smoking cessation and regular exercise.

Examples of foods rich in antioxidants include berries (blueberries, strawberries), leafy green vegetables (spinach, kale, collard greens), cruciferous vegetables (broccoli, Brussels sprouts, cauliflower), and citrus fruits (oranges, grapefruit). Other foods that contain antioxidants include nuts, seeds, legumes, and fish.

Smoking is a major risk factor for AMD. In fact, smokers are 35-60% more likely to develop AMD than non-smokers. Smoking also increases the risk of other age-related eye diseases such as age-related macular degenerat

'Risk factors for age-related macular degeneration (AMD) are older age, genetic risk, smoking, and low dietary intake of antioxidants. The epidemiology of AMD is predominantly driven by a combination of these factors. Risk factors have been shown to increase the likelihood of developing AMD over time, with some studies suggesting that early risk factors may be modifiable through lifestyle changes such as smoking cessation and regular exercise.\n\nExamples of foods rich in antioxidants include berries (blueberries, strawberries), leafy green vegetables (spinach, kale, collard greens), cruciferous vegetables (broccoli, Brussels sprouts, cauliflower), and citrus fruits (oranges, grapefruit). Other foods that contain antioxidants include nuts, seeds, legumes, and fish.\n\nSmoking is a major risk factor for AMD. In fact, smokers are 35-60% more likely to develop AMD than non-smokers. Smoking also increases the risk of other age-related eye diseases such as age-related macular degeneration (