In [18]:
import os
import openai
import nest_asyncio
import json
import numpy as np

from dotenv import load_dotenv
from neo4j import GraphDatabase
from langchain.vectorstores.neo4j_vector import Neo4jVector
from llama_cloud_services import LlamaParse
from numpy.linalg import norm

# loading variables from .env file
load_dotenv() # pass a path if it's not a .env in the current working directory 

# Required for async execution in Jupyter
nest_asyncio.apply()

In [19]:
# Define folder path
folder_path = "/Users/lavonda/Documents/RAG/fulldata"

# List all PDF files in the folder
pdf_files = [os.path.join(folder_path, file) for file in os.listdir(folder_path) if file.endswith(".pdf")]

# Print the list of PDF files to verify they are loaded correctly
print(f"Total PDFs found: {len(pdf_files)}")
print("First 5 PDF files (for verification):")
for file in pdf_files[:5]:  # Print only the first 5 files to avoid clutter
    print(file)

# Uncomment below line to print all files if needed
# print("\n".join(pdf_files))

Total PDFs found: 33
First 5 PDF files (for verification):
/Users/lavonda/Documents/RAG/fulldata/Aorta-follow-up.pdf
/Users/lavonda/Documents/RAG/fulldata/HCM.pdf
/Users/lavonda/Documents/RAG/fulldata/Tricuspid-Regurgitation-Focused-Imaging.pdf
/Users/lavonda/Documents/RAG/fulldata/Pericardiocentesis-follow-up.pdf
/Users/lavonda/Documents/RAG/fulldata/AV-Optimization.pdf


In [20]:
# Initialize LlamaParse with JSON output
LLAMA_API_KEY = os.getenv("LLAMA_API_KEY")
parser = LlamaParse(
    api_key=LLAMA_API_KEY,  # Set API key
    result_type="json",  # Change to JSON format
    output_raw_json=True,  # Ensures JSON structure is preserved
    num_workers=4,  # Optimize for batch processing
    verbose=True,
    premium_mode=True,  
    language="en"  # Set language (default="en")
)

# Load all PDFs synchronously
json_data = parser.get_json_result(pdf_files)

# Save as JSON file
with open("output_data.json", "w", encoding="utf-8") as f:
    json.dump(json_data, f, indent=4)  # Pretty print JSON

print("Extracted JSON data saved to output_data.json")

# Print a preview of the first document's JSON
print("Extracted JSON preview:")
print(json.dumps(json_data[0], indent=4)[:500])  # Show first 500 characters

Parsing files:   0%|          | 0/33 [00:00<?, ?it/s]

Started parsing the file under job_id 886f2376-645a-40c1-939d-8b6e2695ab31
Started parsing the file under job_id fbe6f7f5-2f41-4f48-8435-424a1cd2741c
Started parsing the file under job_id 2fb696ea-f0a7-43cd-8e1f-00325977cb58
Started parsing the file under job_id ff425306-988e-4017-a576-ca7e3e1f2829


Parsing files:   3%|▎         | 1/33 [00:12<06:52, 12.89s/it]

Started parsing the file under job_id 08e9393a-5a07-4476-81bd-ad6577c2d8e3


Parsing files:   9%|▉         | 3/33 [00:14<01:41,  3.39s/it]

Started parsing the file under job_id 68a47ae2-4156-49fb-8e2d-d1d5b05ac76c
Started parsing the file under job_id 314c7a3c-6a9b-43e7-9e11-f070d45eb5b2


Parsing files:  12%|█▏        | 4/33 [00:16<01:27,  3.00s/it]

Started parsing the file under job_id 98f4de6e-480d-4f5b-971b-0c3555d9c59a


Parsing files:  15%|█▌        | 5/33 [00:24<02:15,  4.83s/it]

Started parsing the file under job_id 5ccdf9ec-29f5-44f6-81c3-db1bc4616b25


Parsing files:  21%|██        | 7/33 [00:25<01:02,  2.41s/it]

Started parsing the file under job_id 778e779e-bdf8-43fa-ba95-7b424ac2f72b
Started parsing the file under job_id 0eeb4da4-dbb9-43e5-97a5-01e57398ead3


Parsing files:  24%|██▍       | 8/33 [00:31<01:26,  3.46s/it]

Started parsing the file under job_id 678a0934-7992-45a2-ad29-b339996473d7


Parsing files:  30%|███       | 10/33 [00:39<01:20,  3.48s/it]

Started parsing the file under job_id 03c7dac8-f5d7-4524-9554-e93e06a21d37
Started parsing the file under job_id d01c1d49-d61c-4af6-b2eb-97a756760172


Parsing files:  33%|███▎      | 11/33 [00:43<01:17,  3.54s/it]

Started parsing the file under job_id b51d63ad-5409-4aac-ae54-f981286b0843


Parsing files:  39%|███▉      | 13/33 [00:52<01:15,  3.78s/it]

Started parsing the file under job_id ca443ef2-e41e-4f6d-80fb-4ac73396c741
Started parsing the file under job_id d38cf8f9-51fb-42e9-9680-c2a30ba592ab


Parsing files:  45%|████▌     | 15/33 [00:56<00:45,  2.55s/it]

Started parsing the file under job_id b058e2b6-0e93-4408-9d5f-e6b8d1d1c545
Started parsing the file under job_id f4922c41-6fe3-4a7d-9f3e-e774d2cbf0c8


Parsing files:  48%|████▊     | 16/33 [01:05<01:20,  4.71s/it]

Started parsing the file under job_id c2518a80-08ec-4e7d-a853-d9bea6a985a7


Parsing files:  52%|█████▏    | 17/33 [01:07<01:01,  3.87s/it]

Started parsing the file under job_id c13ea788-e6fa-4d7e-b28c-88b3feca76db


Parsing files:  55%|█████▍    | 18/33 [01:10<00:54,  3.62s/it]

Started parsing the file under job_id 4ddecb4c-53e1-4512-be7c-48b377d1e614


Parsing files:  58%|█████▊    | 19/33 [01:15<00:52,  3.76s/it]

Started parsing the file under job_id b6fd414d-8778-4cf9-9ae1-71dea52cf5be


Parsing files:  61%|██████    | 20/33 [01:21<00:58,  4.48s/it]

Started parsing the file under job_id 3c48d564-90dc-4421-90d1-046688b20ffa


Parsing files:  64%|██████▎   | 21/33 [01:23<00:46,  3.84s/it]

Started parsing the file under job_id 0d92828b-80ea-42cd-87a7-b9f8efb3a1c2


Parsing files:  67%|██████▋   | 22/33 [01:24<00:33,  3.07s/it]

Started parsing the file under job_id 710ade95-9ab1-45b7-a752-b5d069348aa2


Parsing files:  70%|██████▉   | 23/33 [01:32<00:45,  4.54s/it]

Started parsing the file under job_id df3f28f9-4adc-4394-a336-7ea66c05d877


Parsing files:  73%|███████▎  | 24/33 [01:34<00:33,  3.71s/it]

Started parsing the file under job_id 0df00f15-a6a9-4570-901f-8ad25f257391


Parsing files:  76%|███████▌  | 25/33 [01:37<00:27,  3.38s/it]

Started parsing the file under job_id 157f3867-b06e-47a5-8e1f-33700d23c8b2


Parsing files:  79%|███████▉  | 26/33 [01:40<00:23,  3.40s/it]

Started parsing the file under job_id 8f884773-0e5e-4dd3-b4d1-f4b5297a47b9


Parsing files:  82%|████████▏ | 27/33 [01:45<00:23,  3.91s/it]

Started parsing the file under job_id a0fc2611-b45e-452b-b636-f5f08ad1b120


Parsing files:  85%|████████▍ | 28/33 [01:48<00:18,  3.63s/it]

Started parsing the file under job_id 915b0c44-ce90-4d88-ab63-6c0e1bd63f09


Parsing files:  88%|████████▊ | 29/33 [01:49<00:11,  2.90s/it]

Started parsing the file under job_id 34be69d2-0a06-4e29-853e-37afe442b7f6


Parsing files: 100%|██████████| 33/33 [02:02<00:00,  3.71s/it]

Extracted JSON data saved to output_data.json
Extracted JSON preview:
{
    "pages": [
        {
            "page": 1,
            "text": "PROTOCOL: AORTA FOLLOW-UP\n\nInclusion Criteria\n    \u2022   Adult echocardiogram ordered to assess aortic size\n    \u2022   Full echocardiogram performed within two years\nEIMS Data\nProcedure Components: 2-D, Color Flow Doppler, Doppler\nSerial Studies: General, Thoracic Aorta\nFirst Impression: Echocardiogram performed per aorta follow-up protocol.\nBilling Diagnosis: Aneurysm Thoracic Aortic without (or with) Rupture (H





In [24]:
# ✅ Use your Neo4j connection details
NEO4J_URI = os.getenv('NEO4J_URI')
NEO4J_USER = os.getenv('NEO4J_USER')
NEO4J_PASSWORD = os.getenv('NEO4J_PASSWORD')


# ✅ Initialize the connection
driver = GraphDatabase.driver(NEO4J_URI, auth=(NEO4J_USER, NEO4J_PASSWORD))

# # ✅ Load JSON Data
with open("output_data.json", "r", encoding="utf-8") as f:
    data = json.load(f)

# ✅ Function to insert documents into Neo4j
def insert_data(tx, doc):
    query = """
    MERGE (d:Document {file_path: $file_path})
    SET d.num_pages = size($pages)
    WITH d
    UNWIND $pages AS page
    MERGE (p:Page {document: d.file_path, page_number: page.page})
    SET p.text = page.text, p.md = page.md
    MERGE (d)-[:HAS_PAGE]->(p)
    """
    tx.run(query, file_path=doc["file_path"], pages=doc["pages"])

# ✅ Insert Data into Neo4j
with driver.session() as session:
    for doc in data:
        session.execute_write(insert_data, doc)  # Use execute_write instead of write_transaction

print("✅ JSON data successfully imported into Neo4j!")

# ✅ Close connection
driver.close()

✅ JSON data successfully imported into Neo4j!


In [41]:
# 🔹 Load environment variables
NEO4J_URI = os.getenv("NEO4J_URI")
NEO4J_USER = os.getenv("NEO4J_USER")
NEO4J_PASSWORD = os.getenv("NEO4J_PASSWORD")
OPENAI_API_KEY = os.getenv("OPENAI_API_KEY")  # Ensure OpenAI key is set

# 🔹 Connect to Neo4j
driver = GraphDatabase.driver(NEO4J_URI, auth=(NEO4J_USER, NEO4J_PASSWORD))
openai.api_key = OPENAI_API_KEY

# # 🔹 Function to generate embeddings
def generate_embedding(text):
    response = openai.embeddings.create(
        input=text,
        model="text-embedding-3-small"
    )
    return response.data[0].embedding

# 🔹 Function to store embeddings in Neo4j
def store_embeddings():
    with driver.session() as session:
        query = """
        MATCH (p:Page)
        WHERE p.embedding IS NULL
        RETURN p.page_number, p.text, ID(p) AS node_id
        """
        results = session.run(query)
        
        for record in results:
            text = record["p.text"]
            node_id = record["node_id"]
            
            embedding = generate_embedding(text)  # Convert text to embedding
            embedding_str = str(embedding)  # Convert to string to store in Neo4j
            
            session.run("""
            MATCH (p)
            WHERE ID(p) = $node_id
            SET p.embedding = $embedding
            """, node_id=node_id, embedding=embedding_str)
    
    print("✅ Embeddings stored in Neo4j!")

# 🔹 Run the function
store_embeddings()



✅ Embeddings stored in Neo4j!


In [42]:
# ✅ Ensure OpenAI API key is set
OPENAI_API_KEY = os.getenv("OPENAI_API_KEY") 
openai.api_key = OPENAI_API_KEY
print(f"OPENAI_API_KEY: {OPENAI_API_KEY}")

# ✅ Correct function using the updated OpenAI API
def generate_embedding(text):
    response = openai.embeddings.create(
        input=text,  # OpenAI now requires a list
        model="text-embedding-3-small"
    )
    return response.data[0].embedding  # Return the first embedding

OPENAI_API_KEY: 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


In [51]:
# 🔹 Function to search Neo4j using similarity
def search_similar_documents(query_text: str, top_k: int=5):
    query_embedding = generate_embedding(query_text)  # Convert query to embedding

    with driver.session() as session:
        query = """
        MATCH (p:Page)
        WHERE p.embedding IS NOT NULL
        RETURN p.document, p.page_number, p.text, p.embedding
        """
        results = session.run(query)
        
        similarities = []
        for record in results:
            page_embedding = np.array(eval(record["p.embedding"]))  # Convert stored string to array
            similarity = np.dot(query_embedding, page_embedding) / (norm(query_embedding) * norm(page_embedding))  # Cosine similarity
            
            similarities.append({
                "document": record["p.document"],
                "page_number": record["p.page_number"],
                "text": record["p.text"],
                "similarity": similarity
            })

        # Sort by highest similarity
        similarities = sorted(similarities, key=lambda x: x["similarity"], reverse=True)

        return similarities[:top_k]

# 🔹 Example: Search for "blocked artery treatment"
search_results = search_similar_documents("heart attack treatment", top_k=5)

# 🔹 Print top 5 results
for result in search_results:
    print(f"📄 File: {result['document']}, Page: {result['page_number']}")
    print(f"📝 Content: {result['text'][:500]}")
    print(f"🔥 Similarity: {result['similarity']:.4f}\n")

📄 File: /Users/lavonda/Documents/RAG/fulldata/MC-Echo-Lab-Adult-TEE-Protocol.pdf, Page: 3
📝 Content:                            initial heart rhythm and any significant changes.
                        •  Level of consciousness (RASS) and pain assessment should
                           be monitored as appropriate for sedation medication
                           administration and/or procedure. Minimum documentation of
                           every 15-20 minutes.
                     •  Supplemental oxygen should be used for patients undergoing
                        moderate and deep se
🔥 Similarity: 0.3652

📄 File: /Users/lavonda/Documents/RAG/fulldata/HCM-Mavacamten.pdf, Page: 1
📝 Content: P                     : HCM MAVACAMTEN
                                      M
    ROTOCOL ACH                            AVACMMIEN

         The prescription drug, mavacamten (CAMZYOS), is indicated for the treatment of adults with
    symptomatic NYHA class II-III obstructive hypertrophic