In [1]:
import os
from dotenv import load_dotenv

# Load environment variables from .env file
load_dotenv()

# Access the environment variables
langchain_api_key = os.getenv("LANGCHAIN_API_KEY")
langchain_tracing_v2 = os.getenv("LANGCHAIN_TRACING_V2")
langchain_endpoint = os.getenv("LANGCHAIN_ENDPOINT")
langchain_project = os.getenv("LANGCHAIN_PROJECT")
openai_api_key = os.getenv("OPENAI_API_KEY")
tavily_api_key = os.getenv("TAVILY_API_KEY")
llama_parse_api_key = os.getenv("LLAMA_PARSE_API_KEY")


# Print the loaded environment variables (optional, for verification)
print(f"LANGCHAIN_API_KEY: {langchain_api_key[:12]}")
print(f"LANGCHAIN_TRACING_V2: {langchain_tracing_v2}")
print(f"LANGCHAIN_ENDPOINT: {langchain_endpoint}")
print(f"LANGCHAIN_PROJECT: {langchain_project}")
print(f"OPENAI_API_KEY: {openai_api_key[:12]}")
print(f"TAVILY_API_KEY: {tavily_api_key[:12]}")
print(f"LLAMA_PARSE_API_KEY: {llama_parse_api_key[:6]}")

LANGCHAIN_API_KEY: lsv2_pt_b7c2
LANGCHAIN_TRACING_V2: true
LANGCHAIN_ENDPOINT: https://api.smith.langchain.com
LANGCHAIN_PROJECT: langgraph-academy
OPENAI_API_KEY: sk-proj-0Mp6
TAVILY_API_KEY: tvly-r6bPwLx
LLAMA_PARSE_API_KEY: llx-ZN


In [2]:
import nest_asyncio

nest_asyncio.apply()

In [3]:
from llama_index.llms.openai import OpenAI
from llama_index.embeddings.openai import OpenAIEmbedding
from llama_index.core import VectorStoreIndex
from llama_index.core import Settings

embed_model = OpenAIEmbedding(model="text-embedding-3-small")
llm = OpenAI(model="gpt-4o")

Settings.llm = llm
Settings.embed_model = embed_model

In [4]:
from llama_parse import LlamaParse

In [5]:
LlamaParse


llama_parse.base.LlamaParse

In [6]:
# bring in deps
from llama_parse import LlamaParse
from llama_index.core import SimpleDirectoryReader

# set up parser
parser = LlamaParse(
    result_type="markdown",  # "markdown" and "text" are available
    api_key = llama_parse_api_key,
     num_workers=5,
    premium_mode=True,
 #  use_vendor_multimodal_model=True,
 #  vendor_multimodal_model_name="gpt-4o",
 # gpt4o_api_key= openai_api_key,
    verbose=True

)


In [7]:
parser

LlamaParse(is_remote=False, api_key='llx-ZNp4FVlgWs7vPny3pxTouSueUlPorIphe8uSGzhRkMo005ie', base_url='https://api.cloud.llamaindex.ai', result_type=<ResultType.MD: 'markdown'>, num_workers=5, check_interval=1, max_timeout=2000, verbose=True, show_progress=True, language=<Language.ENGLISH: 'en'>, parsing_instruction='', skip_diagonal_text=False, invalidate_cache=False, do_not_cache=False, fast_mode=False, premium_mode=True, continuous_mode=False, do_not_unroll_columns=False, page_separator=None, page_prefix=None, page_suffix=None, gpt4o_mode=False, gpt4o_api_key=None, guess_xlsx_sheet_names=False, bounding_box=None, target_pages=None, ignore_errors=True, split_by_page=True, vendor_multimodal_api_key=None, use_vendor_multimodal_model=False, vendor_multimodal_model_name=None, take_screenshot=False, custom_client=None, disable_ocr=False, is_formatting_instruction=True, annotate_links=False, webhook_url=None, azure_openai_deployment_name=None, azure_openai_endpoint=None, azure_openai_api_ve

In [8]:
# use SimpleDirectoryReader to parse our file
file_extractor = {".pdf": parser}
documents = SimpleDirectoryReader(input_files=["data/OxfordHandbookofClinical Medicine0th2017Edition_SamanSarKo.pdf"], file_extractor=file_extractor).load_data()
print(documents)

Started parsing the file under job_id 39874b10-0fe8-4cd3-82ec-377e65c1e117
........................................................

IOPub data rate exceeded.
The Jupyter server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--ServerApp.iopub_data_rate_limit`.

Current values:
ServerApp.iopub_data_rate_limit=1000000.0 (bytes/sec)
ServerApp.rate_limit_window=3.0 (secs)



In [15]:
type(documents)

list

In [17]:
len(documents)

911

In [16]:
documents[90]

Document(id_='1f97829c-ad4b-4065-84c7-7c6f5684ee10', embedding=None, metadata={'file_path': 'data\\OxfordHandbookofClinical Medicine0th2017Edition_SamanSarKo.pdf', 'file_name': 'OxfordHandbookofClinical Medicine0th2017Edition_SamanSarKo.pdf', 'file_type': 'application/pdf', 'file_size': 33003268, 'creation_date': '2024-11-17', 'last_modified_date': '2024-11-10'}, excluded_embed_metadata_keys=['file_name', 'file_type', 'file_size', 'creation_date', 'last_modified_date', 'last_accessed_date'], excluded_llm_metadata_keys=['file_name', 'file_type', 'file_size', 'creation_date', 'last_modified_date', 'last_accessed_date'], relationships={}, text="## Nodules and contractures\n\n- Dupuytren's contracture (see fig 2.26, p60) fibrosis and contracture of palmar fascia, p698) is seen in liver disease, trauma, epilepsy, and ageing.\n- Look for Heberden's (DIP) fig 2.45 and Bouchard's (PIP) 'nodes'—osteophytes (bone over-growth at a joint) seen with osteoarthritis.\n\nFig 2.45 Heberden's (DIP).\nRe

In [11]:
documents[0]

Document(id_='23b97b61-da3d-41bb-8e0f-9022144a9005', embedding=None, metadata={'file_path': 'data\\OxfordHandbookofClinical Medicine0th2017Edition_SamanSarKo.pdf', 'file_name': 'OxfordHandbookofClinical Medicine0th2017Edition_SamanSarKo.pdf', 'file_type': 'application/pdf', 'file_size': 33003268, 'creation_date': '2024-11-17', 'last_modified_date': '2024-11-10'}, excluded_embed_metadata_keys=['file_name', 'file_type', 'file_size', 'creation_date', 'last_modified_date', 'last_accessed_date'], excluded_llm_metadata_keys=['file_name', 'file_type', 'file_size', 'creation_date', 'last_modified_date', 'last_accessed_date'], relationships={}, text="BECOME THE DOCTOR YOU WANT TO BE\n\n# OXFORD HANDBOOK OF\n\n## CLINICAL MEDICINE\n\nIan B. Wilkinson | Tim Raine | Kate Wiles\nAnna Goodhart | Catriona Hall | Harriet O'Neill\n\nTENTH EDITION", mimetype='text/plain', start_char_idx=None, end_char_idx=None, text_template='{metadata_str}\n\n{content}', metadata_template='{key}: {value}', metadata_sep

In [32]:
print(documents[0].text.split("\n---\n")[0:4])

["BECOME THE DOCTOR YOU WANT TO BE\n\n# OXFORD HANDBOOK OF\n\n## CLINICAL MEDICINE\n\nIan B. Wilkinson | Tim Raine | Kate Wiles\nAnna Goodhart | Catriona Hall | Harriet O'Neill\n\nTENTH EDITION"]


In [33]:
print(documents[100].text.split("\n---\n")[0:4])

["# Assessing higher mental function: a practical guide\n\nStart by reassuring the patient 'I know this may be difficult...' and try to engage in conversation; asking questions that need to phrase to answer (ie not just yes/no). This tests fluency and reception, understanding, and allows assessment of articulation, eg 'How did you travel here today?', 'I came by bus'. Then assess dysphasia by asking: 'What is this' eg pen (tests for nominal dysphasia), repeat 'British Constitution' (tests for conduction dysphasia and dysarthria). Then ask patient to follow one-, two-, and three-step commands ensuring these 'cross the midline', eg make a fist with your right hand then extend your right index finger and touch your left ear.\n\n## Problems with classifying dysphasias\n\nThe classical model of language comprehension occurring in Wernicke's area and language expression in Broca's area is too simple. Functional MRI studies show old ideas that processing of abstract words is confined to the l

In [34]:
# Save the entire document as a single text file
with open("output_document_medical_OxfordHandbookofClinical_Medicine_LLAMA_PARSE.txt", "w", encoding="utf-8") as f:
    for doc in documents:
        f.write(doc.text + "\n\n")  # Adding some space between pages

In [35]:
import os

def save_documents_separately(documents, output_dir="output_pages", file_prefix="OxfordHandbook_ClinicalMedicine_Page_", file_extension=".txt"):
    """
    Saves each page of the document as a separate file with a descriptive name.

    Parameters:
    - documents: List of Document objects.
    - output_dir: Directory where the files will be saved.
    - file_prefix: Prefix for the file names.
    - file_extension: File extension for the saved files (e.g., ".txt" or ".md").
    """
    
    # Ensure the output directory exists
    os.makedirs(output_dir, exist_ok=True)
    
    # Iterate through the documents and save each page separately
    for idx, doc in enumerate(documents):
        # Create a descriptive file name
        file_name = f"{file_prefix}{idx+1}{file_extension}"
        file_path = os.path.join(output_dir, file_name)
        
        # Write the text of the document to the file
        with open(file_path, "w", encoding="utf-8") as f:
            f.write(doc.text)
        
        print(f"Saved: {file_path}")

# Example usage:
save_documents_separately(documents, output_dir="medical_pages", 
                          file_prefix="OxfordHandbook_ClinicalMedicine_Page_", file_extension=".txt")

Saved: medical_pages\OxfordHandbook_ClinicalMedicine_Page_1.txt
Saved: medical_pages\OxfordHandbook_ClinicalMedicine_Page_2.txt
Saved: medical_pages\OxfordHandbook_ClinicalMedicine_Page_3.txt
Saved: medical_pages\OxfordHandbook_ClinicalMedicine_Page_4.txt
Saved: medical_pages\OxfordHandbook_ClinicalMedicine_Page_5.txt
Saved: medical_pages\OxfordHandbook_ClinicalMedicine_Page_6.txt
Saved: medical_pages\OxfordHandbook_ClinicalMedicine_Page_7.txt
Saved: medical_pages\OxfordHandbook_ClinicalMedicine_Page_8.txt
Saved: medical_pages\OxfordHandbook_ClinicalMedicine_Page_9.txt
Saved: medical_pages\OxfordHandbook_ClinicalMedicine_Page_10.txt
Saved: medical_pages\OxfordHandbook_ClinicalMedicine_Page_11.txt
Saved: medical_pages\OxfordHandbook_ClinicalMedicine_Page_12.txt
Saved: medical_pages\OxfordHandbook_ClinicalMedicine_Page_13.txt
Saved: medical_pages\OxfordHandbook_ClinicalMedicine_Page_14.txt
Saved: medical_pages\OxfordHandbook_ClinicalMedicine_Page_15.txt
Saved: medical_pages\OxfordHandboo

### Markdown Element Node Parser
##### The markdown element node parser works well for parsing the markdown output of LlamaParse into a set of table and text nodes.

In [37]:
from llama_index.core.node_parser import MarkdownElementNodeParser

node_parser = MarkdownElementNodeParser(
    llm=llm, num_workers=8
)

In [51]:
nodes = node_parser.get_nodes_from_documents(documents)

In [52]:
nodes[0]

TextNode(id_='13d78438-f519-4820-b9ca-6f0f3c22408f', embedding=None, metadata={'file_path': 'data\\OxfordHandbookofClinical Medicine0th2017Edition_SamanSarKo.pdf', 'file_name': 'OxfordHandbookofClinical Medicine0th2017Edition_SamanSarKo.pdf', 'file_type': 'application/pdf', 'file_size': 33003268, 'creation_date': '2024-11-17', 'last_modified_date': '2024-11-10'}, excluded_embed_metadata_keys=['file_name', 'file_type', 'file_size', 'creation_date', 'last_modified_date', 'last_accessed_date'], excluded_llm_metadata_keys=['file_name', 'file_type', 'file_size', 'creation_date', 'last_modified_date', 'last_accessed_date'], relationships={<NodeRelationship.SOURCE: '1'>: RelatedNodeInfo(node_id='23b97b61-da3d-41bb-8e0f-9022144a9005', node_type=<ObjectType.DOCUMENT: '4'>, metadata={'file_path': 'data\\OxfordHandbookofClinical Medicine0th2017Edition_SamanSarKo.pdf', 'file_name': 'OxfordHandbookofClinical Medicine0th2017Edition_SamanSarKo.pdf', 'file_type': 'application/pdf', 'file_size': 330032

In [40]:
base_nodes, objects = node_parser.get_nodes_and_objects(nodes)

recursive_index = VectorStoreIndex(nodes=base_nodes + objects)

In [55]:
recursive_index.storage_context.persist(persist_dir="index-llamaindex")

In [56]:
# # Persist index to disk
# index.storage_context.persist("naval_index")

# from llama_index import StorageContext, load_index_from_storage

# # Rebuild storage context
# storage_context = StorageContext.from_defaults(persist_dir="naval_index")

# # Load index from the storage context
# new_index = load_index_from_storage(storage_context)

# new_query_engine = new_index.as_query_engine()
# response = new_query_engine.query("who is this text about?")
# print(response)

In [42]:
recursive_index

<llama_index.core.indices.vector_store.base.VectorStoreIndex at 0x2078bb7bd40>

In [43]:
query_engine = recursive_index.as_query_engine(similarity_top_k=25)

In [45]:
query_engine

<llama_index.core.query_engine.retriever_query_engine.RetrieverQueryEngine at 0x2078dfb4980>

In [44]:

query_1 = "what is crhon disease"

response_1 = query_engine.query(query_1)
print(str(response_1))

Crohn's disease is a chronic inflammatory condition characterized by transmural granulomatous inflammation that can affect any part of the gastrointestinal tract from the mouth to the anus, with a particular prevalence in the terminal ileum. It is marked by the presence of skip lesions, where unaffected bowel segments are found between areas of active disease. The disease is thought to result from an inappropriate immune response to gut flora in genetically susceptible individuals. Common symptoms include diarrhea, abdominal pain, and weight loss, along with systemic symptoms such as fatigue and fever. Complications can include bowel obstruction, fistulae, and malnutrition. Treatment involves managing symptoms, optimizing nutrition, and using medications such as steroids and biologics, with surgery being necessary in some cases.


In [46]:

query_1 = "Cerebral blood supply"

response_1 = query_engine.query(query_1)
print(str(response_1))

The cerebral blood supply is primarily provided by the internal carotid arteries and the vertebrobasilar system. The internal carotid arteries supply the anterior two-thirds of the cerebral hemispheres and the basal ganglia. The circle of Willis, an anastomotic ring at the base of the brain, is formed by the internal carotid arteries and the basilar artery, which is fed by the vertebral arteries. This arrangement can help compensate for occlusions in the blood supply. Three pairs of cerebral arteries—anterior, middle, and posterior—branch from the circle of Willis to supply different regions of the brain. The anterior cerebral artery supplies the frontal and medial parts of the cerebrum, the middle cerebral artery supplies the lateral parts of the hemispheres, and the posterior cerebral artery supplies the occipital lobe. The vertebrobasilar circulation supplies the cerebellum, brainstem, and occipital lobes.


In [47]:

query_1 = "Osteoarthritis"

response_1 = query_engine.query(query_1)
print(str(response_1))

Osteoarthritis is the most common joint condition globally, significantly affecting over 10% of individuals aged over 60. It can be primary or secondary to other conditions like obesity or joint disease. Symptoms include pain and crepitus during movement, joint stiffness after rest, and instability. Diagnosis often involves radiographs showing joint space loss and osteophytes. Management includes exercise, weight loss, analgesics like paracetamol, and possibly surgery for severe cases. Non-pharmacological treatments involve physiotherapy and occupational therapy.


In [50]:

query_1 = "editions of the book how many, which editions is this?"

response_1 = query_engine.query(query_1)
print(str(response_1))

The book has ten editions, with the first edition published in 1985 and the tenth edition in 2017.


In [12]:
from copy import deepcopy
from llama_index.core.schema import TextNode
from llama_index.core import VectorStoreIndex


def get_page_nodes(docs, separator="\n---\n"):
    """Split each document into page node, by separator."""
    nodes = []
    for doc in docs:
        doc_chunks = doc.text.split(separator)
        for doc_chunk in doc_chunks:
            node = TextNode(
                text=doc_chunk,
                metadata=deepcopy(doc.metadata),
            )
            nodes.append(node)

    return nodes

In [21]:
page_nodes = get_page_nodes(documents)

In [23]:
type(page_nodes)

list

In [24]:
len(page_nodes)

911

In [None]:

print(page_nodes[31].get_content())

In [36]:
print(page_nodes[31].get_content(metadata_mode="all"))

file_path: data\OxfordHandbookofClinical Medicine0th2017Edition_SamanSarKo.pdf
file_name: OxfordHandbookofClinical Medicine0th2017Edition_SamanSarKo.pdf
file_type: application/pdf
file_size: 33003268
creation_date: 2024-11-17
last_modified_date: 2024-11-10

# Epidemiology

'The work of epidemiology is related to unanswered questions, but also to un-questioned answers.' Patricia Buffler, North American Congress of Epidemiology, 2011.

## Who, what, when, where, why, and how?

Epidemiology is the study of the distribution of clinical phenomena in populations. It analyses disease in terms of host, agent, and environment (the 'epidemiologist's triad'). It elucidates risks and mechanisms for the development of disease, and reveals potential targets for disease prevention and treatment. Epidemiology does not look at the individual patient, but examines a defined population. How applicable its findings are depend upon how well the sample population mirrors the study population, which must, in

In [18]:

print(page_nodes[31].get_content())

# Epidemiology

'The work of epidemiology is related to unanswered questions, but also to un-questioned answers.' Patricia Buffler, North American Congress of Epidemiology, 2011.

## Who, what, when, where, why, and how?

Epidemiology is the study of the distribution of clinical phenomena in populations. It analyses disease in terms of host, agent, and environment (the 'epidemiologist's triad'). It elucidates risks and mechanisms for the development of disease, and reveals potential targets for disease prevention and treatment. Epidemiology does not look at the individual patient, but examines a defined population. How applicable its findings are depend upon how well the sample population mirrors the study population, which must, in turn, mirror the target population. Does your patient fit in this 'target'? If 'yes', then the epidemiological findings may be applicable.

## Measures of disease frequency

**Incidence proportion** is the number of new cases of disease as a proportion of t

In [None]:
##### save files output  ##### 

In [None]:
from copy import deepcopy
from llama_index.core.schema import TextNode
from llama_index.core import VectorStoreIndex


def get_page_nodes(docs, separator="\n---\n"):
    """Split each document into page node, by separator."""
    nodes = []
    for doc in docs:
        doc_chunks = doc.text.split(separator)
        for doc_chunk in doc_chunks:
            node = TextNode(
                text=doc_chunk,
                metadata=deepcopy(doc.metadata),
            )
            nodes.append(node)

    return nodes