In [1]:
import os, sys
# os.environ["CUDA_VISIBLE_DEVICES"] = "0"
sys.path.append(os.path.abspath(os.path.join(os.getcwd(), "../src")))

from tqdm import tqdm
from rag_prompt_template import *
from rag_util import *
from rag_moduler import *
from llm_factory import *
from rag_extraction import *
from grammar_llm_utils import *
import json

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
using_llm = "mistralsmall"
using_embed = "hitsnomed"

PARAMETERS = {
    "llm_model_name": LLM[using_llm],
    "tokenizer_name": LLM[using_llm],
    "embed_model_name": EMBED_MODEL[using_embed],
    "storage_dir": f"../index/snomed_dataset_nodoc_commandr_hitsnomed", # this is a partial KG indices for testing
    # "storage_dir": f"index/snomed_all_dataset_nodoc_hitsnomed",  # this is a full KG indices for testing
    "context_window": 32768,
    "max_new_tokens": 1024,
    "case_num":50,
    "verbose": True,
    "similarity_top_k": 30,
    "graph_store_query_depth": 5,
    "retriever_mode": "hybrid",
}

In [3]:
# initialise llm service context
hf_llm = init_llm_service_context(llm_model_name=PARAMETERS["llm_model_name"], 
                                    tokenizer_name=PARAMETERS["tokenizer_name"], 
                                    embed_model_name=PARAMETERS["embed_model_name"],
                                    context_window=PARAMETERS["context_window"],
                                    max_new_tokens=PARAMETERS["max_new_tokens"],
                                    # quantization_config=None,
                                )

Loading checkpoint shards: 100%|██████████| 9/9 [00:28<00:00,  3.19s/it]


LLM loaded: ../llm/Mistral-Small-Instruct-2409
embed_model loaded: ../llm/embedder/HiT-MiniLM-L12-SnomedCT
Settings loaded.


In [4]:
# Initialize and load knowledge graph index
kg_index = init_kg_storage_context(storage_dir=PARAMETERS["storage_dir"], llm=hf_llm, embed_model_name=PARAMETERS["embed_model_name"])

Global embed_model set to: ../llm/embedder/HiT-MiniLM-L12-SnomedCT


Default retriever
------

In [5]:
retriever = init_retriever(kg_index=kg_index,
                            similarity_top_k=30,
                            graph_store_query_depth=2,
                            verbose=False,
                            )

Retriever created, retriever: <class 'llama_index.core.indices.knowledge_graph.retrievers.KGTableRetriever'>, retriever_mode: hybrid


In [6]:
input_text = """
Results: Clinical data from our institution reveals that Leak of cranial cerebrospinal fluid due to and following procedure on central nervous system shows pathological morphology typical of Cerebrospinal fluid leakage, establishing a critical pathophysiological relationship. Moreover, Cerebrospinal fluid leakage belongs to the category of Morphologically Abnormal Structure. Clinical manifestations develop through well-defined pathological processes affecting specific organ systems and cellular functions. Accurate diagnosis depends on recognition of characteristic clinical patterns, appropriate use of diagnostic testing, and careful interpretation of results within the clinical context. Clinical management requires systematic approach including accurate diagnosis, appropriate treatment selection, and ongoing monitoring of therapeutic response. Treatment protocols emphasize patient safety, efficacy optimization, and quality of life considerations. Comprehensive care includes patient education, support services, and coordination with healthcare team members. Long-term prognosis is generally positive with early intervention and appropriate ongoing management. Success depends on multifactorial considerations including patient characteristics, disease severity, treatment response, and adherence to recommended care protocols. Systematic follow-up ensures continued treatment effectiveness.
"""

retrieved_results = retriever.retrieve(input_text)
for triple in retrieved_results[1].node.metadata['kg_rel_texts']:
    print(triple)

Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


('International neuroblastoma pathology classification: Favorable histology group, patient of any age with ganglioneuroma (Schwannian stroma-dominant) maturing, or mature (finding)', 'associated morphology', 'Lesion (morphologic abnormality)')
('Intracranial hemorrhage following injury with prolonged loss of consciousness AND return to pre-existing conscious level (disorder)', 'type', 'Disorder')
('Compression of brain co-occurrent and due to spontaneous cerebral hemorrhage (disorder)', 'associated morphology', 'Damage (morphologic abnormality)')
('Cerebrovascular accident due to thrombus of left middle cerebral artery (disorder)', 'due to', 'Thrombosis of left middle cerebral artery (disorder)')
('International neuroblastoma pathology classification cannot be determined (finding)', 'associated morphology', 'Lesion (morphologic abnormality)')
('Intracranial hemorrhage co-occurrent and due to closed skull fracture (disorder)', 'associated morphology', 'Fracture (morphologic abnormality)

Custom Retriever (features added, e.g. filter. under improvement)
------

In [7]:
retriever = init_retriever(kg_index=kg_index,
                            similarity_top_k=30,
                            graph_store_query_depth=2,
                            verbose=False,
                            custom_retriever=True # set to True to use custom retriever with additional features (under improvement)
                            )

Retriever created, retriever: <class 'rag_moduler.CustomKGTableRetriever'>, retriever_mode: hybrid


In [8]:
input_text = """
Results: Clinical data from our institution reveals that Leak of cranial cerebrospinal fluid due to and following procedure on central nervous system shows pathological morphology typical of Cerebrospinal fluid leakage, establishing a critical pathophysiological relationship. Moreover, Cerebrospinal fluid leakage belongs to the category of Morphologically Abnormal Structure. Clinical manifestations develop through well-defined pathological processes affecting specific organ systems and cellular functions. Accurate diagnosis depends on recognition of characteristic clinical patterns, appropriate use of diagnostic testing, and careful interpretation of results within the clinical context. Clinical management requires systematic approach including accurate diagnosis, appropriate treatment selection, and ongoing monitoring of therapeutic response. Treatment protocols emphasize patient safety, efficacy optimization, and quality of life considerations. Comprehensive care includes patient education, support services, and coordination with healthcare team members. Long-term prognosis is generally positive with early intervention and appropriate ongoing management. Success depends on multifactorial considerations including patient characteristics, disease severity, treatment response, and adherence to recommended care protocols. Systematic follow-up ensures continued treatment effectiveness.
"""

retrieved_results = retriever.retrieve(input_text)
for triple in retrieved_results:
    print(triple)

Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


[DEBUG] Starting hybrid scoring retrieval (Ollama version)...
[DEBUG] Parent retriever returned 1 NodeWithScore objects
[DEBUG] Starting hybrid scoring computation (Ollama)...
[DEBUG] Node 0 contains 25 keywords
[DEBUG] Query keywords: ['central nervous system', 'clinical management', 'pathological morphology', 'pathological', 'Cerebrospinal', 'system', 'Cerebrospinal fluid leakage', 'treatment', 'diagnosis', 'management']
[DEBUG] Node 0 contains 30 triples
[DEBUG] Triple 0: associated morphology
Semantic score: 0.768, Keyword score: 0.000, Hybrid score: 0.537
[DEBUG] Triple 1: type
Semantic score: 0.675, Keyword score: 0.000, Hybrid score: 0.472
[DEBUG] Triple 2: associated morphology
Semantic score: 0.829, Keyword score: 0.000, Hybrid score: 0.580
[DEBUG] Triple 3: due to
Semantic score: 0.791, Keyword score: 0.000, Hybrid score: 0.553
[DEBUG] Triple 4: associated morphology
Semantic score: 0.740, Keyword score: 0.000, Hybrid score: 0.518
[DEBUG] Ollama embedding computation failed: 