In [2]:
import os, sys
# os.environ["CUDA_VISIBLE_DEVICES"] = "0"
sys.path.append(os.path.abspath(os.path.join(os.getcwd(), "../src")))

from tqdm import tqdm
from rag_prompt_template import *
from rag_util import *
from rag_moduler import *
from llm_factory import *
from rag_extraction import *
import json

  from .autonotebook import tqdm as notebook_tqdm


Initialise RAG pipeline
------

In [3]:
using_llm = "mistralsmall"
# using_llm = "mistral-ft-multitask"
using_embed = "hitsnomed"
task = "entityextraction"
eval_dataset = "mimicivchunk"
using_extractor = "None"
using_generator = "None"

PARAMETERS = {
    "llm_model_name": LLM[using_llm],
    "tokenizer_name": LLM[using_llm],
    "embed_model_name": EMBED_MODEL[using_embed],
    "storage_dir": f"../index/snomed_dataset_nodoc_commandr_hitsnomed", # this is a partial KG indices for testing
    # "storage_dir": f"index/snomed_all_dataset_nodoc_hitsnomed",  # this is a full KG indices for testing
    "input_text_dir": f"../data/humandx_data/humandx_findings.json",
    "context_window": 32768,
    "max_new_tokens": 512,
    "case_num":50,
    "verbose": True,
    "similarity_top_k": 30,
    "graph_store_query_depth": 5,
    "retriever_mode": "hybrid",
    "test_id": f"_test_{task}_{eval_dataset}_{using_generator}_extractor_{using_extractor}"
}

In [4]:
kg_index = init_kg_storage_context(storage_dir=PARAMETERS["storage_dir"], embed_model_name=PARAMETERS["embed_model_name"])



Global embed_model set to: ../llm/embedder/HiT-MiniLM-L12-SnomedCT


In [5]:
hf_llm = init_llm_service_context(llm_model_name=PARAMETERS["llm_model_name"], 
                                    tokenizer_name=PARAMETERS["tokenizer_name"], 
                                    embed_model_name=PARAMETERS["embed_model_name"],
                                    context_window=PARAMETERS["context_window"],
                                    max_new_tokens=PARAMETERS["max_new_tokens"],
                                    # quantization_config=None,
                                )

Loading checkpoint shards: 100%|██████████| 9/9 [00:28<00:00,  3.20s/it]


LLM loaded: ../llm/Mistral-Small-Instruct-2409
embed_model loaded: ../llm/embedder/HiT-MiniLM-L12-SnomedCT
Settings loaded.


In [6]:
query_engine = build_kg_query_engine(
    kg_index,
    llm=hf_llm,
    retriever_mode="hybrid",
    embedding_mode="hybrid",
    similarity_top_k=30,
    graph_store_query_depth=2,
    verbose=True,
)

In [9]:
# Swap LLM of the existing KG query engine and rebuild it without re-loading the KG index
# !NOT FINISHED YET!

# query_engine = swap_llm_and_rebuild_engine(
#     kg_index,
#     new_llm="new_llm_name",
#     retriever_mode="hybrid",
#     embedding_mode="hybrid",
#     similarity_top_k=30,
#     graph_store_query_depth=5,
#     verbose=False,
# )

Loading checkpoint shards: 100%|██████████| 4/4 [00:11<00:00,  2.99s/it]


Simple Question-Answer example
------

In [7]:
response = query_engine.query("What is the most possible type of the concept 'Desloratadine'? Only answer with the type name.")
display(Markdown(f"<b>{response.response}</b>"))

[1;3;32mExtracted keywords: ['type', 'possible', 'Desloratadine', 'concept']
[0m[1;3;34mKG context:
The following are knowledge sequence in max depth 2 in the form of directed graph like:
`subject -[predicate]->, object, <-[predicate_next_hop]-, object_next_hop ...`
('2,4,5-trichlorophenoxyacetic acid (substance)', 'type', 'Substance')
('Chloramphenicol sodium succinate (substance)', 'type', 'Substance')
('Dichlorodiphenyltrichloroethane (substance)', 'type', 'Substance')
('Benzquinamide hydrochloride (substance)', 'type', 'Substance')
('Melarsomine dihydrochloride (substance)', 'type', 'Substance')
('Nitrogen mustard derivative (substance)', 'type', 'Substance')
('Trimethoprim hydrochloride (substance)', 'type', 'Substance')
('Tropatepine hydrochloride (substance)', 'type', 'Substance')
('Lurasidone hydrochloride (substance)', 'type', 'Substance')
('Butyrophenone derivative (substance)', 'type', 'Substance')
('Fluorothymidine (18-F) (substance)', 'type', 'Substance')
('Phenylurea c

Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


<b> Substance</b>

Concept/Entity Extraction example
------

In [7]:
# text = "___ is a ___ man who had severe biliary  pancreatitis resulting in pancreatic necrosis for which he was  treated with nasojejunal feedings and pancreatic rest.  He had  initially had multisystem organ failure, which improved. Mr.  ___ has a large postnecrotic pseudocyst, which has been  drained through a minimally invasive approach into his GI tract.   He has some debris, but this is not currently infected. The  patient was followed by Dr. ___ in his ___  clinic to discuss cholecystectomy. "

text = "After discussion of all  risks, benefits and possible outcomes, patient was scheduled for  elective cholecystectomy on ___.   Past Medical History: Hiatal Hernia   ___ esophagus   Esophageal ulcer   anxiety   Back pain    Social History: ___ Family History: Mother passed of metastatic lung cancer. Father alive, had CVA  and MI.  No  history of pancreatic malignancy      Physical Exam: Prior Discharge: VS: 98.3, 83, 137/69, 16, 98% RA GEN: NAD, "

entity_extraction_prompt = """\
Extract the mentioned SNOMED CT concepts from the given discharge note.

Here is the desired types of the concepts: [finding, disorder, procedure, regimen/therapy, morphologic abnormality, body structure, cell structure]

Here is the discharge note: {text}.

"""

response = query_engine.query(entity_extraction_prompt.format(text=text))
display(Markdown(f"<b>{response}</b>"))
print("========================================")
# response = ", ".join(list(dict.fromkeys(response.split(","))))
print(f"response: {response}")

Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


<b>1. Hiatal Hernia (disorder)
2. Esophageal ulcer (disorder)
3. Back pain (disorder)
4. Metastatic lung cancer (disorder)
5. CVA (disorder)
6. MI (disorder)
7. Elective cholecystectomy (procedure)</b>

response: 1. Hiatal Hernia (disorder)
2. Esophageal ulcer (disorder)
3. Back pain (disorder)
4. Metastatic lung cancer (disorder)
5. CVA (disorder)
6. MI (disorder)
7. Elective cholecystectomy (procedure)


Entity-type Pair Extraction example
------

In [8]:
text = "Antihypertensive drugs and depression: a reappraisal. Eighty-nine new referral hypertensive out-patients and 46 new referral non-hypertensive chronically physically ill out-patients completed a mood rating scale at regular intervals for one year. The results showed a high prevalence of depression in both groups of patients, with no preponderance in the hypertensive group. Hypertensive patients with psychiatric histories had a higher prevalence of depression than the comparison patients. This was accounted for by a significant number of depressions occurring in methyl dopa treated patients with psychiatric histories."

pair_extraction_prompt = """\
Here is the context: {text}.\

Task: Extract the entity-type pairs from the given context with the format of (entity ; type).\

Here is the type list: [Disorder, Substance].\

The steps are as follows:\
1. extract the entity from the given context abstract, using the retrieved sub-graph.
2. select ONE most likely type from the list for the extracted entity.
3. output the pairs in the format of (entity ; type) strictly.
4. repeat the step 1 to step 3.\
\

Provide your answer as follows:

Answer:::
Pairs: (All extracted pairs)\
Answer End:::\

Requirements:\
You MUST provide values for 'Pairs:' in your answer. \
ONLY use the type in the type list: [Disorder, Substance].\
ONLY output valid entity-type pairs without any reasoning.

"""

response = query_engine.query(pair_extraction_prompt.format(text=text))
display(Markdown(f"<b>{response}</b>"))

Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


<b> Pairs: (Hypertensive patients with psychiatric histories; Disorder), (depression; Disorder), (methyl dopa treated patients with psychiatric histories; Disorder)</b>

Triple Extraction example
------

In [11]:
text = "Evaluation of Preference and Utility Measures for Transoral Thyroidectomy. Traditional, trans-cervical thyroidectomy results in the presence of a neck scar, which has been shown to correlate with lower quality of life and lower patient satisfaction. Transoral thyroid surgery (TOTS) has been utilized as an alternative approach to avoid a cutaneous incision and scar by accessing the neck and thyroid through the oral cavity. This study was designed to evaluate patient preference through health-state utility scores for TOTS as compared to conventional trans-cervical thyroidectomy."

triple_extraction_prompt = f"""
Here is the optional relation list: [temporally follows, after, due to, has realization, associated with, has definitional manifestation, associated finding, associated aetiologic finding, associated etiologic finding, interprets, associated morphology, causative agent, course, finding site, temporally related to, pathological process, direct morphology, is modification of, measures, direct substance, has active ingredient, using, part of].

Here is the context: {text}.

Extract the SNOMED CT triples from the given context.

"""
response = query_engine.query(triple_extraction_prompt)
display(Markdown(f"<b>{response}</b>"))

Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


<b>1. ('Transoral thyroid surgery (procedure)', 'is modification of', 'Thyroid surgery (procedure)')
2. ('Traditional, trans-cervical thyroidectomy (procedure)', 'has definitional manifestation', 'Presence of neck scar (finding)')
3. ('Presence of neck scar (finding)', 'associated with', 'Lower quality of life (finding)')
4. ('Presence of neck scar (finding)', 'associated with', 'Lower patient satisfaction (finding)')</b>

Medical Diagnostics example
------

In [15]:
case_vignette = """
40 year old female presenting with chest pain
 Symptom: Worsening chest pain
 • Onset: 2 weeks ago
 • Associated with: Cough, dyspnea, fever
 • Complicated by: Fatigue
 Social history
 • Recent construction in Ohio
 Physical exam
 • Lungs: Wheezing
 Diagnostic: X-ray
 • Interpretation: Normal
"""

medical_diagnosis_prompt = """
Case vignette: {case_vignette}

According the given case vignette, provide only the most probable differential diagnosis, no explanation, no recapitulation of the case information or task. 
Give a maximum of 5 answers, sorted by probability of being the correct diagnosis, most probable first, remove list numbering, 
and respond with each answer on a new line. Be as concise as possible, no need to be polite.

Provide your answer as follows:

Answer:::
Diagnosis: (the 5 most probable diagnoses, most probable first)
1. 
2. 
...
Answer End:::\

You MUST provide values for 'Diagnosis' in your answer.\
Do not provide any other information in your response.\

"""

response = query_engine.query(medical_diagnosis_prompt.format(case_vignette=case_vignette))
display(Markdown(f"<b>{response}</b>"))
# print(f"Results:\n{extract_triple(str(response), notebook=True, split_str1='Diagnosis:')}")

Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


<b>
Myocarditis due to scarlet fever
Acute on chronic combined systolic and diastolic heart failure
Tuberculosis of lung, confirmed by culture only
Aortic orifice posterior left with respect to pulmonary orifice
Acute ST segment elevation myocardial infarction of anterior wall involving right ventricle</b>

In [None]:
# snomed concepts extraction
snomed_extraction_prompt = """\
Here is the context: {text}.\

Task: Extract the SNOMED CT triples from the given context with the format of (concept 1 ; relation ; concept 2).\

Here is the optional relation list: [temporally follows, after, due to, has realization, associated with, has definitional manifestation, 
associated finding, associated aetiologic finding, associated etiologic finding, interprets, associated morphology, causative agent, course, 
finding site, temporally related to, pathological process, direct morphology, is modification of, measures, direct substance, has active ingredient, using, part of].\

The steps are as follows:\
1. extract the concept 1 and concept 2 from the given context sentence, using the retrieved sub-graph.
2. select ONE most likely relation from the list for the extracted concepts.
3. output the triplets in the format of (concept 1 ; relation ; concept 2) strictly.\
\

Provide your answer as follows:

Answer:::
Triples: (The extracted triples)\
Answer End:::\

You MUST provide values for 'Triples:' in your answer.\

"""

snomed_description_generation_prompt = """\
Here is the context: {text}.\

Here is the optional relation list: [temporally follows, after, due to, has realization, associated with, has definitional manifestation, 
associated finding, associated aetiologic finding, associated etiologic finding, interprets, associated morphology, causative agent, course, 
finding site, temporally related to, pathological process, direct morphology, is modification of, measures, direct substance, has active ingredient, using, part of].\

Task: Generate the SNOMED CT descriptions for the given concept.

The steps are as follows:
1. extract a CONCEPT from the given context sentence, using the retrieved sub-graph.
2. generate an EXPRESSION in human-readable phrase that can describe the CONCEPT.
3. select one most likely relation from the list between the CONCEPT and the EXPRESSION.
4. generate descriptions in the format of (CONCEPT ; relation ; EXPRESSION). Each CONCEPT may have multiple descriptions.
5. repeat the step 1 to step 4.

Provide your answer as follows:

Answer:::
Concept: 
Descriptions: (The generated descriptions)
Answer End:::\

You MUST provide values for 'Concept' and 'Description' in your answer.\

Few-shot examples:
Answer:::
Concept: apnea
Descriptions: (apnea ; interprets ; respiration observable) (apnea ; has interpretation ; absent) (apnea ; finding site ; structure of respiratory system)
Answer End:::

"""

snomed_extraction_prompt_var_mappings = {"text": "text"}

prompt_tmpl = PromptTemplate(
    snomed_description_generation_prompt, template_var_mappings=snomed_extraction_prompt_var_mappings
)

def query_and_generate_rel(test_id, query_engine, cases=427):
    logging.info(f"Query Engine: {query_engine}")
    with open("") as f:
        results = []
        sentences = f.readlines()[:cases]
        logging.info(f"Experiment ID: {test_id}")
        print(f"Number of sentences: {len(sentences)}; Number of cases for test: {cases}")
        for sentence_id, text in tqdm(enumerate(sentences)):
            print(f"Processing sentence {sentence_id} / {len(sentences)}")
            print(f"Text: {text}")
            retry_count = 0

            fmt_prompt = prompt_tmpl.format(
                text=text,
            )
            # print(fmt_prompt)
            response = query_engine.query(fmt_prompt)
            # display(Markdown(f"<b>{response}</b>"))
            # results.append(clean_response(str(response)) + "\n")
            results.append(extract_triple(str(response), notebook=True) + "\n")
            print(f"Results: {extract_triple(str(response), notebook=True)}")

    with open(f"results/rel.hyps_{test_id}", 'w') as f:
        f.writelines(results)
    
    logging.info(f"Results saved to results/rel.hyps_{test_id}")
    return response

logging_setup(log_file=f"logs/{PARAMETERS['test_id']}.log", log_level=logging.INFO)

response = query_and_generate_rel(test_id = PARAMETERS["test_id"], query_engine = query_engine, cases=50)

50it [28:31, 34.22s/it]
