In [None]:
from tqdm import tqdm
from rag_prompt_template import *
from rag_util import *
from rag_moduler import *
from rag_extraction import *
import json
os.environ["CUDA_VISIBLE_DEVICES"] = "0"






Initialise RAG pipeline
------

In [None]:
using_llm = "mistralsmall"
using_embed = "hitsnomed"
task = "entityextraction"
eval_dataset = "mimicivchunk"
using_extractor = "None"
using_generator = "None"
using_parser = "nuparser"

PARAMETERS = {
    "llm_model_name": LLM[using_llm],
    "tokenizer_name": LLM[using_llm],
    "embed_model_name": EMBED_MODEL[using_embed],
    "storage_dir": f"index/snomed_dataset_nodoc_commandr_hitsnomed", # this is a partial KG indices for testing
    "input_text_dir": f"data/humandx_data/humandx_findings.json",
    "context_window": 32768,
    "max_new_tokens": 2048,
    "case_num":50,
    "verbose": True,
    "similarity_top_k": 30,
    "graph_store_query_depth": 5,
    "retriever_mode": "hybrid",
    "test_id": f"_test_{task}_{eval_dataset}_{using_generator}_extractor_{using_extractor}"
}

llm = init_llm_service_context(llm_model_name=PARAMETERS["llm_model_name"], 
                                    tokenizer_name=PARAMETERS["tokenizer_name"], 
                                    embed_model_name=PARAMETERS["embed_model_name"],
                                    context_window=PARAMETERS["context_window"],
                                    max_new_tokens=PARAMETERS["max_new_tokens"],
                                    quantization_config=quantization_config,
                                )

if using_parser is not None:
    pipe_parser = init_llm_pipeline(LLM[using_parser], quantization_config=None)


Loading checkpoint shards:   0%|          | 0/9 [00:00<?, ?it/s]

LLM pipeline built: llm/NuExtract-1.5-tiny


In [3]:
kg_index = init_kg_storage_context(llm, storage_dir=PARAMETERS["storage_dir"])

In [4]:
query_engine = init_rag_pipeline(kg_index, 
                                 similarity_top_k=10, 
                                 graph_store_query_depth=5, 
                                 include_text=False, 
                                 retriever_mode="embedding", 
                                 verbose=PARAMETERS["verbose"])

Simple Question-Ansuwer example
------

In [5]:
response = query_engine.query("what is the type of Gastroenteritis caused by influenza?")
display(Markdown(f"<b>{response}</b>"))

print(llm_parser(response, pipe_parser, target_results="classifications"))

Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


[1;3;32mExtracted keywords: ['caused', 'Gastroenteritis', 'type', 'influenza']
[0m

Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


[1;3;34mKG context:
The following are knowledge sequence in max depth 5 in the form of directed graph like:
`subject -[predicate]->, object, <-[predicate_next_hop]-, object_next_hop ...`
('Pneumonia caused by Influenza A virus (disorder)', 'causative agent', 'Influenza A virus (organism)')
('Influenza with encephalopathy (disorder)', 'associated morphology', 'Inflammatory morphology (morphologic abnormality)')
('Influenza with gastrointestinal tract involvement (disorder)', 'pathological process', 'Infectious disease (disorder)')
('Influenza caused by pandemic influenza virus (disorder)', 'type', 'Disorder')
('Influenza with gastrointestinal tract involvement (disorder)', 'type', 'Disorder')
('Pneumonia caused by influenza (disorder)', 'type', 'Disorder')
('Haemophilus influenzae pneumonia (disorder)', 'associated morphology', 'Consolidation (morphologic abnormality)')
('Influenza caused by Influenza A virus subtype H5 (disorder)', 'type', 'Disorder')
('Pneumonia caused by Influenza A

<b> Infectious disease</b>

 {
    "classifications": [
        "Infectious disease"
    ]
}
  


Concept/Entity Extraction example
------

In [6]:
text = "Evaluation of Preference and Utility Measures for Transoral Thyroidectomy. Traditional, trans-cervical thyroidectomy results in the presence of a neck scar, which has been shown to correlate with lower quality of life and lower patient satisfaction. Transoral thyroid surgery (TOTS) has been utilized as an alternative approach to avoid a cutaneous incision and scar by accessing the neck and thyroid through the oral cavity. This study was designed to evaluate patient preference through health-state utility scores for TOTS as compared to conventional trans-cervical thyroidectomy."

entity_extraction_prompt = """\
Here is the context: {text}.\

Task: Extract the SNOMED CT concepts from the given context.\

The steps are as follows:\
1. extract the concepts from the given context sentence, using the retrieved triplets.
2. there may be abbreviations or acronyms in the context, extract them as concepts as well if they are related to the concepts.
3. output the concepts in a list [] strictly, each concept is separated by a comma.\
\

Provide your answer as follows:

Answer:::
Concepts: [] \
Answer End:::\

Requirements:\
You MUST provide values for 'Concepts:' in your answer. \
ONLY extract concepts, DO NOT include the type of the concept, reasoning, or any other information. \
DO NOT include mark numbers or ordinal numbers in your answer. \
Extract as many unique concepts as possible from the given context. \

"""

response = query_engine.query(entity_extraction_prompt.format(text=text))
display(Markdown(f"<b>{response}</b>"))
print("========================================")
print(llm_parser(response, pipe_parser))

Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


[1;3;32mExtracted keywords: ['---------------------\n\nKEYWORDS: transoral thyroid surgery', 'health', 'utility', 'patient', 'preference', 'health-state utility scores', 'cervical', 'transoral', 'state', 'trans-cervical thyroidectomy', 'trans', 'scores', 'thyroid', 'KEYWORDS', 'surgery', 'patient preference', 'thyroidectomy']
[0m

Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


[1;3;34mKG context:
The following are knowledge sequence in max depth 5 in the form of directed graph like:
`subject -[predicate]->, object, <-[predicate_next_hop]-, object_next_hop ...`
('Allergy to chlortalidone (finding)', 'type', 'Clinical Finding')
('Allergy to deoxyribonuclease (finding)', 'type', 'Clinical Finding')
('Allergy to alpha-tocopherol (finding)', 'type', 'Clinical Finding')
('Allergy to goserelin (finding)', 'type', 'Clinical Finding')
('Allergy to bendroflumethiazide (finding)', 'type', 'Clinical Finding')
('Mass of bilateral parotid glands (finding)', 'type', 'Clinical Finding')
('Allergy to fenticonazole (finding)', 'type', 'Clinical Finding')
('Allergy to doxepin (finding)', 'type', 'Clinical Finding')
('Allergy to cetylpyridinium (finding)', 'type', 'Clinical Finding')
("Graves' disease with acropachy AND with thyrotoxic crisis (disorder)", 'type', 'Disorder')
[0m

<b> Concepts: ["thyroidectomy", "neck scar", "quality of life", "patient satisfaction", "transoral thyroid surgery", "trans-cervical thyroidectomy", "transoral thyroidectomy", "health-state utility scores"]</b>

 {
    "Concepts": [
        "thyroidectomy",
        "neck scar",
        "quality of life",
        "patient satisfaction",
        "transoral thyroid surgery",
        "trans-cervical thyroidectomy",
        "transoral thyroidectomy",
        "health-state utility scores"
    ]
}
  


Entity-type Pair Extraction example
------

In [8]:
text = "Antihypertensive drugs and depression: a reappraisal. Eighty-nine new referral hypertensive out-patients and 46 new referral non-hypertensive chronically physically ill out-patients completed a mood rating scale at regular intervals for one year. The results showed a high prevalence of depression in both groups of patients, with no preponderance in the hypertensive group. Hypertensive patients with psychiatric histories had a higher prevalence of depression than the comparison patients. This was accounted for by a significant number of depressions occurring in methyl dopa treated patients with psychiatric histories."

pair_extraction_prompt = """\
Here is the context: {text}.\

Task: Extract the entity-type pairs from the given context with the format of (entity ; type).\

Here is the type list: [Disorder, Substance].\

The steps are as follows:\
1. extract the entity from the given context abstract, using the retrieved sub-graph.
2. select ONE most likely type from the list for the extracted entity.
3. output the pairs in the format of (entity ; type) strictly.
4. repeat the step 1 to step 3.\
\

Provide your answer as follows:

Answer:::
Pairs: (All extracted pairs)\
Answer End:::\

Requirements:\
You MUST provide values for 'Pairs:' in your answer. \
ONLY use the type in the type list: [Disorder, Substance].\
ONLY output valid entity-type pairs without any reasoning.

"""

response = query_engine.query(pair_extraction_prompt.format(text=text))
display(Markdown(f"<b>{response}</b>"))
print("========================================")
parsed_response = llm_parser(response, pipe_parser, "nuparser", target_results="Pairs")
print(parsed_response)
print("========================================")
print(clean_structural_list(parsed_response, "Pairs"))

Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


[1;3;32mExtracted keywords: ['dopa', 'methyl dopa', 'psychiatric', 'drugs', 'depression', 'histories', 'psychiatric histories', 'patients', 'ill', 'out-patients', 'methyl', 'antihypertensive', 'physically', 'hypertensive', 'physically ill']
[0m

Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


[1;3;34mKG context:
The following are knowledge sequence in max depth 5 in the form of directed graph like:
`subject -[predicate]->, object, <-[predicate_next_hop]-, object_next_hop ...`
('Primary degenerative dementia of the Alzheimer type, senile onset, with depression (disorder)', 'type', 'Disorder')
('Primary degenerative dementia of the Alzheimer type, senile onset, with depression (disorder)', 'has definitional manifestation', 'Dementia (disorder)')
('Panic disorder with agoraphobia, moderate agoraphobic avoidance AND panic attacks in partial remission (disorder)', 'type', 'Disorder')
('Panic disorder with agoraphobia, moderate agoraphobic avoidance AND panic attacks in partial remission (disorder)', 'has definitional manifestation', 'Panic (finding)')
('Panic disorder with agoraphobia, mild agoraphobic avoidance AND panic attacks in full remission (disorder)', 'type', 'Disorder')
('Panic disorder with agoraphobia, agoraphobic avoidance in full remission AND panic attacks in ful

<b> Pairs: (depression ; Disorder), (methyl dopa ; Substance)</b>

 {
    "Pairs": [
        [
            "depression",
            "Disorder"
        ],
        [
            "methyl dopa",
            "Substance"
        ]
    ]
}
  
[('depression', 'Disorder'), ('methyl dopa', 'Substance')]


Triple Extraction example
------

In [10]:
text = "Evaluation of Preference and Utility Measures for Transoral Thyroidectomy. Traditional, trans-cervical thyroidectomy results in the presence of a neck scar, which has been shown to correlate with lower quality of life and lower patient satisfaction. Transoral thyroid surgery (TOTS) has been utilized as an alternative approach to avoid a cutaneous incision and scar by accessing the neck and thyroid through the oral cavity. This study was designed to evaluate patient preference through health-state utility scores for TOTS as compared to conventional trans-cervical thyroidectomy."

triple_extraction_prompt = f"""\
Here is the context: {text}.\

Task: Extract the SNOMED CT triples from the given context with the format of (entity1; relation; entity2).\

Here is the optional relation list: [temporally follows, after, due to, has realization, associated with, has definitional manifestation, 
associated finding, associated aetiologic finding, associated etiologic finding, interprets, associated morphology, causative agent, course, 
finding site, temporally related to, pathological process, direct morphology, is modification of, measures, direct substance, has active ingredient, using, part of].\

The steps are as follows:\
1. extract the entity1 and entity2 from the given context sentence, using the retrieved triples.
2. select ONE most likely relation from the list for the extracted entities.
3. output the triples in the format of (entity1; relation; entity2) strictly.
4. repeat the process for all the possible triples extracted from the context.
\

Provide your answer as follows:

Answer:::
Triples: (entity1; relation; entity2) (entity1; relation; entity2) ...
Answer End:::\

You MUST provide values for 'Triples:' in your answer.\

"""
response = query_engine.query(triple_extraction_prompt)
display(Markdown(f"<b>{response}</b>"))
print("====================================")
parsed_response = llm_parser(response, pipe_parser, target_results="Triples")
print(parsed_response)
print("====================================")
print(clean_structural_list(parsed_response, "Triples"))

Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


[1;3;32mExtracted keywords: ['']
[0m

Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


[1;3;34mKG context:
The following are knowledge sequence in max depth 5 in the form of directed graph like:
`subject -[predicate]->, object, <-[predicate_next_hop]-, object_next_hop ...`
('Allergy to chlortalidone (finding)', 'type', 'Clinical Finding')
('Allergy to deoxyribonuclease (finding)', 'type', 'Clinical Finding')
('Allergy to alpha-tocopherol (finding)', 'type', 'Clinical Finding')
('Allergy to goserelin (finding)', 'type', 'Clinical Finding')
('Allergy to bendroflumethiazide (finding)', 'type', 'Clinical Finding')
('Mass of bilateral parotid glands (finding)', 'type', 'Clinical Finding')
('Allergy to fenticonazole (finding)', 'type', 'Clinical Finding')
('Allergy to doxepin (finding)', 'type', 'Clinical Finding')
('Allergy to cetylpyridinium (finding)', 'type', 'Clinical Finding')
("Graves' disease with acropachy AND with thyrotoxic crisis (disorder)", 'type', 'Disorder')
[0m

<b>1. (Evaluation of Preference and Utility Measures for Transoral Thyroidectomy; has realization; Traditional, trans-cervical thyroidectomy)
2. (Evaluation of Preference and Utility Measures for Transoral Thyroidectomy; has realization; transoral thyroid surgery (TOTS))
3. (Traditional, trans-cervical thyroidectomy; associated finding; presence of a neck scar)
4. (presence of a neck scar; associated with; lower quality of life)
5. (presence of a neck scar; associated with; lower patient satisfaction)
6. (transoral thyroid surgery (TOTS); associated finding; avoid a cutaneous incision and scar)
7. (transoral thyroid surgery (TOTS); associated finding; accessing the neck and thyroid through the oral cavity)
8. (transoral thyroid surgery (TOTS); associated finding; patient preference)
9. (transoral thyroid surgery (TOTS); associated finding; health-state utility scores)
10. (transoral thyroid surgery (TOTS); associated finding; conventional trans-cervical thyroidectomy)</b>

{
    "Triples": [
        [
            "Evaluation of Preference and Utility Measures for Transoral Thyroidectomy",
            "has realization",
            "Traditional, trans-cervical thyroidectomy"
        ],
        [
            "Evaluation of Preference and Utility Measures for Transoral Thyroidectomy",
            "has realization",
            "transoral thyroid surgery (TOTS)"
        ],
        [
            "Traditional, trans-cervical thyroidectomy",
            "associated finding",
            "presence of a neck scar"
        ],
        [
            "presence of a neck scar",
            "associated with",
            "lower quality of life"
        ],
        [
            "presence of a neck scar",
            "associated with",
            "lower patient satisfaction"
        ],
        [
            "transoral thyroid surgery (TOTS)",
            "associated finding",
            "avoid a cutaneous incision and scar"
        ],
        [
            "transoral t

Medical Diagnostics example
------

In [None]:
case_vignette = """
40 year old female presenting with chest pain
 Symptom: Worsening chest pain
 • Onset: 2 weeks ago
 • Associated with: Cough, dyspnea, fever
 • Complicated by: Fatigue
 Social history
 • Recent construction in Ohio
 Physical exam
 • Lungs: Wheezing
 Diagnostic: X-ray
 • Interpretation: Normal
"""

medical_diagnosis_prompt = """
Case vignette: {case_vignette}

According the given case vignette, provide only the most probable differential diagnosis, no explanation, no recapitulation of the case information or task. 
Give a maximum of 5 answers, sorted by probability of being the correct diagnosis, most probable first, remove list numbering, 
and respond with each answer on a new line. Be as concise as possible, no need to be polite.

Provide your answer as follows:

Answer:::
Diagnosis: (the 5 most probable diagnoses, most probable first)
1. 
2. 
...
Answer End:::\

You MUST provide values for 'Diagnosis' in your answer.\
Do not provide any other information in your response.\

"""

response = query_engine.query(medical_diagnosis_prompt.format(case_vignette=case_vignette))
print(f"Results:\n{extract_triple(str(response), notebook=True, split_str1='Diagnosis:')}")
print("========================================")
print(llm_parser(response, pipe_parser, target_results="Diagnosis"))

Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


[1;3;32mExtracted keywords: ['Asthma', 'female', 'old', 'End', 'pain', 'Diagnosis', 'Myocardial', 'year', 'chest pain', 'Infarction', 'chest', 'normal\n\n---------------------\nAnswer:::\nDiagnosis:\nCovid-19\nPneumonia\nAsthma\nMyocardial Infarction\nPulmonary Embolism\nAnswer End:::', 'cough', 'X', 'normal', 'dyspnea', 'fatigue', 'construction', '40 year old female', 'Pulmonary', 'Answer', '19', 'fever', 'wheezing', 'X-ray', 'Embolism', 'recent construction', 'recent', 'Ohio', 'Pneumonia', 'Covid', '40', 'ray']
[0m

Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


[1;3;34mKG context:
The following are knowledge sequence in max depth 5 in the form of directed graph like:
`subject -[predicate]->, object, <-[predicate_next_hop]-, object_next_hop ...`
('Left ventricular failure with normal ejection fraction due to coronary arteriosclerosis (disorder)', 'type', 'Disorder')
('Panic disorder with agoraphobia, moderate agoraphobic avoidance AND panic attacks in partial remission (disorder)', 'has definitional manifestation', 'Panic (finding)')
('Acute on chronic combined systolic and diastolic heart failure (disorder)', 'has definitional manifestation', 'Diastolic dysfunction (finding)')
('B-cell lymphoma unclassifiable with features intermediate between classical Hodgkin lymphoma and diffuse large B-cell lymphoma (disorder)', 'type', 'Disorder')
('Major systemic to pulmonary collateral artery with absent pulmonary arteries proximal to hilar bifurcation (disorder)', 'type', 'Disorder')
('Mendelian susceptibility to mycobacterial disease due to partial 

In [10]:
# snomed concepts extraction
snomed_extraction_prompt = """\
Here is the context: {text}.\

Task: Extract the SNOMED CT triples from the given context with the format of (concept 1 ; relation ; concept 2).\

Here is the optional relation list: [temporally follows, after, due to, has realization, associated with, has definitional manifestation, 
associated finding, associated aetiologic finding, associated etiologic finding, interprets, associated morphology, causative agent, course, 
finding site, temporally related to, pathological process, direct morphology, is modification of, measures, direct substance, has active ingredient, using, part of].\

The steps are as follows:\
1. extract the concept 1 and concept 2 from the given context sentence, using the retrieved sub-graph.
2. select ONE most likely relation from the list for the extracted concepts.
3. output the triplets in the format of (concept 1 ; relation ; concept 2) strictly.\
\

Provide your answer as follows:

Answer:::
Triples: (The extracted triples)\
Answer End:::\

You MUST provide values for 'Triples:' in your answer.\

"""

snomed_description_generation_prompt = """\
Here is the context: {text}.\

Here is the optional relation list: [temporally follows, after, due to, has realization, associated with, has definitional manifestation, 
associated finding, associated aetiologic finding, associated etiologic finding, interprets, associated morphology, causative agent, course, 
finding site, temporally related to, pathological process, direct morphology, is modification of, measures, direct substance, has active ingredient, using, part of].\

Task: Generate the SNOMED CT descriptions for the given concept.

The steps are as follows:
1. extract a CONCEPT from the given context sentence, using the retrieved sub-graph.
2. generate an EXPRESSION in human-readable phrase that can describe the CONCEPT.
3. select one most likely relation from the list between the CONCEPT and the EXPRESSION.
4. generate descriptions in the format of (CONCEPT ; relation ; EXPRESSION). Each CONCEPT may have multiple descriptions.
5. repeat the step 1 to step 4.

Provide your answer as follows:

Answer:::
Concept: 
Descriptions: (The generated descriptions)
Answer End:::\

You MUST provide values for 'Concept' and 'Description' in your answer.\

Few-shot examples:
Answer:::
Concept: apnea
Descriptions: (apnea ; interprets ; respiration observable) (apnea ; has interpretation ; absent) (apnea ; finding site ; structure of respiratory system)
Answer End:::

"""

snomed_extraction_prompt_var_mappings = {"text": "text"}

prompt_tmpl = PromptTemplate(
    snomed_description_generation_prompt, template_var_mappings=snomed_extraction_prompt_var_mappings
)

def query_and_generate_rel(test_id, query_engine, cases=427):
    logging.info(f"Query Engine: {query_engine}")
    with open("data/ade1/test.source") as f:
        results = []
        sentences = f.readlines()[:cases]
        logging.info(f"Experiment ID: {test_id}")
        print(f"Number of sentences: {len(sentences)}; Number of cases for test: {cases}")
        for sentence_id, text in tqdm(enumerate(sentences)):
            print(f"Processing sentence {sentence_id} / {len(sentences)}")
            print(f"Text: {text}")
            retry_count = 0

            fmt_prompt = prompt_tmpl.format(
                text=text,
            )
            # print(fmt_prompt)
            response = query_engine.query(fmt_prompt)
            # display(Markdown(f"<b>{response}</b>"))
            # results.append(clean_response(str(response)) + "\n")
            results.append(extract_triple(str(response), notebook=True) + "\n")
            print(f"Results: {extract_triple(str(response), notebook=True)}")

    with open(f"results/rel.hyps_{test_id}", 'w') as f:
        f.writelines(results)
    
    logging.info(f"Results saved to results/rel.hyps_{test_id}")
    return response

logging_setup(log_file=f"logs/{PARAMETERS['test_id']}.log", log_level=logging.INFO)

response = query_and_generate_rel(test_id = PARAMETERS["test_id"], query_engine = query_engine, cases=50)

50it [28:31, 34.22s/it]


Batch Test - concept/entity extraction
------

In [None]:
using_extractor = None
pipe_extractor = None
input_text_dir = "data/mimiciv/eval/notes_concepts_chunked.json"
snomed_extraction_prompt_var_mappings = {"text": "text"}

MIMICIV_entity_extraction_prompt = """\
Here is the context: {text}.\

Task: Extract the SNOMED CT concepts from the given context.\

The steps are as follows:\
1. extract the concepts from the given context sentence, using the retrieved triplets.
2. there may be abbreviations or acronyms in the context, extract them as concepts as well if they are related to the concepts.
3. output the concepts in a list [] strictly, each concept is separated by a comma.\
\

Provide your answer as follows:

Answer:::
Concepts: [] \
Answer End:::\

Requirements:\
You MUST provide values for 'Concepts:' in your answer. \
ONLY extract concepts, DO NOT include the type of the concept, reasoning, or any other information. \
DO NOT include mark numbers or ordinal numbers in your answer. \
Extract as many unique concepts as possible from the given context. \

"""

prompt_tmpl = PromptTemplate(
    MIMICIV_entity_extraction_prompt, template_var_mappings=snomed_extraction_prompt_var_mappings
)

rag_start(test_id="_test_entityextraction_mimicivchunk",
            input_text_dir=input_text_dir,
            prompt_tmpl=prompt_tmpl,
            target_results="Concepts",
            query_engine=query_engine,
            pipe_parser=pipe_parser,
            cases=3)


0it [00:00, ?it/s]Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


[1;3;32mExtracted keywords: ['nasojejunal feedings', 'pseudocyst', 'elective', 'multisystem organ failure', 'pancreatic', 'necrosis', 'multisystem', 'Biliary pancreatitis', 'invasive', 'minimally invasive approach', 'approach', 'Biliary', 'postnecrotic pseudocyst', 'pancreatic rest', 'pancreatic necrosis', 'GI tract', 'Dr.', 'failure', 'pancreatitis', 'organ', 'cholecystectomy', 'nasojejunal', 'GI', 'minimally', 'feedings', 'postnecrotic', 'tract', 'rest', 'elective cholecystectomy']
[0m

Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


[1;3;34mKG context:
The following are knowledge sequence in max depth 5 in the form of directed graph like:
`subject -[predicate]->, object, <-[predicate_next_hop]-, object_next_hop ...`
('Mendelian susceptibility to mycobacterial disease due to partial signal transducer and activator of transcription 1 deficiency (disorder)', 'type', 'Disorder')
('International neuroblastoma pathology classification: Favorable histology group, patient of any age with ganglioneuroma (Schwannian stroma-dominant) maturing, or mature (finding)', 'type', 'Clinical Finding')
('Hematoma of kidney without rupture of capsule AND with open wound into abdominal cavity (disorder)', 'type', 'Disorder')
('Major systemic to pulmonary collateral artery with absent pulmonary arteries proximal to hilar bifurcation (disorder)', 'type', 'Disorder')
('Hematoma of spleen without rupture of capsule AND without open wound into abdominal cavity (disorder)', 'type', 'Disorder')
('Influenza caused by Influenza A virus subtype 

1it [00:16, 16.91s/it]Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


[1;3;32mExtracted keywords: ['Back', 'Hernia', 'laparoscopic', 'metastatic lung cancer', 'esophagus', 'cancer', 'Esophageal', 'CVA', 'cholecystectomy', 'anxiety', 'Esophageal ulcer', 'Hiatal', 'MI', 'gallstone pancreatitis', 'gallstone', 'pain', 'ulcer', 'laparoscopic cholecystectomy', 'Hiatal Hernia', 'pancreatitis', 'metastatic', 'Back pain', 'lung']
[0m

Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


[1;3;34mKG context:
The following are knowledge sequence in max depth 5 in the form of directed graph like:
`subject -[predicate]->, object, <-[predicate_next_hop]-, object_next_hop ...`
('Major systemic to pulmonary collateral artery with absent pulmonary arteries proximal to hilar bifurcation (disorder)', 'type', 'Disorder')
('Left ventricular failure with normal ejection fraction due to coronary arteriosclerosis (disorder)', 'type', 'Disorder')
('Aberrant course of left anterior descending coronary artery from right coronary artery crossing right ventricular outflow tract (disorder)', 'type', 'Disorder')
('Obstetric pyemic and septic pulmonary embolism with postnatal complication (disorder)', 'type', 'Disorder')
('Hepatic coma due to acute hepatitis B with delta agent (disorder)', 'due to', 'Acute hepatitis B with hepatitis D (disorder)')
('Vascular ring with right aortic arch and right patent arterial duct with absent left pulmonary artery (disorder)', 'type', 'Disorder')
('Hypert

Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


[1;3;32mExtracted keywords: ['Back', 'KEYWORDS', 'Hernia', 'heparin', 'Esophageal ulcer', 'venodyne', 'Hiatal', 'MI', 'laparoscopic', 'gallstone pancreatitis', 'gallstone', 'pain', '---------------------\nKEYWORDS: Hiatal Hernia', 'metastatic lung cancer', 'esophagus', 'cancer', 'Esophageal', 'venodyne boots', 'ulcer', 'CVA', 'laparoscopic cholecystectomy', 'Dilaudid PCA', 'pancreatitis', 'cholecystectomy', 'Dilaudid', 'metastatic', 'Back pain', 'PCA', 'lung', 'boots', 'anxiety']
[0m

Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


[1;3;34mKG context:
The following are knowledge sequence in max depth 5 in the form of directed graph like:
`subject -[predicate]->, object, <-[predicate_next_hop]-, object_next_hop ...`
('Major systemic to pulmonary collateral artery with absent pulmonary arteries proximal to hilar bifurcation (disorder)', 'type', 'Disorder')
('Left ventricular failure with normal ejection fraction due to coronary arteriosclerosis (disorder)', 'type', 'Disorder')
('Aberrant course of left anterior descending coronary artery from right coronary artery crossing right ventricular outflow tract (disorder)', 'type', 'Disorder')
('Obstetric pyemic and septic pulmonary embolism with postnatal complication (disorder)', 'type', 'Disorder')
('Hepatic coma due to acute hepatitis B with delta agent (disorder)', 'due to', 'Acute hepatitis B with hepatitis D (disorder)')
('Vascular ring with right aortic arch and right patent arterial duct with absent left pulmonary artery (disorder)', 'type', 'Disorder')
('Hypert

Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


[1;3;32mExtracted keywords: ['Back', 'Hernia', 'Pain', 'Hiatal', 'MI', 'Laparoscopic', 'Gallstone', 'Esophageal', 'CVA', 'Cholecystectomy', 'Esophagus', 'Anxiety', 'Hiatal Hernia', 'Cancer', 'Metastatic Lung Cancer', 'Back Pain', 'Gallstone Pancreatitis', 'Ulcer', 'Pancreatitis', 'Lung', 'Esophageal Ulcer', 'Laparoscopic Cholecystectomy', 'Metastatic']
[0m

Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


[1;3;34mKG context:
The following are knowledge sequence in max depth 5 in the form of directed graph like:
`subject -[predicate]->, object, <-[predicate_next_hop]-, object_next_hop ...`
('Major systemic to pulmonary collateral artery with absent pulmonary arteries proximal to hilar bifurcation (disorder)', 'type', 'Disorder')
('Left ventricular failure with normal ejection fraction due to coronary arteriosclerosis (disorder)', 'type', 'Disorder')
('Aberrant course of left anterior descending coronary artery from right coronary artery crossing right ventricular outflow tract (disorder)', 'type', 'Disorder')
('Obstetric pyemic and septic pulmonary embolism with postnatal complication (disorder)', 'type', 'Disorder')
('Hepatic coma due to acute hepatitis B with delta agent (disorder)', 'due to', 'Acute hepatitis B with hepatitis D (disorder)')
('Vascular ring with right aortic arch and right patent arterial duct with absent left pulmonary artery (disorder)', 'type', 'Disorder')
('Hypert

Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


[1;3;32mExtracted keywords: ['Back', 'Hernia', 'KEYWORDS', 'heparin', 'Esophageal ulcer', 'venodyne', 'Hiatal', 'MI', 'laparoscopic', 'gallstone pancreatitis', 'gallstone', 'pain', 'venodyne boots\n\nAnswer:::\nKEYWORDS: Hiatal Hernia', 'venodyne boots\nAnswer End:::', 'metastatic lung cancer', 'esophagus', 'cancer', 'End', 'Esophageal', 'ulcer', 'Answer', 'CVA', 'laparoscopic cholecystectomy', 'Hiatal Hernia', 'pancreatitis', 'cholecystectomy', 'metastatic', 'Back pain', 'lung', 'boots', 'anxiety']
[0m

Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


[1;3;34mKG context:
The following are knowledge sequence in max depth 5 in the form of directed graph like:
`subject -[predicate]->, object, <-[predicate_next_hop]-, object_next_hop ...`
('Major systemic to pulmonary collateral artery with absent pulmonary arteries proximal to hilar bifurcation (disorder)', 'type', 'Disorder')
('Left ventricular failure with normal ejection fraction due to coronary arteriosclerosis (disorder)', 'type', 'Disorder')
('Aberrant course of left anterior descending coronary artery from right coronary artery crossing right ventricular outflow tract (disorder)', 'type', 'Disorder')
('Obstetric pyemic and septic pulmonary embolism with postnatal complication (disorder)', 'type', 'Disorder')
('Hepatic coma due to acute hepatitis B with delta agent (disorder)', 'due to', 'Acute hepatitis B with hepatitis D (disorder)')
('Vascular ring with right aortic arch and right patent arterial duct with absent left pulmonary artery (disorder)', 'type', 'Disorder')
('Hypert

2it [01:09, 38.00s/it]Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


[1;3;32mExtracted keywords: ['instructions', 'KEYWORDS', 'instructions\nAnswer End:::', 'vital signs', 'biliary', 'medications', 'pain', 'activity', 'diet', 'signs', 'ambulating', 'mental status', 'biliary pancreatitis', 'level', 'consciousness', 'End', 'mental', 'disposition', 'Answer', 'pain control', 'pancreatitis', 'activity status', 'voiding', 'control', 'level of consciousness', 'vital', 'instructions\n\nAnswer:::\nKEYWORDS: discharge', 'status', 'discharge', 'diagnosis']
[0m

Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


[1;3;34mKG context:
The following are knowledge sequence in max depth 5 in the form of directed graph like:
`subject -[predicate]->, object, <-[predicate_next_hop]-, object_next_hop ...`
('Acute ST segment elevation myocardial infarction of anterior wall involving right ventricle (disorder)', 'type', 'Disorder')
('Panic disorder with agoraphobia, moderate agoraphobic avoidance AND panic attacks in partial remission (disorder)', 'type', 'Disorder')
('Subsequent ST segment elevation myocardial infarction of anterior wall (disorder)', 'type', 'Disorder')
('Panic disorder with agoraphobia, mild agoraphobic avoidance AND panic attacks in full remission (disorder)', 'type', 'Disorder')
('Left ventricular failure with normal ejection fraction due to coronary arteriosclerosis (disorder)', 'type', 'Disorder')
('Single right coronary artery supplying all of heart with usual distribution of left coronary artery derived from distal right coronary artery (disorder)', 'type', 'Disorder')
('Intentio

3it [01:54, 38.22s/it]






Batch test - (entity ; type) extraction
------

In [None]:
using_extractor = None
pipe_extractor = None
input_text_dir = "data/BioNEL_datasets/BC5CDR_gold_all.json"
snomed_extraction_prompt_var_mappings = {"text": "text"}

BC5CDR_extraction_prompt = """\
Here is the context: {text}.\

Task: Extract the entity-type pairs from the given context with the format of (entity ; type).\

Here is the type list: [Disorder, Substance].\

The steps are as follows:\
1. extract the entity from the given context abstract, using the retrieved sub-graph.
2. select ONE most likely type from the list for the extracted entity.
3. output the pairs in the format of (entity ; type) strictly.
4. repeat the step 1 to step 3.\
\

Provide your answer as follows:

Answer:::
Pairs: (All extracted pairs)\
Answer End:::\

Requirements:\
You MUST provide values for 'Pairs:' in your answer. \
ONLY use the type in the type list: [Disorder, Substance].\
ONLY output valid entity-type pairs without any reasoning.

"""

prompt_tmpl = PromptTemplate(
    BC5CDR_extraction_prompt, template_var_mappings=snomed_extraction_prompt_var_mappings
)

rag_start(test_id=f"_test_pairextraction_BC5CDR",
            input_text_dir=input_text_dir,
            prompt_tmpl=prompt_tmpl,
            target_results="Pairs",
            query_engine=query_engine,
            pipe_parser=pipe_parser,
            cases=3)


0it [00:00, ?it/s]Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


[1;3;32mExtracted keywords: ['methyl dopa', 'patients', 'methyl', 'depression', 'out-patients', 'hypertensive', 'history', 'rating', 'psychiatric', 'hypertension', 'hypertensive patients', 'scale', 'physically ill patients', 'physically', 'mood', 'ill', 'dopa', 'psychiatric history', 'mood rating scale']
[0m

Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


[1;3;34mKG context:
The following are knowledge sequence in max depth 5 in the form of directed graph like:
`subject -[predicate]->, object, <-[predicate_next_hop]-, object_next_hop ...`
('Primary degenerative dementia of the Alzheimer type, senile onset, with depression (disorder)', 'type', 'Disorder')
('Primary degenerative dementia of the Alzheimer type, senile onset, with depression (disorder)', 'has definitional manifestation', 'Dementia (disorder)')
('Panic disorder with agoraphobia, moderate agoraphobic avoidance AND panic attacks in partial remission (disorder)', 'type', 'Disorder')
('Panic disorder with agoraphobia, moderate agoraphobic avoidance AND panic attacks in partial remission (disorder)', 'has definitional manifestation', 'Panic (finding)')
('Panic disorder with agoraphobia, mild agoraphobic avoidance AND panic attacks in full remission (disorder)', 'type', 'Disorder')
('Panic disorder with agoraphobia, agoraphobic avoidance in full remission AND panic attacks in ful

1it [00:13, 13.46s/it]Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


[1;3;32mExtracted keywords: ['sinoatrial nodal pacemakers', 'nodal', 'asystole', 'bradyarrhythmias', 'atrioventricular', 'pacemakers', 'lidocaine', 'cardiac asystole', 'atrioventricular nodal pacemakers', 'idiosyncrasy', 'sinoatrial', 'cardiac']
[0m

Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


[1;3;34mKG context:
The following are knowledge sequence in max depth 5 in the form of directed graph like:
`subject -[predicate]->, object, <-[predicate_next_hop]-, object_next_hop ...`
('Ventricular aneurysm due to and following acute myocardial infarction (disorder)', 'type', 'Disorder')
('Left ventricular failure with normal ejection fraction due to coronary arteriosclerosis (disorder)', 'type', 'Disorder')
("Ebstein's anomaly of tricuspid valve with atrialization of right ventricular chamber (disorder)", 'type', 'Disorder')
('Left ventricular failure with normal ejection fraction due to coronary arteriosclerosis (disorder)', 'due to', 'Coronary arteriosclerosis (disorder)')
('Mural thrombus of right ventricle following acute myocardial infarction (disorder)', 'type', 'Disorder')
('International neuroblastoma pathology classification: Favorable histology group, patient of any age with ganglioneuroma (Schwannian stroma-dominant) maturing, or mature (finding)', 'type', 'Clinical Fin

2it [00:23, 11.45s/it]Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


[1;3;32mExtracted keywords: ['children', 'beta', 'reabsorption', 'vesicoprostatic tumor', "Fanconi's syndrome", 'phosphate tubular reabsorption', 'MMT', 'tumor', '2', 'cisplatin', 'ifosfamide', '84', 'malignant', 'tubular', 'vesicoprostatic', 'beta 2 microglobulinuria', 'renal', 'phosphate', 'Fanconi', 'SIOP', 'renal toxicity', 'malignant mesenchymal tumors', 'SIOP MMT 84', 'mesenchymal', 'syndrome', 'tumors', 'toxicity', 'microglobulinuria']
[0m

Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


[1;3;34mKG context:
The following are knowledge sequence in max depth 5 in the form of directed graph like:
`subject -[predicate]->, object, <-[predicate_next_hop]-, object_next_hop ...`
('Acute myeloid leukemia and myelodysplastic syndrome related to topoisomerase type 2 inhibitor (disorder)', 'causative agent', 'Substance with topoisomerase inhibitor mechanism of action (substance)')
('Acute myeloid leukemia in complete remission (disorder)', 'type', 'Disorder')
('Intergroup rhabdomyosarcoma study post-surgical clinical group IIC: Locally extensive tumor (spread to regional lymph nodes), gross total resection, but microscopic residual disease (finding)', 'type', 'Clinical Finding')
('Hepatic coma due to acute hepatitis B with delta agent (disorder)', 'due to', 'Acute hepatitis B with hepatitis D (disorder)')
('Accidental mefenamic acid overdose (disorder)', 'type', 'Disorder')
('Hepatic ascites co-occurrent with chronic active hepatitis due to toxic liver disease (disorder)', 'type'

3it [00:46, 15.42s/it]






Batch test - triple extraction
------

In [None]:
using_extractor = None
pipe_extractor = None
input_text_dir = "data/pubmed_eval_datasets/2023_selected_limit_100.json"
snomed_extraction_prompt_var_mappings = {"text": "text"}

Pubmed_snomed_triple_extraction_prompt = """\
Here is the context: {text}.\

Task: Extract the SNOMED CT triples from the given context with the format of (concept 1 ; relation ; concept 2).\

Here is the optional relation list: [temporally follows, after, due to, has realization, associated with, has definitional manifestation,
associated finding, associated aetiologic finding, associated etiologic finding, interprets, associated morphology, causative agent, course,
finding site, temporally related to, pathological process, direct morphology, is modification of, measures, direct substance, has active ingredient, using, part of].\

The steps are as follows:\
1. extract the concept 1 and concept 2 from the given context sentence, using the retrieved sub-graph.
2. select ONE most likely relation from the list for the extracted concepts.
3. output the triples in the format of (concept 1 ; relation ; concept 2) strictly.\
\

Provide your answer as follows:

Answer:::
Triples: (The extracted triples)\
Answer End:::\

Requirements:\
You MUST provide values for 'Triples:' in your answer.\
ONLY output the triples without any other information.\
"""

prompt_tmpl = PromptTemplate(
    Pubmed_snomed_triple_extraction_prompt, template_var_mappings=snomed_extraction_prompt_var_mappings
)

rag_start(test_id=f"_test_tripleextraction_pubmed",
            input_text_dir=input_text_dir,
            prompt_tmpl=prompt_tmpl,
            target_results="Triples",
            query_engine=query_engine,
            pipe_parser=pipe_parser,
            cases=3)


0it [00:00, ?it/s]Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


[1;3;32mExtracted keywords: ['skin sensitizers', 'methodologies', 'Guidelines', 'skin', 'domain', 'OECD Test Guidelines', 'new approach methodologies', 'sensitizers', 'Test', 'acting', 'applicability domain', 'haptens', 'NAMs', 'hydrophobic substances', 'indirectly acting haptens', 'indirectly', 'LLNA', 'applicability', 'new', 'substances', 'assay', 'OECD', 'UVCBs', 'GARD skin assay', 'GARD', 'approach', 'hydrophobic']
[0m

Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


[1;3;34mKG context:
The following are knowledge sequence in max depth 5 in the form of directed graph like:
`subject -[predicate]->, object, <-[predicate_next_hop]-, object_next_hop ...`
('High risk proliferative retinopathy not amenable to photocoagulation due to diabetes mellitus (disorder)', 'type', 'Disorder')
('Allergy to cefpodoxime (finding)', 'type', 'Clinical Finding')
('High risk proliferative retinopathy not amenable to photocoagulation due to diabetes mellitus (disorder)', 'associated with', 'Diabetes mellitus (disorder)')
('Monoclonal gammopathy (disorder)', 'has definitional manifestation', 'Serum gamma globulin above reference range (finding)')
('Allergy to alpha-tocopherol (finding)', 'type', 'Clinical Finding')
('Allergy to phthalylsulfathiazole (finding)', 'type', 'Clinical Finding')
('Allergy to cefpirome (finding)', 'type', 'Clinical Finding')
('Non-healing of skin donor site (finding)', 'associated morphology', 'Wound (morphologic abnormality)')
('Allergy to fluvo

1it [00:27, 27.21s/it]Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


[1;3;32mExtracted keywords: ['colon cancer', 'non-obese', 'robotic surgery', 'cancer', 'open surgery', 'resection', 'laparoscopic', 'non', 'surgery', 'laparoscopic surgery', 'patients', 'colon', 'open', 'obese', 'robotic']
[0m

Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


[1;3;34mKG context:
The following are knowledge sequence in max depth 5 in the form of directed graph like:
`subject -[predicate]->, object, <-[predicate_next_hop]-, object_next_hop ...`
('International neuroblastoma pathology classification: Favorable histology group, patient of any age with ganglioneuroma (Schwannian stroma-dominant) maturing, or mature (finding)', 'type', 'Clinical Finding')
('B-cell lymphoma unclassifiable with features intermediate between classical Hodgkin lymphoma and diffuse large B-cell lymphoma (disorder)', 'type', 'Disorder')
('International neuroblastoma staging system stage 4 (finding)', 'type', 'Clinical Finding')
('Tumor metastasis to non-regional lymph nodes cannot be assessed (finding)', 'type', 'Clinical Finding')
('G2 grade (finding)', 'type', 'Clinical Finding')
('Left iliac fossa mass (finding)', 'type', 'Clinical Finding')
("Non-Hodgkin's lymphoma of uterine cervix (disorder)", 'type', 'Disorder')
('Lymphatic (small vessel) extramural invasion by

2it [01:03, 32.35s/it]Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


[1;3;32mExtracted keywords: ['natural history', 'natural', 'solid', 'Apolipoprotein', 'outcomes', 'mutations', 'organ', 'APOA1', 'history', 'finding', 'Apolipoprotein A-I', 'course', 'gene', 'site', 'amyloidosis', 'finding site', 'solid organ transplantation', 'A', 'I', 'gene mutations', 'transplantation']
[0m

Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


[1;3;34mKG context:
The following are knowledge sequence in max depth 5 in the form of directed graph like:
`subject -[predicate]->, object, <-[predicate_next_hop]-, object_next_hop ...`
("Non-familial Alzheimer's disease of late onset (disorder)", 'type', 'Disorder')
("Non-familial Alzheimer's disease of late onset (disorder)", 'has definitional manifestation', 'Dementia (disorder)')
('International neuroblastoma pathology classification: Favorable histology group, patient of any age with ganglioneuroma (Schwannian stroma-dominant) maturing, or mature (finding)', 'type', 'Clinical Finding')
("Familial Alzheimer's disease of late onset (disorder)", 'type', 'Disorder')
("Progressive aphasia in Alzheimer's disease (disorder)", 'has definitional manifestation', 'Dementia (disorder)')
('Mendelian susceptibility to mycobacterial disease due to partial signal transducer and activator of transcription 1 deficiency (disorder)', 'type', 'Disorder')
("Familial Alzheimer's disease of late onset 

3it [01:16, 25.44s/it]




