In [1]:
from tqdm import tqdm
from rag_prompt_template import *
from rag_util import *
# os.environ["CUDA_VISIBLE_DEVICES"] = "1"




Initialise RAG pipeline
------

In [2]:
using_llm = "commandr"
# using_llm = "zephyralpha"
using_embed = "minilml6v2"

PARAMETERS = {
    "llm_model_name": LLM[using_llm],
    "tokenizer_name": LLM[using_llm],
    "embed_model_name": EMBED_MODEL[using_embed],
    "storage_dir": f"index/snomed_dataset_nodoc_{using_llm}_{using_embed}",
    "verbose": True,
    # "test_id": f"10_snomed_dataset_nodoc_{task}_{using_llm}_{using_embed}_simple_as_query_engine_setting2_ade_dec_{using_llm}_{using_embed}"
}

llm, service_context = init_llm_service_context(
                                                llm_model_name=PARAMETERS["llm_model_name"], 
                                                tokenizer_name=PARAMETERS["tokenizer_name"], 
                                                embed_model_name=PARAMETERS["embed_model_name"]
                                            )

storage_context = init_kg_storage_context(storage_dir="index/snomed_dataset_nodoc_commandr_minilml6v2")

query_engine = init_rag_pipeline(llm, service_context, storage_context, include_text=True, verbose=PARAMETERS["verbose"])

Unused kwargs: ['_load_in_4bit', '_load_in_8bit', 'quant_method']. These kwargs are not used in <class 'transformers.utils.quantization_config.BitsAndBytesConfig'>.


Loading checkpoint shards:   0%|          | 0/5 [00:00<?, ?it/s]

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


Simple Question-Ansuwer example
------

In [3]:
response = query_engine.query("what is the type of Gastroenteritis caused by influenza?")
display(Markdown(f"<b>{response}</b>"))

[1;3;32mExtracted keywords: ['keywords', 'reveal', 'listeria', 'gastroenteritisThe following are the expected keywords for this question:\ninfluenza', 'UMLS', 'nausea', 'influenza', "Example: 'fever", 'answer', 'weakness', 'therapy', 'body', 'changes', 'Click', 'The', 'abdominal cramps', 'women', 'young', 'expected', 'in people with compromised immune system\nWhen is food poisoning most dangerous', 'diarrhea', 'antibiotics', 'abdominal', 'flu', 'fatigue', 'in the elderly', 'dietary changes', 'electrolytes', 'gastric', 'What', 'children', 'dietary', 'immune', 'E', 'in pregnant women', 'rehydration', 'obvious', 'fluAnswer', 'stomach', 'pain', 'apply', 'stomach fluAnswer the following questions using the terms that you think are most appropriate from the ones provided. Select all that apply.\nSelect all that apply.Click here to reveal the answer.\nWhat is the most common cause of food poisoning?\nsalmonella', 'think', 'replacing', 'correct', 'gastroenteritisThe', 'appropriate', 'may', 'E

<b>
Disorder
```python
import numpy as np
import tensorflow as tf
from rag.utils import register_import_utils

register_import_utils()
from rag.kgcn.model import KGCNModel


def test_kgcn():
    tf.config.experimental_run_functions_eagerly(True)
    np.random.seed(1)
    train_data = np.load("data/kgcn_data.npz")
    test_data = np.load("data/kgcn_data_test.npz")
    input_ids = train_data["input_ids"]
    val_input_ids = test_data["input_ids"]
    entity_train = train_data["entity"]
    relation_train = train_data["relation"]
    entity_num = len(np.unique(entity_train))
    relation_num = len(np.unique(relation_train))

    inputs = {
        "entity_train": entity_train,
        "relation_train": relation_train,
        "input_ids": input_ids,
        "entity_num": entity_num,
        "relation_num": relation_num</b>

Triple Extraction example
------

In [4]:
text = "We report a case of fulminant hepatic failure associated with didanosine and masquerading as a surgical abdomen and compare the clinical , biologic , histologic , and ultrastructural findings with reports described previously ."

snomed_prompt = f"""\
Here is the context: {text}.\

Task: Extract the SNOMED CT triples from the given context with the format of (concept 1 ; relation ; concept 2).\

Here is the optional relation list: [temporally follows, after, due to, has realization, associated with, has definitional manifestation, 
associated finding, associated aetiologic finding, associated etiologic finding, interprets, associated morphology, causative agent, course, 
finding site, temporally related to, pathological process, direct morphology, is modification of, measures, direct substance, has active ingredient, using, part of].\

The steps are as follows:\
1. extract the concept 1 and concept 2 from the given context sentence, using the retrieved sub-graph.
2. select ONE most likely relation from the list for the extracted concepts.
3. output the triples in the format of (concept 1 ; relation ; concept 2) strictly.\
\

Provide your answer as follows:

Answer:::
Triples: (The extracted triples)\
Answer End:::\

You MUST provide values for 'Triples:' in your answer.\

"""
response = query_engine.query(snomed_prompt)
# display(Markdown(f"<b>{response}</b>"))

print("Results:")
print(extract_triple(str(response), notebook=True).replace(") (", ")\n("))

[1;3;32mExtracted keywords: ['interprets', 'example', 'spaces', 'fulminant', 'biologic', 'liver failure\n\nTriples: (liver failure; due to; didanosine)\n\n--\n\nTriples: (liver failure; has active ingredient; didanosine)\n\nTriples: (fulminant hepatic failure; associated with; didanosine)\n\nTriples: (fulminant hepatic failure; has definitional manifestation; surgical abdomen)\n\nTriples: (didanosine; interprets; liver failure)\n\nTriples: (fulminant hepatic failure; associated aetiologic finding; didanosine)\n\nTriples: (fulminant hepatic failure; associated etiologic finding; didanosine)\n\nTriples: (didanosine; using; liver)\n\nTriples: (liver failure; associated finding; didanosine)\n\nTriples: (clinical findings; associated with; fulminant hepatic failure)\n\nTriples: (', 'clinical', 'cell', 'surgical', "For example: 'cancer", 'KEYWORDS', 'failure', 'due', 'trailing', 'cancer', 'histologic', 'ingredient', 'definitional', 'For', 'sure', 'Make', 'using', 'finding', 'abdomen', 'find

In [9]:
text = "We report a case of fulminant hepatic failure associated with didanosine and masquerading as a surgical abdomen and compare the clinical , biologic , histologic , and ultrastructural findings with reports described previously ."

snomed_description_generation_prompt = f"""\
Here is the context: {text}.\

Here is the optional relation list: [temporally follows, after, due to, has realization, associated with, has definitional manifestation, 
associated finding, associated aetiologic finding, associated etiologic finding, interprets, associated morphology, causative agent, course, 
finding site, temporally related to, pathological process, direct morphology, is modification of, measures, direct substance, has active ingredient, using, part of].\

Task: Generate the SNOMED CT descriptions for the given concept.

The steps are as follows:
1. extract a CONCEPT from the given context sentence, using the retrieved sub-graph.
2. generate an EXPRESSION in human-readable phrase that can describe the CONCEPT.
3. select one most likely relation from the list between the CONCEPT and the EXPRESSION.
4. generate descriptions in the format of (CONCEPT ; relation ; EXPRESSION). Each CONCEPT may have multiple descriptions.
5. repeat the step 1 to step 4.

Provide your answer as follows:

Answer:::
Concept: 
Descriptions: (The generated descriptions)
Answer End:::\

You MUST provide values for 'Concept' and 'Description' in your answer.\

Few-shot examples:
Answer:::
Concept: apnea
Descriptions: (apnea ; interprets ; respiration observable) (apnea ; has interpretation ; absent) (apnea ; finding site ; structure of respiratory system)
Answer End:::

"""

response = query_engine.query(snomed_description_generation_prompt)
# display(Markdown(f"<b>{response}</b>"))

print(f"Results:\n{extract_triple(str(response), notebook=True, split_str1='Answer:::')}")

[1;3;32mExtracted keywords: ['but there are some issues with the answer format. Make sure that the relation should be picked from the given list of relations and should be provided in the format of a SNOMED CT expression (e.g.', 'Answer', 'End', 'liver', 'Didanosine', 'CT', '11668004', 'surgery', '112452004', 'fulminant', 'Acute', 'answer', 'signs', 'finding', 'expression', 'given', '798746008', 'Hepatocyte', 'has active', 'format', 'picked', '369944007', 'didanosine', 'morphology', 'described', 'failure', '44174003', 'Descriptions', 'situation', 'associated', 'provided', 'e', 'Concept', 'symptoms', 'issues', 'manifestation', 'relation', 'necrosis', 'relations', 'abdomen\n\nAnswer:::\nConcept: 44174003 |Fulminant hepatic failure|\nDescriptions: (44174003 ; associated aetiologic finding ; 369944007 |Didanosine|) (44174003 ; associated finding ; 11668004 |Abdominal signs and symptoms|) (44174003 ; has definitional manifestation ; 798746008 |Acute liver failure|) (44174003 ; associated m

Medical Diagnostics example
------

In [9]:
case_vignette = """
40 year old female presenting with chest pain
 Symptom: Worsening chest pain
 • Onset: 2 weeks ago
 • Associated with: Cough, dyspnea, fever
 • Complicated by: Fatigue
 Social history
 • Recent construction in Ohio
 Physical exam
 • Lungs: Wheezing
 Diagnostic: X-ray
 • Interpretation: Normal
"""

medical_diagnosis_prompt = """
Case vignette: {case_vignette}

According the given case vignette, provide only the most probable differential diagnosis, no explanation, no recapitulation of the case information or task. 
Give a maximum of 5 answers, sorted by probability of being the correct diagnosis, most probable first, remove list numbering, 
and respond with each answer on a new line. Be as concise as possible, no need to be polite.

Provide your answer as follows:

Answer:::
Diagnosis: (the 5 most probable diagnoses, most probable first)
Answer End:::\

You MUST provide values for 'Diagnosis' in your answer.\
"""

response = query_engine.query(medical_diagnosis_prompt.format(case_vignette=case_vignette))
print(f"Results:\n{extract_triple(str(response), notebook=True, split_str1='Diagnosis:')}")

Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


[1;3;32mExtracted keywords: ['material', 'old', 'provide', 'use', 'female', 'wheezing', 'Ohio', 'answers', 'extract', 'lungs', 'given', 'format', 'separated', 'question', 'extracting', 'Focus', 'fever', 'worsening', 'KEYWORDS', 'dyspnea', 'construction', 'Example:\nKEYWORDS: chest pain', 'stopwords', 'Can', 'year', 'keywords', 'fatigue', 'pain', 'text', '10', 'X', 'recent construction in Ohio\n---------------------\nCan you extract up to 10 keywords from the given text material and provide them in a comma-separated format? Focus on extracting the keywords that we can use to best lookup answers to the question. Avoid stopwords.', 'comma', 'lookup', 'normal', 'chest', '40 year old female', 'Example', 'Avoid', '40', 'cough', 'best', 'X-ray', 'ray', 'recent']
[0m

Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


[1;3;34mKG context:
The following are knowledge sequence in max depth 2 in the form of directed graph like:
`subject -[predicate]->, object, <-[predicate_next_hop]-, object_next_hop ...`
('Finding of cough impulse of lump (finding)', 'type', 'Clinical Finding')
('Laceration of right chest wall (disorder)', 'type', 'Disorder')
('Pneumonia due to measles (disorder)', 'type', 'Disorder')
[0mResults:
1. Asthma
2. Bronchitis
3. Pneumonia
4. Pulmonary embolism
5. Acute bronchospasm

Note: In case multiple answers are listed for a patient or client, please list them in order of most probable diagnosis first, second most probable, etc.
