### Load data

In [None]:
import json

# Path to the JSON file
json_file_path = './dataset/BioASQ/test.json'

# Initialize lists to store the individual fields
test_id_list = []
test_question_list = []
test_context_list = []
test_label_list = []

# Open the JSON file and read line by line
with open(json_file_path, 'r', encoding='utf-8') as file:
    for line in file:
        # Parse each line (which is a JSON object) into a Python dictionary
        sample = json.loads(line.strip())
        
        # Extract and store each field in its respective list
        test_id_list.append(sample.get('id'))
        test_question_list.append(sample.get('sentence1'))
        test_context_list.append(sample.get('sentence2'))
        test_label_list.append(sample.get('label'))

### Getting results from LLMs

In [None]:
import openai
import os
import pathlib
import textwrap
import google.generativeai as genai
from IPython.display import display
from IPython.display import Markdown
from google.colab import userdata
from vertexai.generative_models import GenerationConfig

from transformers import AutoTokenizer, T5ForConditionalGeneration
from transformers import LlamaTokenizer, LlamaForCausalLM
from transformers import pipeline


def get_output(prompt, llm):
    if llm == 3.5:
        openai.api_key = ''
        model = 'gpt-3.5-turbo-1106'
        message = openai.ChatCompletion.create(
            model=model,
            temperature=0,
            messages=[
                    {"role": "user", "content": prompt}
                ]
        )
        result = message['choices'][0]['message']['content']

    elif llm == 4:
        openai.api_key = ''
        model = 'gpt-4'
        message = openai.ChatCompletion.create(
            model=model,
            temperature=0,
            messages=[
                    {"role": "user", "content": prompt}
                ]
        )
        result = message['choices'][0]['message']['content']

    elif llm == 'instruct':
        openai.api_key = ''
        model = "gpt-3.5-turbo-instruct"
        message = openai.Completion.create(
            model = model,
            prompt = prompt,
            temperature = 0    
        )
        result = message['choices'][0]['text']   
    
    elif llm == 'gemini-1.0-pro':
        GOOGLE_API_KEY = userdata.get('GOOGLE_API_KEY')
        genai.configure(api_key=GOOGLE_API_KEY)
        model = genai.GenerativeModel('gemini-pro')

        safety_settings = [
            {"category": "HARM_CATEGORY_SEXUALLY_EXPLICIT", "threshold": "BLOCK_NONE"},
            {"category": "HARM_CATEGORY_HATE_SPEECH", "threshold": "BLOCK_NONE"},
            {"category": "HARM_CATEGORY_HARASSMENT", "threshold": "BLOCK_NONE"},
            {"category": "HARM_CATEGORY_DANGEROUS_CONTENT", "threshold": "BLOCK_NONE"}
        ]        

        response = model.generate_content(prompt,
                generation_config=genai.types.GenerationConfig(
                candidate_count=1,
                temperature=0),
                safety_settings=safety_settings)
    
        result = response.candidates[0].content.parts[0].text
    

    elif llm == 'flan-ul2':
        model = T5ForConditionalGeneration.from_pretrained("google/flan-ul2", device_map="auto", load_in_8bit=True)                                                                 
        tokenizer = AutoTokenizer.from_pretrained("google/flan-ul2")
        inputs = tokenizer(prompt, return_tensors="pt").input_ids.to("cuda")
        outputs = model.generate(inputs, max_length=200)
        result = tokenizer.decode(outputs[0])
    

    elif llm == 'med-alpaca':
        pipeline = pipeline("text-generation", model="medalpaca/medalpaca-7b", tokenizer="medalpaca/medalpaca-7b")
        result = pipeline(prompt, max_length=200)[0]['generated_text']

    elif llm == 'pmc-llama':
        tokenizer = LlamaTokenizer.from_pretrained('axiong/PMC_LLaMA_13B')
        model = LlamaForCausalLM.from_pretrained('axiong/PMC_LLaMA_13B')
        encoded_input = tokenizer(prompt, return_tensors="pt", add_special_tokens=False)
        output = model.generate(**encoded_input)
        result = tokenizer.decode(output[0])

    else:
        raise ValueError('Invalid LLM')

    print(result)
    return result

In [None]:
def create_prompt(question, reference, prompt_type='base'):
    
    if prompt_type == 'guide':
        prompt = f'''### Task
You are a skilled medical expert. Considering the information from a biomedical study provided in the reference text, is it correct to conclude that '{question}'? Please first respond with 'yes' or 'no', followed by a brief explanation of your reasoning process, ensuring that your explanation aligns with the study's findings.
'''
    else:
        prompt = f'''### Task
You are a skilled medical expert. Considering the information from a biomedical study provided in the reference text, is it correct to conclude that '{question}'? Please respond with 'yes' or 'no'.
'''


    if prompt_type == '1shot':
        prompt += f''' 
### Examples
Example Question 1: Is the protein Papilin secreted?
Example Reference Text 1: Using expression analysis, we identify three genes that are transcriptionally regulated by HLH-2: the protocadherin cdh-3, and two genes encoding secreted extracellular matrix proteins, mig-6/papilin and him-4/hemicentin.  We found that mig-6 encodes long (MIG-6L) and short (MIG-6S) isoforms of the extracellular matrix protein papilin, each required for distinct aspects of DTC migration. Both MIG-6 isoforms have a predicted N-terminal papilin cassette apilins are homologous, secreted extracellular matrix proteins which share a common order of protein domains.  The TSR superfamily is a diverse family of extracellular matrix and transmembrane proteins, many of which have functions related to regulating matrix organization, cell-cell interactions and cell guidance. This review samples some of the contemporary literature regarding TSR superfamily members (e.g. F-spondin, UNC-5, ADAMTS, papilin, and TRAP) where specific functions are assigned to the TSR domains. Papilins are extracellular matrix proteins  Papilin is an extracellular matrix glycoprotein   Collagen IV, laminin, glutactin, papilin, and other extracellular matrix proteins were made primarily by hemocytes and were secreted into the medium.  A sulfated glycoprotein was isolated from the culture media of Drosophila Kc cells and named papilin.
Example Output Text 1: yes
'''

    elif prompt_type == '3shot':
        prompt += f''' 
### Examples
Example Question 1: Is the protein Papilin secreted?
Example Reference Text 1: Using expression analysis, we identify three genes that are transcriptionally regulated by HLH-2: the protocadherin cdh-3, and two genes encoding secreted extracellular matrix proteins, mig-6/papilin and him-4/hemicentin.  We found that mig-6 encodes long (MIG-6L) and short (MIG-6S) isoforms of the extracellular matrix protein papilin, each required for distinct aspects of DTC migration. Both MIG-6 isoforms have a predicted N-terminal papilin cassette apilins are homologous, secreted extracellular matrix proteins which share a common order of protein domains.  The TSR superfamily is a diverse family of extracellular matrix and transmembrane proteins, many of which have functions related to regulating matrix organization, cell-cell interactions and cell guidance. This review samples some of the contemporary literature regarding TSR superfamily members (e.g. F-spondin, UNC-5, ADAMTS, papilin, and TRAP) where specific functions are assigned to the TSR domains. Papilins are extracellular matrix proteins  Papilin is an extracellular matrix glycoprotein   Collagen IV, laminin, glutactin, papilin, and other extracellular matrix proteins were made primarily by hemocytes and were secreted into the medium.  A sulfated glycoprotein was isolated from the culture media of Drosophila Kc cells and named papilin.
Example Output Text 1: yes

Example Question 2: Does metformin interfere thyroxine absorption?
Example Reference Text 2: LT4 absorption is unchanged by concomitant metformin ingestion. It has been hypothesized that metformin may suppress serum thyrotropin (TSH) concentrations by enhancing LT4 absorption or by directly affecting the hypothalamic-pituitary axis.
Example Output Text 2: no

Example Question 3: Do mutations of AKT1 occur in meningiomas?
Example Reference Text 3: The recent identification of somatic mutations in components of the SHH-GLI1 and AKT1-MTOR signaling pathways indicates the potential for cross talk of these pathways in the development of meningiomas. A mutation in PIK3CA or AKT1 was found in around 9 % of the cases. AKT1E17K mutations cluster with meningothelial and transitional meningiomas and can be detected by SFRP1 immunohistochemistry. AKT1E17K mutations were exclusively seen in meningiomas and occurred in 65 of 958 of these tumors. A strong preponderance was seen in the variant of meningothelial meningioma WHO grade I of basal and spinal localization. In contrast, AKT1E17K mutations were rare in WHO grade II and absent in WHO grade III meningiomas.  We observed strong up-regulation of SFRP1 expression in all meningiomas with AKT1E17K mutation and in HEK293 cells after transfection with mutant AKT1E17K, but not in meningiomas and HEK293 cells lacking this mutation. SMO and AKT1 mutations occur in non-NF2 meningiomas. Recurrent mutations in SMO and AKT1 are mutually exclusive with NF2 loss in meningioma. Genomic sequencing of meningiomas identifies oncogenic SMO and AKT1 mutations.  A subset of meningiomas lacking NF2 alterations harbored recurrent oncogenic mutations in AKT1 (p.Glu17Lys) and SMO (p.Trp535Leu) and exhibited immunohistochemical evidence of activation of these pathways. Genomic analysis of non-NF2 meningiomas reveals mutations in TRAF7, KLF4, AKT1, and SMO. A subset of meningiomas lacking NF2 alterations harbored recurrent oncogenic mutations in AKT1 (p.Glu17Lys) and SMO (p.Trp535Leu) and exhibited immunohistochemical evidence of activation of these pathways. SMO and AKT1 mutations occur in non-NF2 meningiomas The recent identification of somatic mutations in components of the SHH-GLI1 and AKT1-MTOR signaling pathways indicates the potential for cross talk of these pathways in the development of meningiomas A subset of meningiomas lacking NF2 alterations harbored recurrent oncogenic mutations in AKT1 (p.Glu17Lys) and SMO (p.Trp535Leu) and exhibited immunohistochemical evidence of activation of these pathways Genomic analysis of non-NF2 meningiomas reveals mutations in TRAF7, KLF4, AKT1, and SMO Genomic sequencing of meningiomas identifies oncogenic SMO and AKT1 mutations Recurrent mutations in SMO and AKT1 are mutually exclusive with NF2 loss in meningioma A subset of meningiomas lacking NF2 alterations harbored recurrent oncogenic mutations in AKT1 (p.Glu17Lys) and SMO (p.Trp535Leu) and exhibited immunohistochemical evidence of activation of these pathways. These mutations were present in therapeutically challenging tumors of the skull base and higher grade.  A subset of meningiomas lacking NF2 alterations harbored recurrent oncogenic mutations in AKT1 (p.Glu17Lys) and SMO (p.Trp535Leu) and exhibited immunohistochemical evidence of activation of these pathways.
Example Output Text 3: yes
'''

    elif prompt_type == '5shot':
        prompt += f''' 
### Examples
Example Question 1: Is the protein Papilin secreted?
Example Reference Text 1: Using expression analysis, we identify three genes that are transcriptionally regulated by HLH-2: the protocadherin cdh-3, and two genes encoding secreted extracellular matrix proteins, mig-6/papilin and him-4/hemicentin.  We found that mig-6 encodes long (MIG-6L) and short (MIG-6S) isoforms of the extracellular matrix protein papilin, each required for distinct aspects of DTC migration. Both MIG-6 isoforms have a predicted N-terminal papilin cassette apilins are homologous, secreted extracellular matrix proteins which share a common order of protein domains.  The TSR superfamily is a diverse family of extracellular matrix and transmembrane proteins, many of which have functions related to regulating matrix organization, cell-cell interactions and cell guidance. This review samples some of the contemporary literature regarding TSR superfamily members (e.g. F-spondin, UNC-5, ADAMTS, papilin, and TRAP) where specific functions are assigned to the TSR domains. Papilins are extracellular matrix proteins  Papilin is an extracellular matrix glycoprotein   Collagen IV, laminin, glutactin, papilin, and other extracellular matrix proteins were made primarily by hemocytes and were secreted into the medium.  A sulfated glycoprotein was isolated from the culture media of Drosophila Kc cells and named papilin.
Example Output Text 1: yes

Example Question 2: Does metformin interfere thyroxine absorption?
Example Reference Text 2: LT4 absorption is unchanged by concomitant metformin ingestion. It has been hypothesized that metformin may suppress serum thyrotropin (TSH) concentrations by enhancing LT4 absorption or by directly affecting the hypothalamic-pituitary axis.
Example Output Text 2: no

Example Question 3: Do mutations of AKT1 occur in meningiomas?
Example Reference Text 3: The recent identification of somatic mutations in components of the SHH-GLI1 and AKT1-MTOR signaling pathways indicates the potential for cross talk of these pathways in the development of meningiomas. A mutation in PIK3CA or AKT1 was found in around 9 % of the cases. AKT1E17K mutations cluster with meningothelial and transitional meningiomas and can be detected by SFRP1 immunohistochemistry. AKT1E17K mutations were exclusively seen in meningiomas and occurred in 65 of 958 of these tumors. A strong preponderance was seen in the variant of meningothelial meningioma WHO grade I of basal and spinal localization. In contrast, AKT1E17K mutations were rare in WHO grade II and absent in WHO grade III meningiomas.  We observed strong up-regulation of SFRP1 expression in all meningiomas with AKT1E17K mutation and in HEK293 cells after transfection with mutant AKT1E17K, but not in meningiomas and HEK293 cells lacking this mutation. SMO and AKT1 mutations occur in non-NF2 meningiomas. Recurrent mutations in SMO and AKT1 are mutually exclusive with NF2 loss in meningioma. Genomic sequencing of meningiomas identifies oncogenic SMO and AKT1 mutations.  A subset of meningiomas lacking NF2 alterations harbored recurrent oncogenic mutations in AKT1 (p.Glu17Lys) and SMO (p.Trp535Leu) and exhibited immunohistochemical evidence of activation of these pathways. Genomic analysis of non-NF2 meningiomas reveals mutations in TRAF7, KLF4, AKT1, and SMO. A subset of meningiomas lacking NF2 alterations harbored recurrent oncogenic mutations in AKT1 (p.Glu17Lys) and SMO (p.Trp535Leu) and exhibited immunohistochemical evidence of activation of these pathways. SMO and AKT1 mutations occur in non-NF2 meningiomas The recent identification of somatic mutations in components of the SHH-GLI1 and AKT1-MTOR signaling pathways indicates the potential for cross talk of these pathways in the development of meningiomas A subset of meningiomas lacking NF2 alterations harbored recurrent oncogenic mutations in AKT1 (p.Glu17Lys) and SMO (p.Trp535Leu) and exhibited immunohistochemical evidence of activation of these pathways Genomic analysis of non-NF2 meningiomas reveals mutations in TRAF7, KLF4, AKT1, and SMO Genomic sequencing of meningiomas identifies oncogenic SMO and AKT1 mutations Recurrent mutations in SMO and AKT1 are mutually exclusive with NF2 loss in meningioma A subset of meningiomas lacking NF2 alterations harbored recurrent oncogenic mutations in AKT1 (p.Glu17Lys) and SMO (p.Trp535Leu) and exhibited immunohistochemical evidence of activation of these pathways. These mutations were present in therapeutically challenging tumors of the skull base and higher grade.  A subset of meningiomas lacking NF2 alterations harbored recurrent oncogenic mutations in AKT1 (p.Glu17Lys) and SMO (p.Trp535Leu) and exhibited immunohistochemical evidence of activation of these pathways.
Example Output Text 3: yes

Example Question 4: Is enzastaurin effective treatment of glioblastoma?
Example Reference Text 4: RESULTS: fourteen randomized clinical trials were identified (7 with bevacizumab, 2 cilengitide, 1 enzastaurin, 1 dasatinib, 1 vandetanib, 1 temsirolimus, 1 cediranib) including 4330 patients. Antiangiogenic drugs showed no improvement in overall survival with a pooled HR of 1.00, a trend for an inferior outcome, in terms of overall survival, was observed in the group of patients receiving antiangiogenic drug alone compared to cytotoxic drug alone (HR=1.24, p=0.056).  Enzastaurin (LY317615) in combination with bevacizumab for recurrent malignant gliomas is well-tolerated, with response and progression-free survival similar to bevacizumab monotherapy. So far, inhibition of angiogenesis by compounds such as bevacizumab, cediranib, enzastaurin or cilengitide as well as alternative dosing schedules of temozolomide did not prolong survival, neither at primary diagnosis nor at recurrent disease. Despite promising phase II clinical trial results and patient benefit in terms of clinical improvement and longer progression-free survival, an overall survival benefit has not been demonstrated in four randomized phase III trials of bevacizumab or cilengitide in newly diagnosed glioblastoma or cediranib or enzastaurin in recurrent glioblastoma. CONCLUSIONS: PFS-6 missed the primary planned outcome of 55%.  OS (median, 74 weeks) and PFS (median, 36 weeks) results from the current trial were comparable to those from a prior phase II study using erlotinib and were significantly better than those from 2 other previous studies that used thalidomide or cis-retinoic acid, all in combination with temozolomide plus RT.  More recently, antiangiogenic agents including enzastaurin, cediranib, bevacizumab, and others that target mainly the VEGF pathway, have been evaluated in this highly angiogenic disease. Among them, only bevacizumab has been associated with clear anti-tumor activity, although the lack of control studies limits the impact of the results to date. Enzastaurin has anti-glioma activity in patients with recurrent high-grade glioma, but does not appear to have enough single-agent activity to be useful as monotherapy. Several signal transduction inhibitors have been examined in preclinical and clinical malignant glioma trials, including antiangiogenic agents (bevacizumab, enzastaurin), and inhibitors of epidermal growth factor receptor tyrosine kinase (gefitinib and erlotinib), mammalian target of rapamycin (temsirolimus, everolimus) and integrin (cilengitide). Although preliminary clinical results of the use of targeted agents have not translated into significantly better survival, more recent phase II trials are exploring the combination of multitargeted drugs with cytotoxic chemotherapy and radiotherapy in order to overcome the resistance of tumors to single-agent targeted therapies. Several drugs have been tested, including epidermal growth factor receptor (EGFR) tyrosine kinase inhibitors (gefitinib and erlotinib), mammalian target of rapamycin (mTOR) inhibitors (temsirolimus and everolimus), and vascular endothelial growth factor receptor (VEGFR), protein kinase C-beta, and other angiogenesis pathways inhibitors (vatalanib, bevacizumab, and enzastaurin). Although preliminary efficacy results of most trials in recurrent disease have fallen short on expectations, substantial advances have been achieved by associated translational research.  So far, inhibition of angiogenesis by compounds such as bevacizumab, cediranib, enzastaurin or cilengitide as well as alternative dosing schedules of temozolomide did not prolong survival, neither at primary diagnosis nor at recurrent disease.
Example Output Text 4: no

Example Question 5: Does echinacea increase anaphylaxis risk?
Example Reference Text 5: Chicoric acid (dicaffeoyl-tartaric acid), is a natural phenolic compound found in a number of plants, such as chicory (Cichorium intybus) and Echinacea (Echinacea purpurea), which possesses antioxidant, anti-inflammatory, antiviral, and analgesic activities. Although these biological effects of chicoric acid have been investigated, there are no reports of its antiallergic-related anti-inflammatory effects in human mast cells (HMC)-1 or anaphylactic activity in a mouse model. BACKGROUND: Fifty percent of Australians use complementary and alternative medicines (other than vitamins) in any 12-month period, of which echinacea-containing products are increasingly popular. Recent reports have highlighted the risk of allergic reactions to complementary medicines in atopic patients. Two patients suffered anaphylaxis and a third had an acute asthma attack 10 minutes after their first ever dose of echinacea. Fifty-one Australian adverse drug reports implicating echinacea were also reviewed. There were 26 cases suggestive of possible immunoglobulin E-mediated hypersensitivity (4 anaphylaxis, 12 acute asthma, 10 urticaria/angioedema).  Echinacea-associated anaphylaxis. A woman with atopy experienced anaphylaxis after taking, among other dietary supplements, a commercial extract of echinacea. Risk of anaphylaxis in complementary and alternative medicine. Several culprits identified including Andrographis paniculata, Echinacea species, bee products, Ginkgo biloba and Ginseng are discussed here.SUMMARY: Knowing the factors that increase the risk of anaphylaxis allows reactions to be recognized, reported and further investigated.
Example Output Text 5: yes
'''

    # Add input text to be annotated
    prompt += f'''
### Reference Text: {reference}
### Output Text:
'''

    #print(prompt)
    return prompt

In [None]:
def run(llm, prompt_type, ids, questions, contexts):
    output_path = f'./output/BioASQ/{llm}/'
    if not os.path.exists(output_path):
        os.makedirs(output_path)

    # Check if the file exists and delete it before starting to append new outputs
    output_file_path = os.path.join(output_path, f'test_{prompt_type}.json')
    if os.path.exists(output_file_path):
        os.remove(output_file_path)
    
    for id, question, context in zip(ids, questions, contexts): 
        success = False
        while not success:
            try:
                # Assume get_output is a function that generates the output
                prompt = create_prompt(question, context, prompt_type)
                output = get_output(prompt, llm)

                # todo: model may not generate proper answer??
                cleaned_output = ' '.join(output.split())

                # Create a json string
                output_dict = {"id": id, "answer": cleaned_output}
                json_string = json.dumps(output_dict)
                
                # Open the file in append mode ('a') to add each new output
                with open(output_file_path, 'a', encoding='utf-8') as f_write:
                    f_write.write(json_string + '\n')
                    success = True
                    
            except Exception as e:
                print(e)

In [None]:
llm = 4
prompt_type = 'base'
run(llm, prompt_type, test_id_list, test_question_list, test_context_list)

### Evaluation (accuracy)

In [None]:
import re

# load results from specified dir
llm = 'gemini'
prompt_type = '3shot'
output_file_path = f'./output/BioASQ/{llm}/test_{prompt_type}.json'

test_id2_list = []
test_answer_list = []

# Open the JSON file and read line by line
with open(output_file_path, 'r', encoding='utf-8') as file:
    for line in file:
        sample = json.loads(line.strip())

        # Extract sample id
        sample_id = sample.get('id')
        test_id2_list.append(sample_id)

        # Use regular expression to find 'Yes' or 'No' at the beginning of the text
        sample_answer = sample.get('answer')
        extracted_answer_match = re.search(r"^\s*(Yes|No)", sample_answer, re.IGNORECASE)
        extracted_answer = extracted_answer_match.group(0) if extracted_answer_match else "Answer not found!"
        test_answer_list.append(extracted_answer)

assert test_id_list == test_id2_list

print(test_answer_list)
print(test_label_list)

In [None]:
from sklearn.metrics import accuracy_score, roc_auc_score
from sklearn.preprocessing import LabelEncoder

print("Evaluating: ", llm, prompt_type)

# Convert labels and predictions to lowercase
labels = [label.lower() for label in test_label_list]
predictions = [prediction.lower() for prediction in test_answer_list]

# Convert labels to binary values (0 and 1)
le = LabelEncoder()
labels_binary = le.fit_transform(labels)
predictions_binary = le.transform(predictions)

# Calculate accuracy
accuracy = accuracy_score(labels_binary, predictions_binary)
print("Accuracy:", accuracy)

# Calculate AUC score (may not get correct result with few samples)
try: 
    auc_score = roc_auc_score(labels_binary, predictions_binary)
    print("AUC Score:", auc_score)
except:
    pass