## RAG PubmedQA
This notebook demonstrates a workflow for generating and evaluating short, fact-based answers to PubMedQA medical questions using Retrieval-Augmented Generation (RAG) and large language models (LLMs). The process includes prompt engineering, model inference, result cleaning, and evaluation against human-annotated answers.

## Workflow Overview

1. **Data Loading**: Load PubMedQA questions and retrieved contexts.
2. **Prompt Construction**: Build prompts for LLMs to generate concise answers ("yes", "no", or "maybe") based strictly on provided context.
3. **Model Inference**: Use IBM WatsonX.ai foundation models (Mistral, Llama3, FlanT5) to generate answers.
4. **Post-processing**: Clean and normalize model outputs.
5. **Evaluation**: Compare model-generated answers to human annotations using confusion matrix and classification metrics.

In [5]:
from datasets import load_dataset
import pandas as pd

In [12]:
import os
os.getcwd()

'/home/magma/Abhilasha/MetaGen Blended RAG/code/5. RAG-Evaluation'

In [14]:
dotenv_path

NameError: name 'dotenv_path' is not defined

In [6]:
from dotenv import load_dotenv
import os

# Path to your .env file
env_path = "../.env"  # Change path if needed

# Load environment variables from .env
load_dotenv(dotenv_path=env_path)
# Access the environment variables
watsonx_url = os.getenv("watsonx_url")
watsonx_apikey = os.getenv("watsonx_apikey")
watsonx_projectID = os.getenv("watsonx_projectID")

print(watsonx_url)

https://us-south.ml.cloud.ibm.com


## Generation

In [7]:
from ibm_watson_machine_learning.foundation_models import Model
from ibm_watson_machine_learning.metanames import GenTextParamsMetaNames as GenParams
project_id=''
# another cred
wml_credentials = {
    "url": watsonx_url,
    "apikey": watsonx_apikey,
    "project_id": watsonx_projectID,
}

In [8]:
def build_prompt_short_answers_from_context(user_query,context,model_id="MIXTRAL"):
    
    formatted_prompt=""

    SYSTEM_PROMPT = """ 
    You are helping a user generate a factually correct medical decision based only on the provided context.
    """
    USER_PROMPT = """

    You are a medical AI tasked with answering a PubMedQA question. Given the medical question {user_query} and context {context}, follow these instructions:

    Final Decision: Determine the correct answer as either 'yes,' 'no,' or 'maybe' based strictly on the provided medical evidence. Respond with 'yes,' 'no,' or 'maybe' only.
    Do not include any interpretation beyond the given context.
    Question: {user_query}
    Context: {context}

    """

    LLAMA3_PROMPT= """
    <|begin_of_text|><|start_header_id|>system<|end_header_id|>
    {system_prompt}<|eot_id|><|start_header_id|>user<|end_header_id|>
    {user_prompt}<|eot_id|><|start_header_id|>assistant<|end_header_id|>
    Answer Based on the Provided Context:
    """

    MIXTRAL_PROMPT = """[INST]
    [ROLE]
    {system_prompt}
    [/ROLE]
    [USER_INSTRUCTIONS]
    {user_prompt}
    [/USER_INSTRUCTIONS]

    Answer Based on the Provided Context:
    [/INST]"""

    FLANT5_PROMPT ="""
    {user_prompt}
    
    Answer Based on the Provided Context:
    """

    user_prompt = USER_PROMPT.format(user_query=user_query,context=context)
    print(len(user_prompt))
    if  model_id == "MIXTRAL":
        formatted_prompt = MIXTRAL_PROMPT.format(system_prompt=SYSTEM_PROMPT,user_prompt=user_prompt)
    elif model_id == "LLAMA3":
        formatted_prompt = LLAMA3_PROMPT.format(system_prompt=SYSTEM_PROMPT,user_prompt=user_prompt)
    elif model_id == "FLANT5":
        formatted_prompt = FLANT5_PROMPT.format(system_prompt=SYSTEM_PROMPT,user_prompt=user_prompt)

    return formatted_prompt
      

def send_to_watsonxai(prompts,
                    model_id="MIXTRAL",
                    decoding_method="greedy",
                    max_new_tokens=5,
                    min_new_tokens=1,
                    temperature=1.0,
                    repetition_penalty=1.0
                    ):
    if  model_id == "MIXTRAL":
         model_name = "mistralai/mistral-large"
    elif model_id == "FLANT5":
        model_name="google/flan-t5-xxl"
    elif model_id == "LLAMA3":
         model_name="meta-llama/llama-3-1-70b-instruct"
        #  model_name="meta-llama/llama-3-3-70b-instruct"
    model_params = {
        GenParams.DECODING_METHOD: decoding_method,
        GenParams.MIN_NEW_TOKENS: min_new_tokens,
        GenParams.MAX_NEW_TOKENS: max_new_tokens,
        #GenParams.RANDOM_SEED: 42,
        GenParams.TEMPERATURE: temperature,
        GenParams.REPETITION_PENALTY: repetition_penalty,
    }
    # Instantiate a model proxy object to send your requests
    model = Model(
        model_id=model_name,
        params=model_params,
        credentials=wml_credentials,
        project_id='project--id')

    # print("Model used ---",model.model_id)
    response=model.generate_text(prompts)

    
    # print(response)
    return response

In [9]:
query_json = pd.read_excel('../../output/pubmedqa/RAG_Mistral_Large_K_10_top5_Rerank_RAGResults_Prompt_v1.xlsx')
query_json.head()

Unnamed: 0.2,Unnamed: 0.1,Unnamed: 0,pubid,question,context,long_answer,final_decision,answer_length,R_Context,R_Context_reranked,Mistral final_decision,Mistral long_answer
0,0,0,21645374,Do mitochondria play a role in remodelling lac...,{'contexts': ['Programmed cell death (PCD) is ...,Results depicted mitochondrial dynamics in viv...,yes,106,['Programmed cell death (PCD) is the regulated...,['Programmed cell death (PCD) is the regulated...,yes,Mitochondria play a significant role in the re...
1,1,1,16418930,Landolt C and snellen e acuity: differences in...,{'contexts': ['Assessment of visual acuity dep...,"Using the charts described, there was only a s...",no,42,['Differences between Landolt C acuity (LR) an...,['Assessment of visual acuity depends on the o...,yes,The study found small differences between Land...
2,2,2,9488747,"Syncope during bathing in infants, a pediatric...",{'contexts': ['Apparent life-threatening event...,"""Aquagenic maladies"" could be a pediatric form...",yes,14,['Eight infants aged 2 to 15 months were admit...,['Apparent life-threatening events in infants ...,maybe,The context describes infants experiencing syn...
3,3,3,17208539,Are the long-term results of the transanal pul...,{'contexts': ['The transanal endorectal pull-t...,Our long-term study showed significantly bette...,no,69,['The transanal endorectal pull-through (TERPT...,['The transanal endorectal pull-through (TERPT...,maybe,The long-term results of the transanal pull-th...
4,4,4,10808977,Can tailored interventions increase mammograph...,{'contexts': ['Telephone counseling and tailor...,The effects of the intervention were most pron...,yes,123,"[""Message tailoring, based on individual needs...","[""Message tailoring, based on individual needs...",yes,"Tailored interventions, such as tailored print..."


## Get Final decision from retrieved context

In [10]:
import ast  # add this import at the top

for index , row in query_json.iterrows():
    print("Processing -----",index)
    question=row['question']
    print("Question----",question)
    context = row["R_Context_reranked"]

    try:
        model_name="LLAMA3"
        llm_input=build_prompt_short_answers_from_context(question,context,model_name)
        llm_response=send_to_watsonxai(llm_input,model_name)
        print("--------------------------LLM RESPONSE: ", llm_response)
        query_json.at[index, 'FINAL_DECISION_FROM_LONG_ANSWER'] = llm_response
    except Exception as e:
       print(e)

`apikey` value cannot be ''. Pass a valid apikey for IAM token.
`apikey` value cannot be ''. Pass a valid apikey for IAM token.
`apikey` value cannot be ''. Pass a valid apikey for IAM token.
`apikey` value cannot be ''. Pass a valid apikey for IAM token.
`apikey` value cannot be ''. Pass a valid apikey for IAM token.
`apikey` value cannot be ''. Pass a valid apikey for IAM token.
`apikey` value cannot be ''. Pass a valid apikey for IAM token.
`apikey` value cannot be ''. Pass a valid apikey for IAM token.
`apikey` value cannot be ''. Pass a valid apikey for IAM token.
`apikey` value cannot be ''. Pass a valid apikey for IAM token.
`apikey` value cannot be ''. Pass a valid apikey for IAM token.
`apikey` value cannot be ''. Pass a valid apikey for IAM token.
`apikey` value cannot be ''. Pass a valid apikey for IAM token.
`apikey` value cannot be ''. Pass a valid apikey for IAM token.
`apikey` value cannot be ''. Pass a valid apikey for IAM token.
`apikey` value cannot be ''. Pass a vali

Processing ----- 0
Question---- Do mitochondria play a role in remodelling lace plant leaves during programmed cell death?
15471
`apikey` value cannot be ''. Pass a valid apikey for IAM token.
Processing ----- 1
Question---- Landolt C and snellen e acuity: differences in strabismus amblyopia?
13357
`apikey` value cannot be ''. Pass a valid apikey for IAM token.
Processing ----- 2
Question---- Syncope during bathing in infants, a pediatric form of water-induced urticaria?
13973
`apikey` value cannot be ''. Pass a valid apikey for IAM token.
Processing ----- 3
Question---- Are the long-term results of the transanal pull-through equal to those of the transabdominal pull-through?
15875
`apikey` value cannot be ''. Pass a valid apikey for IAM token.
Processing ----- 4
Question---- Can tailored interventions increase mammography use among HMO women?
13707
`apikey` value cannot be ''. Pass a valid apikey for IAM token.
Processing ----- 5
Question---- Double balloon enteroscopy: is it efficaci

`apikey` value cannot be ''. Pass a valid apikey for IAM token.
`apikey` value cannot be ''. Pass a valid apikey for IAM token.
`apikey` value cannot be ''. Pass a valid apikey for IAM token.
`apikey` value cannot be ''. Pass a valid apikey for IAM token.
`apikey` value cannot be ''. Pass a valid apikey for IAM token.
`apikey` value cannot be ''. Pass a valid apikey for IAM token.
`apikey` value cannot be ''. Pass a valid apikey for IAM token.
`apikey` value cannot be ''. Pass a valid apikey for IAM token.
`apikey` value cannot be ''. Pass a valid apikey for IAM token.
`apikey` value cannot be ''. Pass a valid apikey for IAM token.
`apikey` value cannot be ''. Pass a valid apikey for IAM token.
`apikey` value cannot be ''. Pass a valid apikey for IAM token.
`apikey` value cannot be ''. Pass a valid apikey for IAM token.
`apikey` value cannot be ''. Pass a valid apikey for IAM token.
`apikey` value cannot be ''. Pass a valid apikey for IAM token.
`apikey` value cannot be ''. Pass a vali

`apikey` value cannot be ''. Pass a valid apikey for IAM token.
Processing ----- 357
Question---- Mitral replacement or repair for functional mitral regurgitation in dilated and ischemic cardiomyopathy: is it really the same?
17405
`apikey` value cannot be ''. Pass a valid apikey for IAM token.
Processing ----- 358
Question---- Do nontriploid partial hydatidiform moles exist?
12507
`apikey` value cannot be ''. Pass a valid apikey for IAM token.
Processing ----- 359
Question---- Is cholecystectomy really an indication for concomitant splenectomy in mild hereditary spherocytosis?
14643
`apikey` value cannot be ''. Pass a valid apikey for IAM token.
Processing ----- 360
Question---- Does surgery or radiation therapy impact survival for patients with extrapulmonary small cell cancers?
15061
`apikey` value cannot be ''. Pass a valid apikey for IAM token.
Processing ----- 361
Question---- MR Diagnosis of Bone Metastases at 1.5 T and 3 T: Can STIR Imaging Be Omitted?
17253
`apikey` value cann

`apikey` value cannot be ''. Pass a valid apikey for IAM token.
`apikey` value cannot be ''. Pass a valid apikey for IAM token.
`apikey` value cannot be ''. Pass a valid apikey for IAM token.
`apikey` value cannot be ''. Pass a valid apikey for IAM token.
`apikey` value cannot be ''. Pass a valid apikey for IAM token.
`apikey` value cannot be ''. Pass a valid apikey for IAM token.
`apikey` value cannot be ''. Pass a valid apikey for IAM token.
`apikey` value cannot be ''. Pass a valid apikey for IAM token.
`apikey` value cannot be ''. Pass a valid apikey for IAM token.
`apikey` value cannot be ''. Pass a valid apikey for IAM token.
`apikey` value cannot be ''. Pass a valid apikey for IAM token.
`apikey` value cannot be ''. Pass a valid apikey for IAM token.
`apikey` value cannot be ''. Pass a valid apikey for IAM token.
`apikey` value cannot be ''. Pass a valid apikey for IAM token.
`apikey` value cannot be ''. Pass a valid apikey for IAM token.
`apikey` value cannot be ''. Pass a vali

`apikey` value cannot be ''. Pass a valid apikey for IAM token.
Processing ----- 654
Question---- Are complex coronary lesions more frequent in patients with diabetes mellitus?
18229
`apikey` value cannot be ''. Pass a valid apikey for IAM token.
Processing ----- 655
Question---- Are variations in the use of carotid endarterectomy explained by population Need?
13869
`apikey` value cannot be ''. Pass a valid apikey for IAM token.
Processing ----- 656
Question---- Are there effects of intrauterine cocaine exposure on delinquency during early adolescence?
12771
`apikey` value cannot be ''. Pass a valid apikey for IAM token.
Processing ----- 657
Question---- Is expert breast pathology assessment necessary for the management of ductal carcinoma in situ ?
13527
`apikey` value cannot be ''. Pass a valid apikey for IAM token.
Processing ----- 658
Question---- Thoracic and thoracoabdominal aneurysm repair: is reimplantation of spinal cord arteries a waste of time?
15809
`apikey` value cannot be

`apikey` value cannot be ''. Pass a valid apikey for IAM token.
`apikey` value cannot be ''. Pass a valid apikey for IAM token.
`apikey` value cannot be ''. Pass a valid apikey for IAM token.
`apikey` value cannot be ''. Pass a valid apikey for IAM token.
`apikey` value cannot be ''. Pass a valid apikey for IAM token.
`apikey` value cannot be ''. Pass a valid apikey for IAM token.
`apikey` value cannot be ''. Pass a valid apikey for IAM token.
`apikey` value cannot be ''. Pass a valid apikey for IAM token.
`apikey` value cannot be ''. Pass a valid apikey for IAM token.
`apikey` value cannot be ''. Pass a valid apikey for IAM token.
`apikey` value cannot be ''. Pass a valid apikey for IAM token.
`apikey` value cannot be ''. Pass a valid apikey for IAM token.
`apikey` value cannot be ''. Pass a valid apikey for IAM token.


`apikey` value cannot be ''. Pass a valid apikey for IAM token.
Processing ----- 987
Question---- Does prior benign prostate biopsy predict outcome for patients treated with radical perineal prostatectomy?
15555
`apikey` value cannot be ''. Pass a valid apikey for IAM token.
Processing ----- 988
Question---- Does patient position during liver surgery influence the risk of venous air embolism?
15691
`apikey` value cannot be ''. Pass a valid apikey for IAM token.
Processing ----- 989
Question---- Do Indigenous Australians age prematurely?
13137
`apikey` value cannot be ''. Pass a valid apikey for IAM token.
Processing ----- 990
Question---- Are tuberculosis patients adherent to prescribed treatments in China?
15441
`apikey` value cannot be ''. Pass a valid apikey for IAM token.
Processing ----- 991
Question---- Ultrasound in squamous cell carcinoma of the penis; a useful addition to clinical staging?
11309
`apikey` value cannot be ''. Pass a valid apikey for IAM token.
Processing ----- 9

In [11]:
query_json['FINAL_DECISION_FROM_LONG_ANSWER'].replace("Yes", "yes", inplace=True)
query_json['FINAL_DECISION_FROM_LONG_ANSWER'].replace("No", "no", inplace=True)
query_json['FINAL_DECISION_FROM_LONG_ANSWER'].replace("Maybe", "maybe", inplace=True)
query_json['FINAL_DECISION_FROM_LONG_ANSWER'].replace("yes.", "yes", inplace=True)
query_json['FINAL_DECISION_FROM_LONG_ANSWER'].replace("no.", "no", inplace=True)
query_json['FINAL_DECISION_FROM_LONG_ANSWER'].replace("maybe.", "maybe", inplace=True)

KeyError: 'FINAL_DECISION_FROM_LONG_ANSWER'

In [None]:
query_json.head()

## Clean Final Decision output

In [None]:
def clean_decision(text):
    text = str(text).lower().split('.')[0]  # Remove everything after the first '.' and convert to lowercase
    if 'yes' in text:
        return 'yes'
    elif 'no' in text:
        return 'no'
    elif 'maybe' in text:
        return 'maybe'
    return 'unknown'  # For any text not containing yes/no/maybe

In [None]:
query_json['CLEANED_FINAL_DECISION'] = query_json['FINAL_DECISION_FROM_LONG_ANSWER'].apply(clean_decision)
query_json = query_json[query_json['CLEANED_FINAL_DECISION'] != 'unknown']  # Remove rows with unknown decision
print(query_json.shape)
query_json.head()

In [None]:
from sklearn.metrics import accuracy_score, recall_score, precision_score, f1_score
print("Accuracy: ", accuracy_score(query_json['final_decision'], query_json['CLEANED_FINAL_DECISION']))
print("Recall: ", recall_score(query_json['final_decision'], query_json['CLEANED_FINAL_DECISION'], average='weighted'))
print("Precision: ", precision_score(query_json['final_decision'], query_json['CLEANED_FINAL_DECISION'], average='weighted'))
print("F1-score: ", f1_score(query_json['final_decision'], query_json['CLEANED_FINAL_DECISION'], average='weighted'))


In [None]:
query_json[query_json['CLEANED_FINAL_DECISION'] == 'maybe']

In [None]:
query_json.to_excel('../../output/pubmedqa/Best-RAG-results.xlsx')