In [1]:
from datasets import load_dataset
from openai import OpenAI
from dotenv import load_dotenv
from joblib import Memory
import os
import json
import pandas as pd


  from pandas.core import (


## Load datasets from Hugging Face

In [2]:
mcq_data = load_dataset("kg-rag/BiomixQA", "mcq")

tf_data = load_dataset("kg-rag/BiomixQA", "true_false")


## MCQ data first sample

In [3]:
print(mcq_data["train"][0])


{'text': 'Out of the given list, which Gene is associated with head and neck cancer and uveal melanoma. Given list is:  ABO, CACNA2D1,  PSCA, TERT,  SULT1B1', 'option_A': 'ABO', 'option_B': 'CACNA2D1', 'option_C': 'PSCA', 'option_D': 'TERT', 'option_E': 'SULT1B1', 'correct_answer': 'CACNA2D1'}


## True/False data first sample

In [4]:
print(tf_data["train"][0])


{'text': 'enhanced S-cone syndrome is not a vitreoretinal degeneration', 'label': False}


## Configure OpenAI client

In [5]:
load_dotenv(os.path.join(os.path.expanduser('~'), '.gpt_biomixQA.env'))

client = OpenAI(api_key = os.environ.get('API_KEY'))


## Design System prompts for MCQ and True/False dataset

In [6]:
MCQ_QUESTION_SYSTEM_PROMPT = '''
    You are an expert biomedical researcher. 
    Please provide your answer in the following JSON format for the Question asked:
    {"answer": <correct answer>}
'''

TRUE_FALSE_QUESTION_SYSTEM_PROMPT = '''
    You are an expert biomedical researcher. 
    Please provide your answer in the following JSON format for the Question asked:
    {"answer": "True"}
    OR
    {"answer": "False"}
'''

## Selecting a GPT model

In [7]:
CHAT_MODEL = "gpt-4o"

TEMPERATURE = 0.3


## Setting a cache memory for GPT calls

In [8]:
memory = Memory("cachegpt", verbose=0)

## Custom function to call GPT model

In [9]:
@memory.cache
def call_GPT(instruction, system_prompt, chat_model_id, temperature):
    response = client.chat.completions.create(        
        temperature=temperature,
        model=chat_model_id,
        messages=[
            {"role": "system", "content": system_prompt},
            {"role": "user", "content": instruction}
        ]
    )
    if response.choices:
        return response.choices[0].message.content
    else:
        return 'Unexpected response'
        

## Evaluating GPT model on Biomix MCQ data

In [12]:

mcq_predictions = []
for item in mcq_data["train"]:
    resp = call_GPT(item["text"], MCQ_QUESTION_SYSTEM_PROMPT, CHAT_MODEL, TEMPERATURE) 
    try:
        resp = json.loads(resp)
        mcq_predictions.append((item["text"], item["correct_answer"], resp["answer"]))
    except json.JSONDecodeError as e:
        continue

mcq_prediction_df = pd.DataFrame(mcq_predictions, columns=["text", "correct_answer", "prediction"])
mcq_prediction_df_correct = mcq_prediction_df[mcq_prediction_df["correct_answer"] == mcq_prediction_df["prediction"]]
mcq_accuracy = 100*mcq_prediction_df_correct.shape[0]/len(mcq_data["train"])
print(f"Performance accuracy of {CHAT_MODEL} on Biomix MCQ data is {round(mcq_accuracy, 2)}%")


Performance accuracy of gpt-4o on Biomix MCQ data is 68.3%


## Evaluating GPT model on Biomix True/False data

In [13]:

tf_predictions = []
for item in tf_data["train"]:
    resp = call_GPT(item["text"], TRUE_FALSE_QUESTION_SYSTEM_PROMPT, CHAT_MODEL, TEMPERATURE) 
    try:
        resp = json.loads(resp)
        tf_predictions.append((item["text"], item["label"], resp["answer"]))
    except json.JSONDecodeError as e:
        continue

tf_predictions_df = pd.DataFrame(tf_predictions, columns=["text", "correct_answer", "prediction"])
tf_predictions_df.correct_answer = tf_predictions_df.correct_answer.astype(str)
tf_predictions_df.prediction = tf_predictions_df.prediction.astype(str)
tf_predictions_df_correct = tf_predictions_df[tf_predictions_df["correct_answer"] == tf_predictions_df["prediction"]]
tf_accuracy = 100*tf_predictions_df_correct.shape[0]/len(tf_data["train"])
print(f"Performance accuracy of {CHAT_MODEL} on Biomix True/False data is {round(tf_accuracy, 2)}%")


Performance accuracy of gpt-4o on Biomix True/False data is 89.39%
