In [1]:
import json
import os
import matplotlib.pyplot as plt
import glob
from openai import OpenAI
import requests
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score
import numpy as np
import tqdm 



In [7]:
open_api_key = ""

In [8]:
def prompting(payload):
    headers = {
        "Content-Type": "application/json",
        "Authorization": f"Bearer {open_api_key}"
    }

    try:
        resp = requests.post("https://api.openai.com/v1/chat/completions", 
                             headers=headers, json=payload)
        data = resp.json()
        resp_dict = data['choices'][0]['message']['content']
        return resp_dict
    
    except requests.exceptions.HTTPError as http_err:
        print(f"HTTP error occurred: {http_err}")
    except requests.exceptions.RequestException as err:
        print(f"An error occurred: {err}")
    except KeyError as key_err:
        print(f"Key error: {key_err} - the structure of the response might not be as expected.")
    
    return None


### Load data

In [9]:
data = json.load(open('pubmedqa-master/data/test_set.json'))
labels = json.load(open('pubmedqa-master/data/test_ground_truth.json'))
len(data), len(labels)

(500, 500)

In [13]:
predictions, gt = {}, {}

class_dict = { 'no': 0, 'yes': 1, 'maybe': 2, 'none':3}

for i, q in tqdm.tqdm(data.items()):
    question = str(q['QUESTION'])
    inp = {'question': question}
    text_prompt = "Answer question {}. Answer can only be one word and it should be either 'yes', 'no', or 'maybe'.".format(
        question)

    prompt_data = {
        "model": 'gpt-4-0125-preview', 
        "temperature": 0,
        "messages": [
            {
                "role": "user",
                "content": [
                    {"type": "text", "text": text_prompt},
                ],
            }
        ],
    }
    resp = prompting(prompt_data)

    r = 'none'
    if 'no' in resp.lower():
        r = 'no'
    elif 'yes' in resp.lower():
        r = 'yes'
    elif 'maybe' in resp.lower() or 'perhaps' in resp.lower():
        r = 'maybe'
    
    predictions.update({i: class_dict[r]})
    gt.update({i: class_dict[labels[i]]})

100%|███████████████████████████████████████████████████████████████████████████████████████████████████| 500/500 [04:54<00:00,  1.70it/s]


In [14]:
len(predictions), len(gt)

(500, 500)

In [15]:
accuracy_score(list(gt.values()), list(predictions.values() ))

0.552

In [17]:
f1_score(list(gt.values()), list(predictions.values()), average='macro')

0.3875320857544917

### GPT 4 Version - gpt-4-turbo-2024-04-09

In [25]:
predictions, gt = {}, {}

class_dict = { 'no': 0, 'yes': 1, 'maybe': 2, 'none':3}

for i, q in tqdm.tqdm(data.items()):
    question = str(q['QUESTION'])
    inp = {'question': question}
    text_prompt = "Answer question {}. Answer can only be one word and it should be either 'yes', 'no', or 'maybe'.".format(
        question)

    prompt_data = {
        "model": 'gpt-4-turbo-2024-04-09', 
        "temperature": 0,
        "messages": [
            {
                "role": "user",
                "content": [
                    {"type": "text", "text": text_prompt},
                ],
            }
        ],
    }
    resp = prompting(prompt_data)

    r = 'none'
    if 'no' in resp.lower():
        r = 'no'
    elif 'yes' in resp.lower():
        r = 'yes'
    elif 'maybe' in resp.lower() or 'perhaps' in resp.lower():
        r = 'maybe'
    
    predictions.update({i: class_dict[r]})
    gt.update({i: class_dict[labels[i]]})

print(accuracy_score(list(gt.values()), list(predictions.values() )))
print(f1_score(list(gt.values()), list(predictions.values()), average='macro'))

100%|███████████████████████████████████████████████████████████████████████████████████████████████████| 500/500 [06:29<00:00,  1.28it/s]

0.478
0.3725259100954017





### GPT3.5

In [20]:
predictions_gpt, gt_gpt = {}, {}

class_dict = { 'no': 0, 'yes': 1, 'maybe': 2}

for i, q in data.items():

    question = str(q['QUESTION'])
    text_prompt = "Answer question: {} . Answer can only be one word and it should be either 'yes', 'no', or 'maybe'.".format(
        question)

    prompt_data = {
        "model": 'gpt-3.5-turbo', 
        "temperature": 0,
        "messages": [
            {
        "role": "system",
        "content": "You are a helpful medical assistant."
      },
            {
                "role": "user",
                "content": text_prompt
            }
        ]
    }
    resp = prompting(prompt_data)

    r = ''
    if 'no' in resp.lower():
        r = 'no'
    elif 'yes' in resp.lower():
        r = 'yes'
    else:
        r = 'maybe'
    
    predictions_gpt.update({i: class_dict[r]})
    gt_gpt.update({i: class_dict[labels[i]]})


In [21]:
len(predictions_gpt), len(gt_gpt)

(500, 500)

In [22]:
accuracy_score(list(gt_gpt.values()), list(predictions_gpt.values() ))

0.348

In [24]:
f1_score(list(gt_gpt.values()), list(predictions_gpt.values()), average='macro')

0.2898096122180815

In [90]:
# predictions_gpt