In [1]:
import json
import os
import matplotlib.pyplot as plt
import glob
from openai import OpenAI
import requests
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score
import numpy as np
import tqdm 



In [2]:
open_api_key = ""

In [3]:
def prompting(payload):
    headers = {
        "Content-Type": "application/json",
        "Authorization": f"Bearer {open_api_key}"
    }

    try:
        resp = requests.post("https://api.openai.com/v1/chat/completions", 
                             headers=headers, json=payload)
        data = resp.json()
        resp_dict = data['choices'][0]['message']['content']
        return resp_dict
    
    except requests.exceptions.HTTPError as http_err:
        print(f"HTTP error occurred: {http_err}")
    except requests.exceptions.RequestException as err:
        print(f"An error occurred: {err}")
    except KeyError as key_err:
        print(f"Key error: {key_err} - the structure of the response might not be as expected.")
    
    return None


### Load data

In [4]:
data = json.load(open('pubmedqa-master/data/test_set.json'))
labels = json.load(open('pubmedqa-master/data/test_ground_truth.json'))
len(data), len(labels)

(500, 500)

In [133]:
predictions, gt = {}, {}

class_dict = { 'no': 0, 'yes': 1, 'maybe': 2, 'none':3}

for i, q in tqdm.tqdm(data.items()):
    question = str(q['QUESTION'])
    context = str(q['CONTEXTS'])  
    inp = {'question': question, 'context': context}
    text_prompt = "Answer question given context: {} . Answer can only be one word and it should be either 'yes', 'no', or 'maybe'.".format(
        question + " " + context )

    prompt_data = {
        "model": 'gpt-4-vision-preview', 
        "temperature": 0,
        "messages": [
            {
                "role": "user",
                "content": [
                    {"type": "text", "text": text_prompt},
                ],
            }
        ],
        "max_tokens": f"{30}",
    }
    resp = prompting(prompt_data)

    r = 'none'
    if 'no' in resp.lower():
        r = 'no'
    elif 'yes' in resp.lower():
        r = 'yes'
    elif 'maybe' in resp.lower() or 'perhaps' in resp.lower():
        r = 'maybe'

    predictions.update({i: class_dict[r]})
    gt.update({i: class_dict[labels[i]]})

100%|██████████████████████████████████████████████████████████████████████| 500/500 [11:16<00:00,  1.35s/it]


In [134]:
len(predictions), len(gt)

(500, 500)

In [135]:
accuracy_score(list(gt.values()), list(predictions.values() ))

0.748

In [136]:
f1_score(list(gt.values()), list(predictions.values()), average='macro')

0.5837309982081961

### GPT3.5

In [85]:
predictions_gpt, gt_gpt = {}, {}

class_dict = { 'no': 0, 'yes': 1, 'maybe': 2}

for i, q in data.items():

    question = str(q['QUESTION'])
    context = str(q['CONTEXTS'])    
    text_prompt = "Answer question given context: {} . Answer can only be one word and it should be either 'yes', 'no', or 'maybe'.".format(
        question + ' ' + context)

    prompt_data = {
        "model": 'gpt-3.5-turbo', 
        "temperature": 0,
        "messages": [
            {
        "role": "system",
        "content": "You are a helpful medical assistant."
      },
            {
                "role": "user",
                "content": text_prompt
            }
        ]
    }
    resp = prompt(prompt_data)

    r = ''
    if 'no' in resp.lower():
        r = 'no'
    elif 'yes' in resp.lower():
        r = 'yes'
    else:
        r = 'maybe'

    predictions_gpt.update({i: class_dict[r]})
    gt_gpt.update({i: class_dict[labels[i]]})


In [86]:
len(predictions_gpt), len(gt_gpt)

(500, 500)

In [87]:
accuracy_score(list(gt_gpt.values()), list(predictions_gpt.values() ))

0.686

In [88]:
f1_score(list(gt_gpt.values()), list(predictions_gpt.values()), average='macro')

0.5468129486658541

In [90]:
# predictions_gpt