# GPT4 connection and prompting with API

In [1]:
import base64
import json
import requests
import pandas as pd
import os
from dotenv import load_dotenv
load_dotenv()

# Custom OpenAI API key
openai_api_key = os.getenv('OPENAI_API_KEY')

<h2> Focusing on Image-Question-Answering first using the TQA dataset </h2>

In [2]:
# Load dataset containing images and associated questions
image_qa_df = pd.read_csv("../Dataset/test/DiagramQuestionsData.csv")
image_qa_df["gpt4_generated_answers"] = None
image_qa_df.drop(columns=["Unnamed: 0"], inplace=True)
image_qa_df.head()

Unnamed: 0,lesson_name,question_name,answer_choice_1,answer_choice_2,answer_choice_3,answer_choice_4,correct_answer,image_path,image_has_labels_to_guess,caption,gpt4_generated_answers
0,climate and its causes,Which label refers to rains?,V,T,U,E,E,../Dataset/test/abc_question_images/rain_shado...,Yes,a diagram of the water cycle,
1,climate and its causes,How does water from the clouds reach the land ...,ICE FROM THE MOUNTAIN PEAK,AS RAIN,WIND,GRASS,AS RAIN,../Dataset/test/abc_question_images/rain_shado...,Yes,a diagram of the water cycle,
2,climate and its causes,What letter represents the condensation process?,W,J,H,T,T,../Dataset/test/abc_question_images/rain_shado...,Yes,a diagram of the water cycle,
3,climate and its causes,Where can you find moist air?,W,A,H,J,J,../Dataset/test/abc_question_images/rain_shado...,Yes,a diagram of the water cycle,
4,climate and its causes,Where is condensation?,T,H,A,W,T,../Dataset/test/abc_question_images/rain_shado...,Yes,a diagram of the water cycle,


In [2]:
# Function to encode the image
def encode_image(image_path):
    with open(image_path, "rb") as image_file:
        return base64.b64encode(image_file.read()).decode('utf-8')

# Prompt GPT for VQA task
def prompt_gpt(text, base64_image):
    headers = {
        "Content-Type": "application/json",
        "Authorization": f"Bearer {openai_api_key}"
    }

    payload = {
        "model": "gpt-4-vision-preview",
        "messages": [
          {
            "role": "user",
            "content": [
              {
                "type": "text",
                "text": text
              },
              {
                "type": "image_url",
                "image_url": {
                  "url": f"data:image/jpeg;base64,{base64_image}"
                }
              }
            ]
          }
        ],
        "max_tokens": 300
    }

    response = requests.post("https://api.openai.com/v1/chat/completions", headers=headers, json=payload)

    return response.json()

In [4]:
# Retrieve results from GPT-4 by looping through all questions
for idx, question in enumerate(image_qa_df["question_name"]):
    if pd.isna(image_qa_df.loc[idx, "gpt4_generated_answers"]):
        # Build answer choice string
        answer_choice_1 = image_qa_df.loc[idx, "answer_choice_1"]
        answer_choice_2 = image_qa_df.loc[idx, "answer_choice_2"]
        answer_choice_3 = image_qa_df.loc[idx, "answer_choice_3"]
        answer_choice_4 = image_qa_df.loc[idx, "answer_choice_4"]
        answer_string = f"{answer_choice_1}\n{answer_choice_2}\n{answer_choice_3}\n{answer_choice_4}"
        
        # Get base64 string version of the image
        base64_image = encode_image(image_qa_df.loc[idx, "image_path"])
        
        # Build prompt
        prompt = f"Choose only one option below as the answer for the following question. An explanation is not needed.\nQuestion: {question}\n\n{answer_string}"
        gpt_json_result = prompt_gpt(prompt, base64_image)
        
        if ("error" in gpt_json_result):
            print(gpt_json_result)
            # Rate Limits can be reached, hence we have to manually run this function multiple times in order
            # to get all the results of our MCQ questions in the dataset
            print("Rate limit exceed. Exiting loop...")
            break
        image_qa_df.loc[idx, "gpt4_generated_answers"] = gpt_json_result["choices"][0]["message"]["content"]
        image_qa_df.to_csv("GPT_4_preds.csv", index=False)

../Dataset/test/question_images/food_chains_webs_6059.png
{'error': {'message': 'Rate limit reached for gpt-4-vision-preview in organization org-lKFR4bNO94Tw5RDncWUcSIje on requests per day (RPD): Limit 100, Used 100, Requested 1. Please try again in 14m24s. Visit https://platform.openai.com/account/rate-limits to learn more.', 'type': 'requests', 'param': None, 'code': 'rate_limit_exceeded'}}
Rate limit exceed. Exiting loop...


In [8]:
# Calculate Accuracy of GPT-4 predictions
image_qa_df = pd.read_csv("GPT_4_preds.csv")
correct_preds = image_qa_df[image_qa_df["correct_answer"].str.lower() == image_qa_df["gpt4_generated_answers"].str.lower()]
print(f"Number of correct predictions: {len(correct_preds)} out of a total of {len(image_qa_df)} questions.")
print(f"Accuracy: {len(correct_preds)/len(image_qa_df) * 100}")

Number of correct predictions: 2520 out of a total of 3285 questions.
Accuracy: 76.71232876712328


<h2> Focusing on True/False questions: </h2>

In [9]:
# Function created using ChatGPT. Prompt used was: Write a python function for sending post requests to GPT-4's 
# completions API so that we can ask questions to it.
def create_completion(prompt):
    url = "https://api.openai.com/v1/chat/completions"
    headers = {
        "Authorization": f"Bearer {openai_api_key}",
        "Content-Type": "application/json"
    }
    data = {
        "model": "gpt-4",
        "messages": [
            {"role": "system", "content": "You are a helpful assistant."},
            {"role": "user", "content": prompt}
        ],
        "max_tokens": 150,
        "temperature": 0.5,
        "top_p": 1
    }
    response = requests.post(
        url,
        headers=headers,
        data=json.dumps(data)
    )
    return response.json()

In [43]:
# Get True/False answers
true_false_df = pd.read_csv("../Dataset/test/NonDiagram_True_False_QuestionsData.csv")
true_false_df["gpt4_generated_answers"] = None
true_false_df.head()

Unnamed: 0.1,Unnamed: 0,lesson_name,question_name,answer_choice_1,answer_choice_2,correct_answer,gpt_generated_answers
0,0,the nature of science,The scientific method is used to answer any qu...,True,False,False,
1,1,the nature of science,Scientific models are an organized step-by-ste...,True,False,False,
2,2,the nature of science,The dependent variable in an experiment is dir...,True,False,True,
3,3,the nature of science,"Even if there is information we dont know, a m...",True,False,True,
4,4,the nature of science,A theory will still remain even if conflicting...,True,False,False,


In [82]:
# Retrieve results from GPT-4 by looping through all questions
for idx, question in enumerate(true_false_df["question_name"]):
    if pd.isna(true_false_df.loc[idx, "gpt4_generated_answers"]):
        prompt = f"Is this statement True or False: \"{question}\". Only tell me if its True or False. An explanation is not required."
        gpt_json_result = create_completion(prompt)
        if ("error" in gpt_json_result):
            print(gpt_json_result)
            # Rate Limits can be reached, hence we have to manually run this function multiple times in order
            # to get all the results of our true/false questions in the dataset
            print("Rate limit exceed. Exiting loop...")
            break
        true_false_df.loc[idx, "gpt4_generated_answers"] = gpt_json_result["choices"][0]["message"]["content"]

In [111]:
# Write results to CSV file
true_false_df.to_csv("../Dataset/test/NonDiagram_True_False_QuestionsData.csv", index=False)

In [112]:
# Find accuracy of True/False questions
num_correct_guesses = 0
for expected, actual in zip(true_false_df["correct_answer"], true_false_df["gpt4_generated_answers"]):
    if actual == str(expected):
        num_correct_guesses += 1

print(f"Accuracy = {(num_correct_guesses / true_false_df.shape[0]) * 100:.2f}%")

Accuracy = 88.71%


<h2> Now we focus on non-diagram questions: </h2>

In [10]:
non_diagram_qa_df = pd.read_csv("../Dataset/test/NonDiagram_MCQ_QuestionsData.csv")
non_diagram_qa_df.drop(columns=["Unnamed: 0"], inplace=True)
non_diagram_qa_df["gpt4_generated_answers"] = None
non_diagram_qa_df.head()

Unnamed: 0,lesson_name,question_name,answer_choice_1,answer_choice_2,answer_choice_3,answer_choice_4,correct_answer,gpt4_generated_answers
0,the nature of science,Steps of the scientific method include all of ...,doing background research.,constructing a hypothesis.,asking a question.,proving a theory.,proving a theory.,
1,the nature of science,Why do scientists call the Big Bang a theory?,It is probably unlikely and therefore not a fact.,A very well respected scientist proved it to b...,Many scientists have agreed upon this explanat...,All possible answers to a scientific idea are ...,Many scientists have agreed upon this explanat...,
2,the nature of science,The data collected in an experiment should alw...,labeled.,recorded.,reported.,all of the above,all of the above,
3,the nature of science,Which of the following is not a scientific model?,A cross section of an apple that mimics the la...,A chart with nutritional information about foo...,A computer simulation that can show what will ...,An explanation for the extinction of the dinos...,A chart with nutritional information about foo...,
4,the nature of science,If the results of an experiment disprove a hyp...,results should not be reported.,hypothesis is just a theory.,data must contain errors.,none of the above,none of the above,


In [19]:
# Retrieve results from GPT-4 by looping through all questions
for idx, question in enumerate(non_diagram_qa_df["question_name"]):
    if pd.isna(non_diagram_qa_df.loc[idx, "gpt4_generated_answers"]):
        # Build answer choice string
        answer_choice_1 = non_diagram_qa_df.loc[idx, "answer_choice_1"]
        answer_choice_2 = non_diagram_qa_df.loc[idx, "answer_choice_2"]
        answer_choice_3 = non_diagram_qa_df.loc[idx, "answer_choice_3"]
        answer_choice_4 = non_diagram_qa_df.loc[idx, "answer_choice_4"]
        answer_string = f"{answer_choice_1}\n{answer_choice_2}\n{answer_choice_3}\n{answer_choice_4}"
            
        # Build prompt
        prompt = f"Choose only one option below as the answer for the following question. An explanation is not needed.\nQuestion: {question}\n\n{answer_string}"
        gpt_json_result = create_completion(prompt)
        if ("error" in gpt_json_result):
            print(gpt_json_result)
            # Rate Limits can be reached, hence we have to manually run this function multiple times in order
            # to get all the results of our MCQ questions in the dataset
            print("Rate limit exceed. Exiting loop...")
            break
        non_diagram_qa_df.loc[idx, "gpt4_generated_answers"] = gpt_json_result["choices"][0]["message"]["content"]

In [24]:
# Write results to CSV file
non_diagram_qa_df.to_csv("../Dataset/test/NonDiagram_MCQ_QuestionsData.csv", index=False)

In [23]:
# Find accuracy of Non-diagram MCQ questions
num_correct_guesses = 0
for expected, actual in zip(non_diagram_qa_df["correct_answer"], non_diagram_qa_df["gpt4_generated_answers"]):
    if actual == expected:
        num_correct_guesses += 1

print(f"Accuracy = {(num_correct_guesses / non_diagram_qa_df.shape[0]) * 100:.2f}%")

Accuracy = 83.98%
