# We assess GPT-4 on our test set, to see how it performs compared to our fine-tuned Llama-2 models

In [1]:
%pip install openai python-dotenv

Note: you may need to restart the kernel to use updated packages.


In [2]:
import pandas as pd
from openai import OpenAI
client = OpenAI(api_key = "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")


In [3]:
def create_input(claim, evidence):
  return [
    {"role": "system", "content": """Given a ”claim” sentence and an ”evidence” sentence, determine if the evidence supports or refutes the claim, or if there is not enough information.\n
                                    Return the answer as the corresponding label ”SUPPORTS” or ”REFUTES” or ”NOT_ENOUGH_INFO”. \n
                                    Only include the label in the your response. \n
     
     Here are some examples:\n

Claim: Ice berg melts, ocean level remains the same. \n
Evidence: The melting of floating ice raises the ocean level. \n
Class: REFUTES \n

Claim: Global warming is driving polar bears toward extinction.
Evidence: Environmental impacts include the extinction or relocation of many species as their ecosystems change, most immediately the environments of coral reefs, mountains, and the Arctic. \n
Class: SUPPORTS \n

Claim: There is no statistical evidence that global warming is intensifying floods. \n
Evidence: Such events will continue to occur more often and with greater intensity. \n
Class: NOT_ENOUGH_INFO"""},
    {"role": "user", "content": f"Claim: ’{claim}’ \n Evidence: ’{evidence}’"}
    
  ]

In [4]:
from tqdm import tqdm
dataset_name = "../climate_fever/labelled-test-data.csv"

test_data_df = pd.read_csv(dataset_name)

true_labels = test_data_df["label"] 

input_data = [create_input(claim, evidence) for claim, evidence in zip(test_data_df["claim"], test_data_df["evidence"])]

predictions = []

print(len(input_data))




475


In [5]:

#this is commented out because it costs a fiver to run

# with tqdm(total=len(input_data), desc="Predictions") as pbar:
#     for i, input in enumerate(input_data):
#         response = client.chat.completions.create(
#             model="gpt-4-turbo-preview",
#             messages=input
#         )
#         predictions.append(response.choices[0].message.content)
#         # write predictions to output file after each prediction in case of crash

#         with open("predictions_gpt4_fewshot.txt", "w") as f:
#             for prediction in predictions:
#                 f.write(prediction + "\n")
        
#         pbar.update(1)

Predictions: 100%|██████████| 475/475 [05:59<00:00,  1.32it/s]


In [6]:
from sklearn.metrics import classification_report
print(classification_report(true_labels, predictions))


                 precision    recall  f1-score   support

NOT_ENOUGH_INFO       0.81      0.76      0.79       305
        REFUTES       0.44      0.60      0.50        45
       SUPPORTS       0.66      0.66      0.66       125

       accuracy                           0.72       475
      macro avg       0.63      0.67      0.65       475
   weighted avg       0.73      0.72      0.73       475



In [1]:
def get_macro_label(micro_labels):
    if "REFUTES" in micro_labels and "SUPPORTS" in micro_labels:
        return "DISPUTED"
    elif "REFUTES" in micro_labels:
        return "REFUTES"
    elif "SUPPORTS" in micro_labels:
        return "SUPPORTS"
    else:
        return "NOT_ENOUGH_INFO"

def get_macro_labels_from_micro(micro_labels):
  macro_label_predictions=[]
  for i in range(0, len(micro_labels), 5):
    micro_labels_for_claim = micro_labels[i:i+5]
    macro_label_prediction = get_macro_label(micro_labels_for_claim)
    macro_label_predictions.append(macro_label_prediction)
  return macro_label_predictions

In [9]:
from sklearn.metrics import (accuracy_score,
                             classification_report,
                             confusion_matrix)
import numpy as np

def evaluate(y_true, y_pred, macro=False):


    # Calculate accuracy
    accuracy = accuracy_score(y_true=y_true, y_pred=y_pred)
    print(f'Accuracy: {accuracy:.3f}')

    # Generate accuracy report
    unique_labels = set(y_true)  # Get unique labels

    for label in unique_labels:
        label_indices = [i for i in range(len(y_true))
                         if y_true[i] == label]
        label_y_true = [y_true[i] for i in label_indices]
        label_y_pred = [y_pred[i] for i in label_indices]
        accuracy = accuracy_score(label_y_true, label_y_pred)
        print(f'Accuracy for label {label}: {accuracy:.3f}')

    # Generate classification report
    class_report = classification_report(y_true=y_true, y_pred=y_pred)
    print('\nClassification Report:')
    print(class_report)

    # Generate confusion matrix
    conf_matrix = confusion_matrix(y_true=y_true, y_pred=y_pred, labels=[0, 1, 2])
    print('\nConfusion Matrix:')
    print(conf_matrix)

In [8]:
gold_macro_labels = get_macro_labels_from_micro(true_labels.tolist())
predicted_macro_labels = get_macro_labels_from_micro(predictions)

print(classification_report(gold_macro_labels, predicted_macro_labels))


                 precision    recall  f1-score   support

       DISPUTED       0.27      0.38      0.32         8
NOT_ENOUGH_INFO       0.65      0.65      0.65        26
        REFUTES       0.60      0.80      0.69        15
       SUPPORTS       0.87      0.72      0.79        46

       accuracy                           0.68        95
      macro avg       0.60      0.64      0.61        95
   weighted avg       0.72      0.68      0.69        95



In [10]:
import pandas as pd
dataset_name = "../climate_fever/labelled-test-data.csv"

test_data_df = pd.read_csv(dataset_name)

true_labels = test_data_df["label"].tolist() 

prediction_txt_file = "predictions_gpt4_fewshot.txt"
predictions = []
with open(prediction_txt_file, "r") as f:
    for line in f:
        predictions.append(line.strip())

#print(true_labels)
#print(predictions)

predictions_encoded = [0 if label == "NOT_ENOUGH_INFO" else 1 if label == "REFUTES" else 2 for label in predictions]
true_labels_encoded = [0 if label == "NOT_ENOUGH_INFO" else 1 if label == "REFUTES" else 2 for label in true_labels]


evaluate(true_labels_encoded, predictions_encoded, macro=False)


Accuracy: 0.720
Accuracy for label 0: 0.764
Accuracy for label 1: 0.600
Accuracy for label 2: 0.656

Classification Report:
              precision    recall  f1-score   support

           0       0.81      0.76      0.79       305
           1       0.44      0.60      0.50        45
           2       0.66      0.66      0.66       125

    accuracy                           0.72       475
   macro avg       0.63      0.67      0.65       475
weighted avg       0.73      0.72      0.73       475


Confusion Matrix:
[[233  34  38]
 [ 13  27   5]
 [ 42   1  82]]
