# Validate GPT-4 Question Classifier

In [1]:
import os

import openai
from sklearn.metrics import precision_recall_fscore_support, classification_report
import pandas as pd

In [2]:
from info_loss import gpt4_classify_questions

In [3]:
import dotenv

dotenv.load_dotenv()
openai.api_key = os.environ["OPENAI_API_KEY"]

## Load Data

Manually labeled validation set of 50 questions.

In [4]:
df_qs = pd.read_csv("../data/raw/info-loss-question-types-50.csv")
df_qs.head()

Unnamed: 0,question_id,question,answer,label
0,2774638-keziah-concept-4,What questionnaire was used for this study?,The 46-item calcium-focused food frequency que...,procedural
1,5442667-kathryn-omission-2,How reliable are the results about improvement...,There was a meaningful imbalance in the clinic...,extent
2,2699714-kathryn-concept-6,How does etanercept help children with newly d...,The study suggests that etanercept helps with ...,consequence
3,4069047-keziah-concept-1,What kind of melatonin is being analyzed in th...,"This study looks at endogenous melatonin, mean...",concept
4,4555141-keziah-concept-5,What tests did girls generally score higher on...,The girls displayed higher scores on the Wechs...,comparison


## Run classifier

Aggregate results over 5 runs.

In [5]:
y_pred1 = gpt4_classify_questions.predict_batched(
    df_qs["question"],
    batch_size=32,
    cache_dir="../output/question-clf-validation/run1/",
)
y_pred2 = gpt4_classify_questions.predict_batched(
    df_qs["question"],
    batch_size=32,
    cache_dir="../output/question-clf-validation/run2/",
)
y_pred3 = gpt4_classify_questions.predict_batched(
    df_qs["question"],
    batch_size=32,
    cache_dir="../output/question-clf-validation/run3/",
)
y_pred4 = gpt4_classify_questions.predict_batched(
    df_qs["question"],
    batch_size=32,
    cache_dir="../output/question-clf-validation/run4/",
)
y_pred5 = gpt4_classify_questions.predict_batched(
    df_qs["question"],
    batch_size=32,
    cache_dir="../output/question-clf-validation/run5/",
)

  0%|          | 0/2 [00:00<?, ?it/s]

  0%|          | 0/2 [00:00<?, ?it/s]

  0%|          | 0/2 [00:00<?, ?it/s]

  0%|          | 0/2 [00:00<?, ?it/s]

  0%|          | 0/2 [00:00<?, ?it/s]

In [6]:
y_true = df_qs["label"]
df_scores = pd.DataFrame(
    [
        precision_recall_fscore_support(y_true, y_pred1, average="macro"),
        precision_recall_fscore_support(y_true, y_pred2, average="macro"),
        precision_recall_fscore_support(y_true, y_pred3, average="macro"),
        precision_recall_fscore_support(y_true, y_pred4, average="macro"),
        precision_recall_fscore_support(y_true, y_pred5, average="macro"),
    ],
    columns=["precision", "recall", "f1", "support"],
)
df_scores = df_scores.drop("support", axis=1)

print("5 runs:")
display(df_scores.round(3))

print("Average over 5 runs:")
display(df_scores.mean().round(2))

5 runs:


Unnamed: 0,precision,recall,f1
0,0.845,0.871,0.846
1,0.885,0.871,0.871
2,0.898,0.891,0.887
3,0.898,0.891,0.887
4,0.885,0.871,0.871


Average over 5 runs:


precision    0.88
recall       0.88
f1           0.87
dtype: float64

In [8]:
print(classification_report(y_true, y_pred5))

              precision    recall  f1-score   support

       cause       1.00      1.00      1.00         2
  comparison       1.00      0.75      0.86         4
     concept       0.70      1.00      0.82         7
 consequence       0.71      0.71      0.71         7
     example       1.00      1.00      1.00         1
      extent       0.90      0.75      0.82        12
  procedural       0.88      0.88      0.88        17

    accuracy                           0.84        50
   macro avg       0.89      0.87      0.87        50
weighted avg       0.85      0.84      0.84        50

