In [2]:
import os

import wandb
from solvers import InContextGPT
from structs import DataSet

In [3]:
os.environ["WANDB_SILENT"] = "true"

wandb.init(
    project="brainteasers",
    config={
        "solver": "InContextGPT",
        "model_sp": "ft:gpt-3.5-turbo-0613:ncodex::8QhGOWvR",
        "model_wp": "ft:gpt-3.5-turbo-0613:ncodex::8R1R0Vi0"
    }
)

# Sentence Puzzle

In [4]:
sp_eval = DataSet.from_file("../data/SP-eval.pkl")

In [5]:
sp_solver = InContextGPT(model_name="ft:gpt-3.5-turbo-0613:ncodex::8QhGOWvR", context=InContextGPT.Context.SENTENCE)

sp_answers = sp_solver.solve(sp_eval)
sp_are_answers_correct = [instance.is_choice_correct(answer) for instance, answer in zip(sp_eval, sp_answers)]
sp_accuracy = sum(sp_are_answers_correct) / len(sp_are_answers_correct)

print(f"Accuracy on the Sentence Puzzle dataset: {sp_accuracy: .4f}")

  0%|          | 0/119 [00:00<?, ?it/s]

Accuracy on the Sentence Puzzle dataset:  0.8992


# Word Puzzle

In [6]:
wp_eval = DataSet.from_file("../data/WP-eval.pkl")

In [7]:
# wp_solver = FineTunedGPT()
# wp_solver.fit(wp_train)

In [8]:
wp_solver = InContextGPT(model_name="ft:gpt-3.5-turbo-0613:ncodex::8R1R0Vi0", context=InContextGPT.Context.WORD)

wp_answers = wp_solver.solve(wp_eval)
wp_are_answers_correct = [instance.is_choice_correct(answer) for instance, answer in zip(wp_eval, wp_answers)]
wp_accuracy = sum(wp_are_answers_correct) / len(wp_are_answers_correct)

print(f"Accuracy on the Word Puzzle dataset: {wp_accuracy: .4f}")

  0%|          | 0/120 [00:00<?, ?it/s]

Accuracy on the Word Puzzle dataset:  0.6333


In [9]:
total_cardinality = (len(sp_answers) + len(wp_answers))
total_accuracy = (sp_accuracy * (len(sp_answers) / total_cardinality) +
                  wp_accuracy * (len(wp_answers) / total_cardinality))

In [10]:
wandb.log(
    {
        "accuracy/overall": total_accuracy,
        "accuracy/sp": sp_accuracy,
        "accuracy/wp": wp_accuracy,
    }
)

wandb.finish(quiet=True)