In [1]:
import os
import fnmatch
import json
import pathlib
from warnings import warn

import torch
import openai
import datasets
import transformers
from accelerate import Accelerator
from transformers import AutoModelForCausalLM, AutoTokenizer, HfArgumentParser

from eval.args import RunnerArguments, HFArguments, OAIArguments, GenerationArguments
from eval.evaluator import HFEvaluator, OAIEvaluator
from eval.tasks import ALL_TASKS

transformers.logging.set_verbosity_error()
datasets.logging.set_verbosity_error()


  from .autonotebook import tqdm as notebook_tqdm


In [27]:
__file__ = !pwd

In [28]:
__file__ = __file__[0] + '/runner.py'

In [30]:
args = HfArgumentParser(
    [RunnerArguments, HFArguments, OAIArguments, GenerationArguments]
).parse_args("")

args.output_dir = pathlib.Path(__file__).parent / args.output_dir
args.save_generations_raw_path = args.output_dir / args.save_generations_raw_path
args.save_generations_prc_path = args.output_dir / args.save_generations_prc_path
args.save_references_path = args.output_dir / args.save_references_path
args.save_results_path = args.output_dir / args.save_results_path
args.save_generations_raw_path.parent.mkdir(parents=True, exist_ok=True)
args.save_generations_prc_path.parent.mkdir(parents=True, exist_ok=True)
args.save_references_path.parent.mkdir(parents=True, exist_ok=True)
args.save_results_path.parent.mkdir(parents=True, exist_ok=True)

if args.tasks is None:
    task_names = ALL_TASKS
else:
    task_names = set()
    for pattern in args.tasks.split(","):
        for matching in fnmatch.filter(ALL_TASKS, pattern):
            task_names.add(matching)
    task_names = list(task_names)

accelerator = Accelerator()
if accelerator.is_main_process:
    print(f"Selected Tasks: {task_names}")

results = {}

Selected Tasks: ['folio-baseline-16shot', 'folio-baseline-1shot', 'folio-baseline-2shot', 'folio-baseline-4shot', 'folio-baseline-8shot', 'folio-cot-16shot', 'folio-cot-1shot', 'folio-cot-2shot', 'folio-cot-4shot', 'folio-cot-8shot', 'folio-neurosymbolic-16shot', 'folio-neurosymbolic-1shot', 'folio-neurosymbolic-2shot', 'folio-neurosymbolic-4shot', 'folio-neurosymbolic-8shot', 'folio-scratchpad-16shot', 'folio-scratchpad-1shot', 'folio-scratchpad-2shot', 'folio-scratchpad-4shot', 'folio-scratchpad-8shot', 'proofwriter-baseline-16shot', 'proofwriter-baseline-1shot', 'proofwriter-baseline-2shot', 'proofwriter-baseline-4shot', 'proofwriter-baseline-8shot', 'proofwriter-cot-16shot', 'proofwriter-cot-1shot', 'proofwriter-cot-2shot', 'proofwriter-cot-4shot', 'proofwriter-cot-8shot', 'proofwriter-neurosymbolic-16shot', 'proofwriter-neurosymbolic-1shot', 'proofwriter-neurosymbolic-2shot', 'proofwriter-neurosymbolic-4shot', 'proofwriter-neurosymbolic-8shot', 'proofwriter-scratchpad-16shot', 'pr

In [32]:
args.allow_code_execution = True
args.model = 'code-davinci-002'
args.openai_api_env_keys = ['OPENAI_API_KEY']
os.environ['OPENAI_API_KEY'] = 'my key here' # change this

In [33]:
# if args.generations_path:
#     if accelerator.is_main_process:
#         print("Evaluation only mode")
#     evaluator = HFEvaluator(accelerator, None, None, args)
#     for task in task_names:
#         results[task] = evaluator.evaluate(task)
# else:
#     evaluator = None
#     if args.openai_api_env_keys:
#         env_key = args.openai_api_env_keys[0]  # use any key to get list of models
#         openai.api_key = os.environ[env_key]
#         comp_models = {
#             "code-davinci-002",
#             "text-davinci-003",
#             "text-davinci-002",
#             "text-curie-001",
#             "text-babbage-001",
#             "text-ada-001",
#         }
#         chat_models = {
#             "gpt-4",
#             "gpt-4-0613",
#             "gpt-4-32k",
#             "gpt-4-32k-0613",
#             "gpt-3.5-turbo",
#             "gpt-3.5-turbo-16k",
#             "gpt-3.5-turbo-0613",
#             "gpt-3.5-turbo-16k-0613",
#         }
#         if any(model == args.model for model in comp_models):
#             print(f"Using OpenAI Completion API for model {args.model}")
#             evaluator = OAIEvaluator(args)
#         elif any(model == args.model for model in chat_models):
#             print(f"Using OpenAI Chat API for model {args.model}")
#             evaluator = OAIEvaluator(args, chat=True)
#         else:
#             print(
#                 f"Model {args.model} not found in OpenAI API. Assuming HuggingFace locally."
#             )
#     else:
#         warn(
#             "No OpenAI API key provided. Will attempt to use HuggingFace locally regardless of which model name was given."
#         )

#     if evaluator is None:
#         dict_precisions = {
#             "fp32": torch.float32,
#             "fp16": torch.float16,
#             "bf16": torch.bfloat16,
#         }
#         if args.precision not in dict_precisions:
#             raise ValueError(
#                 f"Non valid precision {args.precision}, choose from: fp16, fp32, bf16"
#             )
#         print(f"Loading the model and tokenizer from HF (in {args.precision})")
#         model = AutoModelForCausalLM.from_pretrained(
#             args.model,
#             revision=args.revision,
#             torch_dtype=dict_precisions[args.precision],
#             trust_remote_code=args.trust_remote_code,
#             use_auth_token=args.use_auth_token,
#         )
#         tokenizer = AutoTokenizer.from_pretrained(
#             args.model,
#             revision=args.revision,
#             use_auth_token=args.use_auth_token,
#             truncation_side="left",
#         )
#         if not tokenizer.eos_token:
#             if tokenizer.bos_token:
#                 tokenizer.eos_token = tokenizer.bos_token
#                 print("bos_token used as eos_token")
#             else:
#                 raise ValueError("No eos_token or bos_token found")
#         tokenizer.pad_token = tokenizer.eos_token
#         evaluator = HFEvaluator(accelerator, model, tokenizer, args)

#     for task in task_names:
#         if args.generation_only:
#             if accelerator.is_main_process:
#                 print("Generation mode only")
#             generations_prc, generations_raw, references = evaluator.generate_text(
#                 task
#             )
#             if accelerator.is_main_process:
#                 if args.save_generations_raw:
#                     with open(args.save_generations_raw_path, "w") as fp:
#                         json.dump(generations_raw, fp)
#                         print("raw generations were saved")
#                 if args.save_generations_prc:
#                     with open(args.save_generations_prc_path, "w") as fp:
#                         json.dump(generations_prc, fp)
#                         print("processed generations were saved")
#                 if args.save_references:
#                     with open(args.save_references_path, "w") as fp:
#                         json.dump(references, fp)
#                         print("references were saved")
#         else:
#             results[task] = evaluator.evaluate(task)

# results["config"] = {"model": args.model}
# if not args.generation_only:
#     dumped = json.dumps(results, indent=2, sort_keys=True)
#     if accelerator.is_main_process:
#         print(dumped)

#     if args.save_results:
#         with open(args.save_results_path, "w") as f:
#             f.write(dumped)

In [35]:
import sys

sys.path.append('..')

from eval import tasks

task_name = 'folio-baseline-16shot'
task = tasks.get_task(task_name)
evaluator = OAIEvaluator(args, chat=True)
evaluator.generate_text(task_name)

([], [], [])

## Just random experiment

In [85]:
task.get_dataset()

Dataset({
    features: ['premises', 'premises-FOL', 'conclusion', 'conclusion-FOL', 'label'],
    num_rows: 0
})

In [86]:
task.fewshot_examples()

"<PREMISES>\nAll dispensable things are environment-friendly.\nAll woodware is dispensable.\nAll paper is woodware.\nNo good things are bad.\nAll environment-friendly things are good.\nA worksheet is either paper or is environment-friendly.\n</PREMISES>\n<CONCLUSION>\nA worksheet is not dispensable.\n</CONCLUSION>\n<EVALUATE>\nUncertain\n</EVALUATE>\n\n<PREMISES>\nA La Liga soccer team ranks higher than another if it receives more points.\nIf two La Liga soccer teams recieve the same points, the team which recieves more points from the games between the two teams ranks higher.\nReal Madrid and Barcelona are both La Liga soccer teams.\nIn La Liga 2021-2022, Real Madrid recieves 86 points and Barcelon recieves 73 points.\nIn La Liga 2021-2022, Real Madrid and Barcelona both recieve 3 points from the games between them.\n</PREMISES>\n<CONCLUSION>\nIn La Liga 2021-2022, Real Madrid ranks higher than Barcelona.\n</CONCLUSION>\n<EVALUATE>\nTrue\n</EVALUATE>\n\n<PREMISES>\nAll athletes are go

In [88]:
task.dataset

DatasetDict({
    validation: Dataset({
        features: ['premises', 'premises-FOL', 'conclusion', 'conclusion-FOL', 'label'],
        num_rows: 204
    })
})

In [97]:
task.dataset['validation']['premises']

[['If people perform in school talent shows often, then they attend and are very engaged with school events.',
  'People either perform in school talent shows often or are inactive and disinterested members of their community.',
  'If people chaperone high school dances, then they are not students who attend the school.',
  'All people who are inactive and disinterested members of their community chaperone high school dances.',
  'All young children and teenagers who wish to further their academic careers and educational opportunities are students who attend the school.',
  'Bonnie either both attends and is very engaged with school events and is a student who attends the school, or she neither attends and is very engaged with school events nor is a student who attends the school. '],
 ['If people perform in school talent shows often, then they attend and are very engaged with school events.',
  'People either perform in school talent shows often or are inactive and disinterested membe

In [127]:
task.get_reference(task.dataset['validation'][1])

'True'

In [116]:
task.get_dataset()

Dataset({
    features: ['premises', 'premises-FOL', 'conclusion', 'conclusion-FOL', 'label'],
    num_rows: 0
})