In [1]:
import json
import os
import pathlib
import random
import warnings
from concurrent.futures.thread import ThreadPoolExecutor

import torch
import openai
import datasets
import transformers
from transformers import AutoModelForCausalLM, AutoTokenizer, HfArgumentParser
from eval.args import RunnerArguments, HFArguments, OAIArguments, GenerationArguments
from eval.evaluator import HFEvaluator, OAIEvaluator, _WARNING
from eval.tasks import ALL_TASKS, get_task

from dotenv import load_dotenv

load_dotenv()  # loads the api keys in .env

transformers.logging.set_verbosity_error()
datasets.logging.set_verbosity_error()

args = HfArgumentParser([RunnerArguments(), HFArguments(), OAIArguments(), GenerationArguments()]).parse_args("")

task_name = "proofwriter-neurosymbolic-2shot"

args.task_name = task_name

args.max_length_generation = 3000
args.temperature = 0.8
args.openai_api_env_keys = ['OPENAI_API_KEY', 'OPENAI_API_KEY2', 'OPENAI_API_KEY3', 'OPENAI_API_KEY4', 'OPENAI_API_KEY5', 'OPENAI_API_KEY6']
args.model = 'gpt-3.5-turbo-16k-0613'
args.allow_code_execution = True

args.n_samples = 1

evaluator = OAIEvaluator(args, chat=True)

  from .autonotebook import tqdm as notebook_tqdm


In [13]:
def get_extra_context_prompt(task, generation):
    instruction_and_train = get_instruction_and_extra_context_examples(task)
    test = format_extra_context_example(task, generation)
    return "\n".join([instruction_and_train, test])

def format_extra_context_example(self, generation):
    # todo maybe we should also include the doc itself (similar to the format_example?)
    return f"""
<INPUT>
<PREMISES>
{generation}
</PREMISES>
</INPUT>
<OUTPUT>
"""

def get_instruction_and_extra_context_examples(task):
    return f"""
You will be given the premises for a first-order logic (FOL) problem.
The problem is to identify additional premises that are implicitly common sense from the ones given.
The premises are given in the form of a set of first-order logic sentences.
The task is to generate new common sense premises, text and FOL pairs, that would be common sense to someone reading the original premises.
These new common sense premises should reflect the nature of synonyms and antonyms, categorize proper names, and identify implicit characteristics from the ones provided.
Do not limit the amount of new premises generated in the output.
Expressions should be adhere to the format of the Python NLTK package logic module. Here are a couple examples:
<INPUT>
<PREMISES>
Premise: When a person reads a book, that person
gains knowledge.
FOL: all x. all y. (Person(x) & Reads(x, y) &
Book(y) -> Gains(x, Knowledge))
Premise: Harry read the book "Walden" by Henry
Thoreau.
FOL: Reads(Harry, Walden)
</PREMISES>
</INPUT>

<OUTPUT>
Premise: Harry is a human.
FOL: Person(Harry)
</OUTPUT>

<INPUT>
<PREMISES>
Premise: Heinrich Schmidt was a Nazi German
politician.
FOL: NaziGermanPolitician (HeinrichSchmidt)
</PREMISES>
</INPUT>

<OUTPUT>
Premise: Heinrich Schmidt was a Nazi
FOL: Nazi(HeinrichSchmidt)
Premise: Heinrich Schmidt was a German
FOL: German(HeinrichSchmidt)
Premise: Heinrich Schmidt was a Politician
FOL: Politician(HeinrichSchmidt)
</OUTPUT>

<INPUT>
<PREMISES>
Premise: Famine is bad
FOL: Bad(Famine)
</PREMISES>
</INPUT>
<OUTPUT>
Premise: Bad is not good.
FOL: Bad -> -Good
Premise: Cold is not warm
FOL: Cold -> -Warm
</OUTPUT>

After generating this output pick the top 5 premises that are the most relevant to the provided premises, making sure to prune any that contradict each other or the given input. Return these with the tags <CONTEXT> and </CONTEXT> in the same format as the output.
"""


def generate_text(self, task):
    dataset = task.get_dataset()
    n_tasks = self.args.limit if self.args.limit else len(dataset)
    indices = list(range(n_tasks))
    prompts = [task.get_prompt(dataset[i]) for i in indices]
    stops = [task.stop_words for _ in range(n_tasks)]

    with ThreadPoolExecutor() as executor:
        generations_raw = list(executor.map(self.get_completion, prompts, stops))

    # todo change n_sample of here to 1
    context_prompt = [task.get_extra_context_prompt(random.choice(generation)) for generation in generations_raw]

    with ThreadPoolExecutor() as executor:
        contexts = list(executor.map(self.get_completion, context_prompt, stops))

    # todo bad code. doesn't work for sample bigger than 1

    def extract_context_from_raw(text):
        # this is for the new context Ate added
        try:
            return text.split('<CONTEXT>')[1].split('</CONTEXT>')[0]
        except Exception as e:
            print("failed to get the <CONTEXT> part in", text)
            print('returning all of the generated thing')
            return text

    generations_prc = [
        [
            task.postprocess_generation(
                extract_context_from_raw(contexts[i][j]) + '\n' + generations_raw[i][j], i, completion_only=True
            )
            for j in range(self.args.n_samples)
        ]
        for i in range(n_tasks)
    ]
    references = [task.get_reference(dataset[i]) for i in range(n_tasks)]
    return indices, generations_prc, generations_raw, contexts, references


In [18]:
from eval.tasks import get_task

task = get_task(task_name)
dataset = task.get_dataset()
idx = 5
doc = dataset[idx]

stop_words = task.stop_words + ['</OUTPUT>']

In [7]:
prompt = task.get_prompt(doc)

print("prompting: ")
print(prompt)

generate_raw = evaluator.get_completion(prompt=prompt, stop=stop_words)[0]

prompting: 
The following is a first-order logic (FOL) problem.
The problem is to determine whether the conclusion follows from the premises.
The premises are given in the form of a set of first-order logic sentences.
The conclusion is given in the form of a single first-order logic sentence.
The task is to translate each of the premises and conclusions into FOL expressions, so that the expressions can be evaluated by a theorem solver to determine whether the conclusion follows from the premises.
Expressions should be adhere to the format of the Python NLTK package logic module.


<PREMISES>
All dispensable things are environment-friendly.
All woodware is dispensable.
All paper is woodware.
No good things are bad.
All environment-friendly things are good.
A worksheet is either paper or is environment-friendly.
</PREMISES>
<CONCLUSION>
A worksheet is not dispensable.
</CONCLUSION>
<EVALUATE>
TEXT:	All dispensable things are environment-friendly.
FOL:	all x. (Dispensable(x) -> Environmen

In [8]:
print(generate_raw)

TEXT: The bald eagle does not need the mouse.
FOL: -Needs(BaldEagle, Mouse)

TEXT: The bear does not eat the bald eagle.
FOL: -Eats(Bear, BaldEagle)

TEXT: The bear eats the tiger.
FOL: Eats(Bear, Tiger)

TEXT: The mouse chases the bald eagle.
FOL: Chases(Mouse, BaldEagle)

TEXT: The mouse eats the bear.
FOL: Eats(Mouse, Bear)

TEXT: The mouse is round.
FOL: Round(Mouse)

TEXT: The tiger eats the bear.
FOL: Eats(Tiger, Bear)

TEXT: If something is round then it chases the bear.
FOL: all x. (Round(x) -> Chases(x, Bear))

TEXT: If something eats the bear and it does not need the bald eagle then it chases the bear.
FOL: all x. all y. ((Eats(x, Bear) & -Needs(x, BaldEagle)) -> Chases(x, Bear))

TEXT: If something chases the bear then it needs the tiger.
FOL: all x. (Chases(x, Bear) -> Needs(x, Tiger))

TEXT: If something chases the mouse and the mouse is red then it does not eat the bald eagle.
FOL: all x. (Chases(x, Mouse) & Red(Mouse) -> -Eats(x, BaldEagle))

TEXT: If something is red th

In [15]:
context_prompt = get_extra_context_prompt(task, generate_raw)

print("prompting: ", context_prompt)

context = evaluator.get_completion(prompt=context_prompt, stop=stop_words)[0]

prompting:  
You will be given the premises for a first-order logic (FOL) problem.
The problem is to identify additional premises that are implicitly common sense from the ones given.
The premises are given in the form of a set of first-order logic sentences.
The task is to generate new common sense premises, text and FOL pairs, that would be common sense to someone reading the original premises.
These new common sense premises should reflect the nature of synonyms and antonyms, categorize proper names, and identify implicit characteristics from the ones provided.
Do not limit the amount of new premises generated in the output.
Expressions should be adhere to the format of the Python NLTK package logic module. Here are a couple examples:
<INPUT>
<PREMISES>
Premise: When a person reads a book, that person
gains knowledge.
FOL: all x. all y. (Person(x) & Reads(x, y) &
Book(y) -> Gains(x, Knowledge))
Premise: Harry read the book "Walden" by Henry
Thoreau.
FOL: Reads(Harry, Walden)
</PREMI

In [16]:
print(context)

Premise: The bald eagle is a bird.
FOL: Bird(BaldEagle)

Premise: The bear is an animal.
FOL: Animal(Bear)

Premise: The tiger is an animal.
FOL: Animal(Tiger)

Premise: The mouse is an animal.
FOL: Animal(Mouse)

Premise: The bald eagle is not a mouse.
FOL: -Mouse(BaldEagle)

Premise: The bear does not chase the tiger.
FOL: -Chases(Bear, Tiger)

Premise: The tiger does not eat the tiger.
FOL: -Eats(Tiger, Tiger)

Premise: If something eats the mouse, then it is not round.
FOL: all x. (Eats(x, Mouse) -> -Round(x))

Premise: If something chases the bear, then it does not eat the mouse.
FOL: all x. (Chases(x, Bear) -> -Eats(x, Mouse))

Premise: If something eats the bald eagle, then it is not round.
FOL: all x. (Eats(x, BaldEagle) -> -Round(x))

<CONTEXT>
Premise: The bald eagle is a bird.
FOL: Bird(BaldEagle)

Premise: The bear is an animal.
FOL: Animal(Bear)

Premise: The tiger is an animal.
FOL: Animal(Tiger)

Premise: The mouse is an animal.
FOL: Animal(Mouse)

Premise: The bald eagl

In [19]:
task.postprocess_generation(
    context + '\n' + generate_raw, idx, completion_only=True
)

generation is:  Premise: The bald eagle is a bird.
FOL: Bird(BaldEagle)

Premise: The bear is an animal.
FOL: Animal(Bear)

Premise: The tiger is an animal.
FOL: Animal(Tiger)

Premise: The mouse is an animal.
FOL: Animal(Mouse)

Premise: The bald eagle is not a mouse.
FOL: -Mouse(BaldEagle)

Premise: The bear does not chase the tiger.
FOL: -Chases(Bear, Tiger)

Premise: The tiger does not eat the tiger.
FOL: -Eats(Tiger, Tiger)

Premise: If something eats the mouse, then it is not round.
FOL: all x. (Eats(x, Mouse) -> -Round(x))

Premise: If something chases the bear, then it does not eat the mouse.
FOL: all x. (Chases(x, Bear) -> -Eats(x, Mouse))

Premise: If something eats the bald eagle, then it is not round.
FOL: all x. (Eats(x, BaldEagle) -> -Round(x))

<CONTEXT>
Premise: The bald eagle is a bird.
FOL: Bird(BaldEagle)

Premise: The bear is an animal.
FOL: Animal(Bear)

Premise: The tiger is an animal.
FOL: Animal(Tiger)

Premise: The mouse is an animal.
FOL: Animal(Mouse)

Premis

'Error'