In [273]:
import json
import os
import pathlib
import random
import warnings
from concurrent.futures.thread import ThreadPoolExecutor

import torch
import openai
import datasets
import pandas as pd
import transformers
from transformers import AutoModelForCausalLM, AutoTokenizer, HfArgumentParser
from eval.args import RunnerArguments, HFArguments, OAIArguments, GenerationArguments
from eval.evaluator import HFEvaluator, OAIEvaluator, _WARNING
from eval.tasks import ALL_TASKS, get_task

# from dotenv import load_dotenv

# load_dotenv()  # loads the api keys in .env

transformers.logging.set_verbosity_error()
datasets.logging.set_verbosity_error()

args = HfArgumentParser([RunnerArguments(), HFArguments(), OAIArguments(), GenerationArguments()]).parse_args("")

task_name = "proofwriter-neurosymbolic-2shot"

args.task_name = task_name
args.max_length_generation = 3000
args.temperature = 0.8
args.openai_api_env_keys = ["OPENAI_API_KEY", "OPENAI_API_KEY2", 'OPENAI_API_KEY3']
args.model = 'gpt-3.5-turbo-16k-0613'
args.allow_code_execution = True

args.n_samples = 2

evaluator = OAIEvaluator(args, chat=True)

In [None]:
def get_extra_context_prompt(task, generation):
    instruction_and_train = get_instruction_and_extra_context_examples(task)
    test = format_extra_context_example(task, generation)
    return "\n".join([instruction_and_train, test])

def format_extra_context_example(self, generation):
    # todo maybe we should also include the doc itself (similar to the format_example?)
    sentences = ""
    
    lines = generation.split('\n')
    lines = [line for line in lines if line.strip()]
    #TEXT:
    for i in range(0,len(lines)-2,2):
        
        text = lines[i].strip()
        text = text.removeprefix("TEXT:").strip()
        premise = "PREMISE: " + text

        fol = lines[1+i].strip()
        fol = fol.removeprefix("FOL:").strip()
        fol = "FOL: " + fol
                
        sentences+=f"""
        <SENTENCE>
        {premise}
        {fol}
        </SENTENCE>
        """
    
    return f"""
<INPUT>
{sentences}
</INPUT>
<OUTPUT>
"""

def get_instruction_and_extra_context_examples(task):
    return new_prompt

def generate_text(self, task):
    dataset = task.get_dataset()
    n_tasks = self.args.limit if self.args.limit else len(dataset)
    indices = list(range(n_tasks))   
    prompts = [task.get_prompt(dataset[i]) for i in indices]
    stops = [task.stop_words for _ in range(n_tasks)]

    with ThreadPoolExecutor() as executor:
        generations_raw = list(executor.map(self.get_completion, prompts, stops))

    # todo change n_sample of here to 1
    context_prompt = [task.get_extra_context_prompt(random.choice(generation)) for generation in generations_raw]

    with ThreadPoolExecutor() as executor:
        contexts = list(executor.map(self.get_completion, context_prompt, stops))

    # todo bad code. doesn't work for sample bigger than 1

    def extract_context_from_raw(text):
        # this is for the new context Ate added
        try:
            return text.split('<CONTEXT>')[1].split('</CONTEXT>')[0]
        except Exception as e:
            print("failed to get the <CONTEXT> part in", text)
            print('returning all of the generated thing')
            return text

    generations_prc = [
        [
            task.postprocess_generation(
                extract_context_from_raw(contexts[i][j]) + '\n' + generations_raw[i][j], i, completion_only=True
            )
            for j in range(self.args.n_samples)
        ]
        for i in range(n_tasks)
    ]
    references = [task.get_reference(dataset[i]) for i in range(n_tasks)]
    return indices, generations_prc, generations_raw, contexts, references


In [None]:
new_prompt =f"""You will be given the premises for a first-order logic (FOL) problem.
The problem is to identify additional premises that are implicitly common sense from the ones given, and label them.
The original premises are given in the form of a set of premises and first-order logic sentences.
The task is to generate new common sense premises, text and FOL pairs, that would be common sense to someone reading the original premises. Include this new context in the sentence that it describes, with the correct label of CONTEXT, and provide brief justification.
These new common sense, context-based premises should reflect context: the nature of synonyms and antonyms, categorize proper names, and identify implicit characteristics from the ones provided.
For the new context, provide CONTEXT in text, FOL in the format of the Python NLTK, and a brief justification. 
Do not create new sentences for context, instead, add the context to the sentence it fits best in. 
Only add context to a sentence based on the premise and real-world common-sense, and do not make any inferences using other sentences. 
Do not limit the amount of new premises generated in the output.
Expressions should be adhere to the format of the Python NLTK package logic module. Here are a couple examples:

<INPUT>
    <SENTENCE>
        PREMISE: When a person reads a book, that person gains knowledge.
        FOL: all x. all y. (Person(x) & Reads(x, y) & Book(y) -> Gains(x, Knowledge))
    </SENTENCE>
    <SENTENCE>
        PREMISE:  Harry read the book "Walden" by Henry.
        FOL: Reads(Harry, Walden)
    </SENTENCE>
</INPUT>

<OUTPUT>
    <SENTENCE>
        PREMISE: When a person reads a book, that person gains knowledge.
        FOL: all x. all y. (Person(x) & Reads(x, y) & Book(y) -> Gains(x, Knowledge))
    </SENTENCE>
    <SENTENCE>
        PREMISE: Harry read the book "Walden" by Henry.
        FOL: Reads(Harry, Walden)
        CONTEXT: Harry is a person. 
        FOL: Person(Harry)
        JUSTIFICATION: Harry is a person's name - this categorizes proper nouns. 
        CONTEXT: Walden is a book. 
        FOL: Book(Walden)
        JUSTIFICATION:The text says 'the book "Walden"', so Walden is a book's name - this categorizes proper nouns. 
    </SENTENCE>
</OUTPUT>

<INPUT>
    <SENTENCE>
        PREMISE: Heinrich Schmidt was a Nazi German politician.
        FOL: NaziGermanPolitician(HeinrichSchmidt)
    </SENTENCE>
</INPUT>

<OUTPUT>
    <SENTENCE>
        PREMISE: Heinrich Schmidt was a Nazi German politician.
        FOL: NaziGermanPolitician(HeinrichSchmidt)
        CONTEXT: Heinrich Schmidt was a Nazi.
        FOL: Nazi(HeinrichSchmidt)
        JUSTIFICATION: Because Heinrich Schmidt was a Nazi German politician, he must have been Nazi. 
        CONTEXT: Heinrich Schmidt was a German.
        FOL: German(HeinrichSchmidt)
        JUSTIFICATION: Because Heinrich Schmidt was a Nazi German politician, he must have been German. 
        CONTEXT: Heinrich Schmidt was a Politician.
        FOL: Politician(HeinrichSchmidt)
        JUSTIFICATION: Because Heinrich Schmidt was a Nazi German politician, he must have been a politician. 
    </SENTENCE>
</OUTPUT>

<INPUT>
    <SENTENCE>
        PREMISE: Famine is bad.
        FOL: Bad(Famine)
    </SENTENCE>
</INPUT>
<OUTPUT>
    <SENTENCE>
        PREMISE: Famine is bad.
        FOL: Bad(Famine)
        CONTEXT: Bad is not good.
        FOL: all x. Bad(x) -> -Good(x)
        JUSTIFICATION: The adds context that describes the nature of antonyms.
    </SENTENCE>
</OUTPUT>
"""

In [None]:
from eval.tasks import get_task

task = get_task(task_name)
dataset = task.get_dataset()
idx = 2
doc = dataset[idx]

stop_words = task.stop_words + ['</OUTPUT>']
prompt = task.get_prompt(doc)

print("prompting: ")
print(prompt)

generate_raw = evaluator.get_completion(prompt=prompt, stop=stop_words)[0]
context_prompt = get_extra_context_prompt(task, generate_raw)
#print("prompting: ", context_prompt)
context = evaluator.get_completion(prompt=context_prompt, stop=stop_words)[0];

In [151]:
from eval.tasks import get_task

task = get_task(task_name)
dataset = task.get_dataset()
idx = 2
doc = dataset[idx]

stop_words = task.stop_words + ['</OUTPUT>']

In [202]:
prompt = task.get_prompt(doc)

print("prompting: ")
print(prompt)

generate_raw = evaluator.get_completion(prompt=prompt, stop=stop_words)[0]

prompting: 
The following is a first-order logic (FOL) problem.
The problem is to determine whether the conclusion follows from the premises.
The premises are given in the form of a set of first-order logic sentences.
The conclusion is given in the form of a single first-order logic sentence.
The task is to translate each of the premises and conclusions into FOL expressions, so that the expressions can be evaluated by a theorem solver to determine whether the conclusion follows from the premises.
Expressions should be adhere to the format of the Python NLTK package logic module.


<PREMISES>
All dispensable things are environment-friendly.
All woodware is dispensable.
All paper is woodware.
No good things are bad.
All environment-friendly things are good.
A worksheet is either paper or is environment-friendly.
</PREMISES>
<CONCLUSION>
A worksheet is not dispensable.
</CONCLUSION>
<EVALUATE>
TEXT:	All dispensable things are environment-friendly.
FOL:	all x. (Dispensable(x) -> Environmen

In [203]:
print(generate_raw)

TEXT:	Charlie is cold.
FOL:	Cold(Charlie)
TEXT:	Charlie is furry.
FOL:	Furry(Charlie)
TEXT:	Charlie is kind.
FOL:	Kind(Charlie)
TEXT:	Charlie is nice.
FOL:	Nice(Charlie)
TEXT:	Charlie is red.
FOL:	Red(Charlie)
TEXT:	Charlie is rough.
FOL:	Rough(Charlie)
TEXT:	Dave is red.
FOL:	Red(Dave)
TEXT:	Dave is rough.
FOL:	Rough(Dave)
TEXT:	Fiona is rough.
FOL:	Rough(Fiona)
TEXT:	Harry is kind.
FOL:	Kind(Harry)
TEXT:	Harry is rough.
FOL:	Rough(Harry)
TEXT:	Red things are nice.
FOL:	all x. (Red(x) -> Nice(x))
TEXT:	All nice things are cold.
FOL:	all x. (Nice(x) -> Cold(x))
TEXT:	Furry things are kind.
FOL:	all x. (Furry(x) -> Kind(x))
TEXT:	If something is cold and rough then it is white.
FOL:	all x. (Cold(x) & Rough(x) -> White(x))
TEXT:	If Fiona is furry then Fiona is kind.
FOL:	Furry(Fiona) -> Kind(Fiona)
TEXT:	Rough, kind things are furry.
FOL:	all x. (Rough(x) & Kind(x) -> Furry(x))
TEXT:	White things are kind.
FOL:	all x. (White(x) -> Kind(x))
TEXT:	Fiona is white.
FOL:	White(Fiona)



In [204]:
context_prompt = get_extra_context_prompt(task, generate_raw)
#print("prompting: ", context_prompt)
context = evaluator.get_completion(prompt=context_prompt, stop=stop_words)[0];

In [205]:
print(context)

<SENTENCE>
    PREMISE: Charlie is cold.
    FOL: Cold(Charlie)
    CONTEXT: Cold is not hot.
    FOL: all x. Cold(x) -> -Hot(x)
    JUSTIFICATION: This adds context that describes the nature of antonyms.
</SENTENCE>

<SENTENCE>
    PREMISE: Charlie is furry.
    FOL: Furry(Charlie)
    CONTEXT: Furry is not bald.
    FOL: all x. Furry(x) -> -Bald(x)
    JUSTIFICATION: This adds context that describes the nature of antonyms.
    CONTEXT: Furry things are warm.
    FOL: all x. (Furry(x) -> Warm(x))
    JUSTIFICATION: This adds context that describes the nature of characteristics of furry things.
</SENTENCE>

<SENTENCE>
    PREMISE: Charlie is kind.
    FOL: Kind(Charlie)
    CONTEXT: Kind things are nice.
    FOL: all x. (Kind(x) -> Nice(x))
    JUSTIFICATION: This adds context that describes the nature of characteristics of kind things.
    CONTEXT: Kind things are not mean.
    FOL: all x. (Kind(x) -> -Mean(x))
    JUSTIFICATION: This adds context that describes the nature of antonyms

In [222]:
def strip_prem(context, generate_raw):
    context_prem = set()
    raw_prem = set()
    context_lines = context.split('\n')
    for line in context_lines: 
        line = line.strip()
        if line[:4] == "PREM" or line[:4] == "CONT":
            line = line.replace("PREMISE:", "")
            line = line.replace("CONTEXT:", "")
            line = line.strip()
            context_prem.add(line)

    raw_lines = generate_raw.split('\n')
    for line in raw_lines: 
        line = line.strip()
        if line[:4] == "TEXT":
            line = line.replace("TEXT:", "")
            line = line.strip()
            raw_prem.add(line)   
    return context_prem, raw_prem

In [215]:
print(context)

<SENTENCE>
    PREMISE: Charlie is cold.
    FOL: Cold(Charlie)
    CONTEXT: Cold is not hot.
    FOL: all x. Cold(x) -> -Hot(x)
    JUSTIFICATION: This adds context that describes the nature of antonyms.
</SENTENCE>

<SENTENCE>
    PREMISE: Charlie is furry.
    FOL: Furry(Charlie)
    CONTEXT: Furry is not bald.
    FOL: all x. Furry(x) -> -Bald(x)
    JUSTIFICATION: This adds context that describes the nature of antonyms.
    CONTEXT: Furry things are warm.
    FOL: all x. (Furry(x) -> Warm(x))
    JUSTIFICATION: This adds context that describes the nature of characteristics of furry things.
</SENTENCE>

<SENTENCE>
    PREMISE: Charlie is kind.
    FOL: Kind(Charlie)
    CONTEXT: Kind things are nice.
    FOL: all x. (Kind(x) -> Nice(x))
    JUSTIFICATION: This adds context that describes the nature of characteristics of kind things.
    CONTEXT: Kind things are not mean.
    FOL: all x. (Kind(x) -> -Mean(x))
    JUSTIFICATION: This adds context that describes the nature of antonyms

In [232]:
context.count("PREMISE:")+context.count("CONTEXT:")

31

In [235]:
len(context_prem)

19

In [207]:
context_prem, raw_prem = strip_prem(context, generate_raw)

In [209]:
context_prem

{'All nice things are cold.',
 'Charlie is cold.',
 'Charlie is furry.',
 'Charlie is kind.',
 'Charlie is nice.',
 'Charlie is red.',
 'Charlie is rough.',
 'Cold is not hot.',
 'Dave is red.',
 'Dave is rough.',
 'Fiona is rough.',
 'Furry is not bald.',
 'Furry things are kind.',
 'Furry things are warm.',
 'Harry is kind.',
 'Harry is rough.',
 'If Fiona is furry then Fiona is kind.',
 'If something is cold and rough then it is white.',
 'Kind things are helpful.',
 'Kind things are nice.',
 'Kind things are not mean.',
 'Nice things are helpful.',
 'Nice things are kind.',
 'Nice things are not mean.',
 'Red things are colorful.',
 'Red things are nice.',
 'Red things are not green.',
 'Rough things are not smooth.',
 'Rough things are not soft.',
 'Rough, kind things are furry.',
 'White things are kind.'}

In [208]:
raw_prem

{'All nice things are cold.',
 'Charlie is cold.',
 'Charlie is furry.',
 'Charlie is kind.',
 'Charlie is nice.',
 'Charlie is red.',
 'Charlie is rough.',
 'Dave is red.',
 'Dave is rough.',
 'Fiona is rough.',
 'Fiona is white.',
 'Furry things are kind.',
 'Harry is kind.',
 'Harry is rough.',
 'If Fiona is furry then Fiona is kind.',
 'If something is cold and rough then it is white.',
 'Red things are nice.',
 'Rough, kind things are furry.',
 'White things are kind.'}

In [217]:
raw_prem.issubset(context_prem)

False

In [218]:
raw_prem - context_prem

{'Fiona is white.'}

In [219]:
context_prem - raw_prem

{'Cold is not hot.',
 'Furry is not bald.',
 'Furry things are warm.',
 'Kind things are helpful.',
 'Kind things are nice.',
 'Kind things are not mean.',
 'Nice things are helpful.',
 'Nice things are kind.',
 'Nice things are not mean.',
 'Red things are colorful.',
 'Red things are not green.',
 'Rough things are not smooth.',
 'Rough things are not soft.'}

In [220]:
task.postprocess_generation(
    context + generate_raw, idx, completion_only=True
)

'Uncertain'

In [221]:
task.get_reference(doc)

'Uncertain'

In [274]:
def test(idx):
    data = []
    dataset = task.get_dataset()
    doc = dataset[idx]
    stop_words = task.stop_words + ['</OUTPUT>']
    prompt = task.get_prompt(doc)    
    generate_raw = evaluator.get_completion(prompt=prompt, stop=stop_words)[0]
    context_prompt = get_extra_context_prompt(task, generate_raw)
    output = []
    for i in range(args.n_samples): 
        context = evaluator.get_completion(prompt=context_prompt, stop=stop_words)[i]
        our_result = task.postprocess_generation(context + generate_raw, idx, completion_only=True)
        ref_result = task.get_reference(doc)

        temp_dict = {'our_result': our_result, 'ref_result':ref_result, 'context': context, 'generate_raw': generate_raw}
        
        data.append(temp_dict)
    return pd.DataFrame(data)

In [None]:
test(2)

Rate limit error; trying again with a different API key.
Rate limit error; trying again with a different API key.


In [255]:
our_result, ref_result, context, generate_raw = _

In [257]:
print(context)

<SENTENCE>
    PREMISE: Charlie is blue.
    FOL: Blue(Charlie)
    CONTEXT: Blue is a color.
    FOL: Color(Blue)
    JUSTIFICATION: Blue is mentioned as a color, so it can be categorized as a color.
</SENTENCE>

<SENTENCE>
    PREMISE: Charlie is kind.
    FOL: Kind(Charlie)
    CONTEXT: Kindness is a positive trait.
    FOL: PositiveTrait(Kindness)
    JUSTIFICATION: Kind is mentioned as a positive trait, so it can be categorized as a positive trait.
</SENTENCE>

<SENTENCE>
    PREMISE: Dave is round.
    FOL: Round(Dave)
    CONTEXT: Round is a shape.
    FOL: Shape(Round)
    JUSTIFICATION: Round is mentioned as a shape, so it can be categorized as a shape.
</SENTENCE>

<SENTENCE>
    PREMISE: Fiona is big.
    FOL: Big(Fiona)
    CONTEXT: Big is a size.
    FOL: Size(Big)
    JUSTIFICATION: Big is mentioned as a size, so it can be categorized as a size.
</SENTENCE>

<SENTENCE>
    PREMISE: Fiona is blue.
    FOL: Blue(Fiona)
    CONTEXT: Blue is a color.
    FOL: Color(Blue)
    

In [242]:
raw_prem, context_prem = strip_prem(context, generate_raw)

In [243]:
context_prem - raw_prem

{'Fiona is white.'}

In [244]:
raw_prem - context_prem

{'Charlie is a thing.',
 'Charlie is not hot.',
 'Dave is a thing.',
 'Fiona is a thing.',
 'Furry things are warm.',
 'Harry is a thing.'}