In [17]:
pip install openai datasets pygments rouge_score evaluate

Note: you may need to restart the kernel to use updated packages.


In [18]:
from datasets import load_dataset
from openai import OpenAI
from tqdm import tqdm
import evaluate
import json
import random

random.seed(42)

In [19]:
train_dataset = load_dataset("Jise/ruletaker", split="train")
test_dataset = load_dataset("Jise/ruletaker", split="test")
ood_dataset = load_dataset("Jise/ruletaker", split="ood_test")
print(train_dataset)

rouge = evaluate.load("rouge")

model = "gpt-4o"

Dataset({
    features: ['context', 'statement', 'reasoning', 'depth', 'flag'],
    num_rows: 1000
})


Zero-shot CoT

In [20]:
prompt = """
    {}
    Assertion: {}
"""

instruction = """
    Based on the facts and rules, give simple reasoning steps citing the rules, and output whether the assertion is true.
    You must output in json format: {"reason": "Because {rule1}, {rule2}, ..., {conclusion}", "answer": 1/0}. where rules are copies from the Rule, and conclusion should
    be either the assertion or its negation, do not add other texts.
"""

client = OpenAI()

In [21]:
test_buffer = []
ood_buffer = []

for n, i in enumerate(tqdm(test_dataset)):
    completion = client.chat.completions.create(
        model=model,
        messages=[
            {"role": "system", "content": instruction},
            {
                "role": "user",
                "content": prompt.format(i["context"], i["statement"])
            }
        ]
    )
    test_buffer.append(completion.choices[0].message.content)

for n, i in enumerate(ood_dataset):
    completion = client.chat.completions.create(
        model=model,
        messages=[
            {"role": "system", "content": instruction},
            {
                "role": "user",
                "content": prompt.format(i["context"], i["statement"])
            }
        ]
    )
    ood_buffer.append(completion.choices[0].message.content)

100%|██████████| 250/250 [04:45<00:00,  1.14s/it]


In [22]:
acc = 0
count = 0
err_count = 0
mapping = {"False": 0, "True": 1}
test_output_reasons = []
ood_output_reasons = []
list_to_save = {"test": [], "ood": []}

for i, (o, t) in enumerate(zip(test_buffer, test_dataset)):
    try:
        output = eval(o.strip("```json").strip("```"))
    except:
        err_count += 1
        test_output_reasons.append(o)  # If there is output format error, using the raw output as reason
        continue
    list_to_save["test"].append({"input": t, "output": o})
    count += 1
    if int(output["answer"]) == mapping[t["flag"]]:
        acc += 1
    test_output_reasons.append(output["reason"])

print(f"{model} zeroshot Test Accuracy: {acc/count}")
print(f"{model} zeroshot Output Error: {err_count}")
print(f"{model} zeroshot Test Reason Rouge: {rouge.compute(predictions=test_output_reasons, references=[i["reasoning"] for i in test_dataset], use_stemmer=True)}")

acc = 0
count = 0
err_count = 0

for i, (o, t) in enumerate(zip(ood_buffer, ood_dataset)):
    try:    
        output = eval(o.strip("```json").strip("```"))
    except:
        err_count += 1
        ood_output_reasons.append(o)
        continue
    if int(output["answer"]) == mapping[t["flag"]]:
        acc += 1
    list_to_save["ood"].append({"input": t, "output": o})
    count += 1
    ood_output_reasons.append(output["reason"])
print(f"{model} zeroshot OOD Accuracy: {acc/count}")
print(f"{model} zeroshot Error count: {err_count}")
print(f"{model} zeroshot Reason Rouge: {rouge.compute(predictions=ood_output_reasons, references=[i["reasoning"] for i in ood_dataset], use_stemmer=True)}")
with open(f"{model}-zero-shot.json", "w") as f:
    json.dump(list_to_save, f)

gpt-4o zeroshot Test Accuracy: 0.8951612903225806
gpt-4o zeroshot Output Error: 2
gpt-4o zeroshot Test Reason Rouge: {'rouge1': np.float64(0.4744683576298864), 'rouge2': np.float64(0.35779002476575317), 'rougeL': np.float64(0.41083381112160444), 'rougeLsum': np.float64(0.44453047027650283)}
gpt-4o zeroshot OOD Accuracy: 0.8506666666666667
gpt-4o zeroshot Error count: 0
gpt-4o zeroshot Reason Rouge: {'rouge1': np.float64(0.35752862730132395), 'rouge2': np.float64(0.263847634179622), 'rougeL': np.float64(0.2903490586727546), 'rougeLsum': np.float64(0.3376695137594531)}


Few-shot CoT

In [23]:
prompt = """
    {}
    Assertion: {}
"""

instruction = """
    Based on the facts and rules, give simple reasoning steps citing the rules, and output whether the assertion is true.
    You must output in json format: {{"reason": "Because {{rule1}}, {{rule2}}, ..., {{conclusion}}", "answer": 1/0}}. where rules are copies from the Rule, and conclusion should
    be either the assertion or its negation, do not add other texts.
    Some examples: {}
"""

example_template = """
    {context}
    Assertion: {statement}
    Output: {{"reason": "{reasoning}", "answer": {flag}}}
"""

In [24]:
test_buffer = []
ood_buffer = []

for n, i in enumerate(tqdm(test_dataset)):
    example = []
    for j in random.sample(range(len(train_dataset)), 3):
        example.append(example_template.format(**train_dataset[j]))
    examples = "\n".join(example)
    completion = client.chat.completions.create(
        model=model,
        messages=[
            {"role": "system", "content": instruction.format(examples)},
            {
                "role": "user",
                "content": prompt.format(i["context"], i["statement"])
            }
        ]
    )
    test_buffer.append({"output": completion.choices[0].message.content, "example": example})

for n, i in enumerate(ood_dataset):
    example = []
    for j in random.sample(range(len(train_dataset)), 3):
        example.append(example_template.format(**train_dataset[j]))
    examples = "\n".join(example)
    completion = client.chat.completions.create(
        model=model,
        messages=[
            {"role": "system", "content": instruction.format(examples)},
            {
                "role": "user",
                "content": prompt.format(i["context"], i["statement"])
            }
        ]
    )
    ood_buffer.append({"output": completion.choices[0].message.content, "example": example})

100%|██████████| 250/250 [07:24<00:00,  1.78s/it]


In [30]:
acc = 0
count = 0
err_count = 0
mapping = {"False": 0, "True": 1}
test_output_reasons = []
ood_output_reasons = []
list_to_save = {"test": [], "ood": []}

for i, (o, t) in enumerate(zip(test_buffer, test_dataset)):
    try:
        output = eval(o["output"].strip("```json").strip("```"))
    except:
        err_count += 1
        print("test", i)
        test_output_reasons.append(o["output"])  # If there is output format error, using the raw output as reason
        continue
    list_to_save["test"].append({"input": t, "output": o})
    count += 1
    if int(output["answer"]) == mapping[t["flag"]]:
        acc += 1
    test_output_reasons.append(output["reason"])

print(f"{model} fewshot Test Accuracy: {acc/count}")
print(f"{model} fewshot Test Output Error: {err_count}")
print(f"{model} fewshot Test Reason Rouge: {rouge.compute(predictions=test_output_reasons, references=[i["reasoning"] for i in test_dataset], use_stemmer=True)}")

acc = 0
count = 0
err_count = 0

for i, (o, t) in enumerate(zip(ood_buffer, ood_dataset)):
    try:    
        output = eval(o["output"].strip("```json").strip("```"))
        answer = output["answer"]
    except:
        err_count += 1
        print("ood", i)
        ood_output_reasons.append(o["output"])
        continue
    if int(output["answer"]) == mapping[t["flag"]]:
        acc += 1
    list_to_save["ood"].append({"input": t, "output": o})
    count += 1
    ood_output_reasons.append(output["reason"])
print(f"{model} fewshot OOD Accuracy: {acc/count}")
print(f"{model} fewshot OOD Error count: {err_count}")
print(f"{model} fewshot OOD Reason Rouge: {rouge.compute(predictions=ood_output_reasons, references=[i["reasoning"] for i in ood_dataset], use_stemmer=True)}")
with open(f"{model}-few-shot.json", "w") as f:
    json.dump(list_to_save, f)

test 5
test 26
test 74
test 96
test 105
test 131
gpt-4o fewshot Test Accuracy: 0.930327868852459
gpt-4o fewshot Test Output Error: 6
gpt-4o fewshot Test Reason Rouge: {'rouge1': np.float64(0.5308854623248525), 'rouge2': np.float64(0.4205108911262426), 'rougeL': np.float64(0.4567073292090176), 'rougeLsum': np.float64(0.4984785261028818)}
ood 343
gpt-4o fewshot OOD Accuracy: 0.946524064171123
gpt-4o fewshot OOD Error count: 1
gpt-4o fewshot OOD Reason Rouge: {'rouge1': np.float64(0.4182177225354021), 'rouge2': np.float64(0.3316091253072725), 'rougeL': np.float64(0.33733374379685566), 'rougeLsum': np.float64(0.3946646865691559)}
