# Setup

In [1]:
from dotenv import load_dotenv

load_dotenv()

True

In [2]:
import requests
import wikienv
import wrappers

env = wikienv.WikiEnv()
env = wrappers.FeverWrapper(env, split="dev")
env = wrappers.LoggingWrapper(env)

def step(env, action):
    attempts = 0
    while attempts < 10:
        try:
            return env.step(action)
        except requests.exceptions.Timeout:
            attempts += 1

# ReAct

In [3]:
import json

folder = './prompts/'
prompt_file = 'fever.json'
with open(folder + prompt_file, 'r') as f:
    prompt_dict = json.load(f)

webthink_prompt = prompt_dict['webthink_simple3']

def webthink(llm_func, idx=None, prompt=webthink_prompt, to_print=True):
    question = env.reset(idx=idx)
    if to_print:
        print(idx, question)
    prompt += question + "\n"
    n_calls, n_badcalls = 0, 0
    for i in range(1, 8):
        n_calls += 1
        thought_action = llm_func(prompt + f"Thought {i}:", stop=[f"\nObservation {i}:"])
        try:
            thought, action = thought_action.strip().split(f"\nAction {i}: ")
        except:
            print('ohh...', thought_action)
            n_badcalls += 1
            n_calls += 1
            thought = thought_action.strip().split('\n')[0]
            new_stop = f"\n"
            action = llm_func(prompt + f"Thought {i}: {thought}\nAction {i}:", stop=[new_stop]).strip()
            action.replace(new_stop, "\n")
        obs, r, done, info = step(env, action[0].lower() + action[1:])
        obs = obs.replace('\\n', '')
        step_str = f"Thought {i}: {thought}\nAction {i}: {action}\nObservation {i}: {obs}\n"
        prompt += step_str
        if to_print:
            print(step_str)
        if done:
            break
    if not done:
        obs, r, done, info = step(env, "finish[]")
    if to_print:
        print(info, '\n')
    info.update({'n_calls': n_calls, 'n_badcalls': n_badcalls, 'traj': prompt})
    return r, info

In [4]:
import random
from tqdm import tqdm
from llms import (GPT3, GPT4, Claude3Opus)

DEBUG = False
num_claims = 100
llms = {"GPT-4": GPT4}

idxs = list(range(7405))
random.Random(0).shuffle(idxs)
for llm_name, llm_func in llms.items():
    rewards = []
    infos = []
    pbar = tqdm(idxs[:num_claims])
    for i in pbar:
        r, info = webthink(llm_func, i, to_print=DEBUG)
        rewards.append(info['em'])
        infos.append(info)
        running_acc = sum(rewards)/len(rewards)
        desc_str = f"LLM: {llm_name}, Accuracy: {running_acc:.3f}"
        pbar.set_description(desc_str)

  logger.warn(
  logger.warn(
LLM: GPT-4, Accuracy: 0.500:   4%|▍         | 2/50 [00:10<04:12,  5.25s/it]

ohh... It seems there was an issue with retrieving specific information about the TV show "Lost" from the search. Without direct information about the show's genre or description, it's difficult to confirm if it is categorized specifically as an American drama based on the provided observations. Therefore, the appropriate action would be to conclude there is NOT ENOUGH INFORMATION to determine if the claim that "Lost is an American drama" is supported or refuted based on the observations provided.


LLM: GPT-4, Accuracy: 0.500:  12%|█▏        | 6/50 [00:34<03:42,  5.05s/it]

ohh... Action 1: Search[Simón Bolívar]


LLM: GPT-4, Accuracy: 0.850:  40%|████      | 20/50 [01:48<02:24,  4.80s/it]

ohh... The observation does not provide information about a song titled "Girl" or its artist(s), so there is NOT ENOUGH INFORMATION to determine if the claim that "Girl is only by an African singer" is supported or refuted.


LLM: GPT-4, Accuracy: 0.792:  48%|████▊     | 24/50 [02:13<02:30,  5.77s/it]

ohh... Given the lack of direct information about David Spade being fired from Grown Ups 2 and the inability to confirm his involvement or lack thereof in the movie from the provided search results, there is NOT ENOUGH INFORMATION to support or refute the claim that David Spade was fired from being in Grown Ups 2.


LLM: GPT-4, Accuracy: 0.724:  58%|█████▊    | 29/50 [02:47<02:11,  6.27s/it]

ohh... Action 4: Finish[REFUTES]


LLM: GPT-4, Accuracy: 0.710:  62%|██████▏   | 31/50 [03:04<02:13,  7.01s/it]

ohh... Action 1: Finish[SUPPORTS]


LLM: GPT-4, Accuracy: 0.697:  66%|██████▌   | 33/50 [03:10<01:25,  5.03s/it]

ohh... Without specific information on her marital status or personal relationships from the search results, there is NOT ENOUGH INFORMATION to determine if the claim that Tatum O'Neal spent her life single is true or not.


LLM: GPT-4, Accuracy: 0.676:  68%|██████▊   | 34/50 [03:18<01:36,  6.06s/it]

ohh... Action 2: Finish[REFUTES]


LLM: GPT-4, Accuracy: 0.684:  76%|███████▌  | 38/50 [03:35<00:55,  4.60s/it]

ohh... Action 2: Finish[SUPPORTS]


LLM: GPT-4, Accuracy: 0.660: 100%|██████████| 50/50 [04:41<00:00,  5.63s/it]
