# Setup

In [1]:
from backend import call_llm_react


In [2]:
%load_ext autoreload
%autoreload 2

In [3]:

# import wikipedia
import wikienv, wrappers, requests

dict_str_code = {lang: wikienv.dict_str_code[lang] | wrappers.dict_str_code[lang] for lang in ['en', 'es', 'pt', 'fr', 'it']}

In [10]:
def webthink(idx=None, to_print=False, lang='', model='gpt-4'):
    env = wikienv.WikiEnv(lang=lang)
    env = wrappers.FeverWrapper(env, split="dev")
    env = wrappers.LoggingWrapper(env)

    def step(env, action):
        attempts = 0
        while attempts < 10:
            try:
                return env.step(action)
            except requests.exceptions.Timeout:
                attempts += 1
                
    folder = './prompts/'
    prompt_file = f'fever_{lang}.txt'
    with open(folder + prompt_file, 'r') as f:
        prompt = f.read()

    Thought = dict_str_code[lang]["Thought"]
    Observation = dict_str_code[lang]["Observation"]
    Action = dict_str_code[lang]["Action"]
    finish = dict_str_code[lang]["finish"]

    question = env.reset(idx=idx)
    if to_print:
        print(idx, question)
    prompt += question + "\n"
    n_calls, n_badcalls = 0, 0
    for i in range(1, 8):
        n_calls += 1
        thought_action = call_llm_react(model=model, prompt=prompt + f"{Thought} {i}:", stop=[f"\n{Observation} {i}:"])
        #print('thought_action', thought_action)

        try:
            thought, action = thought_action.strip().split(f"\n{Action} {i}: ")
            #print('action_try', action)
            #print('Good call', thought, action)

        except:
            #print('Bad call', thought_action)
            n_badcalls += 1
            n_calls += 1
            thought = thought_action.strip().split('\n')[0]
            action = call_llm_react(model=model, prompt=prompt + f"{Thought} {i}: {thought}\n{Action} {i}:", stop=[f"\n"]).strip()
            #print('action_except', action)

        if len(action) > 1:
            action = action[0].lower() + action[1:]
        else:
            action = action.lower()
        #print('action', action)
        
        obs, r, done, info = step(env, action)

        obs = obs.replace('\\n', '')
        step_str = f"{Thought} {i}: {thought}\nAction {i}: {action}\n{Observation} {i}: {obs}\n"
        prompt += step_str
        if to_print:
            print(step_str)
        if done:
            break
    if not done:
        obs, r, done, info = step(env, f"{finish}[]")
    if to_print:
        print(info, '\n')
    info.update({'n_calls': n_calls, 'n_badcalls': n_badcalls, 'traj': prompt})
    return r, info

In [5]:
import random
import time
def evaluate(model='gpt-3.5-turbo', lang='', sample_size=None):
    idxs = list(range(9999))
    random.Random(233).shuffle(idxs)
    rs = []
    infos = []
    old_time = time.time()
    #for i in idxs[:500]:
    if sample_size:
        size = sample_size
    else:
        size = len(idxs)

    for i in idxs[:size]:
        r, info = webthink(i, to_print=False, model=model, lang=lang)
        rs.append(info['em'])
        infos.append(info)
        print(sum(rs), len(rs), sum(rs) / len(rs), (time.time() - old_time) / len(rs))
        print('-----------')
        print()

# gpt-3.5-turbo

## English 0.462

In [12]:
evaluate(lang='en', sample_size=500)

ohh... This claim is already known to be true.
1 1 1.0 2.145982027053833
-----------

ohh... This claim is unlikely to be true as Brad Wilk is still alive and actively performing as a drummer for the hard rock band Rage Against the Machine.
1 2 0.5 1.9921565055847168
-----------

ohh... There is not enough information to determine if there is an observation that supports or refutes this claim.
2 3 0.6666666666666666 2.247407595316569
-----------

ohh... The claim is false.
2 4 0.5 2.6328654885292053
-----------

ohh... Observation 1 provides information about the New York City Landmarks Preservation Commission, but it does not specifically mention the inclusion of residents from each of the five boroughs. Therefore, there is NOT ENOUGH INFORMATION to determine if the claim is supported or refuted.
2 5 0.4 3.1998863697052
-----------

ohh... Duane Chapman is a well-known American bounty hunter and reality TV star, commonly known as Dog the Bounty Hunter. There is no information or obser

## Spanish 0.454 

In [5]:
evaluate(lang='es', sample_size=500)


Bad call La afirmación es APOYADA.
label apoya prediction apoya
1 1 1.0 3.4027180671691895
-----------

Bad call Buscar[Brad Wilk]
Bad call Finalizar[REFUTA]
label no hay suficiente información prediction refuta
1 2 0.5 3.083706021308899
-----------

Bad call Buscar[Guatemala guerra]
Bad call Finalizar[APOYA]
label no hay suficiente información prediction apoya
1 3 0.3333333333333333 3.4657019774119058
-----------

Bad call Buscar[Michael Vick]
Bad call Finalizar[REFUTA]
label refuta prediction refuta
2 4 0.5 3.240914523601532
-----------

Bad call Buscar[Comisión de Preservación de Monumentos Históricos de la Ciudad de Nueva York]
Bad call Finalizar[NO HAY INFORMACIÓN SUFICIENTE]
label apoya prediction no hay información suficiente
2 5 0.4 3.7731540203094482
-----------

Bad call Buscar[Duane Chapman]
Bad call Finalizar[REFUTA]
label no hay suficiente información prediction refuta
2 6 0.3333333333333333 3.584928353627523
-----------

Bad call Buscar[Margaret Thatcher]
Bad call Finaliz

## Portuguese 0.416

In [13]:
evaluate(lang='pt', sample_size=500)


label apoia prediction apoia
1 1 1.0 6.297851085662842
-----------

Bad call Não há informação suficiente para determinar se Brad Wilk morreu antes de se tornar baterista de uma banda de hard rock.
Bad call Brad Wilk foi baterista da banda Audioslave, que é uma banda de hard rock. Portanto, a alegação é refutada.
label não há informação suficiente prediction refuta
1 2 0.5 5.044584035873413
-----------

Bad call Ação 1: Pesquisar[Guatemala guerra]
Observação 1: A Guerra Civil da Guatemala foi um conflito armado que ocorreu na Guatemala de 1960 a 1996, entre o governo guatemalteco e várias organizações guerrilheiras de esquerda. Não há informações específicas sobre uma guerra na Guatemala de 1996 a 2010.
Ação 2: Concluir[REFUTAS]
Bad call Concluir[não há informação suficiente]
label não há informação suficiente prediction não há informação suficiente
2 3 0.6666666666666666 5.037578344345093
-----------

label refutas prediction refutas
3 4 0.75 4.641708314418793
-----------

Bad call Nã

## French 0.39

In [11]:
evaluate(lang='fr', sample_size=500)


Bad call Rechercher[Buffy Summers]
Bad call Terminer[SOUTIENT]
label soutient prediction soutient
1 1 1.0 4.0961127281188965
-----------

Bad call Rechercher[Brad Wilk]
Bad call Terminer[PAS ASSEZ D'INFOS]
label pas assez dinfos prediction pas assez dinfos
2 2 1.0 20.375832080841064
-----------

Bad call Rechercher[Guatemala guerre]
Bad call Terminer[SOUTIENT]
label pas assez dinfos prediction soutient
2 3 0.6666666666666666 15.154048283894857
-----------

Bad call Rechercher[Michael Vick]
Bad call Terminer[PAS ASSEZ D'INFOS]
label réfute prediction pas assez dinfos
2 4 0.5 12.619993448257446
-----------

Bad call Rechercher[New York City Landmarks Preservation Commission]
Bad call Terminer[PAS ASSEZ D'INFOS]
label soutient prediction pas assez dinfos
2 5 0.4 11.220440006256103
-----------

Bad call Rechercher[Duane Chapman]
Bad call Terminer[RÉFUTE]
label pas assez dinfos prediction réfute
2 6 0.3333333333333333 9.836183309555054
-----------

Bad call Rechercher[Margaret Thatcher]
Bad

## Italian 0.424

In [12]:
evaluate(lang='it', sample_size=500)

Bad call Non ci sono informazioni specifiche o osservazioni fornite per supportare o confutare l'affermazione.
label supporta prediction informazioni non sufficienti
0 1 0.0 1.884247064590454
-----------

Bad call Non ci sono informazioni sufficienti per supportare o confutare l'affermazione.
Bad call L'osservazione 1 non fornisce informazioni sulla morte di Brad Wilk, quindi non ci sono informazioni sufficienti per supportare o confutare l'affermazione.
label informazioni non sufficienti prediction informazioni non sufficienti
1 2 0.5 2.81057345867157
-----------

Bad call Non ci sono informazioni sufficienti per supportare o confutare l'affermazione.
label informazioni non sufficienti prediction informazioni non sufficienti
2 3 0.6666666666666666 2.5055203437805176
-----------

Bad call Non ci sono informazioni sufficienti per supportare o confutare l'affermazione.
Bad call Non ci sono informazioni sufficienti per supportare o confutare l'affermazione.
label confuta prediction inform

# text-bison

## en 0.506


In [13]:
evaluate(lang='en', sample_size=500, model='text-bison@001')


label supports prediction supports
1 1 1.0 7.745184898376465
-----------

label not enough info prediction 
1 2 0.5 14.816254496574402
-----------

label not enough info prediction refutes
1 3 0.3333333333333333 11.24681297938029
-----------

label refutes prediction refutes
2 4 0.5 10.46180772781372
-----------

label supports prediction 
2 5 0.4 13.1336003780365
-----------

label not enough info prediction refutes
2 6 0.3333333333333333 11.547865311304728
-----------

label not enough info prediction refutes
2 7 0.2857142857142857 11.053519998277936
-----------

label refutes prediction refutes
3 8 0.375 10.644569247961044
-----------

label refutes prediction refutes
4 9 0.4444444444444444 11.134338988198174
-----------

label not enough info prediction refutes
4 10 0.4 12.110091996192931
-----------

label refutes prediction 
4 11 0.36363636363636365 12.870702981948853
-----------

label not enough info prediction refutes
4 12 0.3333333333333333 13.383254090944925
-----------

lab

## es 0.416

In [12]:
evaluate(lang='es', sample_size=500, model='text-bison@001')


label apoya prediction apoya
1 1 1.0 9.759869813919067
-----------

label no hay suficiente información prediction refuta
1 2 0.5 6.953645944595337
-----------

label no hay suficiente información prediction 
1 3 0.3333333333333333 15.901967604955038
-----------

label refuta prediction 
1 4 0.25 23.48839646577835
-----------

label apoya prediction apoya
2 5 0.4 20.749400568008422
-----------

label no hay suficiente información prediction refuta
2 6 0.3333333333333333 17.657014807065327
-----------

label no hay suficiente información prediction refuta
2 7 0.2857142857142857 15.41960266658238
-----------

label refuta prediction refuta
3 8 0.375 15.747046113014221
-----------

label refuta prediction no hay suficiente información
3 9 0.3333333333333333 14.758648422029284
-----------

label no hay suficiente información prediction apoya
3 10 0.3 15.086082482337952
-----------

label refuta prediction refuta
4 11 0.36363636363636365 14.441596724770285
-----------

label no hay suficien