In [203]:
import pandas as pd
from langchain_core.output_parsers import StrOutputParser, JsonOutputParser
from langchain_core.prompts import ChatPromptTemplate
from langchain_openai.chat_models import ChatOpenAI
from dotenv import load_dotenv
from tqdm import tqdm
load_dotenv()

True

# Prompt Evaluation

In [None]:
prompt_eval = """
Instruções:
Avalie a resposta gerada pela IA com base nos seguintes critérios:

1. Verifique se a Resposta da IA está contida na resposta Resposta do Especialista, ou seja, se não existe nenhuma contradição. Ignore termos diferentes ou pequenas informações a mais ou a menos.
2. A Resposta do Especialista pode conter mais informações do que foi solicitado na pergunta, se a informação da Resposta do Especialista não for necessária para responder a pergunta não use para avaliar a Resposta da IA.
3. Se a Resposta da IA tiver mais informações do que a Resposta do Especialista não deve ser levada em consideração para avaliação desde que as informações estejam corretas.
3. Verifique se a resposta pode responder à questão. Para isso veja se a resposta fornece as informações solicitadas na questão, se é suficiente. Por exemplo, se a questão pode ser respondida com um simples "Não", isso é aceitável.

Inclua um raciocínio que justifique a Avaliação. Se os critérios forem atendidos, retorne 'CORRETO'. Se qualquer um dos critérios não for atendido, retorne 'ERRADO'.

A Avaliação deve ser um objeto JSON, com as chaves resultado e raciocínio.

Exemplos:

1. 
### Questão:
Os rendimentos, decorrentes da prestação de serviços de consultoria técnica,  pagos por pessoa jurídica domiciliada no Brasil a sua matriz no exterior estão  sujeitos à legislação de preços de transferência?

### Resposta do Especialista:
Em primeiro lugar, há que distinguir se a prestação dos serviços no Brasil implicou transferência de tecnologia. Na hipótese de ficar comprovada a transferência de tecnologia, com a anuência do Instituto Nacional de Propriedade Industrial (INPI), a transação não estará sujeita às regras de preços de transferência consoante o estabelecido pelo art. 55 da IN RFB nº 1.312, de 2012. Nessa hipótese, a dedução de tais despesas está sujeita aos limites estabelecidos pelos arts. 362 a 365 do RIR/2018. Caso inexista transferência de tecnologia, esses serviços passam a se submeter às regras de preços de transferência.

### Resposta da IA:
Sim, estão sujeitos, a menos que haja transferência de tecnologia com anuência do INPI.

### Avaliação:
{{
  "raciocinio": "A resposta da IA está alinhada com a resposta do especialista, abordando corretamente a questão sem contradições, embora seja mais curta.",
  "resultado": "CORRETO"

}}

2. 
### Questão:
O que deve ser considerado como "contraprestações creditadas"?

### Resposta do Especialista:
Para efeito do disposto no art. 175 da Instrução Normativa RFB nº 1.700, de 2017, consideram-se contraprestações creditadas as contraprestações vencidas.

### Resposta da IA:
As contraprestações vencidas.

### Avaliação:
{{
  "raciocinio": "A resposta da IA cobre as principais hipóteses mencionadas pelo especialista, sem apresentar contradições, embora seja menos detalhada.",
  "resultado": "CORRETO"
}}

3. 
### Questão:
Existe prazo para a compensação de prejuízos fiscais da atividade rural?

### Resposta do Especialista:
Não existe qualquer prazo para compensação de prejuízos fiscais da atividade rural.

### Resposta da IA:
O prazo é de 7 dias a partir da data do prejuízo, podendo se extender até 30 dias para compensação de prejuízos fiscais da atividade rural.

### Avaliação:
{{
  "raciocinio": "A resposta da IA está incorreta porque menciona '7 dias' e 'até 30 dias', que contradizem a resposta do especialista.",
  "resultado": "ERRADO"
}}

4. 
### Questão:
A base de cálculo negativa da CSLL poderá ser compensada com resultados  apurados em períodos subsequentes?

### Resposta do Especialista:
Sim. A base de cálculo da CSLL, quando negativa, poderá ser compensada até o limite de 30% dos resultados apurados em períodos subsequentes, ajustados pelas adições e exclusões previstas na legislação.

### Resposta da IA:
Não. A base de cálculo da CSLL poderá ser compensada dos resultados apurados em períodos subsequentes.

### Avaliação:
{{
  "raciocinio": "A resposta da IA contradiz a resposta do especialista, fornecendo uma informação oposta sobre a necessidade possibilidade de compensação da CSLL.",
  "resultado": "ERRADO"
}}

Agora pense passo a passo e faça essa Avaliação:

### Questão:
{questao}

### Resposta do Especialista:
{resposta_especialista}

### Resposta da IA:
{resposta_ia}

### Avaliação:
"""


# Models Judgment

In [206]:
models = ['mistralai/Mixtral-8x7B-Instruct-v0.1',
               'mistralai/Mistral-7B-Instruct-v0.2',
               'zero-one-ai/Yi-34B-Chat',
               'garage-bAInd/Platypus2-70B-instruct',
               'google/gemma-7b-it',
               'lmsys/vicuna-13b-v1.5',
               'lmsys/vicuna-7b-v1.5',
               'meta-llama/Llama-2-70b-chat-hf',
               'meta-llama/Llama-2-13b-chat-hf',
               'meta-llama/Llama-2-7b-chat-hf',
               'openchat/openchat-3.5-1210',
               'WizardLM/WizardLM-13B-V1.2',
               'Qwen/Qwen1.5-14B-Chat',
               'Qwen/Qwen1.5-72B-Chat',
               'upstage/SOLAR-10.7B-Instruct-v1.0',
               'meta-llama/Llama-3-70b-chat-hf',
               'mistralai/Mistral-7B-Instruct-v0.3',
               'mistralai/Mixtral-8x22B-Instruct-v0.1',
               'Qwen/Qwen2-72B-Instruct',
               'meta-llama/Llama-3-8b-chat-hf',
               'Qwen/Qwen1.5-110B-Chat',
               'teknium/OpenHermes-2p5-Mistral-7B',
               'openia/gpt-3.5-turbo'
               ]

for m in tqdm(models, desc="Model Progress"):
    print(f"Processing model: {m}")
    df = pd.read_csv(f'result_question_answer_generation/{model.split('/')[1]}_QAG.csv') 
    prompt = ChatPromptTemplate.from_template(prompt_eval)
    model = ChatOpenAI(model='gpt-4o',temperature=0.1)
    output_parser = JsonOutputParser()
    total_questions = len(df)
    chain = prompt | model | output_parser
    data = []
    for i, l in enumerate(tqdm(df.itertuples(index=False), desc=f"Processing {m.split('/')[1]}", leave=False)):
        result = chain.invoke({'questao':l.Q,'resposta_especialista':l.A,'resposta_ia':l.A_model})
        data.append(result)
        if (i + 1) % 10 == 0 or (i + 1) == total_questions:
            print(f"Model {m}: Processed {i + 1}/{total_questions} questions")
    num_correto = 0
    num_errada = 0
    for r in data:
        if r['resultado'] == 'CORRETO':
            num_correto+=1
        else:
            num_errada+=1
            
    # Somar o total de entradas
    total_entradas = len(data)

    # Calcular acurácia
    acuracia = num_correto / total_entradas

    # Imprimir resultados
    print(f"Model: {m}")
    print(f"Correto: {num_correto}")
    print(f"Errada: {num_errada}")
    print(f"Total de Entradas: {total_entradas}")
    print(f"Acurácia: {acuracia:.2f}")
    df['Avaliacao_gpt4'] = data
    df.to_csv(f'result_question_answer_llm_evaluation/{model.split('/')[1]}_QAG_gpt-4o.csv', index=False)
    

Model Progress:   0%|          | 0/20 [00:00<?, ?it/s]

Processing model: mistralai/Mixtral-8x7B-Instruct-v0.1




Model mistralai/Mixtral-8x7B-Instruct-v0.1: Processed 10/101 questions




Model mistralai/Mixtral-8x7B-Instruct-v0.1: Processed 20/101 questions




Model mistralai/Mixtral-8x7B-Instruct-v0.1: Processed 30/101 questions




Model mistralai/Mixtral-8x7B-Instruct-v0.1: Processed 40/101 questions




Model mistralai/Mixtral-8x7B-Instruct-v0.1: Processed 50/101 questions




Model mistralai/Mixtral-8x7B-Instruct-v0.1: Processed 60/101 questions




Model mistralai/Mixtral-8x7B-Instruct-v0.1: Processed 70/101 questions




Model mistralai/Mixtral-8x7B-Instruct-v0.1: Processed 80/101 questions




Model mistralai/Mixtral-8x7B-Instruct-v0.1: Processed 90/101 questions




Model mistralai/Mixtral-8x7B-Instruct-v0.1: Processed 100/101 questions


Model Progress:   5%|▌         | 1/20 [03:55<1:14:35, 235.55s/it]

Model mistralai/Mixtral-8x7B-Instruct-v0.1: Processed 101/101 questions
Model: mistralai/Mixtral-8x7B-Instruct-v0.1
Correto: 50
Errada: 51
Total de Entradas: 101
Acurácia: 0.50
Processing model: mistralai/Mistral-7B-Instruct-v0.2




Model mistralai/Mistral-7B-Instruct-v0.2: Processed 10/101 questions




Model mistralai/Mistral-7B-Instruct-v0.2: Processed 20/101 questions




Model mistralai/Mistral-7B-Instruct-v0.2: Processed 30/101 questions




Model mistralai/Mistral-7B-Instruct-v0.2: Processed 40/101 questions




Model mistralai/Mistral-7B-Instruct-v0.2: Processed 50/101 questions




Model mistralai/Mistral-7B-Instruct-v0.2: Processed 60/101 questions




Model mistralai/Mistral-7B-Instruct-v0.2: Processed 70/101 questions




Model mistralai/Mistral-7B-Instruct-v0.2: Processed 80/101 questions




Model mistralai/Mistral-7B-Instruct-v0.2: Processed 90/101 questions




Model mistralai/Mistral-7B-Instruct-v0.2: Processed 100/101 questions


Model Progress:  10%|█         | 2/20 [07:16<1:04:38, 215.45s/it]

Model mistralai/Mistral-7B-Instruct-v0.2: Processed 101/101 questions
Model: mistralai/Mistral-7B-Instruct-v0.2
Correto: 55
Errada: 46
Total de Entradas: 101
Acurácia: 0.54
Processing model: zero-one-ai/Yi-34B-Chat




Model zero-one-ai/Yi-34B-Chat: Processed 10/101 questions




Model zero-one-ai/Yi-34B-Chat: Processed 20/101 questions




Model zero-one-ai/Yi-34B-Chat: Processed 30/101 questions




Model zero-one-ai/Yi-34B-Chat: Processed 40/101 questions




Model zero-one-ai/Yi-34B-Chat: Processed 50/101 questions




Model zero-one-ai/Yi-34B-Chat: Processed 60/101 questions




Model zero-one-ai/Yi-34B-Chat: Processed 70/101 questions




Model zero-one-ai/Yi-34B-Chat: Processed 80/101 questions




Model zero-one-ai/Yi-34B-Chat: Processed 90/101 questions




Model zero-one-ai/Yi-34B-Chat: Processed 100/101 questions


Model Progress:  15%|█▌        | 3/20 [10:42<59:45, 210.94s/it]  

Model zero-one-ai/Yi-34B-Chat: Processed 101/101 questions
Model: zero-one-ai/Yi-34B-Chat
Correto: 53
Errada: 48
Total de Entradas: 101
Acurácia: 0.52
Processing model: garage-bAInd/Platypus2-70B-instruct




Model garage-bAInd/Platypus2-70B-instruct: Processed 10/101 questions




Model garage-bAInd/Platypus2-70B-instruct: Processed 20/101 questions




Model garage-bAInd/Platypus2-70B-instruct: Processed 30/101 questions




Model garage-bAInd/Platypus2-70B-instruct: Processed 40/101 questions




Model garage-bAInd/Platypus2-70B-instruct: Processed 50/101 questions




Model garage-bAInd/Platypus2-70B-instruct: Processed 60/101 questions




Model garage-bAInd/Platypus2-70B-instruct: Processed 70/101 questions




Model garage-bAInd/Platypus2-70B-instruct: Processed 80/101 questions




Model garage-bAInd/Platypus2-70B-instruct: Processed 90/101 questions




Model garage-bAInd/Platypus2-70B-instruct: Processed 100/101 questions


Model Progress:  20%|██        | 4/20 [14:16<56:32, 212.06s/it]

Model garage-bAInd/Platypus2-70B-instruct: Processed 101/101 questions
Model: garage-bAInd/Platypus2-70B-instruct
Correto: 58
Errada: 43
Total de Entradas: 101
Acurácia: 0.57
Processing model: google/gemma-7b-it




Model google/gemma-7b-it: Processed 10/101 questions




Model google/gemma-7b-it: Processed 20/101 questions




Model google/gemma-7b-it: Processed 30/101 questions




Model google/gemma-7b-it: Processed 40/101 questions




Model google/gemma-7b-it: Processed 50/101 questions




Model google/gemma-7b-it: Processed 60/101 questions




Model google/gemma-7b-it: Processed 70/101 questions




Model google/gemma-7b-it: Processed 80/101 questions




Model google/gemma-7b-it: Processed 90/101 questions




Model google/gemma-7b-it: Processed 100/101 questions


Model Progress:  25%|██▌       | 5/20 [17:54<53:33, 214.21s/it]

Model google/gemma-7b-it: Processed 101/101 questions
Model: google/gemma-7b-it
Correto: 45
Errada: 56
Total de Entradas: 101
Acurácia: 0.45
Processing model: lmsys/vicuna-13b-v1.5




Model lmsys/vicuna-13b-v1.5: Processed 10/101 questions




Model lmsys/vicuna-13b-v1.5: Processed 20/101 questions




Model lmsys/vicuna-13b-v1.5: Processed 30/101 questions




Model lmsys/vicuna-13b-v1.5: Processed 40/101 questions




Model lmsys/vicuna-13b-v1.5: Processed 50/101 questions




Model lmsys/vicuna-13b-v1.5: Processed 60/101 questions




Model lmsys/vicuna-13b-v1.5: Processed 70/101 questions




Model lmsys/vicuna-13b-v1.5: Processed 80/101 questions




Model lmsys/vicuna-13b-v1.5: Processed 90/101 questions




Model lmsys/vicuna-13b-v1.5: Processed 100/101 questions


Model Progress:  30%|███       | 6/20 [21:40<50:57, 218.39s/it]

Model lmsys/vicuna-13b-v1.5: Processed 101/101 questions
Model: lmsys/vicuna-13b-v1.5
Correto: 50
Errada: 51
Total de Entradas: 101
Acurácia: 0.50
Processing model: lmsys/vicuna-7b-v1.5




Model lmsys/vicuna-7b-v1.5: Processed 10/101 questions




Model lmsys/vicuna-7b-v1.5: Processed 20/101 questions




Model lmsys/vicuna-7b-v1.5: Processed 30/101 questions




Model lmsys/vicuna-7b-v1.5: Processed 40/101 questions




Model lmsys/vicuna-7b-v1.5: Processed 50/101 questions




Model lmsys/vicuna-7b-v1.5: Processed 60/101 questions




Model lmsys/vicuna-7b-v1.5: Processed 70/101 questions




Model lmsys/vicuna-7b-v1.5: Processed 80/101 questions




Model lmsys/vicuna-7b-v1.5: Processed 90/101 questions




Model lmsys/vicuna-7b-v1.5: Processed 100/101 questions


Model Progress:  35%|███▌      | 7/20 [25:29<48:01, 221.63s/it]

Model lmsys/vicuna-7b-v1.5: Processed 101/101 questions
Model: lmsys/vicuna-7b-v1.5
Correto: 39
Errada: 62
Total de Entradas: 101
Acurácia: 0.39
Processing model: meta-llama/Llama-2-70b-chat-hf




Model meta-llama/Llama-2-70b-chat-hf: Processed 10/101 questions




Model meta-llama/Llama-2-70b-chat-hf: Processed 20/101 questions




Model meta-llama/Llama-2-70b-chat-hf: Processed 30/101 questions




Model meta-llama/Llama-2-70b-chat-hf: Processed 40/101 questions




Model meta-llama/Llama-2-70b-chat-hf: Processed 50/101 questions




Model meta-llama/Llama-2-70b-chat-hf: Processed 60/101 questions




Model meta-llama/Llama-2-70b-chat-hf: Processed 70/101 questions




Model meta-llama/Llama-2-70b-chat-hf: Processed 80/101 questions




Model meta-llama/Llama-2-70b-chat-hf: Processed 90/101 questions




Model meta-llama/Llama-2-70b-chat-hf: Processed 100/101 questions


Model Progress:  40%|████      | 8/20 [29:10<44:19, 221.60s/it]

Model meta-llama/Llama-2-70b-chat-hf: Processed 101/101 questions
Model: meta-llama/Llama-2-70b-chat-hf
Correto: 49
Errada: 52
Total de Entradas: 101
Acurácia: 0.49
Processing model: meta-llama/Llama-2-13b-chat-hf




Model meta-llama/Llama-2-13b-chat-hf: Processed 10/101 questions




Model meta-llama/Llama-2-13b-chat-hf: Processed 20/101 questions




Model meta-llama/Llama-2-13b-chat-hf: Processed 30/101 questions




Model meta-llama/Llama-2-13b-chat-hf: Processed 40/101 questions




Model meta-llama/Llama-2-13b-chat-hf: Processed 50/101 questions




Model meta-llama/Llama-2-13b-chat-hf: Processed 60/101 questions




Model meta-llama/Llama-2-13b-chat-hf: Processed 70/101 questions




Model meta-llama/Llama-2-13b-chat-hf: Processed 80/101 questions




Model meta-llama/Llama-2-13b-chat-hf: Processed 90/101 questions




Model meta-llama/Llama-2-13b-chat-hf: Processed 100/101 questions


Model Progress:  45%|████▌     | 9/20 [32:50<40:32, 221.13s/it]

Model meta-llama/Llama-2-13b-chat-hf: Processed 101/101 questions
Model: meta-llama/Llama-2-13b-chat-hf
Correto: 43
Errada: 58
Total de Entradas: 101
Acurácia: 0.43
Processing model: meta-llama/Llama-2-7b-chat-hf




Model meta-llama/Llama-2-7b-chat-hf: Processed 10/101 questions




Model meta-llama/Llama-2-7b-chat-hf: Processed 20/101 questions




Model meta-llama/Llama-2-7b-chat-hf: Processed 30/101 questions




Model meta-llama/Llama-2-7b-chat-hf: Processed 40/101 questions




Model meta-llama/Llama-2-7b-chat-hf: Processed 50/101 questions




Model meta-llama/Llama-2-7b-chat-hf: Processed 60/101 questions




Model meta-llama/Llama-2-7b-chat-hf: Processed 70/101 questions




Model meta-llama/Llama-2-7b-chat-hf: Processed 80/101 questions




Model meta-llama/Llama-2-7b-chat-hf: Processed 90/101 questions




Model meta-llama/Llama-2-7b-chat-hf: Processed 100/101 questions


Model Progress:  50%|█████     | 10/20 [36:46<37:35, 225.51s/it]

Model meta-llama/Llama-2-7b-chat-hf: Processed 101/101 questions
Model: meta-llama/Llama-2-7b-chat-hf
Correto: 30
Errada: 71
Total de Entradas: 101
Acurácia: 0.30
Processing model: openchat/openchat-3.5-1210




Model openchat/openchat-3.5-1210: Processed 10/101 questions




Model openchat/openchat-3.5-1210: Processed 20/101 questions




Model openchat/openchat-3.5-1210: Processed 30/101 questions




Model openchat/openchat-3.5-1210: Processed 40/101 questions




Model openchat/openchat-3.5-1210: Processed 50/101 questions




Model openchat/openchat-3.5-1210: Processed 60/101 questions




Model openchat/openchat-3.5-1210: Processed 70/101 questions




Model openchat/openchat-3.5-1210: Processed 80/101 questions




Model openchat/openchat-3.5-1210: Processed 90/101 questions




Model openchat/openchat-3.5-1210: Processed 100/101 questions


Model Progress:  55%|█████▌    | 11/20 [40:21<33:21, 222.35s/it]

Model openchat/openchat-3.5-1210: Processed 101/101 questions
Model: openchat/openchat-3.5-1210
Correto: 56
Errada: 45
Total de Entradas: 101
Acurácia: 0.55
Processing model: WizardLM/WizardLM-13B-V1.2




Model WizardLM/WizardLM-13B-V1.2: Processed 10/101 questions




Model WizardLM/WizardLM-13B-V1.2: Processed 20/101 questions




Model WizardLM/WizardLM-13B-V1.2: Processed 30/101 questions




Model WizardLM/WizardLM-13B-V1.2: Processed 40/101 questions




Model WizardLM/WizardLM-13B-V1.2: Processed 50/101 questions




Model WizardLM/WizardLM-13B-V1.2: Processed 60/101 questions




Model WizardLM/WizardLM-13B-V1.2: Processed 70/101 questions




Model WizardLM/WizardLM-13B-V1.2: Processed 80/101 questions




Model WizardLM/WizardLM-13B-V1.2: Processed 90/101 questions




Model WizardLM/WizardLM-13B-V1.2: Processed 100/101 questions


Model Progress:  60%|██████    | 12/20 [44:16<30:09, 226.17s/it]

Model WizardLM/WizardLM-13B-V1.2: Processed 101/101 questions
Model: WizardLM/WizardLM-13B-V1.2
Correto: 49
Errada: 52
Total de Entradas: 101
Acurácia: 0.49
Processing model: Qwen/Qwen1.5-14B-Chat




Model Qwen/Qwen1.5-14B-Chat: Processed 10/101 questions




Model Qwen/Qwen1.5-14B-Chat: Processed 20/101 questions




Model Qwen/Qwen1.5-14B-Chat: Processed 30/101 questions




Model Qwen/Qwen1.5-14B-Chat: Processed 40/101 questions




Model Qwen/Qwen1.5-14B-Chat: Processed 50/101 questions




Model Qwen/Qwen1.5-14B-Chat: Processed 60/101 questions




Model Qwen/Qwen1.5-14B-Chat: Processed 70/101 questions




Model Qwen/Qwen1.5-14B-Chat: Processed 80/101 questions




Model Qwen/Qwen1.5-14B-Chat: Processed 90/101 questions




Model Qwen/Qwen1.5-14B-Chat: Processed 100/101 questions


Model Progress:  65%|██████▌   | 13/20 [48:13<26:45, 229.41s/it]

Model Qwen/Qwen1.5-14B-Chat: Processed 101/101 questions
Model: Qwen/Qwen1.5-14B-Chat
Correto: 48
Errada: 53
Total de Entradas: 101
Acurácia: 0.48
Processing model: Qwen/Qwen1.5-72B-Chat




Model Qwen/Qwen1.5-72B-Chat: Processed 10/101 questions




Model Qwen/Qwen1.5-72B-Chat: Processed 20/101 questions




Model Qwen/Qwen1.5-72B-Chat: Processed 30/101 questions




Model Qwen/Qwen1.5-72B-Chat: Processed 40/101 questions




Model Qwen/Qwen1.5-72B-Chat: Processed 50/101 questions




Model Qwen/Qwen1.5-72B-Chat: Processed 60/101 questions




Model Qwen/Qwen1.5-72B-Chat: Processed 70/101 questions




Model Qwen/Qwen1.5-72B-Chat: Processed 80/101 questions




Model Qwen/Qwen1.5-72B-Chat: Processed 90/101 questions




Model Qwen/Qwen1.5-72B-Chat: Processed 100/101 questions


Model Progress:  70%|███████   | 14/20 [51:59<22:50, 228.41s/it]

Model Qwen/Qwen1.5-72B-Chat: Processed 101/101 questions
Model: Qwen/Qwen1.5-72B-Chat
Correto: 63
Errada: 38
Total de Entradas: 101
Acurácia: 0.62
Processing model: upstage/SOLAR-10.7B-Instruct-v1.0




Model upstage/SOLAR-10.7B-Instruct-v1.0: Processed 10/101 questions




Model upstage/SOLAR-10.7B-Instruct-v1.0: Processed 20/101 questions




Model upstage/SOLAR-10.7B-Instruct-v1.0: Processed 30/101 questions




Model upstage/SOLAR-10.7B-Instruct-v1.0: Processed 40/101 questions




Model upstage/SOLAR-10.7B-Instruct-v1.0: Processed 50/101 questions




Model upstage/SOLAR-10.7B-Instruct-v1.0: Processed 60/101 questions




Model upstage/SOLAR-10.7B-Instruct-v1.0: Processed 70/101 questions




Model upstage/SOLAR-10.7B-Instruct-v1.0: Processed 80/101 questions




Model upstage/SOLAR-10.7B-Instruct-v1.0: Processed 90/101 questions




Model upstage/SOLAR-10.7B-Instruct-v1.0: Processed 100/101 questions


Model Progress:  75%|███████▌  | 15/20 [55:37<18:46, 225.26s/it]

Model upstage/SOLAR-10.7B-Instruct-v1.0: Processed 101/101 questions
Model: upstage/SOLAR-10.7B-Instruct-v1.0
Correto: 52
Errada: 49
Total de Entradas: 101
Acurácia: 0.51
Processing model: meta-llama/Llama-3-70b-chat-hf




Model meta-llama/Llama-3-70b-chat-hf: Processed 10/101 questions




Model meta-llama/Llama-3-70b-chat-hf: Processed 20/101 questions




Model meta-llama/Llama-3-70b-chat-hf: Processed 30/101 questions




Model meta-llama/Llama-3-70b-chat-hf: Processed 40/101 questions




Model meta-llama/Llama-3-70b-chat-hf: Processed 50/101 questions




Model meta-llama/Llama-3-70b-chat-hf: Processed 60/101 questions




Model meta-llama/Llama-3-70b-chat-hf: Processed 70/101 questions




Model meta-llama/Llama-3-70b-chat-hf: Processed 80/101 questions




Model meta-llama/Llama-3-70b-chat-hf: Processed 90/101 questions




Model meta-llama/Llama-3-70b-chat-hf: Processed 100/101 questions


Model Progress:  80%|████████  | 16/20 [59:01<14:36, 219.11s/it]

Model meta-llama/Llama-3-70b-chat-hf: Processed 101/101 questions
Model: meta-llama/Llama-3-70b-chat-hf
Correto: 61
Errada: 40
Total de Entradas: 101
Acurácia: 0.60
Processing model: mistralai/Mistral-7B-Instruct-v0.3




Model mistralai/Mistral-7B-Instruct-v0.3: Processed 10/101 questions




Model mistralai/Mistral-7B-Instruct-v0.3: Processed 20/101 questions




Model mistralai/Mistral-7B-Instruct-v0.3: Processed 30/101 questions




Model mistralai/Mistral-7B-Instruct-v0.3: Processed 40/101 questions




Model mistralai/Mistral-7B-Instruct-v0.3: Processed 50/101 questions




Model mistralai/Mistral-7B-Instruct-v0.3: Processed 60/101 questions




Model mistralai/Mistral-7B-Instruct-v0.3: Processed 70/101 questions




Model mistralai/Mistral-7B-Instruct-v0.3: Processed 80/101 questions




Model mistralai/Mistral-7B-Instruct-v0.3: Processed 90/101 questions




Model mistralai/Mistral-7B-Instruct-v0.3: Processed 100/101 questions


Model Progress:  85%|████████▌ | 17/20 [1:02:37<10:54, 218.20s/it]

Model mistralai/Mistral-7B-Instruct-v0.3: Processed 101/101 questions
Model: mistralai/Mistral-7B-Instruct-v0.3
Correto: 56
Errada: 45
Total de Entradas: 101
Acurácia: 0.55
Processing model: meta-llama/Llama-3-8b-chat-hf




Model meta-llama/Llama-3-8b-chat-hf: Processed 10/101 questions




Model meta-llama/Llama-3-8b-chat-hf: Processed 20/101 questions




Model meta-llama/Llama-3-8b-chat-hf: Processed 30/101 questions




Model meta-llama/Llama-3-8b-chat-hf: Processed 40/101 questions




Model meta-llama/Llama-3-8b-chat-hf: Processed 50/101 questions




Model meta-llama/Llama-3-8b-chat-hf: Processed 60/101 questions




Model meta-llama/Llama-3-8b-chat-hf: Processed 70/101 questions




Model meta-llama/Llama-3-8b-chat-hf: Processed 80/101 questions




Model meta-llama/Llama-3-8b-chat-hf: Processed 90/101 questions




Model meta-llama/Llama-3-8b-chat-hf: Processed 100/101 questions


Model Progress:  90%|█████████ | 18/20 [1:05:57<07:05, 212.69s/it]

Model meta-llama/Llama-3-8b-chat-hf: Processed 101/101 questions
Model: meta-llama/Llama-3-8b-chat-hf
Correto: 55
Errada: 46
Total de Entradas: 101
Acurácia: 0.54
Processing model: Qwen/Qwen1.5-110B-Chat




Model Qwen/Qwen1.5-110B-Chat: Processed 10/101 questions




Model Qwen/Qwen1.5-110B-Chat: Processed 20/101 questions




Model Qwen/Qwen1.5-110B-Chat: Processed 30/101 questions




Model Qwen/Qwen1.5-110B-Chat: Processed 40/101 questions




Model Qwen/Qwen1.5-110B-Chat: Processed 50/101 questions




Model Qwen/Qwen1.5-110B-Chat: Processed 60/101 questions




Model Qwen/Qwen1.5-110B-Chat: Processed 70/101 questions




Model Qwen/Qwen1.5-110B-Chat: Processed 80/101 questions




Model Qwen/Qwen1.5-110B-Chat: Processed 90/101 questions




Model Qwen/Qwen1.5-110B-Chat: Processed 100/101 questions


Model Progress:  95%|█████████▌| 19/20 [1:09:30<03:32, 212.80s/it]

Model Qwen/Qwen1.5-110B-Chat: Processed 101/101 questions
Model: Qwen/Qwen1.5-110B-Chat
Correto: 61
Errada: 40
Total de Entradas: 101
Acurácia: 0.60
Processing model: teknium/OpenHermes-2p5-Mistral-7B




Model teknium/OpenHermes-2p5-Mistral-7B: Processed 10/101 questions




Model teknium/OpenHermes-2p5-Mistral-7B: Processed 20/101 questions




Model teknium/OpenHermes-2p5-Mistral-7B: Processed 30/101 questions




Model teknium/OpenHermes-2p5-Mistral-7B: Processed 40/101 questions




Model teknium/OpenHermes-2p5-Mistral-7B: Processed 50/101 questions




Model teknium/OpenHermes-2p5-Mistral-7B: Processed 60/101 questions




Model teknium/OpenHermes-2p5-Mistral-7B: Processed 70/101 questions




Model teknium/OpenHermes-2p5-Mistral-7B: Processed 80/101 questions




Model teknium/OpenHermes-2p5-Mistral-7B: Processed 90/101 questions




Model teknium/OpenHermes-2p5-Mistral-7B: Processed 100/101 questions


Model Progress: 100%|██████████| 20/20 [1:13:01<00:00, 219.07s/it]

Model teknium/OpenHermes-2p5-Mistral-7B: Processed 101/101 questions
Model: teknium/OpenHermes-2p5-Mistral-7B
Correto: 56
Errada: 45
Total de Entradas: 101
Acurácia: 0.55



