In [7]:
import json
from tqdm import tqdm
import numpy as np
import re

# gpt-4o-2024-11-20, claude-3-5-sonnet-20241022, gemini-1.5-pro, gpt-4o-mini, deepseek-reasoner...
model = "mistralai/mistral-small-3.1-24b-instruct"

dataset = "nfqa_rag"

split_list = ["text"]

full_outputs = []

result_path = f"outputs/{dataset}/{model}/{dataset}/zero_shot/cot/{dataset}_output.jsonl"

with open(result_path, "r") as f:
    outputs = [json.loads(line) for line in f]

source_path = f"data/{dataset}/input/{dataset}_input.jsonl"
with open(source_path, "r") as f:
    sources = [json.loads(line) for line in f]
assert len(sources) == len(outputs)
for i, source in enumerate(sources):
    assert source['id'] == outputs[i]['id']
    assert source['question'] == outputs[i]['question']

print(f"Loaded {len(outputs)} outputs")
full_outputs.extend(outputs)

Loaded 110 outputs


In [8]:
def split_string(s):
    parts = re.split(r'(?i)final answer', s)
    return parts

if "qvq" in model.lower():
    print(model)
    new_data = []
    for index, line in enumerate(tqdm(full_outputs)):
        prediction_rationale = line["messages"][-1]["content"]

        if re.search(r'(?i)final answer', prediction_rationale):
            flag = True
        else:
            flag = False

        prediction = split_string(prediction_rationale)[-1].strip()

        if line['id'].lower().startswith("text"):
            u_pattern = r"[A-J]"
            l_pattern = r"[a-j]"
        else:
            u_pattern = r"[A-E]"
            l_pattern = r"[a-e]"

        letter_match = re.findall(u_pattern, prediction)
        if letter_match:
            if flag:
                prediction = letter_match[0]
            else:
                prediction = letter_match[-1]
        else:
            letter_match = re.findall(l_pattern, prediction)
            if letter_match:
                if flag:
                    prediction = letter_match[0].upper()
                else:
                    prediction = letter_match[-1].upper()

        label = line["label"][0]
        line["prediction"] = prediction
        line["correct"] = prediction == label
        new_data.append(line)
    full_outputs = new_data
elif model == "deepseek-reasoner":
    print(model)
    new_data = []
    for index, line in enumerate(tqdm(full_outputs)):
        assert "Put your final" in line['messages'][-2]["content"]

        prediction = line['response']

        if line['id'].lower().startswith("text"):
            pattern = r"\\boxed{([A-J])}"
        else:
            pattern = r"\\boxed{([A-E])}"

        letter_match = re.findall(pattern, prediction)
        prediction = letter_match[0] if letter_match else prediction
        label = line['label'][0]
        line["prediction"] = prediction
        line["correct"] = prediction == label
        new_data.append(line)
    full_outputs = new_data

In [9]:
# ... existing code ...

# Results stats
print(f"Model: {model}")



# Results stats
print("------------------- Main Results -------------------")
print(f"Model: {model}")

# Calculate accuracy
correct_count = sum(1 for item in full_outputs if item.get("correct", False))
total_count = len(full_outputs)
accuracy = correct_count / total_count if total_count > 0 else 0

print(f"Accuracy: {accuracy:.4f} ({correct_count}/{total_count})")

# Optional: Calculate accuracy by question type if applicable
if any(item['id'].lower().startswith("text") for item in full_outputs):
    text_items = [item for item in full_outputs if item['id'].lower().startswith("text")]
    text_correct = sum(1 for item in text_items if item.get("correct", False))
    
    non_text_items = [item for item in full_outputs if not item['id'].lower().startswith("text")]
    non_text_correct = sum(1 for item in non_text_items if item.get("correct", False))
    
    print(f"Text questions accuracy: {text_correct/len(text_items):.4f} ({text_correct}/{len(text_items)})")
    print(f"Other questions accuracy: {non_text_correct/len(non_text_items):.4f} ({non_text_correct}/{len(non_text_items)})")


Model: mistralai/mistral-small-3.1-24b-instruct
------------------- Main Results -------------------
Model: mistralai/mistral-small-3.1-24b-instruct
Accuracy: 0.7182 (79/110)


In [15]:

dataset = "nfqa"

print("------------------- NFQA Results -------------------")

model_list = ["openai/gpt-4o-mini",
              "deepseek/deepseek-r1-distill-qwen-32b",
              "mistralai/mistral-small-3.1-24b-instruct", 
              "google/gemini-2.0-flash-001", 
              "anthropic/claude-3.5-haiku",
              "meta-llama/llama-3.2-3b-instruct"]

for model in model_list:
    result_path = f"outputs/{dataset}/{model}/{dataset}/zero_shot/cot/{dataset}_output.jsonl"
    with open(result_path, "r") as f:
        outputs = [json.loads(line) for line in f]
    print(f"Model: {model}")
    if outputs == []:
        print("N/A")
    else:
        print(f"Accuracy: {sum(1 for item in outputs if item.get('correct', False)) / len(outputs):.4f} ({sum(1 for item in outputs if item.get('correct', False))}/{len(outputs)})")

------------------- NFQA Results -------------------
Model: openai/gpt-4o-mini
Accuracy: 0.7523 (82/109)
Model: deepseek/deepseek-r1-distill-qwen-32b
Accuracy: 0.5872 (64/109)
Model: mistralai/mistral-small-3.1-24b-instruct
Accuracy: 0.6606 (72/109)
Model: google/gemini-2.0-flash-001
Accuracy: 0.7064 (77/109)
Model: anthropic/claude-3.5-haiku
Accuracy: 0.6606 (72/109)
Model: meta-llama/llama-3.2-3b-instruct
Accuracy: 0.6239 (68/109)


In [17]:

dataset = "nfqa_rag"

print("------------------- NFQA RAG Results -------------------")

model_list = ["openai/gpt-4o-mini",
              "deepseek/deepseek-r1-distill-qwen-32b",
              "mistralai/mistral-small-3.1-24b-instruct", 
              "google/gemini-2.0-flash-001", 
              "anthropic/claude-3.5-haiku",
              "meta-llama/llama-3.2-3b-instruct"]

for model in model_list:
    result_path = f"outputs/{dataset}/{model}/{dataset}/zero_shot/cot/{dataset}_output.jsonl"
    with open(result_path, "r") as f:
        outputs = [json.loads(line) for line in f]
    print(f"Model: {model}")
    if outputs == []:
        print("N/A")
    else:
        print(f"Accuracy: {sum(1 for item in outputs if item.get('correct', False)) / len(outputs):.4f} ({sum(1 for item in outputs if item.get('correct', False))}/{len(outputs)})")

------------------- NFQA RAG Results -------------------
Model: openai/gpt-4o-mini
Accuracy: 0.8818 (97/110)
Model: deepseek/deepseek-r1-distill-qwen-32b
Accuracy: 0.6818 (75/110)
Model: mistralai/mistral-small-3.1-24b-instruct
Accuracy: 0.7182 (79/110)
Model: google/gemini-2.0-flash-001
Accuracy: 0.8364 (92/110)
Model: anthropic/claude-3.5-haiku
Accuracy: 0.7909 (87/110)
Model: meta-llama/llama-3.2-3b-instruct
Accuracy: 0.7909 (87/110)
