In [1]:
import os

In [2]:
import json
import numpy as np
from rouge_score import rouge_scorer
from bert_score import score
from sklearn.metrics import f1_score
import torch

def load_jsonl(file_path):
    data = []
    with open(file_path, 'r', encoding='utf-8') as f:
        for line in f:
            data.append(json.loads(line))
    return data

  from .autonotebook import tqdm as notebook_tqdm


In [14]:
def calculate_metrics(predictions, labels):
    # Initialize ROUGE scorer
    scorer = rouge_scorer.RougeScorer(['rouge1', 'rouge2', 'rougeL'], use_stemmer=True)
    
    # Calculate ROUGE scores
    rouge_scores = {
        'rouge1': [],
        'rouge2': [],
        'rougeL': []
    }
    
    for pred, label in zip(predictions, labels):
        scores = scorer.score(label, pred)
        rouge_scores['rouge1'].append(scores['rouge1'].fmeasure)
        rouge_scores['rouge2'].append(scores['rouge2'].fmeasure)
        rouge_scores['rougeL'].append(scores['rougeL'].fmeasure)
    
    # Calculate average ROUGE scores
    avg_rouge_scores = {
        'rouge1': np.mean(rouge_scores['rouge1']),
        'rouge2': np.mean(rouge_scores['rouge2']),
        'rougeL': np.mean(rouge_scores['rougeL'])
    }

     # Calculate BERTScore
    P, R, F1 = score(predictions, labels, lang='en', verbose=False)
    bert_f1 = torch.mean(F1).item()
    
    # Calculate exact match F1
    exact_matches = [1 if pred == label else 0 for pred, label in zip(predictions, labels)]
    exact_f1 = np.mean(exact_matches)
    
    return {
        'exact_f1': exact_f1,
        'rouge1': avg_rouge_scores['rouge1'],
        'rouge2': avg_rouge_scores['rouge2'], 
        'rougeL': avg_rouge_scores['rougeL'],
        'bert_score': bert_f1
    }

In [15]:
input_data = load_jsonl('data/nfqa_open/input/nfqa_open_input.jsonl')
output_data = load_jsonl('outputs/nfqa_open/anthropic/nfqa_open/zero_shot/cot/nfqa_open_output.jsonl')

# Extract predictions and labels
predictions = [item['prediction'] for item in output_data]
labels = [item['label'][0] if isinstance(item['label'], list) else item['label'] for item in input_data]

# Calculate metrics
metrics = calculate_metrics(predictions, labels)

# Print results
print("\nEvaluation Metrics:")
print(f"Exact Match F1: {metrics['exact_f1']:.4f}")
print(f"ROUGE-1: {metrics['rouge1']:.4f}")
print(f"ROUGE-2: {metrics['rouge2']:.4f}") 
print(f"ROUGE-L: {metrics['rougeL']:.4f}")
print(f"BERTScore: {metrics['bert_score']:.4f}")

Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.



Evaluation Metrics:
Exact Match F1: 0.0000
ROUGE-1: 0.1242
ROUGE-2: 0.0475
ROUGE-L: 0.1002
BERTScore: 0.8224




In [23]:
from tqdm import tqdm

input_data = load_jsonl('data/nfqa_open/input/nfqa_open_input.jsonl')


for model in tqdm(os.listdir('outputs/nfqa_open')):
    sub_dir = os.listdir('outputs/nfqa_open/'+ model)
    output_data = load_jsonl('outputs/nfqa_open/'+ model +'/'+ sub_dir[0] +'/nfqa_open/zero_shot/cot/nfqa_open_output.jsonl')
    
    predictions = [item['prediction'] for item in output_data]
    labels = [item['label'][0] if isinstance(item['label'], list) else item['label'] for item in input_data]

    # Calculate metrics
    metrics = calculate_metrics(predictions, labels)

    # Print results
    # print("\nEvaluation Metrics:")
    print('\n\n')
    print(f"{model} Exact Match F1: {metrics['exact_f1']:.4f}")
    print(f"{model} ROUGE-1: {metrics['rouge1']:.4f}")
    print(f"{model} ROUGE-2: {metrics['rouge2']:.4f}") 
    print(f"{model} ROUGE-L: {metrics['rougeL']:.4f}")
    print(f"{model} BERTScore: {metrics['bert_score']:.4f}")


  0%|          | 0/6 [00:00<?, ?it/s]Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
 17%|█▋        | 1/6 [00:35<02:56, 35.39s/it]




mistralai Exact Match F1: 0.0000
mistralai ROUGE-1: 0.1597
mistralai ROUGE-2: 0.0689
mistralai ROUGE-L: 0.1294
mistralai BERTScore: 0.8417


Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
 33%|███▎      | 2/6 [00:51<01:35, 23.85s/it]




google Exact Match F1: 0.0000
google ROUGE-1: 0.2531
google ROUGE-2: 0.1145
google ROUGE-L: 0.2100
google BERTScore: 0.8605


Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
 50%|█████     | 3/6 [01:24<01:24, 28.18s/it]




deepseek Exact Match F1: 0.0000
deepseek ROUGE-1: 0.1242
deepseek ROUGE-2: 0.0475
deepseek ROUGE-L: 0.1002
deepseek BERTScore: 0.8224


Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
 67%|██████▋   | 4/6 [01:49<00:53, 26.97s/it]




anthropic Exact Match F1: 0.0000
anthropic ROUGE-1: 0.1662
anthropic ROUGE-2: 0.0668
anthropic ROUGE-L: 0.1297
anthropic BERTScore: 0.8443


Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
 83%|████████▎ | 5/6 [02:06<00:23, 23.36s/it]




meta-llama Exact Match F1: 0.0000
meta-llama ROUGE-1: 0.2811
meta-llama ROUGE-2: 0.1312
meta-llama ROUGE-L: 0.2357
meta-llama BERTScore: 0.8658


Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
100%|██████████| 6/6 [02:41<00:00, 26.90s/it]




openai Exact Match F1: 0.0000
openai ROUGE-1: 0.1928
openai ROUGE-2: 0.0790
openai ROUGE-L: 0.1550
openai BERTScore: 0.8487





In [25]:
import re
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
from collections import Counter

lemmatizer = WordNetLemmatizer()


def compute_f1(pred_tokens, gt_tokens):
    common_tokens = set(pred_tokens) & set(gt_tokens)
    num_common = len(common_tokens)

    if num_common == 0:
        return 0.0

    precision = num_common / len(pred_tokens)
    recall = num_common / len(gt_tokens)

    return 2 * precision * recall / (precision + recall)

# adopted from opencompass
def general_postprocess(text: str) -> str:
    # Cut off the first newline, period, or comma
    truncated_text = re.split(r"[\n.,]", text, 1)[0]

    # Remove punctuation
    no_punctuation = re.sub(r"[^\w\s]", "", truncated_text)

    # Remove article
    no_articles = re.sub(r"\b(a|an|the)\b", "", no_punctuation, flags=re.IGNORECASE)

    # Remove duplicated blank spaces
    cleaned_text = re.sub(r"\s+", " ", no_articles).strip()

    return cleaned_text


def tokenize_and_lemmatize(text: str) -> list:
    text = text.lower()
    text = word_tokenize(text)
    tokens = [lemmatizer.lemmatize(word) for word in text]
    return tokens



In [27]:
from token_f1 import calculate_token_f1


input_data = load_jsonl('data/nfqa_open/input/nfqa_open_input.jsonl')


for model in tqdm(os.listdir('outputs/nfqa_open')):
    sub_dir = os.listdir('outputs/nfqa_open/'+ model)
    output_data = load_jsonl('outputs/nfqa_open/'+ model +'/'+ sub_dir[0] +'/nfqa_open/zero_shot/cot/nfqa_open_output.jsonl')
    
    predictions = [item['prediction'] for item in output_data]
    labels = [item['label'][0] if isinstance(item['label'], list) else item['label'] for item in input_data]

    f1_score = calculate_token_f1(predictions, labels)

    # Print results
    # print("\nEvaluation Metrics:")
    print('\n\n')
    print(f"{model} Adjust F1: {f1_score:.4f}")




100%|██████████| 6/6 [00:00<00:00, 89.90it/s]




mistralai Adjust F1: 0.1408



google Adjust F1: 0.2237



deepseek Adjust F1: 0.1110



anthropic Adjust F1: 0.1452



meta-llama Adjust F1: 0.2539



openai Adjust F1: 0.1645



