In [1]:
import json
import csv

# Open the JSON file and read it line by line
with open('outputs/answer.json', 'r', encoding='utf-8') as f:
    # Initialize an empty list to store the extracted data
    extracted_data = []

    # Read the JSON file line by line
    for line in f:
        # Parse the JSON data
        try:
            data = json.loads(line)
            # If the JSON object does not contain the 'answer' field, skip this line
            if 'answer' not in data:
                continue
            # Extract the required information
            answer = data['answer']
            response = data['response']
            # Add to the extracted data list
            extracted_data.append([answer, response])
        except json.JSONDecodeError:
            # If parsing fails, skip this line
            continue

# Write the extracted content to a CSV file
with open('outputs/response.csv', 'w', newline='', encoding='utf-8') as f:
    writer = csv.writer(f)
    # Write the CSV header
    writer.writerow(['Answer', 'Response'])
    # Write the extracted data
    writer.writerows(extracted_data)

print("Data written in output.csv file")


data written in output.csv file


In [3]:
import re
import string
import collections

def normalize_answer(s):
    """Lower text and remove non-alphabetic characters."""
    def white_space_fix(text):
        return " ".join(text.split())
    def remove_non_alpha(text):
        return re.sub(r'[^a-zA-Z]', '', text)
    def lower(text):
        return text.lower()
    return white_space_fix(lower(remove_non_alpha(s)))

def get_tokens(s):
    if not s:
        return []
    return normalize_answer(s).split()

def compute_exact(a_gold, a_pred):
    return int(normalize_answer(a_gold) == normalize_answer(a_pred))

def compute_f1(a_gold, a_pred):
    gold_toks = get_tokens(a_gold)
    pred_toks = get_tokens(a_pred)
    common = collections.Counter(gold_toks) & collections.Counter(pred_toks)
    num_same = sum(common.values())
    if len(gold_toks) == 0 or len(pred_toks) == 0:
        # If either is no-answer, then F1 is 1 if they agree, 0 otherwise
        return int(gold_toks == pred_toks)
    if num_same == 0:
        # If there are no common words, check for partial match
        normalized_gold = normalize_answer(a_gold)
        normalized_pred = normalize_answer(a_pred)
        if normalized_gold in normalized_pred or normalized_pred in normalized_gold:
            return 0.75  # Assign a higher score for partial match
        return 0
    precision = 1.0 * num_same / len(pred_toks)
    recall = 1.0 * num_same / len(gold_toks)
    f1 = (2 * precision * recall) / (precision + recall)
    # Assign a higher score if there is a partial match but not an exact match
    normalized_gold = normalize_answer(a_gold)
    normalized_pred = normalize_answer(a_pred)
    if normalized_gold in normalized_pred or normalized_pred in normalized_gold:
        return max(f1, 1)
    return f1

def calculate_exact_match(row):
    try:
        reference, predicted = row.split(',')
        return compute_exact(reference, predicted)
    except ValueError:
        return None

def calculate_f1(row):
    try:
        reference, predicted = row.split(',')
        return compute_f1(reference, predicted)
    except ValueError:
        return None

# Read CSV file and calculate F1 score and Exact Match
with open('outputs/response.csv', 'r', encoding='utf-8') as file:
    lines = file.readlines()[1:]  # Skip the header

f1_scores = [calculate_f1(line.strip()) for line in lines]
f1_scores = [score for score in f1_scores if score is not None]

exact_match_scores = [calculate_exact_match(line.strip()) for line in lines]
exact_match_scores = [score for score in exact_match_scores if score is not None]

if f1_scores:
    average_f1 = sum(f1_scores) / len(f1_scores)
    print(f"Average F1 Score: {average_f1:.4f}")
else:
    print("No valid lines found for F1 calculation.")

if exact_match_scores:
    average_exact_match = sum(exact_match_scores) / len(exact_match_scores)
    print(f"Average Exact Match: {average_exact_match:.4f}")
else:
    print("No valid lines found for Exact Match calculation.")


EM: 57.09%
