In [1]:
import requests
import json
import pandas as pd
import re
import logging
import openai
from collections import Counter

In [None]:
# Define the folder where the evaluation results are located
folder = f"mgsm/results_gemma"  # <-- You can change this to point to the specific folder you want
df = pd.read_csv(f"{folder}/Xfinal_result.csv")

# Display the first few rows of the dataframe
display(df[:3])

In [None]:
# Define the languages you want to analyze
# lang_order = ['id', 'jv', 'su']
lang_order = ['bn', 'en', 'es', 'ja', 'te', 'th']

# Find mistakes (incorrect answers based on manual review 'is_correct_rev2')
mistake_rows = df[df['is_correct_rev2'] == False]

# Group mistakes by language and question number
lang_qno_mistakes = mistake_rows.groupby(['lang', 'q_no']).size().reset_index(name='mistake_count')

# Find question numbers with the most mistakes per language
most_mistakes_per_lang = (
    lang_qno_mistakes
    .sort_values(['lang', 'mistake_count'], ascending=[True, False])
    .groupby('lang')['q_no']
    .apply(list)
    .to_dict()
)

# Create a complete result dictionary for the specified language order
complete_result = {lang: most_mistakes_per_lang.get(lang, []) for lang in lang_order}

# Print mistakes per language
for lang, q_nos in complete_result.items():
    print(f"{lang}: {q_nos}")

In [None]:
# Aggregate all mistake question numbers
all_mistakes = [q_no for qnos in complete_result.values() for q_no in qnos]

# Count how often each question number appears among mistakes
qno_counts = Counter(all_mistakes)

# Print the most commonly mistaken questions across all languages
most_common = qno_counts.most_common()
print("Question numbers with most mistakes across all languages:")
for qno, count in most_common:
    print(f"q_no {qno} — {count} mistake(s)")

In [None]:
# Load the OpenAI API key
api_key_file = 'apis/api.txt'
with open(api_key_file, "r") as file:
    api_key = file.read().strip()

In [141]:
def classify_error(error_text, api_key, model_name="gpt-4o"):
    """
    Classify the type of error in a generated answer using OpenAI's GPT model.
    
    Args:
        error_text (str): The explanation or reasoning text to classify.
        api_key (str): OpenAI API key.
        model_name (str): The model name to use (default: 'gpt-4o').
        
    Returns:
        str: Predicted error type ("Arithmetic Mistake", "Missing Calculation", or "Hallucination").
    """
    client = openai.OpenAI(api_key=api_key) 

    prompt = f"""
Classify the following error into one of these categories:

1. Arithmetic Mistake: Miscalculation or flawed numerical logic.
   - Incorrect math operation or formula.
   - Misuse of units, percentages, or order of operations.
   - Logical errors that lead to wrong numeric results.

2. Missing Calculation: Incomplete reasoning or skipped steps.
   - Skipping necessary steps in the solution.
   - Leaving the final answer unfinished or abruptly ending.
   - Ignoring parts of the question or data.

3. Hallucination: Introducing unsupported or fabricated information.
   - Adding assumptions or data not present in the prompt.
   - Inventing variables, conditions, or steps.
   - Confidently stating something untrue or unrelated.

Error to classify: "{error_text}"

Return only the category name: "Arithmetic Mistake", "Missing Calculation", or "Hallucination".
"""
    try:
        response = client.chat.completions.create(
            model=model_name,
            messages=[
                {"role": "system", "content": "You are an expert in classifying math errors."},
                {"role": "user", "content": prompt}
            ],
            max_tokens=10,
            temperature=0,
        )
        return response.choices[0].message.content.strip()
    except Exception as e:
        print("ERROR:", e)
        return "Unknown"


In [None]:
# Filter only incorrect rows for error classification
incorrect_df = df[df["is_correct_rev2"] == False].copy()
print(len(incorrect_df))
display(incorrect_df[:3])

In [143]:
# Apply the error classification function to each incorrect answer
incorrect_df["ec_1"] = incorrect_df["self_revision"].apply(lambda x: classify_error(x, api_key))

# Display the result with relevant columns
display(incorrect_df[["q_no", "lang", "answer_number", "self_revision", "gen_answer","revised_answer2", 
                      "is_correct","is_correct2","is_correct_rev", "is_correct_rev2", "ec_1"]])

In [None]:
# Save the classified errors to a CSV file
incorrect_df[["q_no", "lang", "answer_number", "gen_answer","revised_answer2", "is_correct",
            "is_correct2","is_correct_rev", "is_correct_rev2", "ec_1"]].to_csv(f'{folder}/checker.csv', index=False)

In [None]:
# Define the mapping for error categories for easier naming
error_map = {
    'Arithmetic Mistake': 'arithmetic',
    'Missing Calculation': 'missing_calc',
    'Hallucination': 'hallucination'
}

# List of error types to ensure consistent ordering
error_types = list(error_map.keys())

# Summarize the error counts per language
summary = incorrect_df.groupby('lang')['ec_1'].value_counts().unstack(fill_value=0)

# Ensure all error columns are present
for err in error_types:
    if err not in summary.columns:
        summary[err] = 0

# Reorder and rename columns according to error_map
summary = summary[error_types].reset_index()
summary = summary.rename(columns=error_map)

In [None]:
# Group errors by type and language
grouped_errors = {v: {} for v in error_map.values()}

for _, row in incorrect_df.iterrows():
    ec_raw = row['ec_1']
    qno = row['q_no']
    lang = row['lang']

    if ec_raw in error_map:
        err_type = error_map[ec_raw]

        if lang not in grouped_errors[err_type]:
            grouped_errors[err_type][lang] = []

        grouped_errors[err_type][lang].append(qno)

# Pretty print grouped errors by type and language
from pprint import pprint
pprint(grouped_errors)

In [None]:
# Save the grouped errors to a JSON file
output_path = f"{folder}/counts_error.json"
with open(output_path, 'w', encoding='utf-8') as f:
    json.dump(grouped_errors, f, indent=4, ensure_ascii=False)

print(f"Saved error summary to {output_path}")