In [None]:
import pandas as pd
import re
import math

class Evaluation:
    def __init__(self, predictions_df, task_id, prompt):
        """
        Initializes the Evaluation object.
        """
        self.predictions = predictions_df
        self.task_id = task_id
        self.prompt = prompt

    def extract_numeric_answer(self, text):
        """
        Extracts a numeric answer from the given text using a regular expression.
        """
        if self.prompt == "bnap":
            match = re.search(r'উত্তর:\s.*?([+-]?\d*\.?\d+)', text, re.DOTALL)
        else:
            match = re.search(r"(?:Answer:|answer is:)\s.*?([+-]?\d*\.?\d+)", str(text), re.DOTALL)
        if match:
            return match.group(1)
        else:
            return "00000"
        
    def extract_option_answer(self, text):
        """
        Extracts an option answer from the input text.
        For the 'bnap' prompt, it maps "উত্তর ১" to "Option 1" and "উত্তর ২" to "Option 2".
        """
        if self.prompt == 'bnap':
            match = re.search(r"উত্তর:\s*(.*)", text, re.DOTALL)
            if match:
                answer = match.group(1).strip()
                if "উত্তর ১" in answer:
                    return "Option 1"
                elif "উত্তর ২" in answer:
                    return "Option 2"
                else:
                    return "00000"
            else:
                return "00000"
        else:
            match = re.search(r"(?:Answer:|answer is:)\s*(.*)", text, re.DOTALL)
            if match:
                return match.group(1).strip()
            else:
                return "00000"

    def extract_relation_answer(self, text):
        """
        Extracts a relation answer from the input text and maps it to one of:
        "neutral", "contradiction", or "Entailment".
        """
        if self.prompt == 'bnap':
            match = re.search(r"উত্তর:\s*(.*)", text, re.DOTALL)
            if match:
                answer = match.group(1)
                if "নিরপেক্ষ" in answer:
                    return "neutral"
                elif "বিরোধ" in answer:
                    return "contradiction"
                elif "সমর্থন" in answer:
                    return "Entailment"
                else:
                    return "00000"
            else:
                return "00000"
        else:
            match = re.search(r"(?:Answer:|answer is:)\s*(.*)", text, re.DOTALL)
            if match:
                return match.group(1)
            else:
                return "00000"

    def is_direct_answer_match(self, correct_ans, model_output):
        """
        Checks if the correct answer and model output match.
        For 'bnap' prompt, the match is direct (case-sensitive), 
        otherwise it performs a case-insensitive substring check.
        """
        if self.prompt == "bnap":
            return correct_ans == model_output
        else:
            return correct_ans.lower() in model_output.lower()

    def convert_bengali_digits_to_english(self, text):
        """
        Converts Bengali numeric characters in the input text to their English equivalents.
        
        Returns:
            str: The converted numeric text in English, or "00000" if no valid numeric characters are found.
        """
        mapping = {
            '০': '0',
            '১': '1',
            '২': '2',
            '৩': '3',
            '৪': '4',
            '৫': '5',
            '৬': '6',
            '৭': '7',
            '৮': '8',
            '৯': '9',
            '.': '.',
            "/": "/"
        }
        # Retain only characters that are in our mapping or are valid digits/punctuation
        english_text = ''.join(mapping.get(char, char) for char in text if char in '০১২৩৪৫৬৭৮৯10123456789./-')
        return english_text if english_text else "00000"

    def is_numeric_answer_match(self, ground_truth, model_ans):
        """
        Compares a numeric ground truth answer with the model answer after converting Bengali digits.
        """
        ground_truth_converted = self.convert_bengali_digits_to_english(ground_truth)
        model_ans_converted = self.convert_bengali_digits_to_english(model_ans)
        ground_truth_val = int(eval(ground_truth_converted) * 100) / 100
        model_ans_val = int(eval(model_ans_converted) * 100) / 100
        return ground_truth_val == model_ans_val

    def evaluate_accuracy(self):
        """
        Evaluates the model's accuracy on the predictions DataFrame by comparing the 'Answer' and 
        'Model Response' columns.
        
        Returns:
            tuple: A tuple containing the accuracy percentage and the number of exact matches.
        """
        total_data = len(self.predictions)
        
        if self.task_id == "task3":
            exact_matches = self.predictions.apply(
                lambda row: self.is_direct_answer_match(
                    row['Answer'], 
                    self.extract_option_answer(row['Model Response'])
                ),
                axis=1
            ).sum()
            
        elif self.task_id == "task5":
            exact_matches = self.predictions.apply(
                lambda row: self.is_direct_answer_match(
                    row['Answer'], 
                    self.extract_relation_answer(row['Model Response'])
                ),
                axis=1
            ).sum()
            
        else:
            exact_matches = self.predictions.apply(
                lambda row: self.is_numeric_answer_match(
                    row['Answer'], 
                    self.extract_numeric_answer(row['Model Response'])
                ),
                axis=1
            ).sum()
            
        accuracy = (exact_matches / total_data) * 100
        return accuracy

In [None]:
import pandas as pd
import os

# Define the sample sizes for each task
task_sizes = {
    "task1": 150,
    "task2": 250,
    "task3": 150,
    "task4": 150,
    "task5": 150,
    "task6": 150,
}

# Define the prompts and models
prompts = ["bnap", "xlp", "xcot"]

models = [
    "Mathstral_7B", 
    "Llama_3.3_70B", 
    "DeepSeek_R1_Distill_Llama_70B",
    "Gpt_4o", 
    "Gemini_2.0_flash", 
]

# Tasks to process
tasks = [1,2,3,4,5,6]

# Initialize results dictionary
results = {prompt: [] for prompt in prompts}

# Loop through prompts, tasks, and models to collect results
for prompt in prompts:
    for model in models:
        model_results = {"Model": model} 
        for task in tasks:
            taskId = f"task{task}"
            sample_size = task_sizes.get(taskId)
            filename = f"{model}_{prompt}_{taskId}_random_{sample_size}_responses.csv"
            csv_path = os.path.join("Codes", "Run_2", "Model Responses", model, filename)
                
            
            # Read the predictions DataFrame
            predictions_df = pd.read_csv(csv_path)
            
            # Evaluate the predictions
            evaluator = Evaluation(predictions_df, taskId, prompt)
            accuracy = evaluator.evaluate_accuracy()
            
            # Store results
            model_results[taskId] = accuracy
        
        # Calculate average score across tasks
        model_results["Avg."] = sum(model_results[taskId] for taskId in model_results if taskId != "Model") / len(tasks)
        
        # Append results to corresponding prompt category
        results[prompt].append(model_results)

# Convert results to DataFrames
df_bnap = pd.DataFrame(results["bnap"])
df_xlp = pd.DataFrame(results["xlp"])
df_xcot = pd.DataFrame(results["xcot"])

# Display results in tabular format
print("--------------------------------------------BNaP Results--------------------------------------------")
print(df_bnap.to_string(index=False))
print("--------------------------------------------XLP Results---------------------------------------------")
print(df_xlp.to_string(index=False))
print("--------------------------------------------XCoT Results--------------------------------------------")
print(df_xcot.to_string(index=False))