In [1]:
from evaluate import load
import csv
import pandas as pd
import numpy as np
import os

# --- Configuration ---
INPUT_FILE = r'D:\liulanqi1\MindMap-main\MindMap-main\output8_chatgpt_cols_with_llama_and_deepseek.csv'
OUTPUT_FILE = r'D:\liulanqi1\MindMap-main\MindMap-main\output_bert_score_multi_models_revised.csv' # Changed output filename

# Reference column name in INPUT_FILE
REFERENCE_COLUMN_NAME = "Label"

# Candidate output column names from your INPUT_FILE
CANDIDATE_COLUMN_NAMES = ['Mindmap', 'BM25_retrieval', 'KG_retrieval', 'KG_retrieval_multipath', 'KG_self-consistency', 'Zero_shot_GPT4o_mini_kg', 'zero_shot_gpt4_kg', 'Zero_shot_GPT4o_mini', 'Zero_shot_GPT4o', 'One_shot_GPT4o_mini', 'One_shot_GPT4o', 'Few_shot_GPT4o_mini', 'Few_shot_GPT4o', 'CoT_GPT4o_mini', 'CoT_GPT4o', 'AutoGen_Single_Result', 'AutoGen_Single_Result_nokg', 'AutoGen_Multi_Result', 'Multi-Agent Result_nokg', 'KG_ToT_Sampling_Comprehensive_BFS', 'KG_ToT_Sampling_Comprehensive_DFS', 'KG_ToT_Sampling_Comparative_BFS', 'KG_ToT_Sampling_Comparative_DFS', 'KG_ToT_Sequential_Comprehensive_BFS', 'KG_ToT_Sequential_Comprehensive_DFS', 'KG_ToT_Sequential_Comparative_BFS', 'KG_ToT_Sequential_Comparative_DFS','Mindmap_nokg','Zero_shot_GPT4o_mini_kg','zero_shot_gpt4_kg','AutoGen_Single_Result_nokg','Multi-Agent Result_nokg'
]


# BERTScore models to use
MODELS_TO_USE = [
    "distilbert-base-uncased",
    "xlm-roberta-large",
    "roberta-large",
]

# --- Helper Functions ---
def clean_text_for_csv(text):
    """Clean text: convert to string and replace newlines with spaces."""
    if pd.isna(text): # Handle NaN values explicitly
        return ""
    if not isinstance(text, str):
        text = str(text)
    return text.replace('\n', ' ').replace('\r', ' ')

# --- Main Script ---
try:
    # Load BERTScore evaluator
    print("Loading BERTScore model (this may take a moment)...")
    bertscore_evaluator = load("bertscore")
    print("BERTScore model loaded.")

    # Read input CSV
    print(f"Reading input file: {INPUT_FILE}")
    df = pd.read_csv(INPUT_FILE)
    print(f"Successfully read CSV file, found {len(df)} rows.")
    print(f"Input columns: {df.columns.tolist()}")

    # Verify reference column exists
    if REFERENCE_COLUMN_NAME not in df.columns:
        raise ValueError(f"Reference column '{REFERENCE_COLUMN_NAME}' not found in the input CSV.")

    # Verify all candidate columns exist
    missing_candidates = [col for col in CANDIDATE_COLUMN_NAMES if col not in df.columns]
    if missing_candidates:
        print(f"Warning: The following candidate columns were not found and will be skipped: {missing_candidates}")
        # Filter out missing columns from processing list
        CANDIDATE_COLUMN_NAMES = [col for col in CANDIDATE_COLUMN_NAMES if col in df.columns]

    if not CANDIDATE_COLUMN_NAMES:
        raise ValueError("No valid candidate columns found to process after checking input CSV.")


    # Initialize new columns for scores and averages in the DataFrame
    for model_full_name in MODELS_TO_USE:
        model_short_name = model_full_name.split("-")[0] # e.g., "distilbert", "xlm", "roberta"
        # Ensure short names are unique if first parts are same (e.g. xlm-roberta-large vs xlm-roberta-base)
        # For this set, they are unique. If not, adjust model_short_name logic.

        for candidate_name in CANDIDATE_COLUMN_NAMES:
            # Sanitize candidate_name if it contains characters not suitable for column names,
            # though the provided ones seem fine.
            sanitized_candidate_name = candidate_name.replace(" ", "_").replace("-", "_") # Basic sanitization

            df[f"{model_short_name}_{sanitized_candidate_name}_precision"] = np.nan
            df[f"{model_short_name}_{sanitized_candidate_name}_recall"] = np.nan
            df[f"{model_short_name}_{sanitized_candidate_name}_f1"] = np.nan

        # Columns for per-model average scores (average over all candidates for that row)
        df[f"{model_short_name}_avg_precision"] = np.nan
        df[f"{model_short_name}_avg_recall"] = np.nan
        df[f"{model_short_name}_avg_f1"] = np.nan
    print("Initialized new columns in DataFrame for BERTscore results.")

    # Process each row
    for idx, row in df.iterrows():
        print(f"\nProcessing row {idx + 1}/{len(df)}...")
        try:
            reference_text_cleaned = clean_text_for_csv(row[REFERENCE_COLUMN_NAME])
            if not reference_text_cleaned.strip():
                print(f"  Warning: Empty or whitespace-only reference text in row {idx + 1}. Skipping BERTscore for this row.")
                continue
            references_list = [reference_text_cleaned]

            for model_full_name in MODELS_TO_USE:
                model_short_name = model_full_name.split("-")[0]
                print(f"  Using BERTScore model: {model_full_name}")

                current_model_precisions = []
                current_model_recalls = []
                current_model_f1s = []

                for candidate_name in CANDIDATE_COLUMN_NAMES:
                    sanitized_candidate_name = candidate_name.replace(" ", "_").replace("-", "_")
                    prediction_text_cleaned = clean_text_for_csv(row[candidate_name])

                    if not prediction_text_cleaned.strip():
                        print(f"    Warning: Empty or whitespace-only prediction text for '{candidate_name}' in row {idx + 1}. Assigning NaN for this candidate.")
                        # Scores will remain NaN as initialized
                        continue

                    predictions_list = [prediction_text_cleaned]

                    try:
                        results = bertscore_evaluator.compute(
                            predictions=predictions_list,
                            references=references_list,
                            model_type=model_full_name, # Use the full model name for bertscore compute
                            # lang="en" # Specify language if not English or for multilingual models
                        )
                        precision = results["precision"][0]
                        recall = results["recall"][0]
                        f1 = results["f1"][0]

                        df.at[idx, f"{model_short_name}_{sanitized_candidate_name}_precision"] = precision
                        df.at[idx, f"{model_short_name}_{sanitized_candidate_name}_recall"] = recall
                        df.at[idx, f"{model_short_name}_{sanitized_candidate_name}_f1"] = f1

                        current_model_precisions.append(precision)
                        current_model_recalls.append(recall)
                        current_model_f1s.append(f1)
                        # print(f"    {candidate_name} - P: {precision:.4f}, R: {recall:.4f}, F1: {f1:.4f}")

                    except Exception as e_bert:
                        print(f"    Error computing BERTScore for '{candidate_name}' with model '{model_full_name}': {e_bert}")
                        # Scores will remain NaN

                # Calculate and store averages for the current model for this row
                if current_model_f1s: # Check if any scores were successfully computed
                    df.at[idx, f"{model_short_name}_avg_precision"] = np.mean(current_model_precisions)
                    df.at[idx, f"{model_short_name}_avg_recall"] = np.mean(current_model_recalls)
                    df.at[idx, f"{model_short_name}_avg_f1"] = np.mean(current_model_f1s)
                    # print(f"  Model {model_short_name} Avg - P: {np.mean(current_model_precisions):.4f}, R: {np.mean(current_model_recalls):.4f}, F1: {np.mean(current_model_f1s):.4f}")
                else:
                    print(f"  No valid BERTscores computed for model {model_short_name} in this row to average.")
        except KeyError as e_key:
            print(f"  Skipping row {idx+1} due to missing key: {e_key}. Ensure all specified columns exist.")
        except Exception as e_row:
            print(f"  An unexpected error occurred while processing row {idx + 1}: {e_row}")


    # Save the updated DataFrame
    print(f"\nSaving results to: {OUTPUT_FILE}")
    df.to_csv(OUTPUT_FILE, index=False, encoding='utf-8-sig', quoting=csv.QUOTE_MINIMAL) # utf-8-sig for Excel compatibility
    print("Processing complete.")

except FileNotFoundError:
    print(f"Error: Input file '{INPUT_FILE}' not found.")
except ValueError as ve:
    print(f"Configuration Error: {ve}")
except Exception as e:
    print(f"An unexpected error occurred: {e}")
    import traceback
    traceback.print_exc()

Loading BERTScore model (this may take a moment)...
BERTScore model loaded.
Reading input file: D:\liulanqi1\MindMap-main\MindMap-main\output8_chatgpt_cols_with_llama_and_deepseek.csv
Successfully read CSV file, found 29 rows.
Input columns: ['Question', 'Label', 'Zero_shot_GPT4o_mini', 'Zero_shot_GPT4o', 'One_shot_GPT4o_mini', 'One_shot_GPT4o', 'Few_shot_GPT4o_mini', 'Few_shot_GPT4o', 'CoT_GPT4o_mini', 'CoT_GPT4o', 'llama3_pred', 'DeepSeek_Pred']
Initialized new columns in DataFrame for BERTscore results.

Processing row 1/29...
  Using BERTScore model: distilbert-base-uncased
  Using BERTScore model: xlm-roberta-large
  Using BERTScore model: roberta-large


Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.



Processing row 2/29...
  Using BERTScore model: distilbert-base-uncased
  Using BERTScore model: xlm-roberta-large
  Using BERTScore model: roberta-large


Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.



Processing row 3/29...
  Using BERTScore model: distilbert-base-uncased
  Using BERTScore model: xlm-roberta-large
  Using BERTScore model: roberta-large


Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.



Processing row 4/29...
  Using BERTScore model: distilbert-base-uncased
  Using BERTScore model: xlm-roberta-large
  Using BERTScore model: roberta-large


Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.



Processing row 5/29...
  Using BERTScore model: distilbert-base-uncased
  Using BERTScore model: xlm-roberta-large
  Using BERTScore model: roberta-large


Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.



Processing row 6/29...
  Using BERTScore model: distilbert-base-uncased
  Using BERTScore model: xlm-roberta-large
  Using BERTScore model: roberta-large


Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.



Processing row 7/29...
  Using BERTScore model: distilbert-base-uncased
  Using BERTScore model: xlm-roberta-large
  Using BERTScore model: roberta-large


Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.



Processing row 8/29...
  Using BERTScore model: distilbert-base-uncased
  Using BERTScore model: xlm-roberta-large
  Using BERTScore model: roberta-large


Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.



Processing row 9/29...
  Using BERTScore model: distilbert-base-uncased
  Using BERTScore model: xlm-roberta-large
  Using BERTScore model: roberta-large


Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.



Processing row 10/29...
  Using BERTScore model: distilbert-base-uncased
  Using BERTScore model: xlm-roberta-large
  Using BERTScore model: roberta-large


Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.



Processing row 11/29...
  Using BERTScore model: distilbert-base-uncased
  Using BERTScore model: xlm-roberta-large
  Using BERTScore model: roberta-large


Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.



Processing row 12/29...
  Using BERTScore model: distilbert-base-uncased
  Using BERTScore model: xlm-roberta-large
  Using BERTScore model: roberta-large


Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.



Processing row 13/29...
  Using BERTScore model: distilbert-base-uncased
  Using BERTScore model: xlm-roberta-large
  Using BERTScore model: roberta-large


Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.



Processing row 14/29...
  Using BERTScore model: distilbert-base-uncased
  Using BERTScore model: xlm-roberta-large
  Using BERTScore model: roberta-large


Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.



Processing row 15/29...
  Using BERTScore model: distilbert-base-uncased
  Using BERTScore model: xlm-roberta-large
  Using BERTScore model: roberta-large


Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.



Processing row 16/29...
  Using BERTScore model: distilbert-base-uncased
  Using BERTScore model: xlm-roberta-large
  Using BERTScore model: roberta-large


Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.



Processing row 17/29...
  Using BERTScore model: distilbert-base-uncased
  Using BERTScore model: xlm-roberta-large
  Using BERTScore model: roberta-large


Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.



Processing row 18/29...
  Using BERTScore model: distilbert-base-uncased
  Using BERTScore model: xlm-roberta-large
  Using BERTScore model: roberta-large


Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.



Processing row 19/29...
  Using BERTScore model: distilbert-base-uncased
  Using BERTScore model: xlm-roberta-large
  Using BERTScore model: roberta-large


Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.



Processing row 20/29...
  Using BERTScore model: distilbert-base-uncased
  Using BERTScore model: xlm-roberta-large
  Using BERTScore model: roberta-large


Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.



Processing row 21/29...
  Using BERTScore model: distilbert-base-uncased
  Using BERTScore model: xlm-roberta-large
  Using BERTScore model: roberta-large


Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.



Processing row 22/29...
  Using BERTScore model: distilbert-base-uncased
  Using BERTScore model: xlm-roberta-large
  Using BERTScore model: roberta-large


Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.



Processing row 23/29...
  Using BERTScore model: distilbert-base-uncased
  Using BERTScore model: xlm-roberta-large
  Using BERTScore model: roberta-large


Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.



Processing row 24/29...
  Using BERTScore model: distilbert-base-uncased
  Using BERTScore model: xlm-roberta-large
  Using BERTScore model: roberta-large


Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.



Processing row 25/29...
  Using BERTScore model: distilbert-base-uncased
  Using BERTScore model: xlm-roberta-large
  Using BERTScore model: roberta-large


Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.



Processing row 26/29...
  Using BERTScore model: distilbert-base-uncased
  Using BERTScore model: xlm-roberta-large
  Using BERTScore model: roberta-large


Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.



Processing row 27/29...
  Using BERTScore model: distilbert-base-uncased
  Using BERTScore model: xlm-roberta-large
  Using BERTScore model: roberta-large


Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.



Processing row 28/29...
  Using BERTScore model: distilbert-base-uncased
  Using BERTScore model: xlm-roberta-large
  Using BERTScore model: roberta-large


Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.



Processing row 29/29...
  Using BERTScore model: distilbert-base-uncased
  Using BERTScore model: xlm-roberta-large
  Using BERTScore model: roberta-large


Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.



Saving results to: D:\liulanqi1\MindMap-main\MindMap-main\output_bert_score_multi_models_revised.csv
Processing complete.


In [2]:
import pandas as pd
import numpy as np
import os

# --- Configuration ---
INPUT_FILE_AGG = r'D:\liulanqi1\MindMap-main\MindMap-main\output_bert_score_multi_models_revised.csv' # Output from Script 1
OUTPUT_FILE_AGG = r'D:\liulanqi1\MindMap-main\MindMap-main\bertscore_aggregates_and_reports_revised.csv'

# Map original candidate column names (used as identifiers in Script 1's output columns)
# to desired display names for the report tables.
# Keys MUST match the strings in CANDIDATE_COLUMN_NAMES from Script 1.
OUTPUT_NAME_MAP = {
    "Mindmap": "Mindmap",
    "BM25_retrieval": "BM25 Retrieval",
    "KG_retrieval": "KG Retrieval",
    "KG_retrieval_multipath": "KG Retrieval Multipath",
    "KG_self-consistency": "KG Self-Consistency",
    
    "Zero_shot_GPT4o_mini_kg": "Zero-shot GPT4o-mini (KG)",
    "zero_shot_gpt4_kg": "Zero-shot GPT4o (KG)",  # 自定义命名
    "Zero_shot_GPT4o_mini": "Zero-shot GPT4o-mini",
    "Zero_shot_GPT4o": "Zero-shot GPT4o",
    
    "One_shot_GPT4o_mini": "One-shot GPT4o-mini",
    "One_shot_GPT4o": "One-shot GPT4o",
    
    "Few_shot_GPT4o_mini": "Few-shot GPT4o-mini",
    "Few_shot_GPT4o": "Few-shot GPT4o",
    
    "CoT_GPT4o_mini": "CoT GPT4o-mini",
    "CoT_GPT4o": "CoT GPT4o",
    
    "AutoGen_Single_Result": "AutoGen Single",
    "AutoGen_Single_Result_nokg": "AutoGen Single (no KG)",
    "AutoGen_Multi_Result": "AutoGen Multi",
    
    "Multi-Agent Result_nokg": "Multi-Agent Result (no KG)",
    
    "KG_ToT_Sampling_Comprehensive_BFS": "KG-ToT Samp Comprehensive BFS",
    "KG_ToT_Sampling_Comprehensive_DFS": "KG-ToT Samp Comprehensive DFS",
    "KG_ToT_Sampling_Comparative_BFS": "KG-ToT Samp Comparative BFS",
    "KG_ToT_Sampling_Comparative_DFS": "KG-ToT Samp Comparative DFS",
    
    "KG_ToT_Sequential_Comprehensive_BFS": "KG-ToT Seq Comprehensive BFS",
    "KG_ToT_Sequential_Comprehensive_DFS": "KG-ToT Seq Comprehensive DFS",
    "KG_ToT_Sequential_Comparative_BFS": "KG-ToT Seq Comparative BFS",
    "KG_ToT_Sequential_Comparative_DFS": "KG-ToT Seq Comparative DFS",
    
    "Mindmap_nokg": "Mindmap (no KG)"  # 自定义命名
}



# These are the identifiers derived from CANDIDATE_COLUMN_NAMES in Script 1
# (after basic sanitization if any was applied there, here assumed to be the same)
CANDIDATE_IDENTIFIERS = [name.replace(" ", "_").replace("-", "_") for name in OUTPUT_NAME_MAP.keys()]


# --- Main Script ---
try:
    df = pd.read_csv(INPUT_FILE_AGG)
    print(f"Successfully read aggregated scores CSV: {INPUT_FILE_AGG}, found {len(df)} rows.")

    # Dynamically detect model prefixes from column names
    model_prefixes = set()
    for col in df.columns:
        # Example column: "distilbert_Mindmap_precision"
        parts = col.split('_')
        # A heuristic: if a part is followed by a known candidate identifier and then a metric
        for candidate_id in CANDIDATE_IDENTIFIERS:
            if f"_{candidate_id}_precision" in col or \
               f"_{candidate_id}_recall" in col or \
               f"_{candidate_id}_f1" in col:
                # Extract the prefix before the first candidate identifier match part
                # This is a bit tricky if model prefixes themselves contain underscores.
                # Assuming model_short_name from script 1 (e.g. "distilbert", "xlm", "roberta")
                # is the prefix we want.
                # Let's find the part of the column name before "_{candidate_id}_METRIC"
                try:
                    prefix = col.split(f"_{candidate_id}_")[0]
                    model_prefixes.add(prefix)
                    break # Found prefix for this column
                except: # Be robust
                    pass
    
    # A simpler way if prefixes are known (e.g., from MODELS_TO_USE in Script 1)
    # model_prefixes = sorted(list(set([m.split("-")[0] for m in ["distilbert-base-uncased", "xlm-roberta-large", "roberta-large"]])))


    if not model_prefixes:
        # Fallback or error if dynamic detection fails significantly
        print("Warning: Could not robustly detect model prefixes. Manually define or check column naming.")
        # As a fallback, assuming models from Script 1 if needed:
        # models_from_script1 = ["distilbert-base-uncased", "xlm-roberta-large", "roberta-large"]
        # model_prefixes = sorted(list(set([m.split("-")[0] for m in models_from_script1])))
        # For now, we proceed, but this might indicate an issue if the set is empty.
        # Let's try with the initial list of models to get prefixes
        _models_for_prefix = [
            "distilbert-base-uncased", "xlm-roberta-large", "roberta-large",
        ]
        model_prefixes = sorted(list(set([m.split("-")[0] for m in _models_for_prefix])))
        print(f"Using predefined model prefixes: {model_prefixes}")


    print(f"Detected/Using model prefixes: {sorted(list(model_prefixes))}")

    # --- 1. Calculate average score ACROSS BERTScore MODELS for each CANDIDATE METHOD ---
    candidate_method_avg_scores = []
    for original_candidate_name, display_name in OUTPUT_NAME_MAP.items():
        # Use the sanitized/identifier version for column lookup if different
        candidate_identifier = original_candidate_name.replace(" ", "_").replace("-", "_")

        metrics_sum = {'precision': 0.0, 'recall': 0.0, 'f1': 0.0}
        valid_models_count = 0
        for prefix in model_prefixes:
            precision_col = f"{prefix}_{candidate_identifier}_precision"
            recall_col = f"{prefix}_{candidate_identifier}_recall"
            f1_col = f"{prefix}_{candidate_identifier}_f1"

            if all(col_name in df.columns for col_name in [precision_col, recall_col, f1_col]):
                # Calculate mean for this specific candidate over all rows in df
                avg_p = df[precision_col].mean(skipna=True)
                avg_r = df[recall_col].mean(skipna=True)
                avg_f1 = df[f1_col].mean(skipna=True)

                if not (np.isnan(avg_p) or np.isnan(avg_r) or np.isnan(avg_f1)):
                    metrics_sum['precision'] += avg_p
                    metrics_sum['recall'] += avg_r
                    metrics_sum['f1'] += avg_f1
                    valid_models_count += 1
            # else:
                # print(f"Debug: Missing columns for {prefix} and {candidate_identifier}")


        if valid_models_count > 0:
            candidate_method_avg_scores.append({
                'candidate_method_identifier': candidate_identifier, # Store the identifier
                'candidate_method_display_name': display_name,
                'avg_precision': metrics_sum['precision'] / valid_models_count,
                'avg_recall': metrics_sum['recall'] / valid_models_count,
                'avg_f1': metrics_sum['f1'] / valid_models_count,
                'models_contributed': valid_models_count
            })
        # else:
            # print(f"Debug: No valid models found for candidate {candidate_identifier}")


    # --- 2. Calculate OVERALL average (average of averages from step 1) ---
    overall_avg_metrics = {'precision': 0.0, 'recall': 0.0, 'f1': 0.0}
    if candidate_method_avg_scores:
        overall_avg_metrics['precision'] = np.mean([r['avg_precision'] for r in candidate_method_avg_scores if 'avg_precision' in r])
        overall_avg_metrics['recall'] = np.mean([r['avg_recall'] for r in candidate_method_avg_scores if 'avg_recall' in r])
        overall_avg_metrics['f1'] = np.mean([r['avg_f1'] for r in candidate_method_avg_scores if 'avg_f1' in r])

    # --- 3. Per-BERTScore-model averages + detailed per-output per-model ---
    per_bertscore_model_avg_scores = [] # For Table 2 (average of a model across all candidate methods)
    detailed_model_output_scores = []   # For Table 3

    for prefix in sorted(list(model_prefixes)):
        # For Table 2: Average this BERTscore model's performance across all candidate methods
        model_total_metrics = {'precision': 0.0, 'recall': 0.0, 'f1': 0.0}
        valid_candidates_for_model_count = 0

        for original_candidate_name, display_name in OUTPUT_NAME_MAP.items():
            candidate_identifier = original_candidate_name.replace(" ", "_").replace("-", "_")
            precision_col = f"{prefix}_{candidate_identifier}_precision"
            recall_col = f"{prefix}_{candidate_identifier}_recall"
            f1_col = f"{prefix}_{candidate_identifier}_f1"

            if all(col_name in df.columns for col_name in [precision_col, recall_col, f1_col]):
                # Mean score of this model for this candidate_identifier (across all CSV rows)
                avg_p = df[precision_col].mean(skipna=True)
                avg_r = df[recall_col].mean(skipna=True)
                avg_f1 = df[f1_col].mean(skipna=True)

                if not (np.isnan(avg_p) or np.isnan(avg_r) or np.isnan(avg_f1)):
                    # For Table 3
                    detailed_model_output_scores.append({
                        'bertscore_model': prefix,
                        'candidate_method_identifier': candidate_identifier,
                        'candidate_method_display_name': display_name,
                        'avg_precision': avg_p,
                        'avg_recall': avg_r,
                        'avg_f1': avg_f1
                    })
                    # For Table 2 accumulation
                    model_total_metrics['precision'] += avg_p
                    model_total_metrics['recall'] += avg_r
                    model_total_metrics['f1'] += avg_f1
                    valid_candidates_for_model_count += 1

        if valid_candidates_for_model_count > 0:
            per_bertscore_model_avg_scores.append({
                'bertscore_model': prefix,
                'avg_precision': model_total_metrics['precision'] / valid_candidates_for_model_count,
                'avg_recall': model_total_metrics['recall'] / valid_candidates_for_model_count,
                'avg_f1': model_total_metrics['f1'] / valid_candidates_for_model_count,
                'candidates_contributed': valid_candidates_for_model_count
            })

    # --- Save combined results to a CSV (optional, for easier data handling) ---
    # Create DataFrames from the lists of dictionaries
    df_candidate_avg = pd.DataFrame(candidate_method_avg_scores)
    df_bertscore_model_avg = pd.DataFrame(per_bertscore_model_avg_scores)
    df_detailed = pd.DataFrame(detailed_model_output_scores)

    # You might want to save these separately or combine them meaningfully
    # For now, let's just print them. If saving, decide on format.
    # Example of saving one:
    if not df_candidate_avg.empty:
      df_candidate_avg.to_csv(OUTPUT_FILE_AGG, index=False, float_format='%.6f')
      print(f"\nCandidate method average scores saved to {OUTPUT_FILE_AGG}")


    # --- Printing Tables ---

    # Table 1: Average BERTScore for each CANDIDATE METHOD (across BERTscore models), sorted by F1
    print("\n--- Table 1: Avg BERTScore per Candidate Method (across BERTScore Models) ---")
    print("Sorted by F1 score (descending)")
    print("=" * 100)
    header = f"{'Candidate Method':<45} | {'Avg Precision':^15} | {'Avg Recall':^15} | {'Avg F1':^15}"
    print(header)
    print("-" * len(header))
    if candidate_method_avg_scores:
        sorted_candidates = sorted(candidate_method_avg_scores, key=lambda x: x.get('avg_f1', 0), reverse=True)
        for r_cand in sorted_candidates:
            print(f"{r_cand.get('candidate_method_display_name', 'N/A'):<45} | {r_cand.get('avg_precision', 0):^15.4f} | {r_cand.get('avg_recall', 0):^15.4f} | {r_cand.get('avg_f1', 0):^15.4f}")
        print("-" * len(header))
        # Print overall average of these candidate averages
        if overall_avg_metrics['f1'] != 0.0 : # Check if overall_avg was computed
             print(f"{'OVERALL (Avg of above)':<45} | {overall_avg_metrics['precision']:^15.4f} | {overall_avg_metrics['recall']:^15.4f} | {overall_avg_metrics['f1']:^15.4f}")
        print("-" * len(header))
    else:
        print("No data for candidate method averages.")


    # Table 2: Average BERTScore for each BERTScore MODEL (across candidate methods), sorted by F1
    print("\n--- Table 2: Avg BERTScore per BERTScore Model (across Candidate Methods) ---")
    print("Sorted by F1 score (descending)")
    print("=" * 90)
    header_m = f"{'BERTScore Model':<30} | {'Avg Precision':^15} | {'Avg Recall':^15} | {'Avg F1':^15}"
    print(header_m)
    print("-" * len(header_m))
    if per_bertscore_model_avg_scores:
        sorted_models = sorted(per_bertscore_model_avg_scores, key=lambda x: x.get('avg_f1', 0), reverse=True)
        for r_model in sorted_models:
            print(f"{r_model.get('bertscore_model', 'N/A'):<30} | {r_model.get('avg_precision', 0):^15.4f} | {r_model.get('avg_recall', 0):^15.4f} | {r_model.get('avg_f1', 0):^15.4f}")
        print("-" * len(header_m))
    else:
        print("No data for BERTScore model averages.")


    # Table 3: Detailed BERTScore for each BERTScore MODEL on each CANDIDATE METHOD, sorted by F1
    print("\n--- Table 3: Detailed Avg BERTScore (BERTScore Model vs. Candidate Method) ---")
    print("Sorted by F1 score (descending)")
    print("=" * 120)
    header_d = f"{'BERTScore Model':<30} | {'Candidate Method':<45} | {'Avg Precision':^15} | {'Avg Recall':^15} | {'Avg F1':^15}"
    print(header_d)
    print("-" * len(header_d))
    if detailed_model_output_scores:
        # Use the DataFrame for easier sorting if created: df_detailed
        if not df_detailed.empty:
            sorted_detailed = df_detailed.sort_values(by='avg_f1', ascending=False).fillna(0)
            for _, d_row in sorted_detailed.iterrows():
                print(f"{d_row.get('bertscore_model', 'N/A'):<30} | {d_row.get('candidate_method_display_name', 'N/A'):<45} | {d_row.get('avg_precision', 0):^15.4f} | {d_row.get('avg_recall', 0):^15.4f} | {d_row.get('avg_f1', 0):^15.4f}")
        print("-" * len(header_d))
    else:
        print("No data for detailed model vs. candidate method scores.")

except FileNotFoundError:
    print(f"Error: Input file '{INPUT_FILE_AGG}' not found.")
except Exception as e_agg:
    print(f"An error occurred during aggregation: {e_agg}")
    import traceback
    traceback.print_exc()

Successfully read aggregated scores CSV: D:\liulanqi1\MindMap-main\MindMap-main\output_bert_score_multi_models_revised.csv, found 29 rows.
Detected/Using model prefixes: ['distilbert', 'roberta', 'xlm']

Candidate method average scores saved to D:\liulanqi1\MindMap-main\MindMap-main\bertscore_aggregates_and_reports_revised.csv

--- Table 1: Avg BERTScore per Candidate Method (across BERTScore Models) ---
Sorted by F1 score (descending)
Candidate Method                              |  Avg Precision  |   Avg Recall    |     Avg F1     
---------------------------------------------------------------------------------------------------
Few-shot GPT4o                                |     0.7844      |     0.8287      |     0.8058     
One-shot GPT4o                                |     0.7652      |     0.8323      |     0.7972     
Few-shot GPT4o-mini                           |     0.7580      |     0.8168      |     0.7861     
One-shot GPT4o-mini                           |     0.7491  