In [1]:
# ============================================================================
# CELL 0: ENHANCED JSON NORMALIZATION FOR REPORT-LEVEL SCHEMAS
# ============================================================================
"""
Normalizes both OLD and NEW schema formats:
- OLD: {"input": "...", "output": [...]}
- NEW: {"report": "...", "inputs": [{"input": "...", "output": [...]}]}
"""

import json
from pathlib import Path
from copy import deepcopy

# ============================================================================
# CANONICAL ENTITY FOR NEW FORMAT
# ============================================================================

CANONICAL_ENTITY_NEW = {
    "general_finding": None,
    "specific_finding": None,
    "finding_presence": "unknown",  # present/absent/uncertain
    "location": [],
    "degree": [],
    "measurement": None,
    "comparison": None
}

def normalize_entity_new_format(entity: dict) -> dict:
    """Normalize entity for NEW schema format"""
    normalized = deepcopy(CANONICAL_ENTITY_NEW)
    
    for key in normalized:
        if key not in entity:
            continue
        
        value = entity[key]
        
        # "None" string ‚Üí None
        if isinstance(value, str) and value.lower() == "none":
            value = None
        
        # Empty string ‚Üí None
        if isinstance(value, str) and value.strip() == "":
            value = None
        
        # location & degree MUST be list
        if key in ("location", "degree"):
            if value is None:
                value = []
            elif isinstance(value, str):
                value = [value] if value.strip() else []
            elif not isinstance(value, list):
                value = [str(value)]
            # Remove "None" from lists
            value = [v for v in value if str(v).lower() != "none"]
        
        # finding_presence normalize
        if key == "finding_presence":
            if value is None:
                value = "unknown"
            elif isinstance(value, str):
                value = value.lower()
                if value not in ("present", "absent", "uncertain"):
                    value = "unknown"
        
        normalized[key] = value
    
    return normalized

def detect_schema_format(data: dict) -> str:
    """Detect if schema is OLD or NEW format"""
    if "report" in data and "inputs" in data:
        return "new"
    elif "input" in data and "output" in data:
        return "old"
    else:
        return "unknown"

def normalize_new_format_schema(data: dict) -> dict:
    """Normalize NEW format schema"""
    normalized_inputs = []
    
    for input_item in data.get("inputs", []):
        normalized_entities = []
        
        for entity in input_item.get("output", []):
            normalized_entities.append(normalize_entity_new_format(entity))
        
        normalized_inputs.append({
            "input": input_item.get("input", "").strip(),
            "output": normalized_entities
        })
    
    return {
        "report": data.get("report", "").strip(),
        "inputs": normalized_inputs
    }

def auto_normalize_report_schemas(base_dir: str = "data_report"):
    """
    Auto-normalize all report-level schemas
    """
    base_path = Path(base_dir)
    
    if not base_path.exists():
        print(f"‚ùå Base directory not found: {base_dir}")
        return
    
    # Find all numbered directories
    data_dirs = sorted([d for d in base_path.iterdir() if d.is_dir() and d.name.isdigit()])
    
    print("="*70)
    print("REPORT-LEVEL SCHEMA NORMALIZATION")
    print("="*70)
    print(f"\nBase directory: {base_dir}")
    print(f"Found {len(data_dirs)} data directories: {[d.name for d in data_dirs]}")
    
    total_processed = 0
    total_errors = 0
    
    for data_dir in data_dirs:
        print(f"\n{'='*70}")
        print(f"Processing: {data_dir}")
        print(f"{'='*70}")
        
        # Find all JSON files
        json_files = list(data_dir.glob("gt*.json")) + list(data_dir.glob("sample*.json"))
        
        if not json_files:
            print(f"‚ö† No JSON files found")
            continue
        
        print(f"Found {len(json_files)} JSON files")
        
        for json_file in sorted(json_files):
            try:
                with open(json_file, 'r', encoding='utf-8') as f:
                    raw_data = json.load(f)
                
                # Detect format
                schema_format = detect_schema_format(raw_data)
                
                if schema_format == "new":
                    # Normalize
                    normalized = normalize_new_format_schema(raw_data)
                    
                    # Overwrite original (or save to new location)
                    with open(json_file, 'w', encoding='utf-8') as f:
                        json.dump(normalized, f, indent=2, ensure_ascii=False)
                    
                    print(f"  ‚úÖ {json_file.name} (NEW format normalized)")
                    total_processed += 1
                    
                elif schema_format == "old":
                    print(f"  ‚ÑπÔ∏è  {json_file.name} (OLD format - skipped)")
                    
                else:
                    print(f"  ‚ö†Ô∏è  {json_file.name} (Unknown format)")
                    total_errors += 1
                
            except Exception as e:
                print(f"  ‚ùå {json_file.name}: {e}")
                total_errors += 1
    
    print(f"\n{'='*70}")
    print("NORMALIZATION SUMMARY")
    print(f"{'='*70}")
    print(f"Processed: {total_processed}")
    print(f"Errors: {total_errors}")
    
    return total_processed, total_errors

# RUN
print("üîÑ Normalizing report-level schemas...\n")
processed, errors = auto_normalize_report_schemas("data_report")

if errors == 0:
    print("\n‚úÖ ALL SCHEMAS NORMALIZED!")
else:
    print(f"\n‚ö†Ô∏è  {errors} errors occurred")

üîÑ Normalizing report-level schemas...

REPORT-LEVEL SCHEMA NORMALIZATION

Base directory: data_report
Found 4 data directories: ['0', '1', '2', '3']

Processing: data_report/0
Found 2 JSON files
  ‚úÖ gt0.json (NEW format normalized)
  ‚úÖ sample0.2.json (NEW format normalized)

Processing: data_report/1
Found 1 JSON files
  ‚úÖ gt1.json (NEW format normalized)

Processing: data_report/2
Found 1 JSON files
  ‚úÖ gt2.json (NEW format normalized)

Processing: data_report/3
Found 1 JSON files
  ‚úÖ gt3.json (NEW format normalized)

NORMALIZATION SUMMARY
Processed: 5
Errors: 0

‚úÖ ALL SCHEMAS NORMALIZED!


In [2]:
from medical_schema_evaluator import MedicalSchemaEvaluator
import importlib
import medical_schema_evaluator

importlib.reload(medical_schema_evaluator)
from sentence_transformers import SentenceTransformer
print("‚úÖ sentence-transformers y√ºklendi!")

# Test initialization
evaluator = MedicalSchemaEvaluator()
print("‚úÖ Evaluator initialized successfully!")

try:
    from google import genai
    from google.genai import types  # Move this inside here
    GEMINI_AVAILABLE = True
except ImportError:
    GEMINI_AVAILABLE = False
    types = None  # Define as None if import fails
    print("‚ö† google-genai kurulu deƒüil. L√ºtfen √ßalƒ±≈ütƒ±r:")
    print("  pip install google-genai")

  from .autonotebook import tqdm as notebook_tqdm


‚úÖ sentence-transformers y√ºklendi!
‚úÖ Evaluator initialized successfully!


In [1]:
# ============================================================================
# CELL 1: ENTITY-LEVEL EVALUATION WITH SEMANTIC MATCHING + ERROR DETECTION
# ============================================================================
"""
Entity-level evaluation with SapBERT semantic matching
+ Structural Error Detection (Merged/Split entities)
"""

import json
from pathlib import Path
from collections import defaultdict
from typing import List, Dict, Tuple
import numpy as np
from datetime import datetime

# ============================================================================
# IMPORT EVALUATORS
# ============================================================================

from entity_level_evaluator import EntityLevelEvaluator, SemanticMedicalMatcher
from llm_evaluator import LLMEvaluator
from multi_embedding_evaluator import EmbeddingEvaluator

# ============================================================================
# üÜï YENƒ∞ EKLENEN: Structural Error Detection Fonksiyonlarƒ±
# ============================================================================


# ============================================================================
# ‚≠ê USER CONFIGURATION (Aynƒ± kalƒ±yor)
# ============================================================================
class UserConfig:
    """
    ‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê
    üìù EDIT THESE SETTINGS TO CONTROL WHICH MODELS TO USE
    ‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê
    """
    
    # ----------------------------------------------------------------
    # üîë API KEYS - Add your keys here
    # ----------------------------------------------------------------
    API_KEYS = {
        "gemini": "AIzaSyCKbu90y_CTrfEc5fEKuWTYha2FvwfySVA",
        #"gemma": "",           
        #"glm": "",             
        #"deepseek": "",        
    }
    
    # ----------------------------------------------------------------
    # ü§ñ LLM MODELS TO USE
    # ----------------------------------------------------------------
    SELECTED_LLM_MODELS = [
        "gemini_pro",          # ‚úÖ USING THIS
    ]
    
    # ----------------------------------------------------------------
    # üß† EMBEDDING MODELS TO USE
    # ----------------------------------------------------------------
    SELECTED_EMBEDDING_MODELS = [
        "pubmedbert",           
        "s_pubmedbert",         
        "general_baseline",   
        "neuml_pubmedbert",   
    ]
    
    # ----------------------------------------------------------------
    # üìÅ DATA CONFIGURATION
    # ----------------------------------------------------------------
    DATA_DIR = "./data_report/0/"
    GT_FILENAME = "gt0.json"
    OUTPUT_DIR = None  # Auto-generated if None
    
    # Semantic matching threshold
    MATCH_THRESHOLD = 0.6  # Minimum score to consider entities matched

# ‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê
# END OF USER CONFIGURATION
# ‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê
def detect_entity_structural_errors(matches: List[Dict], gt_entities: List[Dict], pred_entities: List[Dict]) -> Dict:
    """
    Yapƒ±sal hatalarƒ± tespit et:
    1. MERGED: 2+ GT entity -> 1 Pred entity (√∂rnekteki Cardiomegaly+Effusion durumu)
    2. SPLIT: 1 GT entity -> 2+ Pred entity  
    3. DEGREE_MIXUP: Uyumsuz degree kombinasyonu (["mild", "trace"])
    """
    
    errors = {
        'merged_entities': [],      
        'split_entities': [],       
        'degree_mixups': [],        
        'contradictions': []        
    }
    
    # Helper: Pred entity index'ini bul
    def get_pred_idx(pred_ent):
        try:
            return pred_entities.index(pred_ent)
        except ValueError:
            return None
    
    # 1. MERGED ENTITY Detection
    pred_to_gt = defaultdict(list)
    
    for match in matches:
        if match['match_type'] == 'matched' and match.get('pred_entity'):
            pred_idx = get_pred_idx(match['pred_entity'])
            if pred_idx is not None:
                pred_to_gt[pred_idx].append({
                    'gt_entity': match['gt_entity'],
                    'match_score': match.get('match_score', 0)
                })
    
    # Birle≈ütirilmi≈ü entity'leri tespit et
    for pred_idx, gt_matches in pred_to_gt.items():
        if len(gt_matches) > 1:
            pred_ent = pred_entities[pred_idx]
            pred_finding = pred_ent.get('general_finding', 'Unknown')
            pred_degree = pred_ent.get('degree', [])
            
            # GT bilgilerini topla
            gt_findings = []
            for gm in gt_matches:
                gt_ent = gm['gt_entity']
                gt_findings.append({
                    'finding': gt_ent.get('general_finding'),
                    'specific': gt_ent.get('specific_finding'),
                    'degree': gt_ent.get('degree', []),
                    'location': gt_ent.get('location', [])
                })
            
            error_info = {
                'type': 'MERGED_ENTITY',
                'severity': 'HIGH',
                'description': f"{len(gt_matches)} farklƒ± GT entity tek entity'de birle≈ütirilmi≈ü",
                'gt_entities': gt_findings,
                'pred_entity': {
                    'finding': pred_finding,
                    'degree': pred_degree,
                    'location': pred_ent.get('location', [])
                },
                'impact': f"{len(gt_matches)-1} entity kayƒ±p (FN)"
            }
            errors['merged_entities'].append(error_info)
            
            # Degree mixup kontrol√º
            if len(pred_degree) > 1:
                # Uyumsuz degree kontrol√º
                degree_set = set(str(d).lower() for d in pred_degree)
                exclusive = {'mild', 'moderate', 'severe', 'trace', 'small', 'large'}
                if len(degree_set & exclusive) > 1:
                    errors['degree_mixups'].append({
                        'finding': pred_finding,
                        'degrees': pred_degree,
                        'source_entities': [g['finding'] for g in gt_findings],
                        'reason': "Farklƒ± entity'lerden gelen degree'ler birle≈ütirilmi≈ü"
                    })
    
    # 2. SPLIT ENTITY Detection
    gt_to_pred = defaultdict(list)
    for i, match in enumerate(matches):
        if match['match_type'] == 'matched' and match.get('gt_entity'):
            try:
                gt_idx = gt_entities.index(match['gt_entity'])
                gt_to_pred[gt_idx].append(match['pred_entity'])
            except ValueError:
                continue
    
    for gt_idx, pred_ents in gt_to_pred.items():
        if len(pred_ents) > 1:
            gt_ent = gt_entities[gt_idx]
            errors['split_entities'].append({
                'type': 'SPLIT_ENTITY',
                'severity': 'MEDIUM',
                'gt_entity': {
                    'finding': gt_ent.get('general_finding'),
                    'degree': gt_ent.get('degree')
                },
                'split_into': [p.get('general_finding') for p in pred_ents],
                'count': len(pred_ents)
            })
    
    # 3. CONTRADICTION Detection
    for match in matches:
        if match['match_type'] == 'matched':
            gt_ent = match['gt_entity']
            pred_ent = match['pred_entity']
            
            gt_pres = str(gt_ent.get('finding_presence', '')).lower()
            pred_pres = str(pred_ent.get('finding_presence', '')).lower()
            
            contradictions = [('present', 'absent'), ('absent', 'present'), ('normal', 'abnormal')]
            if (gt_pres, pred_pres) in contradictions:
                errors['contradictions'].append({
                    'finding': gt_ent.get('general_finding'),
                    'gt_presence': gt_pres,
                    'pred_presence': pred_pres
                })
    
    return errors

# ============================================================================
# MODEL CONFIGURATIONS
# ============================================================================

EMBEDDING_MODELS = {
    "pubmedbert": {
        "name": "microsoft/BiomedNLP-PubMedBERT-base-uncased-abstract-fulltext",
        "description": "PubMed trained"
    },
    "s_pubmedbert": {
        "name": "pritamdeka/S-PubMedBert-MS-MARCO",
        "description": "Sentence-level"
    },
    "general_baseline": {
        "name": "sentence-transformers/all-MiniLM-L6-v2",
        "description": "General baseline"
    },
    "neuml_pubmedbert": {
        "name": "NeuML/pubmedbert-base-embeddings",
        "description": "NeuML embeddings"
    }
}

LLM_MODELS = {
    #"gemini_flash": {"type": "gemini", "name": "models/gemini-2.5-flash"},
    "gemini_pro": {"type": "gemini", "name": "models/gemini-2.5-pro"},
    #"gemma": {"type": "gemma", "name": "gemma-3-27b-it"},
    #"glm": {"type": "glm", "name": "glm-4-flash"},
    #"deepseek": {"type": "deepseek", "name": "deepseek-chat"}
}

# ============================================================================
# VALIDATION & DISPLAY
# ============================================================================

print("\n" + "="*70)
print("ENTITY-LEVEL EVALUATION v2.0 (Semantic Matching)")
print("="*70)

# Validate selections
valid_llm_keys = set(LLM_MODELS.keys())
invalid_llm = [k for k in UserConfig.SELECTED_LLM_MODELS if k not in valid_llm_keys]
if invalid_llm:
    print(f"‚ö†Ô∏è  WARNING: Invalid LLM models: {invalid_llm}")
    UserConfig.SELECTED_LLM_MODELS = [k for k in UserConfig.SELECTED_LLM_MODELS if k in valid_llm_keys]

valid_emb_keys = set(EMBEDDING_MODELS.keys())
invalid_emb = [k for k in UserConfig.SELECTED_EMBEDDING_MODELS if k not in valid_emb_keys]
if invalid_emb:
    print(f"‚ö†Ô∏è  WARNING: Invalid embedding models: {invalid_emb}")
    UserConfig.SELECTED_EMBEDDING_MODELS = [k for k in UserConfig.SELECTED_EMBEDDING_MODELS if k in valid_emb_keys]

print(f"\nüìÇ Data Directory: {UserConfig.DATA_DIR}")
print(f"üìÑ Ground Truth: {UserConfig.GT_FILENAME}")
print(f"\nü§ñ Selected LLM Models ({len(UserConfig.SELECTED_LLM_MODELS)}):")
for llm in UserConfig.SELECTED_LLM_MODELS:
    llm_info = LLM_MODELS[llm]
    has_key = "‚úÖ" if UserConfig.API_KEYS.get(llm_info['type']) else "‚ùå"
    print(f"   {has_key} {llm}: {llm_info['name']}")

print(f"\nüß† Selected Embedding Models ({len(UserConfig.SELECTED_EMBEDDING_MODELS)}):")
for emb in UserConfig.SELECTED_EMBEDDING_MODELS:
    emb_info = EMBEDDING_MODELS[emb]
    print(f"   ‚úÖ {emb}: {emb_info['name']}")

print("="*70)

# ============================================================================
# EVALUATION CLASS
# ============================================================================


class EntityLevelReportEvaluator:
    
    def __init__(self, api_keys: dict, match_threshold: float = 0.6):
        self.api_keys = api_keys
        self.match_threshold = match_threshold
        
        print("\nüîß Initializing Semantic Medical Matcher...")
        self.semantic_matcher = SemanticMedicalMatcher(use_embeddings=True)
        self.entity_evaluator = EntityLevelEvaluator(
            use_semantic_matching=True,
            use_llm_for_borderline=False,
            llm_evaluator=None
        )
        self.embedding_cache = {}
        print("üß† Pre-loading embedding models...")
        for emb_key, emb_config in EMBEDDING_MODELS.items():
            try:
                emb_eval = EmbeddingEvaluator(emb_config['name'])
                if emb_eval.model:
                    self.embedding_cache[emb_key] = emb_eval
                    print(f"   ‚úÖ Cached: {emb_key}")
                else:
                    print(f"   ‚ùå Failed: {emb_key}")
            except Exception as e:
                print(f"   ‚ùå Error loading {emb_key}: {e}")
        
        print("‚úÖ Evaluator ready!")
    
    def evaluate_single(self, gt_path: str, pred_path: str,
                       embedding_models: List[str] = None,
                       llm_models: List[str] = None) -> Dict:
        
        with open(gt_path, 'r') as f:
            gt_schema = json.load(f)
        with open(pred_path, 'r') as f:
            pred_schema = json.load(f)
        
        result = {
            'gt_file': Path(gt_path).name,
            'pred_file': Path(pred_path).name,
            'timestamp': datetime.now().isoformat(),
            'status': 'success'
        }
        
        # Report check
        if gt_schema.get('report', '').strip() != pred_schema.get('report', '').strip():
            return {**result, 'status': 'error', 'error': 'Reports do not match'}
        
        # Entity extraction
        gt_entities = self.entity_evaluator.flatten_entities(gt_schema)
        pred_entities = self.entity_evaluator.flatten_entities(pred_schema)
        
        print(f"\n{'='*70}")
        print(f"üìÅ Evaluating: {Path(pred_path).name}")
        print(f"{'='*70}")
        print(f"üìä Entity Count: GT={len(gt_entities)} | Pred={len(pred_entities)}")
        
        # Matching
        matches = self.entity_evaluator.match_entities(
            gt_entities, pred_entities, report_text=gt_schema.get('report', '')
        )
        
        # Metrics
        base_metrics = self.entity_evaluator.compute_metrics(matches)
        
        # üÜï Structural error detection (detailed)
        structural_errors = detect_entity_structural_errors(matches, gt_entities, pred_entities)
        
        # üÜï Per-field error analysis
        field_errors = self._analyze_field_errors(matches)
        
        result['entity_metrics'] = {
            'total_gt': len(gt_entities),
            'total_pred': len(pred_entities),
            'true_positives': base_metrics['true_positives'],
            'false_positives': base_metrics['false_positives'],
            'false_negatives': base_metrics['false_negatives'],
            'precision': base_metrics['precision'],
            'recall': base_metrics['recall'],
            'f1_score': base_metrics['f1_score'],
            'avg_match_quality': base_metrics['avg_match_quality'],
            'contradictions': base_metrics['contradiction_count'],
            'field_wise_accuracy': base_metrics.get('field_wise_accuracy', {}),
            'structural_errors': structural_errors,  # üÜï Detailed errors
            'field_error_analysis': field_errors     # üÜï Per-field breakdown
        }
        
        print(f"\nüìà STRUCTURAL (Entity-Level):")
        print(f"   Precision: {base_metrics['precision']:.3f}")
        print(f"   Recall:    {base_metrics['recall']:.3f}")
        print(f"   F1-Score:  {base_metrics['f1_score']:.3f}")
        print(f"   TP: {base_metrics['true_positives']} | FP: {base_metrics['false_positives']} | FN: {base_metrics['false_negatives']}")
        
        if structural_errors['merged_entities']:
            print(f"   üî¥ Merged Entities: {len(structural_errors['merged_entities'])}")
        if structural_errors['split_entities']:
            print(f"   üü° Split Entities: {len(structural_errors['split_entities'])}")
        if structural_errors['contradictions']:
            print(f"   ‚ö´ Contradictions: {len(structural_errors['contradictions'])}")
        
        # ==========================================
        # üÜï EMBEDDING EVALUATION (Cached + FN/FP Analysis)
        # ==========================================
        result['embedding_scores'] = {}
        result['error_analysis'] = {
            'fp_semantic_scores': {},  # False Positive semantic similarities
            'fn_semantic_scores': {}   # False Negative semantic similarities
        }
        
        if embedding_models:
            print(f"\nüîç SEMANTIC (Embedding) Evaluation:")
            
            for emb_key in embedding_models:
                if emb_key not in self.embedding_cache:
                    continue
                
                emb_eval = self.embedding_cache[emb_key]
                print(f"   Using cached {emb_key}...")
                
                # 1. Matched pairs (TP)
                tp_scores = []
                for match in matches:
                    if match['match_type'] == 'matched':
                        try:
                            sim = emb_eval.compute_similarity(
                                {'output': [match['gt_entity']]}, 
                                {'output': [match['pred_entity']]}
                            )
                            tp_scores.append(float(sim))
                        except:
                            continue
                
                # 2. False Positive Analysis (Pred has it, GT doesn't)
                fp_scores = []
                for match in matches:
                    if match['match_type'] == 'false_positive' and match['pred_entity']:
                        # Find most similar GT entity (to see if it's a misalignment)
                        best_sim = 0
                        for gt_ent in gt_entities:
                            try:
                                sim = emb_eval.compute_similarity(
                                    {'output': [gt_ent]}, 
                                    {'output': [match['pred_entity']]}
                                )
                                best_sim = max(best_sim, float(sim))
                            except:
                                continue
                        fp_scores.append(best_sim)
                
                # 3. False Negative Analysis (GT has it, Pred doesn't)
                fn_scores = []
                for match in matches:
                    if match['match_type'] == 'false_negative' and match['gt_entity']:
                        # Find most similar Pred entity
                        best_sim = 0
                        for pred_ent in pred_entities:
                            try:
                                sim = emb_eval.compute_similarity(
                                    {'output': [match['gt_entity']]}, 
                                    {'output': [pred_ent]}
                                )
                                best_sim = max(best_sim, float(sim))
                            except:
                                continue
                        fn_scores.append(best_sim)
                
                # Store results
                if tp_scores:
                    result['embedding_scores'][emb_key] = {
                        'mean': float(np.mean(tp_scores)),
                        'std': float(np.std(tp_scores)),
                        'count': len(tp_scores),
                        'type': 'true_positives'
                    }
                    print(f"      ‚úÖ TP: {np.mean(tp_scores):.3f} (n={len(tp_scores)})")
                
                if fp_scores:
                    result['error_analysis']['fp_semantic_scores'][emb_key] = {
                        'mean': float(np.mean(fp_scores)),
                        'std': float(np.std(fp_scores)),
                        'count': len(fp_scores)
                    }
                    print(f"      ‚ö†Ô∏è  FP (sem): {np.mean(fp_scores):.3f} (n={len(fp_scores)})")
                
                if fn_scores:
                    result['error_analysis']['fn_semantic_scores'][emb_key] = {
                        'mean': float(np.mean(fn_scores)),
                        'std': float(np.std(fn_scores)),
                        'count': len(fn_scores)
                    }
                    print(f"      ‚ö†Ô∏è  FN (sem): {np.mean(fn_scores):.3f} (n={len(fn_scores)})")
        
        # ==========================================
        # LLM EVALUATION (Fixed)
        # ==========================================
        result['llm_scores'] = {}
        
        if llm_models:
            print(f"\nü§ñ LLM Evaluation:")
            
            for llm_key in llm_models:
                if llm_key not in LLM_MODELS:
                    continue
                
                llm_config = LLM_MODELS[llm_key]
                api_key = self.api_keys.get(llm_config['type'])
                
                if not api_key:
                    print(f"   ‚ö†Ô∏è  No API key for {llm_key}")
                    continue
                
                print(f"   Testing {llm_key}...", end=" ")
                
                try:
                    llm_eval = LLMEvaluator(
                        model_type=llm_config['type'],
                        model_name=llm_config['name'],
                        api_key=api_key
                    )
                    
                    # Evaluate matched pairs with rate limiting awareness
                    llm_scores = []
                    matched_pairs = [m for m in matches if m['match_type'] == 'matched'][:5]
                    
                    for i, match in enumerate(matched_pairs):
                        try:
                            llm_result = llm_eval.evaluate_schema_pair(
                                {'output': [match['gt_entity']], 'input': ''},
                                {'output': [match['pred_entity']], 'input': ''},
                                gt_schema.get('report', '')[:500]
                            )
                            score = llm_result.get('similarity_score', 0)
                            if score is not None:
                                llm_scores.append(float(score))
                        except Exception as e:
                            print(f"\n      ‚ö†Ô∏è  Pair {i+1} failed: {str(e)[:50]}")
                            continue
                    
                    if llm_scores:
                        avg_llm = np.mean(llm_scores)
                        result['llm_scores'][llm_key] = {
                            'mean': float(avg_llm),
                            'count': len(llm_scores)
                        }
                        print(f"‚úÖ {avg_llm:.3f} (n={len(llm_scores)})")
                    else:
                        print("‚ùå No scores")
                        
                except Exception as e:
                    print(f"‚ùå Failed: {str(e)[:50]}")
        
        # Sample mismatches
        print(f"\nüîç Sample Mismatches:")
        fp_shown = fn_shown = 0
        for match in matches:
            if match['match_type'] == 'false_positive' and fp_shown < 2:
                ent = match['pred_entity']
                print(f"   FP: {ent.get('general_finding')} @ {ent.get('location')}")
                fp_shown += 1
            if match['match_type'] == 'false_negative' and fn_shown < 2:
                ent = match['gt_entity']
                print(f"   FN: {ent.get('general_finding')} @ {ent.get('location')}")
                fn_shown += 1
        
        result['detailed_matches'] = matches
        return result
    
    def _analyze_field_errors(self, matches: List[Dict]) -> Dict:
        """
        Per-field error analysis: which fields cause most failures?
        """
        field_stats = defaultdict(lambda: {'total': 0, 'correct': 0, 'errors': []})
        
        for match in matches:
            if match['match_type'] != 'matched':
                continue
            
            gt_ent = match['gt_entity']
            pred_ent = match['pred_entity']
            
            # Check each field
            fields = ['general_finding', 'specific_finding', 'finding_presence', 'location', 'degree']
            for field in fields:
                gt_val = gt_ent.get(field)
                pred_val = pred_ent.get(field) if pred_ent else None
                
                field_stats[field]['total'] += 1
                
                # Simple equality check (could be enhanced with semantic similarity)
                is_match = self._values_match(gt_val, pred_val)
                if is_match:
                    field_stats[field]['correct'] += 1
                else:
                    field_stats[field]['errors'].append({
                        'gt': str(gt_val)[:50],
                        'pred': str(pred_val)[:50]
                    })
        
        # Calculate accuracy per field
        result = {}
        for field, stats in field_stats.items():
            if stats['total'] > 0:
                result[field] = {
                    'accuracy': stats['correct'] / stats['total'],
                    'total': stats['total'],
                    'errors': len(stats['errors']),
                    'error_rate': (stats['total'] - stats['correct']) / stats['total']
                }
        
        return result
    
    def _values_match(self, val1, val2) -> bool:
        """Check if two values match (handles lists and None)"""
        if val1 is None and val2 is None:
            return True
        if val1 is None or val2 is None:
            return False
        
        # Normalize lists
        if isinstance(val1, list):
            val1 = set(str(x).lower() for x in val1 if x)
        else:
            val1 = str(val1).lower()
            
        if isinstance(val2, list):
            val2 = set(str(x).lower() for x in val2 if x)
        else:
            val2 = str(val2).lower()
        
        return val1 == val2
    

def bootstrap_confidence_interval(data: List[float], n_bootstrap: int = 1000, confidence: float = 0.95) -> Tuple[float, float]:
    """
    Calculate bootstrap confidence interval for a metric
    """
    # In bootstrap_confidence_interval function, add:
    if len(data) < 2:
        return (float(np.mean(data)), float(np.mean(data)))  # Return mean as both bounds
    
    bootstrap_means = []
    n = len(data)
    
    for _ in range(n_bootstrap):
        sample = np.random.choice(data, size=n, replace=True)
        bootstrap_means.append(np.mean(sample))
    
    alpha = 1 - confidence
    lower = np.percentile(bootstrap_means, 100 * alpha / 2)
    upper = np.percentile(bootstrap_means, 100 * (1 - alpha / 2))
    
    return (float(lower), float(upper))

def calculate_correlations(results: List[Dict]) -> Dict:
    """
    Calculate correlations between Structural F1 and Semantic scores
    """
    successful = [r for r in results if r.get('status') == 'success']
    if len(successful) < 2:
        return {}
    
    # Extract scores
    structural_scores = [r['entity_metrics']['f1_score'] for r in successful]
    
    correlations = {}
    
    # Correlate with each embedding model
    for emb_key in ['pubmedbert', 's_pubmedbert', 'general_baseline', 'neuml_pubmedbert']:
        emb_scores = [r.get('embedding_scores', {}).get(emb_key, {}).get('mean', 0) for r in successful]
        
        if any(emb_scores):  # If we have scores
            # Pearson correlation
            if len(structural_scores) == len(emb_scores) and len(structural_scores) > 1:
                corr_matrix = np.corrcoef(structural_scores, emb_scores)
                correlations[f'structural_vs_{emb_key}'] = {
                    'pearson_r': float(corr_matrix[0, 1]) if corr_matrix.shape == (2, 2) else 0.0,
                    'structural_mean': float(np.mean(structural_scores)),
                    'structural_std': float(np.std(structural_scores)),
                    'semantic_mean': float(np.mean(emb_scores)),
                    'semantic_std': float(np.std(emb_scores))
                }
    
    return correlations

def analyze_structural_vs_semantic_gap(results: List[Dict]) -> Dict:
    """
    Analyze the gap between structural and semantic scores
    """
    gaps = []
    for r in results:
        if r.get('status') != 'success':
            continue
        
        struct = r['entity_metrics']['f1_score']
        sem = r.get('embedding_scores', {}).get('pubmedbert', {}).get('mean', 0)
        
        if sem > 0:
            gap = sem - struct
            normalized_gap = gap / struct if struct > 0 else 0
            gaps.append({
                'sample': r['pred_file'],
                'structural': struct,
                'semantic': sem,
                'gap': gap,
                'normalized_gap': normalized_gap,
                'interpretation': 'Meaning captured but exact matching failed' if gap > 0.3 else 'Aligned'
            })
    
    if not gaps:
        return {}
    
    return {
        'mean_gap': float(np.mean([g['gap'] for g in gaps])),
        'mean_normalized_gap': float(np.mean([g['normalized_gap'] for g in gaps])),
        'high_divergence_samples': [g for g in gaps if g['normalized_gap'] > 0.5],
        'sample_count': len(gaps)
    }

def evaluate_directory(
    data_dir: str = None,
    gt_filename: str = None,
    output_dir: str = None,
    embedding_models: List[str] = None,
    llm_models: List[str] = None
):
    """
    Evaluate all samples in directory
    """
    # Defaults
    data_dir = data_dir or UserConfig.DATA_DIR
    gt_filename = gt_filename or UserConfig.GT_FILENAME
    output_dir = output_dir or (Path(data_dir) / 'entity_level_results')
    embedding_models = embedding_models or UserConfig.SELECTED_EMBEDDING_MODELS
    llm_models = llm_models or UserConfig.SELECTED_LLM_MODELS
    
    data_path = Path(data_dir)
    gt_path = data_path / gt_filename
    output_path = Path(output_dir)
    output_path.mkdir(parents=True, exist_ok=True)
    
    if not gt_path.exists():
        print(f"‚ùå GT file not found: {gt_path}")
        return
    
    # Find test files
    test_files = sorted(data_path.glob("sample*.json"))
    
    if not test_files:
        print(f"‚ö†Ô∏è No test files found in {data_dir}")
        return
    
    print(f"\n{'='*70}")
    print(f"BATCH EVALUATION")
    print(f"{'='*70}")
    print(f"Directory: {data_dir}")
    print(f"GT: {gt_filename}")
    print(f"Test files: {len(test_files)}")
    
    # Initialize evaluator
    evaluator = EntityLevelReportEvaluator(
        api_keys=UserConfig.API_KEYS,
        match_threshold=UserConfig.MATCH_THRESHOLD
    )
    
    # Evaluate each file
    all_results = []
    
    for test_file in test_files:
        result = evaluator.evaluate_single(
            str(gt_path),
            str(test_file),
            embedding_models=embedding_models,
            llm_models=llm_models
        )
        
        all_results.append(result)
        
        # Save individual result
        result_file = output_path / f"result_{test_file.stem}.json"
        with open(result_file, 'w') as f:
            # Don't save full matches to keep file size reasonable
            result_summary = {k: v for k, v in result.items() if k != 'detailed_matches'}
            json.dump(result_summary, f, indent=2, ensure_ascii=False)
        
        print(f"\n  üíæ Saved: {result_file.name}")
    
    # Generate summary
    _generate_summary(all_results, output_path)
    
    # üÜï FINAL COMPARISON TABLE
    print("\n" + "="*70)
    print("üìä COMPREHENSIVE COMPARISON TABLE")
    print("="*70)
    print(f"{'Sample':<15} {'Structural':<12} {'PubMedBERT':<12} {'S-PubMedB':<12} {'LLM':<10}")
    print("-"*70)
    
    for r in all_results:
        if r.get('status') != 'success':
            print(f"{r['pred_file']:<15} {'ERROR':<12}")
            continue
        
        sample = r['pred_file'].replace('sample', '').replace('.json', '')
        struct_f1 = r['entity_metrics']['f1_score']
        
        # Get embedding scores (default to 0 if missing)
        emb_pub = r.get('embedding_scores', {}).get('pubmedbert', {}).get('mean', 0)
        emb_spub = r.get('embedding_scores', {}).get('s_pubmedbert', {}).get('mean', 0)
        
        # Get LLM score
        llm_score = r.get('llm_scores', {}).get('gemini_pro', {}).get('mean', 0)
        
        print(f"{sample:<15} {struct_f1:<12.3f} {emb_pub:<12.3f} {emb_spub:<12.3f} {llm_score:<10.3f}")
    
    print("-"*70)
    
    # Interpretation guide
    print(f"\nüìã Interpretation:")
    print(f"   ‚Ä¢ Structural: Entity-level F1 (exact matching)")
    print(f"   ‚Ä¢ PubMedBERT: Semantic similarity (medical context)")
    print(f"   ‚Ä¢ S-PubMedB:  Sentence-level semantic similarity")
    print(f"   ‚Ä¢ LLM:        Gemini clinical assessment")
    print(f"\nüí° Note: If Semantic > Structural, model captures meaning")
    print(f"         but makes exact matching errors (synonyms, etc.)")
    
    return all_results


def _generate_summary(results: List[Dict], output_dir: Path):
    """Generate comprehensive summary report with statistics"""
    summary_path = output_dir / "SUMMARY.txt"
    
    successful = [r for r in results if r.get('status') == 'success']
    
    # üÜï Statistical calculations
    f1_scores = [r['entity_metrics']['f1_score'] for r in successful]
    f1_ci = bootstrap_confidence_interval(f1_scores) if f1_scores else (0, 0)
    correlations = calculate_correlations(results)
    gap_analysis = analyze_structural_vs_semantic_gap(results)
    
    with open(summary_path, 'w') as f:
        f.write("="*70 + "\n")
        f.write("ENTITY-LEVEL EVALUATION SUMMARY v2.1 (Professional)\n")
        f.write("="*70 + "\n\n")
        f.write(f"Timestamp: {datetime.now().isoformat()}\n")
        f.write(f"Total samples: {len(results)}\n")
        f.write(f"Successful: {len(successful)}\n")
        f.write(f"Failed: {len(results) - len(successful)}\n\n")
        
        # üÜï Confidence Intervals
        if f1_scores:
            f.write("CONFIDENCE INTERVALS (95% Bootstrap):\n")
            f.write("-"*70 + "\n")
            f.write(f"Structural F1: {np.mean(f1_scores):.3f} [{f1_ci[0]:.3f}, {f1_ci[1]:.3f}]\n\n")
        
        # Per-field error analysis
        f.write("PER-FIELD ERROR ANALYSIS:\n")
        f.write("-"*70 + "\n")
        all_field_errors = defaultdict(lambda: {'total': 0, 'errors': 0})
        for r in successful:
            field_analysis = r['entity_metrics'].get('field_error_analysis', {})
            for field, stats in field_analysis.items():
                all_field_errors[field]['total'] += stats.get('total', 0)
                all_field_errors[field]['errors'] += stats.get('errors', 0)
        
        for field, stats in sorted(all_field_errors.items(), key=lambda x: x[1]['errors'], reverse=True):
            error_rate = stats['errors'] / stats['total'] * 100 if stats['total'] > 0 else 0
            f.write(f"  {field:25s}: {error_rate:5.1f}% error ({stats['errors']}/{stats['total']})\n")
        f.write("\n")
        
        # Per-sample detailed results
        f.write("PER-SAMPLE RESULTS:\n")
        f.write("-"*70 + "\n")
        
        for result in results:
            if result.get('status') == 'error':
                f.write(f"\n{result['pred_file']}: ‚ùå ERROR - {result['error']}\n")
                continue
            
            metrics = result['entity_metrics']
            f.write(f"\n{result['pred_file']}:\n")
            f.write(f"  üìä STRUCTURAL (Entity-Level):\n")
            f.write(f"    Entities: GT={metrics['total_gt']}, Pred={metrics['total_pred']}\n")
            f.write(f"    Precision: {metrics['precision']:.3f}\n")
            f.write(f"    Recall:    {metrics['recall']:.3f}\n")
            f.write(f"    F1-Score:  {metrics['f1_score']:.3f}\n")
            f.write(f"    TP: {metrics['true_positives']}, FP: {metrics['false_positives']}, FN: {metrics['false_negatives']}\n")
            
            # Field-wise accuracy
            if metrics.get('field_wise_accuracy'):
                f.write(f"  üî¨ Field-wise Accuracy:\n")
                for field, acc in metrics['field_wise_accuracy'].items():
                    f.write(f"    {field}: {acc:.3f}\n")
            
            # Embedding scores
            if result.get('embedding_scores'):
                f.write(f"  üîç SEMANTIC (Embeddings):\n")
                for emb_name, emb_data in result['embedding_scores'].items():
                    f.write(f"    {emb_name}: {emb_data['mean']:.3f} (n={emb_data['count']})\n")
                
                # üÜï FN/FP Semantic scores
                if result.get('error_analysis'):
                    f.write(f"  ‚ö†Ô∏è  Error Semantic Analysis:\n")
                    for emb_name, fp_data in result['error_analysis'].get('fp_semantic_scores', {}).items():
                        f.write(f"    FP {emb_name}: {fp_data['mean']:.3f} (high=missed alignment)\n")
            
            # LLM scores
            if result.get('llm_scores'):
                f.write(f"  ü§ñ LLM:\n")
                for llm_name, llm_data in result['llm_scores'].items():
                    f.write(f"    {llm_name}: {llm_data['mean']:.3f}\n")
            
            # Structural errors
            s_errors = metrics.get('structural_errors', {})
            if s_errors.get('merged_entities'):
                f.write(f"  üî¥ Merged Entities: {len(s_errors['merged_entities'])}\n")
                for me in s_errors['merged_entities'][:2]:  # Show first 2
                    f.write(f"      - {me['description']}\n")
            if s_errors.get('contradictions'):
                f.write(f"  ‚ö´ Contradictions: {len(s_errors['contradictions'])}\n")
        
        # Comparison Table
        f.write("\n" + "="*70 + "\n")
        f.write("COMPARISON TABLE (Structural vs Semantic vs LLM)\n")
        f.write("-"*70 + "\n")
        f.write(f"{'Sample':<15} {'Structural':<12} {'PubMedBERT':<12} {'S-PubMedB':<12} {'LLM':<10}\n")
        f.write("-"*70 + "\n")
        
        for r in results:
            if r.get('status') != 'success':
                f.write(f"{r['pred_file']:<15} {'ERROR':<12}\n")
                continue
            
            sample_name = r['pred_file'].replace('sample', '').replace('.json', '')
            struct_f1 = r['entity_metrics']['f1_score']
            emb_pub = r.get('embedding_scores', {}).get('pubmedbert', {}).get('mean', 0)
            emb_spub = r.get('embedding_scores', {}).get('s_pubmedbert', {}).get('mean', 0)
            llm_score = r.get('llm_scores', {}).get('gemini_pro', {}).get('mean', 0)
            
            f.write(f"{sample_name:<15} {struct_f1:<12.3f} {emb_pub:<12.3f} {emb_spub:<12.3f} {llm_score:<10.3f}\n")
        
                # üÜï Structural Error Summary - WITH EXAMPLES
        f.write("\n" + "="*70 + "\n")
        f.write("STRUCTURAL ERROR ANALYSIS\n")
        f.write("-"*70 + "\n")
        
        # Collect actual error objects from all samples (not just counts)
        all_merged = []
        all_splits = []
        all_contra = []
        
        for r in successful:
            errors = r['entity_metrics'].get('structural_errors', {})
            all_merged.extend(errors.get('merged_entities', []))
            all_splits.extend(errors.get('split_entities', []))
            all_contra.extend(errors.get('contradictions', []))
        
        total_merged = len(all_merged)
        total_splits = len(all_splits)
        total_contra = len(all_contra)
        
        f.write(f"Total Merged Entities: {total_merged}\n")
        f.write(f"Total Split Entities: {total_splits}\n")
        f.write(f"Total Contradictions: {total_contra}\n")
        
    
        # Show actual merged entity examples
        if all_merged:
            f.write(f"\nüî¥ MERGED ENTITY EXAMPLES:\n")
            for i, me in enumerate(all_merged[:3], 1):
                f.write(f"  {i}. {me.get('description', 'N/A')}\n")
                f.write(f"     Severity: {me.get('severity', 'N/A')}\n")
                f.write(f"     Impact: {me.get('impact', 'N/A')}\n")
                gt_ents = me.get('gt_entities', [])
                if gt_ents:
                    findings = [g.get('finding', 'Unknown') for g in gt_ents]
                    f.write(f"     Source entities: {', '.join(findings)}\n")
                pred = me.get('pred_entity', {})
                if pred:
                    degrees = pred.get('degree', [])
                    deg_str = ', '.join(degrees) if degrees else 'None'
                    f.write(f"     Merged into: {pred.get('finding', 'Unknown')} "
                           f"(degrees: {deg_str})\n")
                f.write("\n")
            
            if len(all_merged) > 3:
                f.write(f"     ... and {len(all_merged) - 3} more\n")
        
        # Show split entity examples  
        if all_splits:
            f.write(f"\nüü° SPLIT ENTITY EXAMPLES:\n")
            for i, se in enumerate(all_splits[:3], 1):
                gt_ent = se.get('gt_entity', {})
                f.write(f"  {i}. GT: {gt_ent.get('finding', 'Unknown')} "
                       f"‚Üí Split into: {', '.join(se.get('split_into', []))}\n")
                f.write(f"     Severity: {se.get('severity', 'N/A')}\n")
                f.write(f"     Description: {se.get('description', 'N/A')}\n")
        
        # Show contradiction examples
        if all_contra:
            f.write(f"\n‚ö´ CONTRADICTION EXAMPLES:\n")
            for i, co in enumerate(all_contra[:3], 1):
                f.write(f"  {i}. {co.get('finding', 'Unknown')}: "
                       f"GT='{co.get('gt_presence', 'N/A')}' vs "
                       f"Pred='{co.get('pred_presence', 'N/A')}'\n")
        
        # üÜï Correlation Analysis
        if correlations:
            f.write("\n" + "="*70 + "\n")
            f.write("CORRELATION ANALYSIS (Structural vs Semantic)\n")
            f.write("-"*70 + "\n")
            for corr_name, corr_data in correlations.items():
                f.write(f"{corr_name}:\n")
                f.write(f"  Pearson r: {corr_data['pearson_r']:.3f}\n")
                f.write(f"  Structural: {corr_data['structural_mean']:.3f} ¬± {corr_data['structural_std']:.3f}\n")
                f.write(f"  Semantic:   {corr_data['semantic_mean']:.3f} ¬± {corr_data['semantic_std']:.3f}\n")
                if abs(corr_data['pearson_r']) < 0.3:
                    f.write(f"  ‚ö†Ô∏è  Low correlation: Metrics measure different aspects\n")
                elif corr_data['pearson_r'] > 0.7:
                    f.write(f"  ‚úÖ High correlation: Metrics are aligned\n")
                f.write("\n")
        
        # üÜï Semantic vs Structural Gap Analysis
        if gap_analysis:
            f.write("="*70 + "\n")
            f.write("SEMANTIC-STRUCTURAL DIVERGENCE ANALYSIS\n")
            f.write("-"*70 + "\n")
            f.write(f"Mean Gap: {gap_analysis['mean_gap']:.3f}\n")
            f.write(f"Mean Normalized Gap: {gap_analysis['mean_normalized_gap']:.3f}\n")
            if gap_analysis['high_divergence_samples']:
                f.write(f"\nHigh Divergence Samples (semantic >> structural):\n")
                for s in gap_analysis['high_divergence_samples'][:3]:
                    f.write(f"  - {s['sample']}: Structural={s['structural']:.3f}, Semantic={s['semantic']:.3f}\n")
                    f.write(f"    Interpretation: {s['interpretation']}\n")
        
        # Aggregate Statistics
        f.write("\n" + "="*70 + "\n")
        f.write("AGGREGATE STATISTICS\n")
        f.write("-"*70 + "\n")
        
        if successful:
            avg_precision = np.mean([r['entity_metrics']['precision'] for r in successful])
            avg_recall = np.mean([r['entity_metrics']['recall'] for r in successful])
            avg_f1 = np.mean([r['entity_metrics']['f1_score'] for r in successful])
            
            f.write(f"\nAverage Structural Precision: {avg_precision:.3f}\n")
            f.write(f"Average Structural Recall:    {avg_recall:.3f}\n")
            f.write(f"Average Structural F1-Score:  {avg_f1:.3f}\n")
            
            # Average embeddings
            for emb_name in ['pubmedbert', 's_pubmedbert', 'general_baseline', 'neuml_pubmedbert']:
                scores = [r.get('embedding_scores', {}).get(emb_name, {}).get('mean', 0) 
                         for r in successful if r.get('embedding_scores', {}).get(emb_name)]
                if scores:
                    ci = bootstrap_confidence_interval(scores)
                    f.write(f"Average {emb_name}: {np.mean(scores):.3f} [{ci[0]:.3f}, {ci[1]:.3f}]\n")
    
    print(f"\nüìÑ Summary saved: {summary_path}")


# ============================================================================
# RUN EVALUATION
# ============================================================================

print("\n" + "="*70)
print("üöÄ STARTING EVALUATION")
print("="*70)

results = evaluate_directory()


print("\n‚úÖ DONE!")

  from .autonotebook import tqdm as notebook_tqdm



ENTITY-LEVEL EVALUATION v2.0 (Semantic Matching)

üìÇ Data Directory: ./data_report/0/
üìÑ Ground Truth: gt0.json

ü§ñ Selected LLM Models (1):
   ‚úÖ gemini_pro: models/gemini-2.5-pro

üß† Selected Embedding Models (4):
   ‚úÖ pubmedbert: microsoft/BiomedNLP-PubMedBERT-base-uncased-abstract-fulltext
   ‚úÖ s_pubmedbert: pritamdeka/S-PubMedBert-MS-MARCO
   ‚úÖ general_baseline: sentence-transformers/all-MiniLM-L6-v2
   ‚úÖ neuml_pubmedbert: NeuML/pubmedbert-base-embeddings

üöÄ STARTING EVALUATION

BATCH EVALUATION
Directory: ./data_report/0/
GT: gt0.json
Test files: 4

üîß Initializing Semantic Medical Matcher...
üîÑ Loading model: cambridgeltl/SapBERT-from-PubMedBERT-fulltext


No sentence-transformers model found with name cambridgeltl/SapBERT-from-PubMedBERT-fulltext. Creating a new one with mean pooling.
Loading weights: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 199/199 [00:00<00:00, 706.32it/s, Materializing param=pooler.dense.weight]                               
BertModel LOAD REPORT from: cambridgeltl/SapBERT-from-PubMedBERT-fulltext
Key                     | Status     |  | 
------------------------+------------+--+-
embeddings.position_ids | UNEXPECTED |  | 

Notes:
- UNEXPECTED	:can be ignored when loading from different task/architecture; not ok if you expect identical arch.


‚úÖ SapBERT loaded
üîÑ Loading model: microsoft/BiomedNLP-PubMedBERT-base-uncased-abstract-fulltext


No sentence-transformers model found with name microsoft/BiomedNLP-PubMedBERT-base-uncased-abstract-fulltext. Creating a new one with mean pooling.
Loading weights: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 199/199 [00:00<00:00, 1123.92it/s, Materializing param=pooler.dense.weight]                               
BertModel LOAD REPORT from: microsoft/BiomedNLP-PubMedBERT-base-uncased-abstract-fulltext
Key                                        | Status     |  | 
-------------------------------------------+------------+--+-
cls.predictions.decoder.bias               | UNEXPECTED |  | 
cls.predictions.transform.LayerNorm.bias   | UNEXPECTED |  | 
cls.predictions.transform.dense.weight     | UNEXPECTED |  | 
cls.predictions.transform.dense.bias       | UNEXPECTED |  | 
cls.seq_relationship.bias                  | UNEXPECTED |  | 
cls.predictions.transform.LayerNorm.weight | UNEXPECTED |  | 
cls.seq_relationship.weight                | UNEXPECTED |  | 
cls.predictions.bias                       

‚úÖ PubMedBERT loaded
‚ö° Using cached model: cambridgeltl/SapBERT-from-PubMedBERT-fulltext
‚úÖ SapBERT loaded
‚ö° Using cached model: microsoft/BiomedNLP-PubMedBERT-base-uncased-abstract-fulltext
‚úÖ PubMedBERT loaded
üß† Pre-loading embedding models...
    Loading: microsoft/BiomedNLP-PubMedBERT-base-uncased-abstract-fulltext...


No sentence-transformers model found with name microsoft/BiomedNLP-PubMedBERT-base-uncased-abstract-fulltext. Creating a new one with mean pooling.
Loading weights: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 199/199 [00:00<00:00, 1389.71it/s, Materializing param=pooler.dense.weight]                               
BertModel LOAD REPORT from: microsoft/BiomedNLP-PubMedBERT-base-uncased-abstract-fulltext
Key                                        | Status     |  | 
-------------------------------------------+------------+--+-
cls.predictions.decoder.bias               | UNEXPECTED |  | 
cls.predictions.transform.LayerNorm.bias   | UNEXPECTED |  | 
cls.predictions.transform.dense.weight     | UNEXPECTED |  | 
cls.predictions.transform.dense.bias       | UNEXPECTED |  | 
cls.seq_relationship.bias                  | UNEXPECTED |  | 
cls.predictions.transform.LayerNorm.weight | UNEXPECTED |  | 
cls.seq_relationship.weight                | UNEXPECTED |  | 
cls.predictions.bias                       

    ‚úÖ Loaded successfully
   ‚úÖ Cached: pubmedbert
    Loading: pritamdeka/S-PubMedBert-MS-MARCO...


Loading weights: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 199/199 [00:00<00:00, 1634.87it/s, Materializing param=pooler.dense.weight]                               
BertModel LOAD REPORT from: pritamdeka/S-PubMedBert-MS-MARCO
Key                     | Status     |  | 
------------------------+------------+--+-
embeddings.position_ids | UNEXPECTED |  | 

Notes:
- UNEXPECTED	:can be ignored when loading from different task/architecture; not ok if you expect identical arch.


    ‚úÖ Loaded successfully
   ‚úÖ Cached: s_pubmedbert
    Loading: sentence-transformers/all-MiniLM-L6-v2...


Loading weights: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 103/103 [00:00<00:00, 1472.14it/s, Materializing param=pooler.dense.weight]                             
BertModel LOAD REPORT from: sentence-transformers/all-MiniLM-L6-v2
Key                     | Status     |  | 
------------------------+------------+--+-
embeddings.position_ids | UNEXPECTED |  | 

Notes:
- UNEXPECTED	:can be ignored when loading from different task/architecture; not ok if you expect identical arch.


    ‚úÖ Loaded successfully
   ‚úÖ Cached: general_baseline
    Loading: NeuML/pubmedbert-base-embeddings...


Loading weights: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 199/199 [00:00<00:00, 1464.78it/s, Materializing param=pooler.dense.weight]                               


    ‚úÖ Loaded successfully
   ‚úÖ Cached: neuml_pubmedbert
‚úÖ Evaluator ready!

üìÅ Evaluating: sample0.0.json
üìä Entity Count: GT=30 | Pred=30

üìà STRUCTURAL (Entity-Level):
   Precision: 0.900
   Recall:    0.900
   F1-Score:  0.900
   TP: 27 | FP: 3 | FN: 3

üîç SEMANTIC (Embedding) Evaluation:
   Using cached pubmedbert...
      ‚úÖ TP: 0.998 (n=27)
      ‚ö†Ô∏è  FP (sem): 0.995 (n=3)
      ‚ö†Ô∏è  FN (sem): 1.000 (n=3)
   Using cached s_pubmedbert...
      ‚úÖ TP: 0.992 (n=27)
      ‚ö†Ô∏è  FP (sem): 0.981 (n=3)
      ‚ö†Ô∏è  FN (sem): 1.000 (n=3)
   Using cached general_baseline...
      ‚úÖ TP: 0.954 (n=27)
      ‚ö†Ô∏è  FP (sem): 0.900 (n=3)
      ‚ö†Ô∏è  FN (sem): 1.000 (n=3)
   Using cached neuml_pubmedbert...
      ‚úÖ TP: 0.956 (n=27)
      ‚ö†Ô∏è  FP (sem): 0.886 (n=3)
      ‚ö†Ô∏è  FN (sem): 1.000 (n=3)

ü§ñ LLM Evaluation:
   Testing gemini_pro...    [DEBUG] Looking up rate limit for: 'gemini-2.5-pro'
   [DEBUG] Found sleep time: 20.0s
‚úì gemini modeli ba≈ülatƒ