In [None]:
import os
import json
import pandas as pd
from e_inference import run_inference_pipeline, CORRECTIONS_DIR, INPUT_DIR

def find_corresponding_input_file(base_name):
    """Finds the original input file (PDF, PNG, JPG) for a given base name."""
    for ext in ['.pdf', '.png', '.jpg', '.jpeg']:
        input_path = os.path.join(INPUT_DIR, base_name + ext)
        if os.path.exists(input_path):
            return input_path
    return None

def flatten_json_for_eval(data):
    """Converts the nested JSON into a flat set of strings for easy comparison."""
    flat_items = set()
    # Flatten patient info
    for key, value in data.get('patient_info', {}).items():
        # Normalize key and value for consistent comparison
        key_norm = str(key).strip().lower()
        value_norm = str(value).strip().lower()
        if value_norm: # Only add if there is a value
            flat_items.add(f"patient_{key_norm}={value_norm}")

    # Flatten lab results, using a primary key to identify each row
    lab_results = data.get('lab_results', [])
    # Find the most likely primary key (e.g., 'Investigation', 'Test Name')
    primary_key = None
    if lab_results:
        potential_keys = lab_results[0].keys()
        for key in potential_keys:
            if 'investigation' in key.lower() or 'test' in key.lower():
                primary_key = key
                break
        if not primary_key:
            primary_key = next(iter(potential_keys), None)

    if primary_key:
        for row in lab_results:
            row_id = str(row.get(primary_key, 'unknown_row')).strip().lower()
            for key, value in row.items():
                if key == primary_key: continue
                
                key_norm = str(key).strip().lower()
                value_norm = str(value).strip().lower()
                if value_norm:
                    flat_items.add(f"result_{row_id}_{key_norm}={value_norm}")
            
    return flat_items

def compare_extractions(ground_truth, prediction):
    """
    Compares extractions by flattening the data structures into sets.
    Returns counts of True Positives (TP), False Positives (FP), and False Negatives (FN).
    """
    gt_set = flatten_json_for_eval(ground_truth)
    pred_set = flatten_json_for_eval(prediction)

    tp = len(gt_set.intersection(pred_set))
    fp = len(pred_set - gt_set)
    fn = len(gt_set - pred_set)
    
    return tp, fp, fn

def run_evaluation():
    """Main function to run the evaluation across all corrected files."""
    total_tp, total_fp, total_fn = 0, 0, 0
    results = []

    corrected_files = [f for f in os.listdir(CORRECTIONS_DIR) if f.endswith('_corrected.json')]
    
    if not corrected_files:
        print(" No corrected files found. Cannot run evaluation.")
        return

    print(f"Found {len(corrected_files)} corrected files. Starting evaluation...")
    print("-" * 50)

    for filename in corrected_files:
        base_name = filename.replace('_corrected.json', '')
        
        with open(os.path.join(CORRECTIONS_DIR, filename), 'r', encoding='utf-8') as f:
            ground_truth_data = json.load(f)
        
        original_file_path = find_corresponding_input_file(base_name)
        if not original_file_path:
            print(f"⚠️  Skipping '{filename}': Could not find original input file.")
            continue
            
        print(f"Processing: {os.path.basename(original_file_path)}...")
        predicted_data = run_inference_pipeline(original_file_path)

        tp, fp, fn = compare_extractions(ground_truth_data, predicted_data)
        
        total_tp += tp
        total_fp += fp
        total_fn += fn
        
        results.append({"File": base_name, "Correct (TP)": tp, "Incorrect (FP)": fp, "Missed (FN)": fn})

    precision = total_tp / (total_tp + total_fp) if (total_tp + total_fp) > 0 else 0
    recall = total_tp / (total_tp + total_fn) if (total_tp + total_fn) > 0 else 0
    f1_score = 2 * (precision * recall) / (precision + recall) if (precision + recall) > 0 else 0

    print("\n" + "="*50 + "\nEVALUATION RESULTS\n" + "="*50)
    if results:
        df = pd.DataFrame(results)
        print(df.to_string(index=False))
    
    print("\n" + "-"*50 + "\nOverall Performance Metrics:\n" + f"  Precision: {precision:.2%}\n" + f"  Recall:    {recall:.2%}\n" + f"  F1-Score:  {f1_score:.2%}\n" + "-"*50)

if __name__ == "__main__":
    run_evaluation()

Found 9 corrected files. Starting evaluation...
--------------------------------------------------
Processing: 17756177_50641000301.pdf...
{'patient_info': {'Name': 'Mr K P SHRAVAN Gender: Male LabID : 50641000301', 'Age’': '40 Years Mob. No. : 9035707662 Pt.ID : 6186848', 'B2B_ SC': 'Ref. By © VIKRAMKAMATH Pt. Loc', 'Reg Date and Time =': '29-Jun-2025 06:37 Report Date and Time : Ref Id1'}, 'lab_results': [{'Test Name': {'value': 'Adjusted Calcium', 'confidence': 95.0}, 'Result Value': {'value': '8.1', 'confidence': 96.0}, 'Unit': {'value': 'mg/dL', 'confidence': 81.0}, 'Reference Range': {'value': '8.8 10.4', 'confidence': 90.0}}, {'Test Name': {'value': 'Chloride', 'confidence': 96.0}, 'Result Value': {'value': '109', 'confidence': 96.0}, 'Unit': {'value': 'mEq/L', 'confidence': 82.0}, 'Reference Range': {'value': '98 107', 'confidence': 92.0}}, {'Test Name': {'value': 'LDL Cholesterol (Direct)', 'confidence': 95.0}, 'Result Value': {'value': '32', 'confidence': 96.0}, 'Unit': {'val