In [25]:
import json
from collections import defaultdict

def load_json(filepath):
    with open(filepath, 'r') as f:
        return json.load(f)

def compare_extracted_criteria(gpt_data, ground_truth_data):
    results = []
    mismatch_counts = defaultdict(lambda: defaultdict(int))  # Track mismatches by section and key

    for gpt_trial, gt_trial in zip(gpt_data, ground_truth_data):
        gpt_criteria = gpt_trial.get("Extracted Criteria", {})
        gt_criteria = gt_trial.get("Extracted Criteria", {})

        for section in gpt_criteria:
            # Skip specific sections
            if section in ["text_relating_to_medical_history", "unused_text"]:
                continue

            gpt_section = gpt_criteria.get(section, {})
            gt_section = gt_criteria.get(section, {})

            for key, gt_value in gt_section.items():
                gpt_value = gpt_section.get(key, None)
                if (gpt_value in ["Not required", "Both allowed","Allowed",0] and not gt_value) or \
                   (gt_value in ["Not required", "Both allowed", "Allowed",0] and not gpt_value):
                    match = True
                else:
                    match = gt_value == gpt_value


                results.append({
                    "section": section,
                    "key": key,
                    "ground_truth": gt_value,
                    "gpt_extracted": gpt_value,
                    "match": match
                })

                # Increment mismatch count for the key in the section if there's a mismatch
                if not match:
                    mismatch_counts[section][key] += 1

    return results, mismatch_counts


def calculate_accuracy(results):
    total = len(results)
    matches = sum(1 for result in results if result["match"])
    return matches / total if total > 0 else 0


def main():
    gpt_file = "extracted_criteria_30_new_merged.json"
    ground_truth_file = "ground_truth_30.json"

    # Load data
    gpt_data = load_json(gpt_file)
    ground_truth_data = load_json(ground_truth_file)

    # Compare criteria
    comparison_results, mismatch_counts = compare_extracted_criteria(gpt_data, ground_truth_data)

    accuracy = calculate_accuracy(comparison_results)
    print(f"Overall Accuracy: {accuracy:.2f}")

    mismatches = [r for r in comparison_results if not r["match"]]
    if mismatches:
        print("\nMismatched Entries:")
        for mismatch in mismatches:
            print(mismatch)

    print("\nMismatch Count by Section and Key:")
    for section, keys in mismatch_counts.items():
        print(f"Section: {section}")
        for key, count in keys.items():
            print(f"  {key}: {count} mismatches")

if __name__ == "__main__":
    main()

Overall Accuracy: 0.72

Mismatched Entries:
{'section': 'health_and_organ_function', 'key': 'untreated_central_nervous_system_metastases', 'ground_truth': 'Not allowed', 'gpt_extracted': None, 'match': False}
{'section': 'disease_characteristics', 'key': 'disease_progression_after_previous_treatment', 'ground_truth': 'Required', 'gpt_extracted': None, 'match': False}
{'section': 'health_and_organ_function', 'key': 'untreated_central_nervous_system_metastases', 'ground_truth': 'Not allowed', 'gpt_extracted': None, 'match': False}
{'section': 'health_and_organ_function', 'key': 'anc_level_lower_bound_10_9_l', 'ground_truth': 3, 'gpt_extracted': None, 'match': False}
{'section': 'health_and_organ_function', 'key': 'untreated_central_nervous_system_metastases', 'ground_truth': 'Not allowed', 'gpt_extracted': None, 'match': False}
{'section': 'demographics_and_general_characteristics', 'key': 'months_of_life_expectancy', 'ground_truth': 4, 'gpt_extracted': 3, 'match': False}
{'section': 'di

In [22]:
import json
from collections import defaultdict

def load_json(filepath):
    with open(filepath, 'r') as f:
        return json.load(f)

def compare_extracted_criteria(gpt_data, ground_truth_data):
    results = []
    mismatch_counts = defaultdict(lambda: defaultdict(int))  # Track mismatches by section and key

    for trial_index, (gpt_trial, gt_trial) in enumerate(zip(gpt_data, ground_truth_data), start=1):
        gpt_criteria = gpt_trial.get("Extracted Criteria", {})
        gt_criteria = gt_trial.get("Extracted Criteria", {})

        for section in gpt_criteria:
            # Skip specific sections
            if section in ["text_relating_to_medical_history", "unused_text"]:
                continue

            gpt_section = gpt_criteria.get(section, {})
            gt_section = gt_criteria.get(section, {})

            for key, gt_value in gt_section.items():
                gpt_value = gpt_section.get(key, None)
                if (gpt_value in ["Not required", "Both allowed", "Allowed", 0] and not gt_value) or \
                   (gt_value in ["Not required", "Both allowed", "Allowed", 0] and not gpt_value):
                    match = True
                else:
                    match = gt_value == gpt_value

                results.append({
                    "trial_number": trial_index,  # Add trial number
                    "section": section,
                    "key": key,
                    "ground_truth": gt_value,
                    "gpt_extracted": gpt_value,
                    "match": match
                })

                # Increment mismatch count for the key in the section if there's a mismatch
                if not match:
                    mismatch_counts[section][key] += 1

    return results, mismatch_counts


def calculate_accuracy(results):
    total = len(results)
    matches = sum(1 for result in results if result["match"])
    return matches / total if total > 0 else 0


def main():
    gpt_file = "extracted_criteria_30_new_merged.json"
    ground_truth_file = "ground_truth_30.json"

    # Load data
    gpt_data = load_json(gpt_file)
    ground_truth_data = load_json(ground_truth_file)

    # Compare criteria
    comparison_results, mismatch_counts = compare_extracted_criteria(gpt_data, ground_truth_data)

    accuracy = calculate_accuracy(comparison_results)
    print(f"Overall Accuracy: {accuracy:.2f}")

    mismatches = [r for r in comparison_results if not r["match"]]
    if mismatches:
        print("\nMismatched Entries:")
        for mismatch in mismatches:
            print(f"Trial {mismatch['trial_number']}: {mismatch}")

    print("\nMismatch Count by Section and Key:")
    for section, keys in mismatch_counts.items():
        print(f"Section: {section}")
        for key, count in keys.items():
            print(f"  {key}: {count} mismatches")

if __name__ == "__main__":
    main()

Overall Accuracy: 0.72

Mismatched Entries:
Trial 5: {'trial_number': 5, 'section': 'health_and_organ_function', 'key': 'untreated_central_nervous_system_metastases', 'ground_truth': 'Not allowed', 'gpt_extracted': None, 'match': False}
Trial 9: {'trial_number': 9, 'section': 'disease_characteristics', 'key': 'disease_progression_after_previous_treatment', 'ground_truth': 'Required', 'gpt_extracted': None, 'match': False}
Trial 9: {'trial_number': 9, 'section': 'health_and_organ_function', 'key': 'untreated_central_nervous_system_metastases', 'ground_truth': 'Not allowed', 'gpt_extracted': None, 'match': False}
Trial 10: {'trial_number': 10, 'section': 'health_and_organ_function', 'key': 'anc_level_lower_bound_10_9_l', 'ground_truth': 3, 'gpt_extracted': None, 'match': False}
Trial 10: {'trial_number': 10, 'section': 'health_and_organ_function', 'key': 'untreated_central_nervous_system_metastases', 'ground_truth': 'Not allowed', 'gpt_extracted': None, 'match': False}
Trial 13: {'trial_

In [27]:
import json
import difflib

def normalize_value(value):
    """
    Normalize values to make comparison more flexible
    """
    if value is None:
        return None
    
    # Convert to string and lowercase
    str_value = str(value).lower().strip()
    
    # Handle numeric values
    if str_value.isdigit():
        return int(str_value)
    
    # Normalize boolean-like values
    if str_value in ['0', 'none', 'not allowed', 'not required', 'false']:
        return 'not allowed'
    
    if str_value in ['1', 'required', 'allowed', 'true']:
        return 'allowed'
    
    return str_value

def filter_criteria(criteria):
    """
    Remove 'text_relating_to_medical_history' and 'unused_text' from criteria
    """
    if not isinstance(criteria, dict):
        return criteria
    
    # Create a copy of the dictionary to avoid modifying the original
    filtered = {}
    for key, value in criteria.items():
        if key not in ['text_relating_to_medical_history', 'unused_text']:
            if isinstance(value, dict):
                filtered[key] = filter_criteria(value)
            else:
                filtered[key] = value
    
    return filtered

def compare_extracted_criteria(ground_truth_data, extracted_data):
    """
    Compare extracted criteria for each trial text
    """
    # Create dictionaries keyed by trial text for easy matching
    ground_truth_map = {
        item.get('Trial Text', ''): filter_criteria(item.get('Extracted Criteria', {})) 
        for item in ground_truth_data
    }
    extracted_map = {
        item.get('Trial Text', ''): filter_criteria(item.get('Extracted Criteria', {})) 
        for item in extracted_data
    }
    
    # Tracking results
    total_trials = 0
    matched_trials = 0
    trial_results = []
    
    # Compare each trial
    for trial_text, ground_truth_criteria in ground_truth_map.items():
        if not trial_text:
            continue
        
        total_trials += 1
        
        # Find matching extracted criteria
        extracted_criteria = extracted_map.get(trial_text, {})
        
        if not extracted_criteria:
            trial_results.append({
                'trial_text': trial_text,
                'match': False,
                'reason': 'No matching extracted criteria found'
            })
            continue
        
        # Deep comparison of criteria
        match_result = compare_criteria(ground_truth_criteria, extracted_criteria)
        
        trial_results.append({
            'trial_text': trial_text,
            'match': match_result['full_match'],
            'accuracy': match_result['accuracy'],
            'matches': match_result['matches'],
            'mismatches': match_result['mismatches']
        })
        
        if match_result['full_match']:
            matched_trials += 1
    
    # Calculate overall accuracy
    overall_accuracy = (matched_trials / total_trials * 100) if total_trials > 0 else 0
    
    return {
        'total_trials': total_trials,
        'matched_trials': matched_trials,
        'overall_accuracy': round(overall_accuracy, 2),
        'trial_results': trial_results
    }

def compare_criteria(ground_truth, extracted):
    """
    Compare individual criteria between ground truth and extracted data
    """
    matches = []
    mismatches = []
    total_comparisons = 0
    
    def recursive_compare(gt_dict, ext_dict, path=''):
        nonlocal matches, mismatches, total_comparisons
        
        # Ensure both are dictionaries
        if not isinstance(gt_dict, dict) or not isinstance(ext_dict, dict):
            return
        
        # Compare keys
        all_keys = set(list(gt_dict.keys()) + list(ext_dict.keys()))
        
        for key in all_keys:
            current_path = f"{path}.{key}" if path else key
            
            # Skip if key not in both dictionaries
            if key not in gt_dict or key not in ext_dict:
                mismatches.append(current_path)
                continue
            
            # Get values
            gt_value = gt_dict[key]
            ext_value = ext_dict[key]
            
            # If nested dictionary, recurse
            if isinstance(gt_value, dict) and isinstance(ext_value, dict):
                recursive_compare(gt_value, ext_value, current_path)
                continue
            
            # Normalize and compare values
            total_comparisons += 1
            norm_gt = normalize_value(gt_value)
            norm_ext = normalize_value(ext_value)
            
            if norm_gt == norm_ext:
                matches.append(current_path)
            else:
                mismatches.append(f"{current_path}: GT={norm_gt}, Ext={norm_ext}")
    
    # Perform recursive comparison
    recursive_compare(ground_truth, extracted)
    
    # Calculate accuracy
    accuracy = (len(matches) / total_comparisons * 100) if total_comparisons > 0 else 0
    
    return {
        'full_match': len(mismatches) == 0,
        'accuracy': round(accuracy, 2),
        'matches': matches,
        'mismatches': mismatches
    }

# Load ground truth and extracted data
with open('ground_truth_30.json', 'r') as gt_file:
    ground_truth_data = json.load(gt_file)

with open('extracted_criteria_30_new_merged.json', 'r') as ext_file:
    extracted_data = json.load(ext_file)

# Run comparison
comparison_results = compare_extracted_criteria(ground_truth_data, extracted_data)

# Print detailed results
print("Comparison Results:")
print(f"Total Trials: {comparison_results['total_trials']}")
print(f"Matched Trials: {comparison_results['matched_trials']}")
print(f"Overall Accuracy: {comparison_results['overall_accuracy']}%")

# Print details of a few trials
print("\nSample Trial Results:")
for trial in comparison_results['trial_results'][:5]:
    print(f"\nTrial Text (first 100 chars): {trial['trial_text'][:100]}...")
    print(f"Match: {trial.get('match', 'N/A')}")
    print(f"Accuracy: {trial.get('accuracy', 'N/A')}%")
    
    if not trial.get('match', True):
        print("Reason:", trial.get('reason', 'Unknown mismatch'))
    
    # Print mismatches if available
    if trial.get('mismatches'):
        print("Mismatches:")
        for mismatch in trial['mismatches'][:5]:
            print(f"- {mismatch}")

# Optional: Write detailed results to a file
with open('comparison_results.json', 'w') as results_file:
    json.dump(comparison_results, results_file, indent=2)

print("\nDetailed results have been saved to 'comparison_results.json'")

Comparison Results:
Total Trials: 23
Matched Trials: 7
Overall Accuracy: 30.43%

Sample Trial Results:

Trial Text (first 100 chars): Inclusion Criteria:

Patients with a history of histological and/or cytological proven HER2-positive...
Match: True
Accuracy: 100.0%

Trial Text (first 100 chars): SELECTION OF PATIENTS (MOST IMPORTANT CRITERIA)

Inclusion criteria for first-line therapy

‚Ä¢ Hist...
Match: False
Accuracy: 100.0%
Reason: Unknown mismatch
Mismatches:
- disease_characteristics.confirmed_locally_recurrent_breast_cancer
- disease_characteristics.measurable_lesion

Trial Text (first 100 chars): Inclusion Criteria:

Histologically confirmed invasive cancer of the breast.
Presence of at least on...
Match: True
Accuracy: 100.0%

Trial Text (first 100 chars): Inclusion Criteria:

Patients diagnosed with primary breast cancer attending hospital for the resect...
Match: False
Accuracy: 100.0%
Reason: Unknown mismatch
Mismatches:
- demographics_and_general_characteristics

Trial Tex

In [33]:
import json

def normalize_value(value):
    """
    Normalize values to make comparison more flexible
    """
    if value is None:
        return None
    
    # Convert to string and lowercase
    str_value = str(value).lower().strip()
    
    # Handle numeric values
    if str_value.isdigit():
        return int(str_value)
    
    # Normalize boolean-like values
    if str_value in ['allowed', 'not required', 'both allowed']:
        return 'allowed'
    
    if str_value in ['0']:
        return ''
    
    return str_value

def filter_criteria(criteria):
    """
    Remove 'text_relating_to_medical_history' and 'unused_text' from criteria
    """
    if not isinstance(criteria, dict):
        return criteria
    
    # Create a copy of the dictionary to avoid modifying the original
    filtered = {}
    for key, value in criteria.items():
        if key not in ['text_relating_to_medical_history', 'unused_text']:
            if isinstance(value, dict):
                filtered_nested = filter_criteria(value)
                if filtered_nested:  # Only add if not empty
                    filtered[key] = filtered_nested
            else:
                filtered[key] = value
    
    return filtered

def compare_criteria_detailed(ground_truth, extracted):
    """
    Compare individual criteria between ground truth and extracted data
    Provides detailed tracking of matches and mismatches
    """
    total_comparisons = 0
    correct_comparisons = 0
    detailed_mismatches = []
    
    def recursive_compare(gt_dict, ext_dict, path=''):
        nonlocal total_comparisons, correct_comparisons, detailed_mismatches
        
        # Ensure both are dictionaries
        if not isinstance(gt_dict, dict) or not isinstance(ext_dict, dict):
            return
        
        # Compare keys
        all_keys = set(list(gt_dict.keys()) + list(ext_dict.keys()))
        
        for key in all_keys:
            current_path = f"{path}.{key}" if path else key
            
            # Skip if key not in both dictionaries
            if key not in gt_dict or key not in ext_dict:
                detailed_mismatches.append({
                    'path': current_path,
                    'reason': 'Key missing in one of the dictionaries'
                })
                continue
            
            # Get values
            gt_value = gt_dict[key]
            ext_value = ext_dict[key]
            
            # If nested dictionary, recurse
            if isinstance(gt_value, dict) and isinstance(ext_value, dict):
                recursive_compare(gt_value, ext_value, current_path)
                continue
            
            # Normalize and compare values
            total_comparisons += 1
            norm_gt = normalize_value(gt_value)
            norm_ext = normalize_value(ext_value)
            
            if norm_gt == norm_ext:
                correct_comparisons += 1
            else:
                detailed_mismatches.append({
                    'path': current_path,
                    'ground_truth': norm_gt,
                    'extracted': norm_ext
                })
    
    # Perform recursive comparison
    recursive_compare(ground_truth, extracted)
    
    return {
        'total_comparisons': total_comparisons,
        'correct_comparisons': correct_comparisons,
        'accuracy_percentage': (correct_comparisons / total_comparisons * 100) if total_comparisons > 0 else 0,
        'detailed_mismatches': detailed_mismatches
    }

def comprehensive_comparison(ground_truth_data, extracted_data):
    """
    Compare extracted criteria across all trials
    """
    # Create dictionaries keyed by trial text for easy matching
    ground_truth_map = {
        item.get('Trial Text', ''): filter_criteria(item.get('Extracted Criteria', {})) 
        for item in ground_truth_data
    }
    extracted_map = {
        item.get('Trial Text', ''): filter_criteria(item.get('Extracted Criteria', {})) 
        for item in extracted_data
    }
    
    # Tracking overall results
    overall_total_comparisons = 0
    overall_correct_comparisons = 0
    overall_detailed_mismatches = []
    trial_results = []
    
    # Compare each trial
    for trial_text, ground_truth_criteria in ground_truth_map.items():
        if not trial_text:
            continue
        
        # Find matching extracted criteria
        extracted_criteria = extracted_map.get(trial_text, {})
        
        if not extracted_criteria:
            continue
        
        # Detailed comparison for this trial
        trial_comparison = compare_criteria_detailed(ground_truth_criteria, extracted_criteria)
        
        # Accumulate results
        overall_total_comparisons += trial_comparison['total_comparisons']
        overall_correct_comparisons += trial_comparison['correct_comparisons']
        overall_detailed_mismatches.extend(trial_comparison['detailed_mismatches'])
        
        # Store trial-level results
        trial_results.append({
            'trial_text': trial_text,
            'total_comparisons': trial_comparison['total_comparisons'],
            'correct_comparisons': trial_comparison['correct_comparisons'],
            'accuracy_percentage': trial_comparison['accuracy_percentage']
        })
    
    # Calculate overall accuracy
    overall_accuracy = (overall_correct_comparisons / overall_total_comparisons * 100) if overall_total_comparisons > 0 else 0
    
    return {
        'total_parameters_compared': overall_total_comparisons,
        'total_correct_parameters': overall_correct_comparisons,
        'overall_accuracy_percentage': round(overall_accuracy, 2),
        'trial_results': trial_results,
        'detailed_mismatches': overall_detailed_mismatches
    }

# Load ground truth and extracted data
with open('ground_truth_30.json', 'r') as gt_file:
    ground_truth_data = json.load(gt_file)

with open('extracted_criteria_30_new_merged.json', 'r') as ext_file:
    extracted_data = json.load(ext_file)

# Run comprehensive comparison
comparison_results = comprehensive_comparison(ground_truth_data, extracted_data)

# Print detailed results
print("Comprehensive Comparison Results:")
print(f"Total Parameters Compared: {comparison_results['total_parameters_compared']}")
print(f"Total Correct Parameters: {comparison_results['total_correct_parameters']}")
print(f"Overall Accuracy: {comparison_results['overall_accuracy_percentage']}%")

# Print top mismatches
print("\nTop 10 Mismatches:")
top_mismatches = sorted(
    comparison_results['detailed_mismatches'], 
    key=lambda x: 1 if 'ground_truth' in x else 0
)[:10]
for mismatch in top_mismatches:
    if 'ground_truth' in mismatch:
        print(f"Path: {mismatch['path']}")
        print(f"Ground Truth: {mismatch['ground_truth']}")
        print(f"Extracted: {mismatch['extracted']}")
        print("---")
    else:
        print(f"Path: {mismatch['path']}, Reason: {mismatch['reason']}")
        print("---")

# Save detailed results
with open('comprehensive_comparison_results.json', 'w') as results_file:
    json.dump(comparison_results, results_file, indent=2)

print("\nDetailed results have been saved to 'comprehensive_comparison_results.json'")

Comprehensive Comparison Results:
Total Parameters Compared: 327
Total Correct Parameters: 321
Overall Accuracy: 98.17%

Top 10 Mismatches:
Path: disease_characteristics.confirmed_locally_recurrent_breast_cancer, Reason: Key missing in one of the dictionaries
---
Path: disease_characteristics.measurable_lesion, Reason: Key missing in one of the dictionaries
---
Path: health_and_organ_function.untreated_central_nervous_system_metastases, Reason: Key missing in one of the dictionaries
---
Path: disease_characteristics.er_pr_status, Reason: Key missing in one of the dictionaries
---
Path: disease_characteristics.her2_status, Reason: Key missing in one of the dictionaries
---
Path: health_and_organ_function, Reason: Key missing in one of the dictionaries
---
Path: demographics_and_general_characteristics, Reason: Key missing in one of the dictionaries
---
Path: health_and_organ_function, Reason: Key missing in one of the dictionaries
---
Path: health_and_organ_function.untreated_central_ne