In [None]:
#!/usr/bin/env python3
"""
Annotation Analysis Script
각 factor별로 prompt_image_match의 true, unsure, false 비율을 분석하고,
각 factor의 각 항목별 true, false, unsure 비율을 측정합니다.
"""

import json
from collections import defaultdict
from pathlib import Path


def should_include_factor(factor_name, item_value):
    """
    해당 factor와 item_value가 필터링 조건에 맞는지 확인합니다.
    
    Args:
        factor_name: factor 이름 (예: "age", "appearance_body_status", etc.)
        item_value: 해당 factor의 값 (예: "Thin", "Fat", etc.)
    
    Returns:
        bool: 포함해야 하면 True, 아니면 False
    """
    # 항상 포함하는 factor들
    always_include = ["age", "gender", "location", "race_ethnicity"]
    if factor_name in always_include:
        return True
    
    # appearance_body_status: "Thin", "Fat"만 포함
    if factor_name == "appearance_body_status":
        return item_value in ["Thin", "Fat"]
    
    # disability_status: "without a disability", "who uses a wheelchair"만 포함
    if factor_name == "disability_status":
        return item_value in ["without a disability", "who uses a wheelchair"]
    
    # 그 외는 제외
    return False


def analyze_annotations_with_accuracy(annotation_file, vqa_file=None):
    """
    annotation JSON 파일에서 각 factor별, 각 항목별 true/false/unsure 비율을 분석합니다.
    Diffusion 모델이 생성한 이미지에 대한 annotation 결과(prompt_image_match)를 분석합니다.
    
    Args:
        annotation_file: annotation JSON 파일 경로
        vqa_file: VQA 결과 JSON 파일 경로 (assigned_attributes를 가져오기 위해 사용, optional)
    """
    # JSON 파일 로드
    with open(annotation_file, 'r', encoding='utf-8') as f:
        annotations = json.load(f)
    
    # image_filename을 키로 하는 딕셔너리 생성 (assigned_attributes를 가져오기 위해)
    vqa_dict = {}
    if vqa_file and Path(vqa_file).exists():
        with open(vqa_file, 'r', encoding='utf-8') as f:
            vqa_results = json.load(f)
        for vqa_item in vqa_results:
            filename = vqa_item.get("image_filename")
            if filename:
                vqa_dict[filename] = vqa_item
    
    # 각 factor별로 true/unsure/false 카운트
    factor_counts = defaultdict(lambda: {"true": 0, "unsure": 0, "false": 0})
    
    # 각 factor의 각 항목별 true/false/unsure 카운트
    # 구조: {factor_name: {item_value: {"true": count, "false": count, "unsure": count}}}
    factor_item_counts = defaultdict(lambda: defaultdict(lambda: {"true": 0, "unsure": 0, "false": 0}))
    
    total_annotations = len(annotations)
    
    # 각 annotation을 순회하며 분석
    for annotation in annotations:
        filename = annotation.get("image_filename")
        if not filename:
            continue
        
        # assigned_attributes는 VQA 파일에서 가져오거나, 없으면 건너뜀
        assigned_attributes = {}
        if filename in vqa_dict:
            assigned_attributes = vqa_dict[filename].get("assigned_attributes", {})
        
        factor_annotations = annotation.get("factor_annotations", {})
        
        for factor_name, factor_data in factor_annotations.items():
            # 필터링: 특정 속성들만 포함
            item_value = assigned_attributes.get(factor_name)
            if not should_include_factor(factor_name, item_value):
                continue
            
            prompt_image_match = factor_data.get("prompt_image_match", "").lower()
            
            if prompt_image_match in ["true", "unsure", "false"]:
                factor_counts[factor_name][prompt_image_match] += 1
            
            # 각 항목별 true/false/unsure 카운트 (assigned_attributes가 있는 경우만)
            if item_value and prompt_image_match in ["true", "unsure", "false"]:
                factor_item_counts[factor_name][item_value][prompt_image_match] += 1
    
    # 결과 출력
    print("=" * 80)
    print(f"Filtered Diffusion Model Results Analysis")
    print(f"(Only selected factors: age, appearance_body_status (Thin/Fat only),")
    print(f" disability_status (without a disability/wheelchair only), gender, location, race_ethnicity)")
    print(f"(Based on annotation prompt_image_match - Diffusion 생성 이미지 평가 결과)")
    print(f"Total annotations: {total_annotations}")
    print("=" * 80)
    print()
    
    # factor 이름을 정렬하여 출력
    sorted_factors = sorted(factor_counts.keys())
    
    for factor_name in sorted_factors:
        counts = factor_counts[factor_name]
        total = counts["true"] + counts["unsure"] + counts["false"]
        
        if total == 0:
            continue
        
        # 비율 계산
        true_pct = (counts["true"] / total) * 100
        unsure_pct = (counts["unsure"] / total) * 100
        false_pct = (counts["false"] / total) * 100
        
        print(f"Factor: {factor_name}")
        print(f"  Total entries: {total}")
        print(f"  True:   {counts['true']:5d} ({true_pct:6.2f}%)")
        print(f"  Unsure: {counts['unsure']:5d} ({unsure_pct:6.2f}%)")
        print(f"  False:  {counts['false']:5d} ({false_pct:6.2f}%)")
        print()
    
    # 요약 테이블 형식으로도 출력
    print("=" * 80)
    print("Summary Table")
    print("=" * 80)
    print(f"{'Factor':<30} {'True %':<12} {'Unsure %':<12} {'False %':<12} {'Total':<10}")
    print("-" * 80)
    
    for factor_name in sorted_factors:
        counts = factor_counts[factor_name]
        total = counts["true"] + counts["unsure"] + counts["false"]
        
        if total == 0:
            continue
        
        true_pct = (counts["true"] / total) * 100
        unsure_pct = (counts["unsure"] / total) * 100
        false_pct = (counts["false"] / total) * 100
        
        print(f"{factor_name:<30} {true_pct:>6.2f}%     {unsure_pct:>6.2f}%     {false_pct:>6.2f}%     {total:>8}")
    
    print("=" * 80)
    print()
    
    # 각 factor의 각 항목별 true/false/unsure 비율 출력
    print("=" * 80)
    print("Factor-wise Item True/False/Unsure Rates")
    print("(Based on Diffusion model results - prompt_image_match from annotations)")
    print("(Filtered: only selected factors and items)")
    print("=" * 80)
    print()
    
    for factor_name in sorted_factors:
        if factor_name not in factor_item_counts:
            continue
        
        items = factor_item_counts[factor_name]
        if not items:
            continue
        
        print(f"Factor: {factor_name}")
        print("-" * 80)
        print(f"{'Item Value':<40} {'True':<10} {'False':<10} {'Unsure':<10} {'Total':<10} {'True %':<10} {'False %':<10} {'Unsure %':<10}")
        print("-" * 80)
        
        # 항목별로 정렬하여 출력 (total 기준 내림차순)
        sorted_items = sorted(items.items(), key=lambda x: sum(x[1].values()), reverse=True)
        
        for item_value, counts in sorted_items:
            total = counts["true"] + counts["false"] + counts["unsure"]
            if total > 0:
                true_pct = (counts["true"] / total) * 100
                false_pct = (counts["false"] / total) * 100
                unsure_pct = (counts["unsure"] / total) * 100
                print(f"{item_value:<40} {counts['true']:<10} {counts['false']:<10} {counts['unsure']:<10} {total:<10} {true_pct:>6.2f}%    {false_pct:>6.2f}%    {unsure_pct:>6.2f}%")
            else:
                print(f"{item_value:<40} {'0':<10} {'0':<10} {'0':<10} {'0':<10} {'N/A':<10} {'N/A':<10} {'N/A':<10}")
        
        print()
    
    print("=" * 80)
    
    return factor_counts, factor_item_counts


if __name__ == "__main__":
    # 파일 경로 설정
    script_dir = Path("/root/cs454_ct_diffusion_bias/annotations")
    annotation_file = script_dir / "total_annotation.json"
    # VQA 파일은 assigned_attributes를 가져오기 위해 사용 (optional)
    vqa_file = Path("/root/cs454_ct_diffusion_bias/evaluator/vqa_results/scores_2wise.json")
    
    if not annotation_file.exists():
        print(f"Error: {annotation_file} 파일을 찾을 수 없습니다.")
        exit(1)
    
    # VQA 파일이 없어도 분석 가능 (assigned_attributes 없이 factor별 통계만)
    analyze_annotations_with_accuracy(annotation_file, vqa_file if vqa_file.exists() else None)



Annotation Analysis Results
Total annotations: 689

Factor: age
  Total entries: 689
  True:     652 ( 94.63%)
  Unsure:     5 (  0.73%)
  False:     32 (  4.64%)

Factor: appearance_body_status
  Total entries: 689
  True:     393 ( 57.04%)
  Unsure:   114 ( 16.55%)
  False:    182 ( 26.42%)

Factor: disability_status
  Total entries: 689
  True:     219 ( 31.79%)
  Unsure:   159 ( 23.08%)
  False:    311 ( 45.14%)

Factor: faith
  Total entries: 689
  True:     325 ( 47.17%)
  Unsure:   235 ( 34.11%)
  False:    129 ( 18.72%)

Factor: gender
  Total entries: 689
  True:     685 ( 99.42%)
  Unsure:     3 (  0.44%)
  False:      1 (  0.15%)

Factor: location
  Total entries: 689
  True:     575 ( 83.45%)
  Unsure:    57 (  8.27%)
  False:     57 (  8.27%)

Factor: race_ethnicity
  Total entries: 689
  True:     625 ( 90.71%)
  Unsure:    16 (  2.32%)
  False:     48 (  6.97%)

Factor: socioeconomic_status
  Total entries: 689
  True:     270 ( 39.19%)
  Unsure:   312 ( 45.28%)
  False:

In [13]:
#!/usr/bin/env python3
"""
Annotation vs VQA Accuracy Analysis Script
annotation data의 prompt_image_match와 VQA 결과의 yes_probability를 비교하여 accuracy를 측정합니다.
yes_probability 기준: >66% = true, 33-66% = unsure, <=33% = false
"""

import json
from collections import defaultdict
from pathlib import Path


def compare_annotation_vqa(annotation_file, vqa_file):
    """
    annotation 파일과 VQA 결과 파일을 비교하여 accuracy를 계산합니다.
    VQA의 yes_probability를 기준으로 채점합니다:
    - >66%: true
    - 33-66%: unsure (비교 제외)
    - <=33%: false
    
    Args:
        annotation_file: annotation JSON 파일 경로
        vqa_file: VQA 결과 JSON 파일 경로
    """
    # JSON 파일 로드
    with open(annotation_file, 'r', encoding='utf-8') as f:
        annotations = json.load(f)
    
    with open(vqa_file, 'r', encoding='utf-8') as f:
        vqa_results = json.load(f)
    
    # image_filename을 키로 하는 딕셔너리 생성 (빠른 조회를 위해)
    vqa_dict = {}
    for vqa_item in vqa_results:
        filename = vqa_item.get("image_filename")
        if filename:
            vqa_dict[filename] = vqa_item
    
    # 각 factor별로 accuracy 계산
    # 구조: {factor_name: {"correct": count, "total": count, "unsure_excluded": count, "vqa_missing": count}}
    factor_stats = defaultdict(lambda: {"correct": 0, "total": 0, "unsure_excluded": 0, "vqa_missing": 0})
    
    # 각 factor별 confusion matrix (3x3: True, False, Unsure)
    # 구조: {factor_name: {"true": {"true": 0, "false": 0, "unsure": 0}, "false": {...}, "unsure": {...}}}
    factor_confusion = defaultdict(lambda: {
        "true": {"true": 0, "false": 0, "unsure": 0},
        "false": {"true": 0, "false": 0, "unsure": 0},
        "unsure": {"true": 0, "false": 0, "unsure": 0}
    })
    
    # 전체 통계
    overall_stats = {"correct": 0, "total": 0, "unsure_excluded": 0, "vqa_missing": 0, "matched_images": 0}
    
    # 전체 confusion matrix (3x3)
    overall_confusion = {
        "true": {"true": 0, "false": 0, "unsure": 0},
        "false": {"true": 0, "false": 0, "unsure": 0},
        "unsure": {"true": 0, "false": 0, "unsure": 0}
    }
    
    # 각 annotation을 순회하며 VQA 결과와 비교
    # annotation에 있는 factor만 비교합니다 (VQA에만 있고 annotation에 없는 factor는 제외)
    for annotation in annotations:
        filename = annotation.get("image_filename")
        if not filename:
            continue
        
        # 해당 이미지의 VQA 결과 찾기
        vqa_item = vqa_dict.get(filename)
        if not vqa_item:
            continue
        
        overall_stats["matched_images"] += 1
        vqa_results_data = vqa_item.get("vqa_results", {})
        factor_annotations = annotation.get("factor_annotations", {})
        
        # annotation에 있는 factor만 순회하여 비교
        for factor_name, factor_data in factor_annotations.items():
            prompt_image_match = factor_data.get("prompt_image_match", "").lower()
            
            # VQA 결과에서 해당 factor 찾기
            # annotation에 있는 factor만 비교하므로, VQA에 없으면 건너뜀
            vqa_factor = vqa_results_data.get(factor_name)
            if not vqa_factor:
                factor_stats[factor_name]["vqa_missing"] += 1
                overall_stats["vqa_missing"] += 1
                continue
            
            # yes_probability를 기준으로 true/unsure/false 판단
            yes_probability = vqa_factor.get("yes_probability", 0.0)
            
            # yes_probability 기준으로 VQA 결과 분류
            # > 66%: true, 33% < yes_probability <= 66%: unsure, <= 33%: false
            if yes_probability > 0.66:
                vqa_result = "true"
            elif yes_probability > 0.33:
                vqa_result = "unsure"
            else:
                vqa_result = "false"
            
            # 모든 경우를 confusion matrix에 기록 (True, False, Unsure 모두 포함)
            # 3x3 confusion matrix 업데이트
            factor_confusion[factor_name][prompt_image_match][vqa_result] += 1
            overall_confusion[prompt_image_match][vqa_result] += 1
            
            # accuracy 계산은 True/False만 비교 (Unsure 제외)
            if prompt_image_match == "unsure":
                factor_stats[factor_name]["unsure_excluded"] += 1
                overall_stats["unsure_excluded"] += 1
                continue
            
            # VQA의 unsure도 accuracy 계산에서 제외
            if vqa_result == "unsure":
                factor_stats[factor_name]["unsure_excluded"] += 1
                overall_stats["unsure_excluded"] += 1
                continue
            
            factor_stats[factor_name]["total"] += 1
            overall_stats["total"] += 1
            
            # 비교: annotation의 true/false와 VQA의 yes_probability 기반 결과 비교
            if prompt_image_match == "true" and vqa_result == "true":
                # True Positive: 둘 다 true
                factor_stats[factor_name]["correct"] += 1
                overall_stats["correct"] += 1
            elif prompt_image_match == "false" and vqa_result == "false":
                # True Negative: 둘 다 false
                factor_stats[factor_name]["correct"] += 1
                overall_stats["correct"] += 1
    
    # 결과 출력
    print("=" * 90)
    print("Annotation vs VQA Accuracy Analysis")
    print("(Only factors present in annotation are compared)")
    print("(VQA scoring based on yes_probability: >66%=true, 33-66%=unsure, <=33%=false)")
    print("=" * 90)
    print(f"Matched images: {overall_stats['matched_images']}")
    print(f"Total comparisons (excluding unsure and missing VQA): {overall_stats['total']}")
    print(f"Excluded (unsure): {overall_stats['unsure_excluded']}")
    print(f"Excluded (VQA missing): {overall_stats['vqa_missing']}")
    print()
    
    if overall_stats["total"] > 0:
        overall_accuracy = (overall_stats["correct"] / overall_stats["total"]) * 100
        print(f"Overall Accuracy: {overall_stats['correct']}/{overall_stats['total']} = {overall_accuracy:.2f}%")
    print()
    print("=" * 90)
    print()
    
    # Factor별 상세 결과
    sorted_factors = sorted(factor_stats.keys())
    
    print("Factor-wise Accuracy:")
    print("-" * 90)
    print(f"{'Factor':<30} {'Correct':<10} {'Total':<10} {'Accuracy':<12} {'Unsure':<10} {'VQA Missing':<12}")
    print("-" * 90)
    
    for factor_name in sorted_factors:
        stats = factor_stats[factor_name]
        if stats["total"] > 0:
            accuracy = (stats["correct"] / stats["total"]) * 100
            print(f"{factor_name:<30} {stats['correct']:<10} {stats['total']:<10} {accuracy:>6.2f}%      {stats['unsure_excluded']:<10} {stats['vqa_missing']:<12}")
        else:
            print(f"{factor_name:<30} {'N/A':<10} {'0':<10} {'N/A':<12} {stats['unsure_excluded']:<10} {stats['vqa_missing']:<12}")
    
    print("=" * 90)
    print()
    
    # 상세 통계 출력
    print("Detailed Statistics:")
    print("-" * 90)
    
    for factor_name in sorted_factors:
        stats = factor_stats[factor_name]
        if stats["total"] > 0:
            accuracy = (stats["correct"] / stats["total"]) * 100
            print(f"\nFactor: {factor_name}")
            print(f"  Correct: {stats['correct']}/{stats['total']} ({accuracy:.2f}%)")
            print(f"  Unsure excluded: {stats['unsure_excluded']}")
            print(f"  VQA missing: {stats['vqa_missing']}")
        else:
            print(f"\nFactor: {factor_name}")
            print(f"  No comparisons (all were unsure or VQA missing)")
            print(f"  Unsure excluded: {stats['unsure_excluded']}")
            print(f"  VQA missing: {stats['vqa_missing']}")
    
    print()
    print("=" * 90)
    print()
    
    # Confusion Matrix 출력 (3x3: True, False, Unsure)
    print("=" * 90)
    print("Confusion Matrix (3x3: True, False, Unsure)")
    print("=" * 90)
    print()
    
    # 전체 Confusion Matrix
    print("Overall Confusion Matrix:")
    print("-" * 90)
    print(f"{'':<20} {'VQA: True':<15} {'VQA: False':<15} {'VQA: Unsure':<15}")
    print("-" * 90)
    print(f"{'Annotation: True':<20} {overall_confusion['true']['true']:<15} {overall_confusion['true']['false']:<15} {overall_confusion['true']['unsure']:<15}")
    print(f"{'Annotation: False':<20} {overall_confusion['false']['true']:<15} {overall_confusion['false']['false']:<15} {overall_confusion['false']['unsure']:<15}")
    print(f"{'Annotation: Unsure':<20} {overall_confusion['unsure']['true']:<15} {overall_confusion['unsure']['false']:<15} {overall_confusion['unsure']['unsure']:<15}")
    print("-" * 90)
    
    # 전체 지표 계산 (True/False만 사용, Unsure 제외)
    total_cm = (overall_confusion["true"]["true"] + overall_confusion["true"]["false"] + 
                overall_confusion["false"]["true"] + overall_confusion["false"]["false"])
    if total_cm > 0:
        tp = overall_confusion["true"]["true"]
        fp = overall_confusion["false"]["true"]
        tn = overall_confusion["false"]["false"]
        fn = overall_confusion["true"]["false"]
        
        precision = tp / (tp + fp) if (tp + fp) > 0 else 0
        recall = tp / (tp + fn) if (tp + fn) > 0 else 0
        f1_score = 2 * (precision * recall) / (precision + recall) if (precision + recall) > 0 else 0
        specificity = tn / (tn + fp) if (tn + fp) > 0 else 0
        
        print(f"\nOverall Metrics (True/False only, Unsure excluded):")
        print(f"  Precision: {precision:.4f}")
        print(f"  Recall (Sensitivity): {recall:.4f}")
        print(f"  F1-Score: {f1_score:.4f}")
        print(f"  Specificity: {specificity:.4f}")
    
    print()
    print("=" * 90)
    print()
    
    # Factor별 Confusion Matrix
    print("Factor-wise Confusion Matrices:")
    print("=" * 90)
    
    for factor_name in sorted_factors:
        cm = factor_confusion[factor_name]
        total_factor = (cm["true"]["true"] + cm["true"]["false"] + cm["true"]["unsure"] +
                        cm["false"]["true"] + cm["false"]["false"] + cm["false"]["unsure"] +
                        cm["unsure"]["true"] + cm["unsure"]["false"] + cm["unsure"]["unsure"])
        
        if total_factor > 0:
            print(f"\nFactor: {factor_name}")
            print("-" * 90)
            print(f"{'':<20} {'VQA: True':<15} {'VQA: False':<15} {'VQA: Unsure':<15}")
            print("-" * 90)
            print(f"{'Annotation: True':<20} {cm['true']['true']:<15} {cm['true']['false']:<15} {cm['true']['unsure']:<15}")
            print(f"{'Annotation: False':<20} {cm['false']['true']:<15} {cm['false']['false']:<15} {cm['false']['unsure']:<15}")
            print(f"{'Annotation: Unsure':<20} {cm['unsure']['true']:<15} {cm['unsure']['false']:<15} {cm['unsure']['unsure']:<15}")
            print("-" * 90)
            
            # Factor별 지표 계산 (True/False만 사용)
            total_tf = cm["true"]["true"] + cm["true"]["false"] + cm["false"]["true"] + cm["false"]["false"]
            if total_tf > 0:
                tp = cm["true"]["true"]
                fp = cm["false"]["true"]
                tn = cm["false"]["false"]
                fn = cm["true"]["false"]
                
                precision = tp / (tp + fp) if (tp + fp) > 0 else 0
                recall = tp / (tp + fn) if (tp + fn) > 0 else 0
                f1_score = 2 * (precision * recall) / (precision + recall) if (precision + recall) > 0 else 0
                specificity = tn / (tn + fp) if (tn + fp) > 0 else 0
                
                print(f"  Metrics (True/False only):")
                print(f"    Precision: {precision:.4f}")
                print(f"    Recall (Sensitivity): {recall:.4f}")
                print(f"    F1-Score: {f1_score:.4f}")
                print(f"    Specificity: {specificity:.4f}")
        else:
            print(f"\nFactor: {factor_name}")
            print("  No confusion matrix data (all were missing)")
    
    print()
    print("=" * 90)
    
    return factor_stats, overall_stats, factor_confusion, overall_confusion


if __name__ == "__main__":
    # 파일 경로 설정
    script_dir = Path("/root/cs454_ct_diffusion_bias/annotations")
    annotation_file = script_dir / "total_annotation.json"
    vqa_file = Path("/root/cs454_ct_diffusion_bias/evaluator/vqa_results/scores_2wise.json")
    
    if not annotation_file.exists():
        print(f"Error: {annotation_file} 파일을 찾을 수 없습니다.")
        exit(1)
    
    if not vqa_file.exists():
        print(f"Error: {vqa_file} 파일을 찾을 수 없습니다.")
        exit(1)
    
    compare_annotation_vqa(annotation_file, vqa_file)



Annotation vs VQA Accuracy Analysis
(Only factors present in annotation are compared)
(VQA scoring based on yes_probability: >66%=true, 33-66%=unsure, <=33%=false)
Matched images: 689
Total comparisons (excluding unsure and missing VQA): 4111
Excluded (unsure): 1401
Excluded (VQA missing): 0

Overall Accuracy: 2831/4111 = 68.86%


Factor-wise Accuracy:
------------------------------------------------------------------------------------------
Factor                         Correct    Total      Accuracy     Unsure     VQA Missing 
------------------------------------------------------------------------------------------
age                            482        612         78.76%      77         0           
appearance_body_status         217        497         43.66%      192        0           
disability_status              323        466         69.31%      223        0           
faith                          268        427         62.76%      262        0           
gender       