In [2]:
import json
from pathlib import Path
from collections import Counter
import matplotlib.pyplot as plt
import random

# ============================================================
# Paths
# ============================================================
DATA_FILE = "../annotations/train.json"
RESULTS_DIR = "../results"
Path(RESULTS_DIR).mkdir(exist_ok=True)

# ============================================================
# Load Data
# ============================================================
with open(DATA_FILE, "r", encoding="utf-8") as f:
    data = json.load(f)

print(f"Loaded {len(data)} annotation items.")

# ============================================================
# Extract Texts and Taxonomy Labels
# ============================================================
texts = []
level1_labels = []
level2_labels = []
annotated_flags = []

for item in data:
    text = item.get("data", {}).get("text")
    ann_list = item.get("annotations", [])
    level1 = level2 = None
    is_annotated = False
    
    if ann_list:
        res_list = ann_list[0].get("result", [])
        if res_list:
            taxonomy = res_list[0].get("value", {}).get("taxonomy", [])
            if taxonomy and isinstance(taxonomy[0], list):
                is_annotated = True
                level1 = taxonomy[0][0] if len(taxonomy[0]) > 0 else None
                level2 = taxonomy[0][1] if len(taxonomy[0]) > 1 else None

    texts.append(text)
    level1_labels.append(level1)
    level2_labels.append(level2)
    annotated_flags.append(is_annotated)

# ============================================================
# Annotation Statistics
# ============================================================
total_annotations = len(texts)
level1_counts = Counter([l for l in level1_labels if l])
level2_counts = Counter([l for l in level2_labels if l])

missing_labels = sum(1 for l in level1_labels if not l)
missing_texts = sum(1 for t in texts if not t)
duplicate_texts = len(texts) - len(set(t for t in texts if t))
incomplete_annotations = sum(1 for a in annotated_flags if not a)

# Text length stats
text_lengths = [len(t.split()) for t in texts if t]
if text_lengths:
    min_len = min(text_lengths)
    max_len = max(text_lengths)
    avg_len = sum(text_lengths) / len(text_lengths)
else:
    min_len = max_len = avg_len = 0

# ============================================================
# Extra 1: Imbalance Ratio
# ============================================================
max_count = max(level1_counts.values()) if level1_counts else 1
imbalance_ratio = {label: round(count / max_count, 2) for label, count in level1_counts.items()}

# ============================================================
# Extra 2: Taxonomy Consistency Check
# (Define expected parent-child structure for validation)
# ============================================================
expected_map = {
    "Sports": ["Basketball", "Football", "Cricket", "Tennis"],
    "Entertainment": ["Movies", "Music", "TV Shows", "Celebrities"],
    "Technology": ["AI", "Gadgets", "Software", "Hardware"]
}

inconsistent_pairs = []
for l1, l2 in zip(level1_labels, level2_labels):
    if l1 and l2 and l1 in expected_map:
        if l2 not in expected_map[l1]:
            inconsistent_pairs.append((l1, l2))

# ============================================================
# Extra 3: Completeness (unannotated tasks)
# ============================================================
annotation_completeness = 100 * (1 - incomplete_annotations / total_annotations)

# ============================================================
# Save annotations_stats.txt
# ============================================================
with open(f"{RESULTS_DIR}/annotations_stats.txt", "w", encoding="utf-8") as f:
    f.write(f"Total annotations: {total_annotations}\n\n")
    f.write("Level 1 Taxonomy Counts:\n")
    for label, count in level1_counts.items():
        f.write(f"- {label}: {count}\n")
    f.write("\nLevel 2 Taxonomy Counts:\n")
    for label, count in level2_counts.items():
        f.write(f"- {label}: {count}\n")
    f.write(f"\nMissing taxonomy labels: {missing_labels}\n")
    f.write(f"Incomplete annotations: {incomplete_annotations}\n")
    f.write(f"Annotation completeness: {annotation_completeness:.2f}%\n")
    f.write(f"Missing texts: {missing_texts}\n")
    f.write(f"Duplicate texts: {duplicate_texts}\n")
    f.write(f"Text length (words) - min: {min_len}, max: {max_len}, avg: {avg_len:.2f}\n\n")
    f.write("Imbalance Ratio (vs. largest class):\n")
    for label, ratio in imbalance_ratio.items():
        f.write(f"- {label}: {ratio}\n")
    f.write("\nInconsistent taxonomy pairs:\n")
    for l1, l2 in inconsistent_pairs:
        f.write(f"- {l1} → {l2}\n")

# ============================================================
# Save eval_summary.txt
# ============================================================
with open(f"{RESULTS_DIR}/eval_summary.txt", "w", encoding="utf-8") as f:
    f.write("Evaluation Summary (Taxonomy Annotations)\n")
    f.write("-------------------------------------------\n")
    f.write(f"Total annotations: {total_annotations}\n")
    f.write(f"Incomplete annotations: {incomplete_annotations}\n")
    f.write(f"Annotation completeness: {annotation_completeness:.2f}%\n")
    f.write(f"Missing taxonomy labels: {missing_labels}\n")
    f.write(f"Missing texts: {missing_texts}\n")
    f.write(f"Duplicate texts: {duplicate_texts}\n")
    f.write(f"Inconsistent taxonomy pairs: {len(inconsistent_pairs)}\n")
    f.write(f"Text length (words) - min: {min_len}, max: {max_len}, avg: {avg_len:.2f}\n")

# ============================================================
# Generate Bar Charts
# ============================================================
plt.figure(figsize=(8, 4))
plt.bar(level1_counts.keys(), level1_counts.values(), color="skyblue")
plt.title("Level 1 Taxonomy Distribution")
plt.xlabel("Level 1 Categories")
plt.ylabel("Count")
plt.xticks(rotation=45, ha="right")
plt.tight_layout()
plt.savefig(f"{RESULTS_DIR}/level1_distribution.png")
plt.close()

if level2_counts:
    plt.figure(figsize=(10, 4))
    plt.bar(level2_counts.keys(), level2_counts.values(), color="lightgreen")
    plt.title("Level 2 Taxonomy Distribution")
    plt.xlabel("Level 2 Categories")
    plt.ylabel("Count")
    plt.xticks(rotation=45, ha="right")
    plt.tight_layout()
    plt.savefig(f"{RESULTS_DIR}/level2_distribution.png")
    plt.close()

# ============================================================
# Generate report.md
# ============================================================
with open(f"{RESULTS_DIR}/report.md", "w", encoding="utf-8") as f:
    f.write("# Taxonomy Annotation Report\n\n")
    f.write(f"Total annotations: **{total_annotations}**\n\n")
    
    f.write("## Level 1 Taxonomy Counts\n")
    for label, count in level1_counts.items():
        f.write(f"- **{label}**: {count}\n")

    f.write("\n## Level 2 Taxonomy Counts\n")
    for label, count in level2_counts.items():
        f.write(f"- **{label}**: {count}\n")

    f.write(f"\n## Missing taxonomy labels: {missing_labels}\n")
    f.write(f"## Incomplete annotations: {incomplete_annotations}\n")
    f.write(f"## Annotation completeness: {annotation_completeness:.2f}%\n")
    f.write(f"## Missing texts: {missing_texts}\n")
    f.write(f"## Duplicate texts: {duplicate_texts}\n")
    f.write(f"## Inconsistent taxonomy pairs: {len(inconsistent_pairs)}\n")
    f.write(f"## Text Length Stats (words) - min: {min_len}, max: {max_len}, avg: {avg_len:.2f}\n")

    f.write("\n## Imbalance Ratio (vs. largest class)\n")
    for label, ratio in imbalance_ratio.items():
        f.write(f"- {label}: {ratio}\n")

    if inconsistent_pairs:
        f.write("\n## Inconsistent Taxonomy Pairs Found\n")
        for l1, l2 in inconsistent_pairs[:10]:
            f.write(f"- {l1} → {l2}\n")
        if len(inconsistent_pairs) > 10:
            f.write(f"...and {len(inconsistent_pairs)-10} more\n")

    # Random sample texts by Level 1 category
    f.write("\n## Random Sample Texts by Level 1 Category\n")
    for label in level1_counts.keys():
        samples = [t for t, l in zip(texts, level1_labels) if l == label and t]
        if samples:
            f.write(f"\n### {label}\n")
            for txt in random.sample(samples, min(5, len(samples))):
                f.write(f"- {txt}\n")

    # Add plots
    f.write("\n## Taxonomy Distribution Plots\n")
    f.write("![Level 1 Taxonomy Distribution](level1_distribution.png)\n")
    if level2_counts:
        f.write("![Level 2 Taxonomy Distribution](level2_distribution.png)\n")

print("✅ Taxonomy annotation analysis complete. Results saved in 'results/' folder.")


Loaded 100 annotation items.
✅ Taxonomy annotation analysis complete. Results saved in 'results/' folder.
