In [21]:
# notebooks/re_annotations_static_mpl.ipynb

from pathlib import Path
import json
from collections import Counter
import matplotlib.pyplot as plt
import networkx as nx

# -----------------------------
# 1. Paths
# -----------------------------
DATA_PATH = Path("../annotations/train.json")
RESULTS_PATH = Path("../results")
STATS_FILE = RESULTS_PATH / "annotations_stats.txt"
EVAL_FILE = RESULTS_PATH / "eval_summary.txt"
REPORT_FILE = RESULTS_PATH / "report.md"
GRAPH_FILE = RESULTS_PATH / "relation_graph.png"
ENTITY_BAR_FILE = RESULTS_PATH / "entity_counts.png"
RELATION_BAR_FILE = RESULTS_PATH / "relation_counts.png"

RESULTS_PATH.mkdir(parents=True, exist_ok=True)

# -----------------------------
# 2. Load Data
# -----------------------------
with DATA_PATH.open("r", encoding="utf-8") as f:
    data = json.load(f)

print(f"Loaded {len(data)} annotation tasks from {DATA_PATH}")

# -----------------------------
# 3. Extract Entities and Relations
# -----------------------------
all_entities = []
all_relations = []

for item in data:
    annotations = item.get("annotations", [])
    if not annotations:
        continue
    for ann in annotations:
        results = ann.get("result", [])
        id2entity = {}
        for res in results:
            if res.get("type") == "labels":
                ent_info = {
                    "text": res["value"]["text"],
                    "start": res["value"]["start"],
                    "end": res["value"]["end"],
                    "labels": res["value"]["labels"]
                }
                id2entity[res["id"]] = ent_info
                all_entities.append(ent_info)
        for res in results:
            if res.get("type") == "relation":
                from_ent = id2entity.get(res["from_id"])
                to_ent = id2entity.get(res["to_id"])
                if from_ent and to_ent:
                    all_relations.append({
                        "from_text": from_ent["text"],
                        "from_labels": from_ent["labels"],
                        "to_text": to_ent["text"],
                        "to_labels": to_ent["labels"],
                        "relation": res["labels"][0],
                        "direction": res.get("direction")
                    })

# -----------------------------
# 4. Annotation Statistics
# -----------------------------
entity_counts = Counter([label for ent in all_entities for label in ent["labels"]])
relation_counts = Counter([rel["relation"] for rel in all_relations])

with STATS_FILE.open("w", encoding="utf-8") as f:
    f.write("=== Entity Counts ===\n")
    for label, count in entity_counts.items():
        f.write(f"{label}: {count}\n")
    f.write("\n=== Relation Counts ===\n")
    for rel, count in relation_counts.items():
        f.write(f"{rel}: {count}\n")

# -----------------------------
# 5. Bar Plots (Matplotlib)
# -----------------------------
def plot_bar(counter, title, filename):
    labels, counts = zip(*counter.most_common())
    plt.figure(figsize=(10,6))
    plt.bar(labels, counts, color=plt.cm.tab20.colors)
    plt.xticks(rotation=45, ha='right')
    plt.ylabel("Counts")
    plt.title(title)
    plt.tight_layout()
    plt.savefig(filename)
    plt.close()

plot_bar(entity_counts, "Entity Counts", ENTITY_BAR_FILE)
plot_bar(relation_counts, "Relation Counts", RELATION_BAR_FILE)

# -----------------------------
# 6. Evaluation Summary
# -----------------------------
eval_summary = {
    "num_annotation_tasks": len(data),
    "num_entities": len(all_entities),
    "num_relations": len(all_relations),
    "unique_entity_labels": list(entity_counts.keys()),
    "unique_relation_labels": list(relation_counts.keys())
}

with EVAL_FILE.open("w", encoding="utf-8") as f:
    for k, v in eval_summary.items():
        f.write(f"{k}: {v}\n")

# -----------------------------
# 7. Relation Graph (Matplotlib + NetworkX)
# -----------------------------
G = nx.DiGraph()

for ent in all_entities:
    label = ent["labels"][0]
    if ent["text"] not in G:
        G.add_node(ent["text"], label=label)

for rel in all_relations:
    G.add_edge(rel["from_text"], rel["to_text"], label=rel["relation"])

# Node colors by entity label
unique_labels = list(entity_counts.keys())
color_map = {label: plt.cm.tab20(i / len(unique_labels)) for i, label in enumerate(unique_labels)}
node_colors = [color_map[G.nodes[n]['label']] for n in G.nodes]

plt.figure(figsize=(14,10))
pos = nx.spring_layout(G, k=0.5, seed=42)
nx.draw_networkx_nodes(G, pos, node_color=node_colors, node_size=1200, alpha=0.9)
nx.draw_networkx_labels(G, pos, font_size=10, font_weight='bold')
nx.draw_networkx_edges(G, pos, arrowstyle='-|>', arrowsize=20, edge_color='gray', width=2)
edge_labels = nx.get_edge_attributes(G, 'label')
nx.draw_networkx_edge_labels(G, pos, edge_labels=edge_labels, font_color='red', font_size=9)

plt.title("Relation Extraction Graph", fontsize=16)
plt.axis('off')
plt.tight_layout()
plt.savefig(GRAPH_FILE)
plt.close()

# -----------------------------
# 8. Markdown Report
# -----------------------------
report_md = "# Relation Extraction Annotation Report\n\n"

report_md += "## Summary Statistics\n\n"
report_md += f"- Number of annotation tasks: {eval_summary['num_annotation_tasks']}\n"
report_md += f"- Number of entities: {eval_summary['num_entities']}\n"
report_md += f"- Number of relations: {eval_summary['num_relations']}\n"
report_md += f"- Entity labels: {', '.join(eval_summary['unique_entity_labels'])}\n"
report_md += f"- Relation types: {', '.join(eval_summary['unique_relation_labels'])}\n\n"

report_md += "## Entity Counts\n\n"
report_md += f"![Entity Counts]({ENTITY_BAR_FILE.name})\n\n"
report_md += "## Relation Counts\n\n"
report_md += f"![Relation Counts]({RELATION_BAR_FILE.name})\n\n"
report_md += "## Relation Graph\n\n"
report_md += f"![Relation Graph]({GRAPH_FILE.name})\n"

with REPORT_FILE.open("w", encoding="utf-8") as f:
    f.write(report_md)

print(f"\n✅ RE Annotation analysis complete! Check {RESULTS_PATH} for stats, report, and PNG graphs.")


Loaded 100 annotation tasks from ../annotations/train.json

✅ RE Annotation analysis complete! Check ../results for stats, report, and PNG graphs.
