In [3]:
# -----------------------------
# 1. Imports
# -----------------------------
import spacy
from spacy.tokens import Doc, Span
from spacy import displacy
from collections import Counter
from pathlib import Path

# -----------------------------
# 2. Paths
# -----------------------------
DATA_PATH = Path("../annotations/train.conll")
RESULTS_PATH = Path("../results")
STATS_FILE = RESULTS_PATH / "annotation_stats.txt"
EVAL_FILE = RESULTS_PATH / "eval_summary.txt"
REPORT_FILE = RESULTS_PATH / "report.md"

# Create results folder if it doesn't exist
RESULTS_PATH.mkdir(parents=True, exist_ok=True)

# -----------------------------
# 3. Read CoNLL file
# -----------------------------
def read_conll(filepath):
    sentences, labels = [], []
    with open(filepath, encoding="utf-8") as f:
        sent, lab = [], []
        for line in f:
            line = line.strip()
            if not line:
                if sent:
                    sentences.append(sent)
                    labels.append(lab)
                    sent, lab = [], []
                continue
            parts = line.split()
            if len(parts) >= 2:
                token = parts[0]
                tag = parts[-1]
                sent.append(token)
                lab.append(tag)
            else:
                print(f"⚠️ Skipping malformed line: {line}")
    return sentences, labels

sentences, labels = read_conll(DATA_PATH)
print(f"✅ Loaded {len(sentences)} sentences from {DATA_PATH}")

# -----------------------------
# 4. Initialize SpaCy blank model
# -----------------------------
nlp = spacy.blank("en")

# -----------------------------
# 5. Visualization Function (Notebook Only)
# -----------------------------
def visualize_sentence(tokens, tags, show_jupyter=True):
    """
    Visualize a sentence's entities using SpaCy's displaCy.
    HTML generation is disabled; only notebook display.
    """
    # Convert BIO tags → entities
    entities = []
    start = None
    current_label = None
    for i, tag in enumerate(tags):
        if tag.startswith("B-"):
            if start is not None:
                entities.append((start, i, current_label))
            start = i
            current_label = tag[2:]
        elif tag.startswith("I-") and current_label == tag[2:]:
            continue
        else:
            if start is not None:
                entities.append((start, i, current_label))
                start = None
                current_label = None
    if start is not None:
        entities.append((start, len(tags), current_label))

    doc = Doc(nlp.vocab, words=tokens)
    doc.ents = [Span(doc, start, end, label=label) for start, end, label in entities]

    if show_jupyter:
        displacy.render(doc, style="ent", jupyter=True)

# -----------------------------
# 6. Visualize Sample Sentences
# -----------------------------
N = min(5, len(sentences))
for i in range(N):
    visualize_sentence(sentences[i], labels[i], show_jupyter=True)
print(f"✅ Displayed {N} sentences in notebook")

# -----------------------------
# 7. Annotation Statistics
# -----------------------------
all_labels = [tag for sent_tags in labels for tag in sent_tags if tag != "O"]
label_counts = Counter(all_labels)

with open(STATS_FILE, "w", encoding="utf-8") as f:
    for label, count in label_counts.items():
        f.write(f"{label}: {count}\n")
print(f"✅ Saved annotation stats in {STATS_FILE}")

# -----------------------------
# 8. Evaluation Summary
# -----------------------------
total_sentences = len(sentences)
sentences_with_entities = sum(1 for l in labels if any(tag != "O" for tag in l))
total_tokens = sum(len(s) for s in sentences)
annotated_tokens = sum(1 for l in labels for tag in l if tag != "O")
percent_annotated = (annotated_tokens / total_tokens * 100) if total_tokens else 0

# BIO consistency check
bio_errors = 0
for l in labels:
    prev_tag = "O"
    for tag in l:
        if tag.startswith("I-") and (prev_tag == "O" or prev_tag[2:] != tag[2:]):
            bio_errors += 1
        prev_tag = tag

with open(EVAL_FILE, "w", encoding="utf-8") as f:
    f.write(f"Total sentences: {total_sentences}\n")
    f.write(f"Sentences with entities: {sentences_with_entities}\n")
    f.write(f"Total tokens: {total_tokens}\n")
    f.write(f"Annotated tokens: {annotated_tokens} ({percent_annotated:.2f}%)\n\n")
    f.write("Entity counts:\n")
    for label, count in label_counts.items():
        f.write(f"    {label}: {count}\n")
    f.write(f"\nBIO consistency errors: {bio_errors}\n")
print(f"✅ Saved evaluation summary in {EVAL_FILE}")

# -----------------------------
# 9. Generate GitHub-Ready Markdown Report
# -----------------------------
md_lines = [
    "# Named Entity Recognition Annotation Report",
    "",
    "## 1. Evaluation Summary",
    "",
    f"- **Total sentences:** {total_sentences}",
    f"- **Sentences with entities:** {sentences_with_entities}",
    f"- **Total tokens:** {total_tokens}",
    f"- **Annotated tokens:** {annotated_tokens} ({percent_annotated:.2f}%)",
    "",
    "### Entity Counts (Markdown Table)",
    "",
    "| Entity Label | Count |",
    "|---------------|--------|",
]

for label, count in sorted(label_counts.items()):
    md_lines.append(f"| {label} | {count} |")

md_lines.extend([
    "",
    f"**BIO consistency errors:** {bio_errors}",
    "",
    "## 2. Sample Visualizations",
    "",
    "Visualizations are displayed directly in the notebook. No HTML files generated.",
])

with open(REPORT_FILE, "w", encoding="utf-8") as f:
    f.write("\n".join(md_lines))
print(f"✅ Markdown report generated at: {REPORT_FILE}")


✅ Loaded 100 sentences from ../annotations/train.conll


✅ Displayed 5 sentences in notebook
✅ Saved annotation stats in ../results/annotation_stats.txt
✅ Saved evaluation summary in ../results/eval_summary.txt
✅ Markdown report generated at: ../results/report.md
