In [4]:
# notebooks/sentiment_annotations.ipynb

import json
from pathlib import Path
from collections import Counter
import matplotlib.pyplot as plt
import random

# Paths
DATA_FILE = "../annotations/train.json"
RESULTS_DIR = "../results"
Path(RESULTS_DIR).mkdir(exist_ok=True)

# Expected labels (adjust as per your project)
EXPECTED_LABELS = {"Positive", "Negative", "Neutral"}

# Load data
with open(DATA_FILE, "r") as f:
    data = json.load(f)

# Extract texts and labels safely
texts = []
labels = []

for item in data:
    text = item.get("data", {}).get("text")
    # Default label None if no annotations
    label = None
    anns = item.get("annotations", [])
    if anns:
        results = anns[0].get("result", [])
        if results:
            choices = results[0].get("value", {}).get("choices", [])
            if choices:
                label = choices[0]
    texts.append(text)
    labels.append(label)

# Annotation statistics
total_annotations = len(labels)
label_counts = Counter([l for l in labels if l])
label_distribution = {k: f"{v/total_annotations*100:.2f}%" for k, v in label_counts.items()}

# Missing and unexpected labels
missing_labels = sum(1 for l in labels if not l)
unexpected_labels = [l for l in labels if l and l not in EXPECTED_LABELS]

# Missing texts
missing_texts = sum(1 for t in texts if not t)

# Text length stats
text_lengths = [len(t.split()) for t in texts if t]
if text_lengths:
    min_len = min(text_lengths)
    max_len = max(text_lengths)
    avg_len = sum(text_lengths) / len(text_lengths)
else:
    min_len = max_len = avg_len = 0

# Duplicate texts
duplicate_count = len(texts) - len(set(t for t in texts if t))

# Save annotations_stats.txt
with open(f"{RESULTS_DIR}/annotations_stats.txt", "w") as f:
    f.write(f"Total annotations: {total_annotations}\n\n")
    f.write("Label counts:\n")
    for label, count in label_counts.items():
        f.write(f"{label}: {count}\n")
    f.write("\nLabel distribution:\n")
    for label, perc in label_distribution.items():
        f.write(f"{label}: {perc}\n")
    f.write(f"\nMissing labels: {missing_labels}\n")
    f.write(f"Unexpected labels: {unexpected_labels}\n")
    f.write(f"Missing texts: {missing_texts}\n")
    f.write(f"Duplicate texts: {duplicate_count}\n")
    f.write(f"Text length (words) - min: {min_len}, max: {max_len}, avg: {avg_len:.2f}\n")

# Save eval_summary.txt
with open(f"{RESULTS_DIR}/eval_summary.txt", "w") as f:
    f.write(f"Total annotations: {total_annotations}\n")
    f.write(f"Missing labels: {missing_labels}\n")
    f.write(f"Unexpected labels: {unexpected_labels}\n")
    f.write(f"Missing texts: {missing_texts}\n")
    f.write(f"Duplicate texts: {duplicate_count}\n")
    f.write(f"Text length (words) - min: {min_len}, max: {max_len}, avg: {avg_len:.2f}\n")

# Generate bar chart for label distribution
plt.figure(figsize=(6,4))
plt.bar(label_counts.keys(), label_counts.values(), color=['green','red','gray'])
plt.title("Sentiment Label Distribution")
plt.xlabel("Labels")
plt.ylabel("Count")
plt.tight_layout()
plt.savefig(f"{RESULTS_DIR}/label_distribution.png")
plt.close()

# Generate markdown report
with open(f"{RESULTS_DIR}/report.md", "w") as f:
    f.write("# Sentiment Annotation Report\n\n")
    f.write(f"Total annotations: **{total_annotations}**\n\n")
    f.write("## Label Counts\n")
    for label, count in label_counts.items():
        f.write(f"- **{label}**: {count} ({label_distribution[label]})\n")
    f.write(f"\n## Missing labels: {missing_labels}\n")
    f.write(f"## Unexpected labels: {unexpected_labels}\n")
    f.write(f"## Missing texts: {missing_texts}\n")
    f.write(f"## Duplicate texts: {duplicate_count}\n")
    f.write(f"## Text Length Stats (words) - min: {min_len}, max: {max_len}, avg: {avg_len:.2f}\n")
    
    # Add random samples per label (skip empty texts)
    f.write("\n## Random Sample Texts by Label\n")
    for label in EXPECTED_LABELS:
        sample_texts = [t for t, l in zip(texts, labels) if l == label and t]
        if sample_texts:
            f.write(f"\n### {label}\n")
            for txt in random.sample(sample_texts, min(5, len(sample_texts))):
                f.write(f"- {txt}\n")
    
    # Link to plot
    f.write("\n## Label Distribution Plot\n")
    f.write(f"![Label Distribution](label_distribution.png)\n")

print("Sentiment annotation stats generated in 'results/' folder.")


Sentiment annotation stats generated in 'results/' folder.
