In [2]:
import json
import os
import csv
from pathlib import Path
from collections import Counter, defaultdict

In [8]:

def find_json_files(base_path):
    """find all JSON files in the directory structure."""
    json_files = []
    base = Path(base_path)
    
    for item in base.rglob("*.json"):
        json_files.append(item)
    
    return json_files

def load_church_father_names(csv_path):
    """Load church father names from CSV file."""
    father_names = {}
    
    try:
        with open(csv_path, 'r', encoding='utf-8-sig') as f:  # utf-8-sig handles BOM
            reader = csv.DictReader(f, delimiter=';')
            for row in reader:
                # Strip whitespace from keys and values
                row = {k.strip(): v.strip() if v else v for k, v in row.items()}
                p_id = row.get('ID', '').strip()
                name = row.get('Name', '').strip()
                if p_id and name:
                    father_names[p_id] = name
        print(f"Loaded {len(father_names)} church father names from {csv_path}\n")
    except FileNotFoundError:
        print(f"Warning: Could not find file at {csv_path}")
        print("Will display P-numbers only.\n")
    except Exception as e:
        print(f"Warning: Error loading: {e}")
        print("Will display P-numbers only.\n")
    
    return father_names

def extract_annotations(json_file):
    """Extract PatristicReference annotations from a JSON file."""
    with open(json_file, 'r', encoding='utf-8') as f:
        data = json.load(f)
    
    annotations = []
    
    # The annotations are in a flat list structure
    if isinstance(data, dict):
        # Check if there's a list of annotations
        for key, value in data.items():
            if isinstance(value, list):
                for item in value:
                    if isinstance(item, dict) and item.get('%TYPE') == 'webanno.custom.PatristicReference':
                        annotations.append(item)
    
    return annotations

def analyse_annotations(base_path, church_father_csv_path):
    """analyse all annotations and generate statistics."""
    
    print(f"Analysing annotations in: {base_path}\n")
    print("=" * 80)
    
    # Load church father names
    father_names = load_church_father_names(church_father_csv_path)
    
    # Find all JSON files
    json_files = find_json_files(base_path)
    print(f"Found {len(json_files)} JSON files\n")
    
    # Collect all annotations
    all_annotations = []
    annotations_per_letter = {}
    
    for json_file in json_files:
        try:
            annotations = extract_annotations(json_file)
            if annotations:
                # Extract letter ID from path (the parent directory name)
                letter_id = json_file.parent.name
                annotations_per_letter[letter_id] = len(annotations)
                all_annotations.extend(annotations)
        except Exception as e:
            print(f"Error processing {json_file}: {e}")
    
    if not all_annotations:
        print("No annotations found!")
        return
    
    # Calculate statistics
    total_annotations = len(all_annotations)
    
    # Reference type breakdown
    reference_types = Counter()
    for ann in all_annotations:
        ref_type = ann.get('reference_type', 'unknown')
        reference_types[ref_type] += 1
    
    # Church fathers breakdown
    church_fathers = Counter()
    for ann in all_annotations:
        father = ann.get('church_fathers', 'unknown')
        church_fathers[father] += 1

    # Detection source breakdown
    detection_sources = Counter()
    for ann in all_annotations:
        source = ann.get('detection_source', 'unknown')
        detection_sources[source] += 1
    
    # Average annotations per letter
    num_letters = len(annotations_per_letter)
    avg_annotations = total_annotations / num_letters if num_letters > 0 else 0
    
    # Print results
    print(f"Total annotations: {total_annotations}")
    print(f"Total letters with annotations: {num_letters}")
    print(f"Average annotations per letter: {avg_annotations:.2f}")

    print("Reference Type")
    for ref_type, count in reference_types.most_common():
        percentage = (count / total_annotations) * 100
        print(f"{ref_type:20s}: {count:4d} ({percentage:5.1f}%)")
    print()
    
    print("Detection Source")
    for source, count in detection_sources.most_common():
        percentage = (count / total_annotations) * 100
        print(f"{source:20s}: {count:4d} ({percentage:5.1f}%)")
    print()
    
    print("Church Fathers Distribution")
    print(f"{'P-Number':<15s} {'Name':<40s} {'Count':>8s} {'Percentage':>12s}")
    print("-" * 80)
    for father, count in church_fathers.most_common():
        percentage = (count / total_annotations) * 100
        name = father_names.get(father, "Unknown")
        print(f"{father:<15s} {name:<40s} {count:8d} {percentage:11.1f}%")
    print()
    

    annotation_counts = list(annotations_per_letter.values())
    if annotation_counts:
        print(f"Min annotations per letter: {min(annotation_counts)}")
        print(f"Max annotations per letter: {max(annotation_counts)}")
        print(f"Median annotations per letter: {sorted(annotation_counts)[len(annotation_counts)//2]}")

if __name__ == "__main__":
    base_path = "../annotations/annotations-json"
    church_father_csv = "../data/church-fathers/church-fathers-gnd-cc.csv"
    analyse_annotations(base_path, church_father_csv)

Analysing annotations in: ../annotations/annotations-json

Loaded 44 church father names from ../data/church-fathers/church-fathers-gnd-cc.csv

Found 67 JSON files

Total annotations: 100
Total letters with annotations: 67
Average annotations per letter: 1.49
Reference Type
implicit            :   50 ( 50.0%)
explicit            :   50 ( 50.0%)

Detection Source
tei                 :   54 ( 54.0%)
passim              :   42 ( 42.0%)
scholar             :    4 (  4.0%)

Church Fathers Distribution
P-Number        Name                                        Count   Percentage
--------------------------------------------------------------------------------
P18700          Augustinus von Hippo                           54        54.0%
P18986          Sophronius Eusebius Hieronymus                 11        11.0%
P17746          Quintus Septimius Florens Tertullian            9         9.0%
P18048          Hilarius von Poitiers                           5         5.0%
P18988          Cypria