In [1]:
import json
import csv
from pathlib import Path
from collections import Counter, defaultdict
import re
from cltk import NLP
from cltk.alphabet.lat import normalize_lat

In [2]:
def setup_cltk():
    """Initialize CLTK for Latin."""

    nlp = NLP(language="lat")
    print("CLTK initialized successfully.\n")
    return nlp


def lemmatize_latin(text, nlp):
    """Lemmatize Latin text using CLTK."""

    # Normalize and process
    text = normalize_lat(text)
    doc = nlp.analyze(text)
        
    # Extract lemmas
    lemmas = []
    for word in doc.words:
        if hasattr(word, 'lemma') and word.lemma:
            lemmas.append(word.lemma.lower())
        else:
            lemmas.append(word.string.lower())
        
    return set(lemmas)

def is_latin(text):
    """
    Detect if text is primarily Latin.
    Returns True if text contains mostly Latin characters and common Latin words.
    """
    # Latin words as indicators
    latin_indicators = ['est', 'sunt', 'et', 'non', 'in', 'ut', 'ad', 'per', 'cum', 
                        'sed', 'qui', 'quae', 'quod', 'esse', 'sit', 'de']
    
    text_lower = text.lower()
    
    # Check if at least one Latin indicator is present
    has_latin_indicator = any(indicator in text_lower for indicator in latin_indicators)
    
    # Check if text is primarily Latin alphabet (excluding Greek, High New German, etc.)
    latin_chars = sum(1 for c in text if c.isalpha() and ord(c) < 591)  # Basic Latin + Latin Extended
    total_chars = sum(1 for c in text if c.isalpha())
    
    if total_chars == 0:
        return False
    
    latin_ratio = latin_chars / total_chars if total_chars > 0 else 0
    
    return has_latin_indicator and latin_ratio > 0.9

def jaccard_coefficient(set1, set2):
    """Calculate Jaccard coefficient between two sets."""
    if len(set1) == 0 and len(set2) == 0:
        return 0.0
    
    intersection = len(set1.intersection(set2))
    union = len(set1.union(set2))
    
    return intersection / union if union > 0 else 0.0


def find_json_files(base_path):
    """Recursively find all JSON files."""
    return list(Path(base_path).rglob("*.json"))


def extract_text_from_json(json_file):
    """Extract the full text (sofaString) from JSON file."""
    with open(json_file, 'r', encoding='utf-8') as f:
        data = json.load(f)
    
    # Find the sofaString in the Sofa object
    if isinstance(data, dict):
        for key, value in data.items():
            if isinstance(value, list):
                for item in value:
                    if isinstance(item, dict) and item.get('%TYPE') == 'uima.cas.Sofa':
                        return item.get('sofaString', '')
    return ''


def extract_annotations(json_file):
    """Extract PatristicReference annotations from JSON."""
    with open(json_file, 'r', encoding='utf-8') as f:
        data = json.load(f)
    
    annotations = []
    
    if isinstance(data, dict):
        for key, value in data.items():
            if isinstance(value, list):
                for item in value:
                    if isinstance(item, dict) and item.get('%TYPE') == 'webanno.custom.PatristicReference':
                        annotations.append(item)
    
    return annotations


def analyze_jaccard(base_path):
    """Main analysis function."""
    

    print("Annotation Jaccard Analysis")

    print()
    
    # Initialize CLTK
    nlp = setup_cltk()
    
    # Find all JSON files
    json_files = find_json_files(base_path)
    print(f"Found {len(json_files)} JSON files")
    print(f"Processing annotations (lemmatizing only annotated spans)...\n")
    
    # Collect results
    all_results = []
    latin_count = 0
    non_latin_count = 0
    processed_files = 0
    
    # Process each file
    for json_file in json_files:
        try:
            # Get full text (only used to extract the annotated spans by index)
            full_text = extract_text_from_json(json_file)
            if not full_text:
                continue
            
            # Get annotations
            annotations = extract_annotations(json_file)
            
            if annotations:
                processed_files += 1
                print(f"Processing letter {json_file.parent.name} ({processed_files} with annotations)... ", end='', flush=True)
                
                file_annotation_count = 0
            
            if annotations:
                processed_files += 1
                print(f"Processing letter {processed_files}/{len(json_files)}: {json_file.parent.name} ({len(annotations)} annotations)")
            
            for ann in annotations:
                # Extract annotation data
                begin = ann.get('begin')
                end = ann.get('end')
                patristic_text = ann.get('patristic_text', '')
                reference_type = ann.get('reference_type', 'unknown')
                church_father = ann.get('church_fathers', 'unknown')
                confidence = ann.get('confidence', 'unknown')
                detection_source = ann.get('detection_source', 'unknown')
                
                if begin is None or end is None or not patristic_text:
                    continue
                
                # Extract letter text span (ONLY the annotated portion - much faster!)
                letter_span = full_text[begin:end]
                
                # Check if both texts are Latin
                if not (is_latin(letter_span) and is_latin(patristic_text)):
                    non_latin_count += 1
                    continue
                
                latin_count += 1
                file_annotation_count += 1
                
                # Lemmatize both texts
                letter_tokens = lemmatize_latin(letter_span, nlp)
                patristic_tokens = lemmatize_latin(patristic_text, nlp)
                
                # Calculate Jaccard coefficient
                jaccard = jaccard_coefficient(letter_tokens, patristic_tokens)
                
                # Calculate overlap stats
                intersection = letter_tokens.intersection(patristic_tokens)
                union = letter_tokens.union(patristic_tokens)
                
                # Store results
                result = {
                    'letter_id': json_file.parent.name,
                    'reference_type': reference_type,
                    'church_father': church_father,
                    'confidence': confidence,
                    'detection_source': detection_source,
                    'jaccard': jaccard,
                    'letter_tokens': len(letter_tokens),
                    'patristic_tokens': len(patristic_tokens),
                    'intersection': len(intersection),
                    'union': len(union),
                    'letter_span': letter_span[:100] + '...' if len(letter_span) > 100 else letter_span
                }
                
                all_results.append(result)
            
            if file_annotation_count > 0:
                print(f"‚úì {file_annotation_count} Latin annotations")
        
        except Exception as e:
            print(f"\n‚ö† Error processing {json_file}: {e}")
    
    print()
    print(f"Completed processing!")
    print(f"  - Letters processed: {processed_files}")
    print(f"  - Latin annotations: {latin_count}")
    print(f"  - Non-Latin (skipped): {non_latin_count}")
    print()
    
    if not all_results:
        print("No Latin annotations found to analyze!")
        return
    
    # Calculate overall statistics
    jaccards = [r['jaccard'] for r in all_results]
    avg_jaccard = sum(jaccards) / len(jaccards)
    
    print(f"Total Latin annotations analyzed: {len(all_results)}")
    print(f"Average Jaccard coefficient: {avg_jaccard:.4f}")
    print(f"Min Jaccard: {min(jaccards):.4f}")
    print(f"Max Jaccard: {max(jaccards):.4f}")
    print(f"Median Jaccard: {sorted(jaccards)[len(jaccards)//2]:.4f}")
    print()
    
    # Token overlap statistics

    print("Token overlap statistics")

    
    avg_letter_tokens = sum(r['letter_tokens'] for r in all_results) / len(all_results)
    avg_patristic_tokens = sum(r['patristic_tokens'] for r in all_results) / len(all_results)
    avg_intersection = sum(r['intersection'] for r in all_results) / len(all_results)
    avg_union = sum(r['union'] for r in all_results) / len(all_results)
    
    print(f"Average letter span tokens: {avg_letter_tokens:.2f}")
    print(f"Average patristic text tokens: {avg_patristic_tokens:.2f}")
    print(f"Average intersection size: {avg_intersection:.2f}")
    print(f"Average union size: {avg_union:.2f}")
    print()
    
    # Analysis by reference type
    print("Analysis by reference type")
    
    by_ref_type = defaultdict(list)
    for result in all_results:
        by_ref_type[result['reference_type']].append(result['jaccard'])
    
    print(f"{'Reference Type':<20s} {'Count':>8s} {'Avg Jaccard':>15s} {'Min':>10s} {'Max':>10s}")
    
    for ref_type in sorted(by_ref_type.keys()):
        jaccards = by_ref_type[ref_type]
        count = len(jaccards)
        avg = sum(jaccards) / count
        min_j = min(jaccards)
        max_j = max(jaccards)
        print(f"{ref_type:<20s} {count:8d} {avg:15.4f} {min_j:10.4f} {max_j:10.4f}")
    
    print()
    
    # Show top 10 highest Jaccard scores
    print("Top 10 highest Jaccard scores")
    
    sorted_results = sorted(all_results, key=lambda x: x['jaccard'], reverse=True)
    
    print(f"{'Letter':<12s} {'Type':<10s} {'Jaccard':>10s} {'Tokens(L/P)':>15s} {'Preview':<40s}")
    
    for result in sorted_results[10:]:
        letter_id = result['letter_id']
        ref_type = result['reference_type']
        jaccard = result['jaccard']
        tokens_str = f"{result['letter_tokens']}/{result['patristic_tokens']}"
        preview = result['letter_span'][:37] + '...' if len(result['letter_span']) > 37 else result['letter_span']
        
        print(f"{letter_id:<12s} {ref_type:<10s} {jaccard:10.4f} {tokens_str:>15s} {preview:<40s}")
    
    print()
    
    # Show top 10 lowest Jaccard scores
    print("Top 10 lowest Jaccard Scores")

    print(f"{'Letter':<12s} {'Type':<10s} {'Jaccard':>10s} {'Tokens(L/P)':>15s} {'Preview':<40s}")

    for result in sorted_results[-10:]:
        letter_id = result['letter_id']
        ref_type = result['reference_type']
        jaccard = result['jaccard']
        tokens_str = f"{result['letter_tokens']}/{result['patristic_tokens']}"
        preview = result['letter_span'][:37] + '...' if len(result['letter_span']) > 37 else result['letter_span']
        
        print(f"{letter_id:<12s} {ref_type:<10s} {jaccard:10.4f} {tokens_str:>15s} {preview:<40s}")
    
    print()

    print("All Jaccard Scores")
    sorted_results = sorted(all_results, key=lambda x: x['jaccard'], reverse=True)
    print(f"{'#':<4s} {'Letter':<12s} {'Type':<10s} {'Jaccard':>10s} {'Tokens(L/P)':>15s} {'Preview':<40s}")

    for i, result in enumerate(sorted_results, 1):
        letter_id = result['letter_id']
        ref_type = result['reference_type']
        jaccard = result['jaccard']
        tokens_str = f"{result['letter_tokens']}/{result['patristic_tokens']}"
        preview = result['letter_span'][:37] + '...' if len(result['letter_span']) > 37 else result['letter_span']
    
        print(f"{i:<4d} {letter_id:<12s} {ref_type:<10s} {jaccard:10.4f} {tokens_str:>15s} {preview:<40s}")

    print()


if __name__ == "__main__":
    base_path = "/Users/lenap/Desktop/Master-Thesis/bullinger-patristic-annotations/annotations/annotations-json"
    analyze_jaccard(base_path)
 

Annotation Jaccard Analysis

‚Äéê§Ä CLTK version '1.5.0'. When using the CLTK in research, please cite: https://aclanthology.org/2021.acl-demo.3/

Pipeline for language 'Latin' (ISO: 'lat'): `LatinNormalizeProcess`, `LatinStanzaProcess`, `LatinEmbeddingsProcess`, `StopsProcess`, `LatinLexiconProcess`.

‚∏ñ ``LatinStanzaProcess`` using Stanza model from the Stanford NLP Group: https://stanfordnlp.github.io/stanza/ . Please cite: https://arxiv.org/abs/2003.07082
‚∏ñ ``LatinEmbeddingsProcess`` using word2vec model by University of Oslo from http://vectors.nlpl.eu/ . Please cite: https://aclanthology.org/W17-0237/
‚∏ñ ``LatinLexiconProcess`` using Lewis's *An Elementary Latin Dictionary* (1890).

‚∏é To suppress these messages, instantiate ``NLP()`` with ``suppress_banner=True``.
CLTK initialized successfully.

Found 67 JSON files
Processing annotations (lemmatizing only annotated spans)...

Processing letter 12805.txt (1 with annotations)... Processing letter 2/67: 12805.txt (1 annotatio