In [9]:
import json
import pandas as pd
from collections import Counter, defaultdict
import re

In [10]:
input_file = '../data/passim/passim-output.json' 

print(f"\nLoading results from: {input_file}")

matches = []
with open(input_file, 'r') as f:
    for line in f:
        if line.strip():
            matches.append(json.loads(line))

print(f"Loaded {len(matches):,} matches")


Loading results from: ../data/passim/passim-output.json
Loaded 974 matches


In [12]:
# Group by cluster
clusters = defaultdict(list)
for match in matches:
    cluster_id = match.get('cluster')
    clusters[cluster_id].append(match)

print(f"Total clusters: {len(clusters)}")
print(f"Total matches: {len(matches)}")

# Cluster size distribution
cluster_sizes = [len(matches) for matches in clusters.values()]
print(f"\nCluster size statistics:")
print(f"  Min: {min(cluster_sizes)}")
print(f"  Max: {max(cluster_sizes)}")
print(f"  Mean: {sum(cluster_sizes)/len(cluster_sizes):.1f}")
print(f"  Median: {sorted(cluster_sizes)[len(cluster_sizes)//2]}")

# Top clusters by size
print(f"\nTop 10 largest clusters:")
sorted_clusters = sorted(clusters.items(), key=lambda x: len(x[1]), reverse=True)
for cluster_id, cluster_matches in sorted_clusters[:10]:
    print(f"  Cluster {cluster_id}: {len(cluster_matches)} matches")
    # Show one example
    example = cluster_matches[0]
    print(f"    Example: {example['text'][:80]}...")


patristic_citations = Counter()
patristic_authors = Counter()

for match in matches:
    # Check if this is a bullinger -> patristic match
    if match.get('corpus') == 'bullinger':
        # Get patristic sources
        for src in match.get('src', []):
            # The patristic ID is in the src
            if 'uid' in src:
                patristic_citations[match['id']] += 1
        
        # Try to extract author from metadata in source
        src_ids = [s.get('uid') for s in match.get('src', [])]
    
    # If corpus is patristic, the bullinger is citing it
    elif match.get('corpus') == 'patristic':
        doc_id = match['id']
        patristic_citations[doc_id] += 1
        
        # Extract author from metadata
        metadata = match.get('metadata', {})
        author = metadata.get('author', 'Unknown')
        if author != 'Unknown':
            patristic_authors[author] += 1

print(f"\nTop 20 most cited patristic sources:")
for i, (source_id, count) in enumerate(patristic_citations.most_common(20), 1):
    print(f"{i:2}. {source_id}: {count} citations")

print(f"\nTop 20 most cited authors:")
for i, (author, count) in enumerate(patristic_authors.most_common(20), 1):
    print(f"{i:2}. {author}: {count} citations")


bullinger_citations = Counter()
bullinger_metadata = {}

for match in matches:
    if match.get('corpus') == 'bullinger':
        letter_id = match['id']
        bullinger_citations[letter_id] += 1
        if letter_id not in bullinger_metadata:
            bullinger_metadata[letter_id] = match.get('metadata', {})

print(f"\nTop 20 letters with most patristic citations:")
for i, (letter_id, count) in enumerate(bullinger_citations.most_common(20), 1):
    meta = bullinger_metadata.get(letter_id, {})
    date = meta.get('date', 'Unknown')
    recipient = meta.get('recipient', 'Unknown')
    print(f"{i:2}. {letter_id} ({date}): {count} citations")
    print(f"     To: {recipient}")


# Create CSV for easy annotation
annotation_data = []

for match in matches:
    cluster_id = match.get('cluster')
    
    if match.get('corpus') == 'bullinger':
        row = {
            'cluster_id': cluster_id,
            'bullinger_id': match['id'],
            'bullinger_text': match['text'],
            'bullinger_date': match.get('metadata', {}).get('date', ''),
            'bullinger_recipient': match.get('metadata', {}).get('recipient', ''),
            'bullinger_sender': match.get('metadata', {}).get('sender', ''),
            'patristic_sources': '; '.join([str(s.get('uid', '')) for s in match.get('src', [])]),
            'cluster_size': match.get('size', 0),
            'citation_type': '',  # For manual annotation
            'notes': ''  # For manual annotation
        }
        annotation_data.append(row)
    elif match.get('corpus') == 'patristic':
        row = {
            'cluster_id': cluster_id,
            'patristic_id': match['id'],
            'patristic_text': match['text'],
            'patristic_author': match.get('metadata', {}).get('author', ''),
            'patristic_title': match.get('metadata', {}).get('title', ''),
            'bullinger_sources': '; '.join([str(s.get('uid', '')) for s in match.get('src', [])]),
            'cluster_size': match.get('size', 0),
            'citation_type': '',
            'notes': ''
        }
        annotation_data.append(row)

df = pd.DataFrame(annotation_data)
csv_file = '../data/citations/output_passim_for_annotation.csv'
df.to_csv(csv_file, index=False, encoding='utf-8')
print(f"Exported to CSV: {csv_file}")
print(f"Rows: {len(df)}")

Total clusters: 354
Total matches: 974

Cluster size statistics:
  Min: 2
  Max: 19
  Mean: 2.8
  Median: 2

Top 10 largest clusters:
  Cluster 146: 19 matches
    Example: propheta pronunciat beatus vir qui non abiit in consilio impiorum et in via pecc...
  Cluster 3: 11 matches
    Example: us apud matth dixit veniet super vos omnis sanguis iustus qui effusus est super ...
  Cluster 80: 10 matches
    Example: elio clamat dominus si quis sitit veniat ad me et bibat qui credit in me sicut d...
  Cluster 92: 9 matches
    Example: inus ego sum vitis“ ait vos palmites qui manet in me et ego in eo hic fert fruct...
  Cluster 185: 9 matches
    Example: s dixerit ’ecce hic christus aut illic’ nolite credere surgent enim pseudochrist...
  Cluster 50: 8 matches
    Example: omnes qui laboratis et onerati estis et ego reficiam vos tollite iugum meum supe...
  Cluster 57: 8 matches
    Example: scribit hunc cibum potum societatem vult intelligi corporis et mendacia laborum ...
  Cluster 172: 