In [1]:
import json
import subprocess
import passim
from pathlib import Path
from xml.etree import ElementTree as ET
from collections import defaultdict, Counter
import pandas as pd
import networkx as nx
from IPython.display import display, Markdown

In [None]:
# Configuration paths
LETTERS_DIR = "/Master-Thesis/bullinger-letters/data/letters"
SOURCES_DIR = "../data/cc-tei" 

# Output files
LETTERS_JSON = "../data/passim/bullinger-letters.json"
SOURCES_JSON = "../data/passim/patristic-sources.json"
OUTPUT_DIR = "../data/passim"

## Bullinger Letters

In [3]:
# TEI namespace
TEI_NS = {'tei': 'http://www.tei-c.org/ns/1.0'}

def extract_text_from_letter(xml_file):
    """
    Extract text content from a Bullinger letter XML file.
    """
    try:
        tree = ET.parse(xml_file)
        root = tree.getroot()
        
        # Extract document ID
        file_id = Path(xml_file).stem
        tei_id = root.get('{http://www.w3.org/XML/1998/namespace}id', file_id)
        
        # Extract metadata
        metadata = {}
        
        title_elem = root.find('.//tei:titleStmt/tei:title', TEI_NS)
        if title_elem is not None and title_elem.text:
            metadata['title'] = title_elem.text.strip()
        
        date_elem = root.find('.//tei:correspAction[@type="sent"]/tei:date', TEI_NS)
        if date_elem is not None:
            metadata['date'] = date_elem.get('when', '')
        
        sender_elem = root.find('.//tei:correspAction[@type="sent"]/tei:persName', TEI_NS)
        if sender_elem is not None:
            metadata['sender'] = sender_elem.get('ref', '')
        
        recipient_elem = root.find('.//tei:correspAction[@type="received"]/tei:persName', TEI_NS)
        if recipient_elem is not None:
            metadata['recipient'] = recipient_elem.get('ref', '')
        
        # Extract text from body
        text_parts = []
        
        # Find all text in <s> (sentence) tags
        for s_elem in root.findall('.//tei:body//tei:s', TEI_NS):
            if s_elem.text:
                text_parts.append(s_elem.text.strip())
            for child in s_elem:
                if child.tail:
                    text_parts.append(child.tail.strip())
        
        full_text = ' '.join(text_parts)
        full_text = ' '.join(full_text.split())  # Clean whitespace
        
        return {
            'id': tei_id,
            'text': full_text,
            'series': 'bullinger_letters',
            'metadata': metadata
        }
    
    except Exception as e:
        print(f"Error processing {xml_file}: {e}")
        return None

def extract_letters(input_dir, output_file):
    """
    Process all XML files in the letters directory.
    """
    documents = []
    input_path = Path(input_dir)
    xml_files = list(input_path.glob('**/*.xml'))
    
    print(f"Found {len(xml_files)} XML files")
    
    for i, xml_file in enumerate(xml_files, 1):
        if i % 100 == 0:
            print(f"  Processing: {i}/{len(xml_files)}...")
        
        doc = extract_text_from_letter(xml_file)
        if doc and doc['text']:
            documents.append(doc)
    
    # Create output directory
    output_path = Path(output_file)
    output_path.parent.mkdir(parents=True, exist_ok=True)
    
    # Write to JSON file
    with open(output_file, 'w', encoding='utf-8') as f:
        for doc in documents:
            json.dump(doc, f, ensure_ascii=False)
            f.write('\n')
    
    # Statistics
    total_chars = sum(len(doc['text']) for doc in documents)
    avg_length = total_chars / len(documents) if documents else 0
    
    print(f"\n✓ Successfully extracted {len(documents)} letters")
    print(f"  Total characters: {total_chars:,}")
    print(f"  Average letter length: {avg_length:.0f} characters")
    print(f"  Saved to: {output_file}")
    
    return documents

# Extract letters
print("Extracting Bullinger letters.")
letters = extract_letters(LETTERS_DIR, LETTERS_JSON)

# Show sample
if letters:
    print("\nSample letter:")
    sample = letters[0]
    print(f"  ID: {sample['id']}")
    print(f"  Text preview: {sample['text'][:200]}...")
    print(f"  Metadata: {sample['metadata']}")

Extracting Bullinger letters.
Found 13114 XML files
  Processing: 100/13114...
  Processing: 200/13114...
  Processing: 300/13114...
  Processing: 400/13114...
  Processing: 500/13114...
  Processing: 600/13114...
  Processing: 700/13114...
  Processing: 800/13114...
  Processing: 900/13114...
  Processing: 1000/13114...
  Processing: 1100/13114...
  Processing: 1200/13114...
  Processing: 1300/13114...
  Processing: 1400/13114...
  Processing: 1500/13114...
  Processing: 1600/13114...
  Processing: 1700/13114...
  Processing: 1800/13114...
  Processing: 1900/13114...
  Processing: 2000/13114...
  Processing: 2100/13114...
  Processing: 2200/13114...
  Processing: 2300/13114...
  Processing: 2400/13114...
  Processing: 2500/13114...
  Processing: 2600/13114...
  Processing: 2700/13114...
  Processing: 2800/13114...
  Processing: 2900/13114...
  Processing: 3000/13114...
  Processing: 3100/13114...
  Processing: 3200/13114...
  Processing: 3300/13114...
  Processing: 3400/13114...
  Pro

## Patristic Sources (Subset from CC)

In [7]:
# Chunking for sources
CHUNK_SIZE = 300   # Words per chunk
CHUNK_OVERLAP = 50 # Overlapping words

In [8]:
def extract_text_from_source(xml_file):
    """
    Extract text content from a CC XML file.
    """
    try:
        tree = ET.parse(xml_file)
        root = tree.getroot()
        
        file_id = Path(xml_file).stem
        
        # Extract metadata
        metadata = {}
        
        title_elem = root.find('.//tei:titleStmt/tei:title', TEI_NS)
        if title_elem is not None and title_elem.text:
            metadata['title'] = title_elem.text.strip()
        
        author_elem = root.find('.//tei:titleStmt/tei:author', TEI_NS)
        if author_elem is not None:
            author_text = ''.join(author_elem.itertext()).strip()
            metadata['author'] = author_text
            viaf_ref = author_elem.get('ref', '')
            if viaf_ref:
                metadata['author_ref'] = viaf_ref
        
        series_elem = root.find('.//tei:seriesStmt/tei:title', TEI_NS)
        if series_elem is not None and series_elem.text:
            metadata['series'] = series_elem.text.strip()
        
        series_id_elem = root.find('.//tei:seriesStmt/tei:idno', TEI_NS)
        if series_id_elem is not None and series_id_elem.text:
            metadata['series_id'] = series_id_elem.text.strip()
        
        # Extract text from body
        text_parts = []
        for p_elem in root.findall('.//tei:body//tei:p', TEI_NS):
            text = ''.join(p_elem.itertext()).strip()
            if text:
                text_parts.append(text)
        
        full_text = ' '.join(text_parts)
        full_text = ' '.join(full_text.split())
        
        series_name = metadata.get('series_id', metadata.get('author', file_id))
        
        return {
            'id': file_id,
            'text': full_text,
            'series': series_name,
            'metadata': metadata
        }
    
    except Exception as e:
        print(f"Error processing {xml_file}: {e}")
        return None

def chunk_document(doc, chunk_size=300, overlap=50):
    """
    Split a document into overlapping chunks.
    """
    words = doc['text'].split()
    
    if len(words) <= chunk_size:
        return [doc]
    
    chunks = []
    chunk_id = 0
    
    for i in range(0, len(words), chunk_size - overlap):
        chunk_words = words[i:i + chunk_size]
        if not chunk_words:
            break
        
        chunk_text = ' '.join(chunk_words)
        
        chunk_doc = {
            'id': f"{doc['id']}_chunk{chunk_id}",
            'text': chunk_text,
            'series': doc['series'],
            'metadata': {
                **doc['metadata'],
                'chunk_id': chunk_id,
                'parent_id': doc['id'],
                'word_offset': i
            }
        }
        chunks.append(chunk_doc)
        chunk_id += 1
    
    return chunks

def extract_sources(input_dir, output_file, chunk_size=300, overlap=50):
    """
    Process all XML files in the sources directory.
    """
    documents = []
    input_path = Path(input_dir)
    xml_files = list(input_path.glob('**/*.xml'))
    
    print(f"Found {len(xml_files)} XML files")
    
    for xml_file in xml_files:
        doc = extract_text_from_source(xml_file)
        if doc and doc['text']:
            word_count = len(doc['text'].split())
            if word_count > chunk_size:
                chunks = chunk_document(doc, chunk_size=chunk_size, overlap=overlap)
                print(f"  {xml_file.name}: {word_count} words → {len(chunks)} chunks")
                documents.extend(chunks)
            else:
                print(f"  {xml_file.name}: {word_count} words (no chunking)")
                documents.append(doc)
    
    # Create output directory
    output_path = Path(output_file)
    output_path.parent.mkdir(parents=True, exist_ok=True)

    # Write to JSON file
    with open(output_file, 'w', encoding='utf-8') as f:
        for doc in documents:
            json.dump(doc, f, ensure_ascii=False)
            f.write('\n')
    
    # Statistics
    total_chars = sum(len(doc['text']) for doc in documents)
    avg_length = total_chars / len(documents) if documents else 0
    
    print(f"Successfully extracted {len(documents)} documents (including chunks)")
    print(f"Total characters: {total_chars:,}")
    print(f"Average document length: {avg_length:.0f} characters")
    print(f"Saved to: {output_file}")
    
    return documents

# Extract sources
print("Extracting patristic sources.")
sources = extract_sources(SOURCES_DIR, SOURCES_JSON, 
                         chunk_size=CHUNK_SIZE, overlap=CHUNK_OVERLAP)

# Show sample
if sources:
    print("\nSample source:")
    sample = sources[0]
    print(f"  ID: {sample['id']}")
    print(f"  Series: {sample['series']}")
    print(f"  Text preview: {sample['text'][:200]}...")
    print(f"  Metadata: {sample['metadata']}")

Extracting patristic sources.
Found 726 XML files
  Firmicus_Maternus-Mathesis.xml: 125089 words → 501 chunks
  012_Julius-Firmicus-Maternus_De-errore-profanarum-religionum.xml: 14228 words → 57 chunks
  dlt000187.xml: 125651 words → 503 chunks
  013_Athanasius-Alexandrinus_Epistolae-ad-Luciferum.xml: 1140 words → 5 chunks
  103_Athanasius-Alexandrinus_De-observationibus-monachorum.xml: 2325 words → 10 chunks
  073_Athanasius-Alexandrinus_-Evagrius-Antiochensis_Vita-B.-Antonii-abbatis.xml: 16901 words → 68 chunks
  062_Athanasius-Alexandrinus_Epistola-ad-Luciferum.xml: 3796 words → 16 chunks
  103_Athanasius-Alexandrinus_Exhortatio-ad-sponsam-Christi.xml: 5448 words → 22 chunks
  043_Augustinus-Hipponensis_Contra-Cresconium-grammaticum-Donatistam.xml: 58644 words → 235 chunks
  042_Augustinus-Hipponensis_De-utilitate-credendi.xml: 11155 words → 45 chunks
  040_Augustinus-Hipponensis_Sermo-I-de-symbolo.xml: 3991 words → 16 chunks
  032_Augustinus-Hipponensis_De-ordine.xml: 17673 words →