In [1]:
from lxml import etree
import os
import glob

In [None]:
def extract_letter_text(tei_file, output_file):
    """Extract text from <body><div><p><s> elements"""
    
    # Parse TEI with namespace
    tree = etree.parse(tei_file)
    ns = {'tei': 'http://www.tei-c.org/ns/1.0'}
    
    # Get letter ID from filename or xml:id
    letter_id = tree.xpath('/tei:TEI/@xml:id', namespaces=ns)[0]
    
    # Extract all sentence elements from body
    sentences = tree.xpath('//tei:text/tei:body//tei:s', namespaces=ns)
    
    # Combine text, filtering out footnote references
    text_parts = []
    for s in sentences:
        # Get text, excluding <note> elements
        text = ''.join(s.xpath('.//text()[not(ancestor::tei:note)]', namespaces=ns))
        text = text.strip()
        if text:
            text_parts.append(text)
    
    full_text = ' '.join(text_parts)
    
    # Write to plain text file
    with open(output_file, 'w', encoding='utf-8') as f:
        f.write(full_text)
    
    print(f"Extracted: {letter_id} -> {output_file}")

def process_directory(input_dir, output_dir):
    """Process all TEI files in a directory"""
    os.makedirs(output_dir, exist_ok=True)
    
    for tei_file in glob.glob(os.path.join(input_dir, '*.xml')):
        basename = os.path.basename(tei_file)
        output_file = os.path.join(output_dir, basename.replace('.xml', '.txt'))
        extract_letter_text(tei_file, output_file)

if __name__ == '__main__':
    # Adjust these paths
    input_dir = '/Master-Thesis/bullinger-letters/data/letters'
    output_dir = '../data/bullinger_plaintext'
    
    process_directory(input_dir, output_dir)

Extracted: file5848 -> ../data/bullinger_plaintext/5848.txt
Extracted: file5690 -> ../data/bullinger_plaintext/5690.txt
Extracted: file162 -> ../data/bullinger_plaintext/162.txt
Extracted: file6399 -> ../data/bullinger_plaintext/6399.txt
Extracted: file7087 -> ../data/bullinger_plaintext/7087.txt
Extracted: file9914 -> ../data/bullinger_plaintext/9914.txt
Extracted: file1390 -> ../data/bullinger_plaintext/1390.txt
Extracted: file604 -> ../data/bullinger_plaintext/604.txt
Extracted: file7939 -> ../data/bullinger_plaintext/7939.txt
Extracted: file2699 -> ../data/bullinger_plaintext/2699.txt
Extracted: file2841 -> ../data/bullinger_plaintext/2841.txt
Extracted: file3587 -> ../data/bullinger_plaintext/3587.txt
Extracted: file2855 -> ../data/bullinger_plaintext/2855.txt
Extracted: file3593 -> ../data/bullinger_plaintext/3593.txt
Extracted: file9900 -> ../data/bullinger_plaintext/9900.txt
Extracted: file88 -> ../data/bullinger_plaintext/88.txt
Extracted: file610 -> ../data/bullinger_plaintex