In [1]:
import pandas as pd
import xml.etree.ElementTree as ET
from pathlib import Path
import shutil
from typing import Set, List

In [None]:
csv_file = "../data/church-fathers/church-fathers-gnd-cc.csv"          
input_dir = "/Master-Thesis/bullinger-letters/data/letters/"               
output_dir = "../data/bullinger-filtered-letters/"     

Path(output_dir).mkdir(parents=True, exist_ok=True)

In [3]:
# Read CSV file
df = pd.read_csv(csv_file, sep=';', names=['ID', 'Name', 'gnd_id', 'extra'], header=0)

# Extract church father IDs and convert to lowercase 'p' format
church_father_ids = set(df['ID'].str.lower())

print(f"Loaded {len(church_father_ids)} church fathers:")
for person_id in sorted(church_father_ids):
    name = df[df['ID'].str.lower() == person_id]['Name'].values[0]
    print(f"  {person_id}: {name}")

Loaded 44 church fathers:
  p16781: Gaius Marius Victorinus
  p17746: Quintus Septimius Florens Tertullian
  p17752: Clemens von Alexandrien
  p17753: Origenes
  p17828: Benedikt von Nursia
  p17867: Cyrill von Alexandrien
  p17892: Epiphanius von Salamis
  p17899: Eusebius von Caesarea
  p17933: Gennadius von Marseille
  p17965: Gregor von Nazianz
  p18048: Hilarius von Poitiers
  p18056: Irenäus von Lyon
  p18057: Isidor von Sevilla
  p18058: Iulius Firmicus Maternus
  p18343: Sulpicius Severus
  p18346: Theodoret von Kyrrhos
  p18357: Thomas von Aquin
  p18454: Didymus der Blinde
  p18489: Papst Gregor I. (der Große)
  p18512: Hippolyt von Rom
  p18513: Hippolytos von Rom
  p18700: Augustinus von Hippo
  p18881: Ignatius von Antiochien
  p18887: Basilius der Grosse (Caesariensis)
  p18889: Rufinus von Aquileia
  p18895: Athanasius von Alexandrien
  p18974: Arnobius d.Ä.
  p18986: Sophronius Eusebius Hieronymus
  p18988: Cyprian
  p18989: Anicius Manlius Severinus Boethius
  p18993: 

In [4]:
def contains_church_father(xml_file: Path, church_father_ids: Set[str]) -> tuple[bool, List[str]]:
    """
    Check if a TEI XML file contains references to any church fathers.
    
    Args:
        xml_file: Path to the XML file
        church_father_ids: Set of church father IDs to look for (lowercase)
    
    Returns:
        Tuple of (found: bool, matching_ids: List[str])
    """
    try:
        tree = ET.parse(xml_file)
        root = tree.getroot()
        
        # TEI namespace
        ns = {'tei': 'http://www.tei-c.org/ns/1.0'}
        
        # Find all persName elements with ref attributes
        person_refs = root.findall('.//tei:persName[@ref]', ns)
        
        found_ids = []
        
        for person in person_refs:
            ref = person.get('ref')
            if ref and ref.lower() in church_father_ids:
                found_ids.append(ref.lower())
        
        return len(found_ids) > 0, list(set(found_ids))  # Remove duplicates
        
    except Exception as e:
        print(f"Error processing {xml_file}: {e}")
        return False, []

In [5]:
# Get all XML files
xml_files = list(Path(input_dir).glob('*.xml'))

print(f"Found {len(xml_files)} XML files to process\n")

# Process each file
matched_files = []
church_father_mentions = {cf_id: [] for cf_id in church_father_ids}

for xml_file in xml_files:
    contains, found_ids = contains_church_father(xml_file, church_father_ids)
    
    if contains:
        # Copy file to output directory
        shutil.copy2(xml_file, Path(output_dir) / xml_file.name)
        matched_files.append(xml_file.name)
        
        # Track which church fathers appear in which files
        for cf_id in found_ids:
            church_father_mentions[cf_id].append(xml_file.name)
        
        print(f"✓ {xml_file.name} - Contains: {', '.join(found_ids)}")


print(f"SUMMARY")
print(f"Total files processed: {len(xml_files)}")
print(f"Files with church fathers: {len(matched_files)}")
print(f"Files copied to: {output_dir}")

Found 13114 XML files to process

✓ 9914.xml - Contains: p18346
✓ 10484.xml - Contains: p18895, p18700
✓ 10309.xml - Contains: p4541
✓ 11981.xml - Contains: p17828, p19014
✓ 4217.xml - Contains: p18700
✓ 11995.xml - Contains: p18986
✓ 11567.xml - Contains: p18357
✓ 12046.xml - Contains: p18700
✓ 10137.xml - Contains: p4541, p18700, p17746, p18993
✓ 11229.xml - Contains: p4541, p18700, p18993, p18986
✓ 12697.xml - Contains: p18700
✓ 2262.xml - Contains: p19001
✓ 10043.xml - Contains: p19001, p18700, p18993
✓ 9848.xml - Contains: p18346
✓ 12330.xml - Contains: p18513, p18988
✓ 8564.xml - Contains: p18489
✓ 10042.xml - Contains: p18881, p17746
✓ 11374.xml - Contains: p4541
✓ 12090.xml - Contains: p18700, p18986
✓ 5493.xml - Contains: p18700
✓ 4202.xml - Contains: p18895
✓ 11943.xml - Contains: p4541
✓ 12292.xml - Contains: p18700
✓ 1391.xml - Contains: p17746
✓ 9917.xml - Contains: p18346
✓ 5863.xml - Contains: p18895
✓ 8412.xml - Contains: p16781
✓ 10478.xml - Contains: p18700
✓ 10915.xm

In [6]:
print(f"CHURCH FATHER MENTIONS")

for cf_id in sorted(church_father_ids):
    files = church_father_mentions[cf_id]
    name = df[df['ID'].str.lower() == cf_id]['Name'].values[0]
    
    if files:
        print(f"{cf_id} ({name}): {len(files)} mention(s)")
        for f in files:
            print(f"  - {f}")
    else:
        print(f"{cf_id} ({name}): 0 mentions")

CHURCH FATHER MENTIONS
p16781 (Gaius Marius Victorinus): 3 mention(s)
  - 8412.xml
  - 11471.xml
  - 7030.xml
p17746 (Quintus Septimius Florens Tertullian): 56 mention(s)
  - 10137.xml
  - 10042.xml
  - 1391.xml
  - 10040.xml
  - 12326.xml
  - 10041.xml
  - 1221.xml
  - 8210.xml
  - 10044.xml
  - 10468.xml
  - 10454.xml
  - 11990.xml
  - 12080.xml
  - 10250.xml
  - 11826.xml
  - 11749.xml
  - 10140.xml
  - 11699.xml
  - 5196.xml
  - 10036.xml
  - 10022.xml
  - 5756.xml
  - 10033.xml
  - 10027.xml
  - 11729.xml
  - 5769.xml
  - 10015.xml
  - 10217.xml
  - 10202.xml
  - 11888.xml
  - 10201.xml
  - 10598.xml
  - 1878.xml
  - 2801.xml
  - 11912.xml
  - 2024.xml
  - 10172.xml
  - 4720.xml
  - 521.xml
  - 11326.xml
  - 10038.xml
  - 12389.xml
  - 10159.xml
  - 11591.xml
  - 4342.xml
  - 8419.xml
  - 5473.xml
  - 12448.xml
  - 592.xml
  - 10501.xml
  - 11999.xml
  - 1820.xml
  - 10067.xml
  - 10099.xml
  - 10112.xml
  - 6828.xml
p17752 (Clemens von Alexandrien): 1 mention(s)
  - 10050.xml
p17