In [1]:
import xml.etree.ElementTree as ET
from pathlib import Path
import pandas as pd
import re
from typing import List, Dict, Tuple

In [10]:
# Path to your filtered letters directory (from previous notebook)
letters_dir = "../data/bullinger-filtered-letters"

# Path to church fathers CSV
church_fathers_csv = "../data/church-fathers/church-fathers-gnd-cc.csv"

# Output file for annotation preparation
output_csv = "../data/citations/patristic_citations_for_annotation.csv"

In [20]:
# Load church fathers mapping
cf_df = pd.read_csv(church_fathers_csv, sep=';', names=['ID', 'Name', 'gnd_id', 'extra'], header=0)
cf_dict = dict(zip(cf_df['ID'].str.lower(), cf_df['Name']))

print(f"Loaded {len(cf_dict)} church fathers")

Loaded 44 church fathers


In [None]:
def extract_patristic_references(xml_file: Path, church_father_ids: set) -> List[Dict]:
    """
    Extract patristic references from a TEI XML file.
    
    Returns list of dictionaries with:
    - file_name
    - file_id (from xml:id)
    - sentence_text
    - church_father_id
    - church_father_name
    - footnote_text (with citation)
    - citation_parsed (extracted citation)
    """
    references = []
    
    try:
        tree = ET.parse(xml_file)
        root = tree.getroot()
        ns = {'tei': 'http://www.tei-c.org/ns/1.0'}
        
        # Get file ID
        file_id = root.get('{http://www.w3.org/XML/1998/namespace}id', 'unknown')
        
        # Find all sentences in the body
        sentences = root.findall('.//tei:body//tei:s', ns)
        
        for sentence in sentences:
            # Get sentence text
            sentence_text = ''.join(sentence.itertext())
            
            # Find persName tags in this sentence
            person_refs = sentence.findall('.//tei:persName[@ref]', ns)
            
            for person in person_refs:
                person_id = person.get('ref', '').lower()
                
                # Check if it's a church father
                if person_id in church_father_ids:
                    person_name = cf_dict.get(person_id, 'Unknown')
                    
                    # Look for footnotes in this sentence
                    footnotes = sentence.findall('.//tei:note[@type="footnote"]', ns)
                    
                    footnote_text = ""
                    citation_parsed = ""
                    
                    if footnotes:
                        for fn in footnotes:
                            fn_text = ''.join(fn.itertext())
                            
                            # Check if footnote contains patristic citation
                            # Look for patterns like "Vgl.", "Cyprian", "Epist.", etc.
                            if any(keyword in fn_text for keyword in ['Vgl.', 'Siehe', 'Cf.', 'CChr', 'CSEL', 'PG', 'PL']):
                                footnote_text = fn_text
                                citation_parsed = parse_citation(fn_text)
                    
                    references.append({
                        'file_name': xml_file.name,
                        'file_id': file_id,
                        'sentence_text': sentence_text.strip(),
                        'church_father_id': person_id,
                        'church_father_name': person_name,
                        'footnote_text': footnote_text,
                        'citation_parsed': citation_parsed,
                        'detection_source': 'TEI',
                        'reference_type': '',  # To be filled for annotation
                        'patristic_work': '',  # To be filled in preparation for annotation
                        'patristic_text': '',  # To be filled in preparation for annotation
                        'confidence': '',  # To be filled for annotation
                        'notes': ''  # For notes
                    })
        
    except Exception as e:
        print(f"Error processing {xml_file}: {e}")
    
    return references


def parse_citation(footnote_text: str) -> str:
    """
    Extract structured citation from footnote text.
    
    Examples:
    - "Vgl. bes. Cyprian, Epist. 63, 10, 2-11, 1" → "Cyprian, Epist. 63, 10, 2-11, 1"
    - "Origenes author est..." → extract citation pattern
    """
    # Remove common prefixes
    citation = footnote_text
    for prefix in ['Vgl. bes.', 'Vgl.', 'Siehe', 'Cf.', 'S.']:
        citation = citation.replace(prefix, '').strip()
    
    # Try to extract structured citation patterns
    # Pattern: Author, Work, numbers
    citation_pattern = r'([A-Z][a-z]+(?:\s+[A-Z][a-z]+)?),\s*([^,]+(?:,\s*\d+[^,]*)+)'
    match = re.search(citation_pattern, citation)
    
    if match:
        return match.group(0)
    
    # Return first 200 chars if no pattern match
    return citation[:200]


In [18]:
# Get all XML files
xml_files = list(Path(letters_dir).glob('*.xml'))

print(f"Processing {len(xml_files)} XML files...\n")

# Extract all references
all_references = []
church_father_ids = set(cf_dict.keys())

for xml_file in xml_files:
    refs = extract_patristic_references(xml_file, church_father_ids)
    all_references.extend(refs)
    
    if refs:
        print(f"{xml_file.name}: Found {len(refs)} reference(s)")

print(f"\nTotal references extracted: {len(all_references)}")

Processing 322 XML files...

9914.xml: Found 1 reference(s)
10484.xml: Found 2 reference(s)
10309.xml: Found 2 reference(s)
11981.xml: Found 2 reference(s)
4217.xml: Found 1 reference(s)
11995.xml: Found 1 reference(s)
11567.xml: Found 4 reference(s)
12046.xml: Found 2 reference(s)
10137.xml: Found 4 reference(s)
11229.xml: Found 4 reference(s)
12697.xml: Found 1 reference(s)
2262.xml: Found 1 reference(s)
10043.xml: Found 7 reference(s)
9848.xml: Found 1 reference(s)
12330.xml: Found 2 reference(s)
8564.xml: Found 1 reference(s)
10042.xml: Found 3 reference(s)
11374.xml: Found 1 reference(s)
5493.xml: Found 1 reference(s)
4202.xml: Found 1 reference(s)
11943.xml: Found 1 reference(s)
1391.xml: Found 1 reference(s)
9917.xml: Found 1 reference(s)
5863.xml: Found 1 reference(s)
8412.xml: Found 2 reference(s)
10478.xml: Found 3 reference(s)
10915.xml: Found 1 reference(s)
7325.xml: Found 2 reference(s)
10040.xml: Found 16 reference(s)
12125.xml: Found 1 reference(s)
3396.xml: Found 2 refe

In [11]:
# Create DataFrame
df = pd.DataFrame(all_references)

# Display first few rows
print("Preview of extracted references:")
display(df.head())

# Save to CSV
df.to_csv(output_csv, index=False, encoding='utf-8')
print(f"Saved to: {output_csv}")

Preview of extracted references:


Unnamed: 0,file_name,file_id,sentence_text,church_father_id,church_father_name,footnote_text,citation_parsed,detection_source,reference_type,patristic_work,patristic_text,confidence,notes
0,9914.xml,file9914,Theodoreti exempla quottidie expecto sicut et ...,p18346,Theodoret von Kyrrhos,,,TEI,,,,,
1,10484.xml,file10484,Recte enim dictum est deum comprehensibilem no...,p18700,Augustinus von Hippo,"Vgl. Augustin, Sermo CVII, 3, 5 (MPL XXXVIII 6...","Augustin, Sermo CVII, 3, 5 (MPL XXXVIII 663).",TEI,,,,,
2,10484.xml,file10484,Dei enim proprium est incomprehensibilemVgl. A...,p18895,Athanasius von Alexandrien,"Vgl. Athanasius von Alexandrien, Oratio contra...","Alexandrien, Oratio contra gentes, 35 (MPG XXV...",TEI,,,,,
3,10309.xml,file10309,Chrysostomus interpretans hunc locum ait: «Eti...,p4541,Johannes Chrysostomus,"Johannes Chrysostomus, Commentarius in Epistol...","Romanos, Homilia 23, 1 (MPG LX 615).",TEI,,,,,
4,10309.xml,file10309,Chrysostomus interpretans hunc locum ait: «Eti...,p4541,Johannes Chrysostomus,"Johannes Chrysostomus, Commentarius in Epistol...","Romanos, Homilia 23, 1 (MPG LX 615).",TEI,,,,,


Saved to: ../data/citations/patristic_citations_for_annotation.csv


## Statistics

In [13]:
print(f"Total references found: {len(df)}")
print(f"References with citations: {df['citation_parsed'].astype(bool).sum()}")
print(f"References with footnotes: {df['footnote_text'].astype(bool).sum()}")

print("\nReferences by Church Father:")
print(df['church_father_name'].value_counts())

print("\nReferences by File:")
print(df['file_name'].value_counts())

Total references found: 1222
References with citations: 610
References with footnotes: 610

References by Church Father:
church_father_name
Augustinus von Hippo                               388
Quintus Septimius Florens Tertullian               127
Sophronius Eusebius Hieronymus                     106
Ambrosius von Mailand                               59
Cyprian                                             50
Cyrill von Alexandrien                              49
Athanasius von Alexandrien                          47
Lucius Caecilius Firmianus Lactantius (Laktanz)     46
Johannes Chrysostomus                               43
Eusebius von Caesarea                               39
Theodoret von Kyrrhos                               35
Thomas von Aquin                                    27
Hilarius von Poitiers                               27
Origenes                                            22
Basilius der Grosse (Caesariensis)                  15
Papst Gregor I. (der Große)        

## Generate Corpus Corporum Lookup List

In [17]:
citations_to_lookup = df[df['citation_parsed'] != ''].copy()
citations_to_lookup = citations_to_lookup[['church_father_id', 'church_father_name', 'citation_parsed']].drop_duplicates()

print("Citations to look up in Corpus Corporum:\n")
for idx, row in citations_to_lookup.iterrows():
    print(f"- {row['church_father_id'], row['church_father_name']}: {row['citation_parsed']}")

# Save lookup list
citations_to_lookup.to_csv('../data/citations/corpus_corporum_lookup_list.csv', index=False)
print(f"Saved lookup list to: corpus_corporum_lookup_list.csv")

Citations to look up in Corpus Corporum:

- ('p18700', 'Augustinus von Hippo'): Augustin, Sermo CVII, 3, 5 (MPL XXXVIII 663).
- ('p18895', 'Athanasius von Alexandrien'): Alexandrien, Oratio contra gentes, 35 (MPG XXV 69).
- ('p4541', 'Johannes Chrysostomus'): Romanos, Homilia 23, 1 (MPG LX 615).
- ('p19014', 'Salamanes Hermeias Sozomenos'): Einer der vier Teile des "Corpus iuris civilis", der eine Sammlung von Kaiserkonstitutionen des 2. bis 6. Jh.s n. Chr. in zwölf Büchern darbietet.  HBBW VII [Nr. 944] 56, Anm. 46.
- ('p18700', 'Augustinus von Hippo'): Augustin, Epist. 53, 2 (CChr-L XXXI 222, 30-45).  dazu Arne Hogrefe
- ('p18700', 'Augustinus von Hippo'): unten Nr. 1200, 60f.
- ('p4541', 'Johannes Chrysostomus'): unten Nr. 1200, 60f.
- ('p18986', 'Sophronius Eusebius Hieronymus'): Gedacht ist an Ambrosiasters Auslegung von Phil 2, 9 (CSEL LXXXI/3 141-145); vgl. Frecht an Capito, 31. Oktober 1538 (Capito, Corr. 244, Nr. 694).
- ('p18993', 'Ambrosius von Mailand'): Gedacht ist an Ambr