In [56]:
import xml.etree.ElementTree as ET
import pandas as pd
from pathlib import Path
from io import StringIO

In [57]:
# Load the manually created CSV file with Church Fathers data
church_fathers_df = pd.read_csv("../data/church-fathers/manual-church-fathers.csv", sep=';',  encoding='utf-8')
print(church_fathers_df.head())
print(f"Number of Church Fathers in CSV: {len(church_fathers_df)}")

# Path to the persons.xml file
persons_xml_path = Path('../data/church-fathers/persons.xml')

       ID                  Surname     Forename
0  P18881  Ignatius von Antiochien          NaN
1  P19930                 Smyrnäus   Polycarpus
2  P19894              Atheniensis  Athenagoras
3  P21365              Antiochenus   Theophilus
4  P18056                 von Lyon      Irenäus
Number of Church Fathers in CSV: 46


In [58]:
# Parse the XML file
tree = ET.parse(persons_xml_path)
root = tree.getroot()

# Define TEI namespace
ns = {'tei': 'http://www.tei-c.org/ns/1.0'}

print(f"Number of person elements: {len(root.findall('.//tei:person', ns))}")

Number of person elements: 6043


In [59]:
# Extract GND IDs and Wikipedia links for all persons in the XML
extracted_persons = []

for person in root.findall('.//tei:person', ns):
    # Get person ID (xml:id attribute)
    person_id = person.get('{http://www.w3.org/XML/1998/namespace}id')
    
    # Skip if no ID
    if not person_id:
        continue
    
    # Extract GND identifier if present
    gnd_elem = person.find('.//tei:idno[@subtype="gnd"]', ns)
    gnd_id = None
    gnd_url = None
    
    if gnd_elem is not None and gnd_elem.text:
        gnd_url = gnd_elem.text.strip()
        gnd_id = gnd_url.split('/')[-1]  # Extract just the number
    
    # Extract Wikipedia link if present
    wikipedia_elem = person.find('.//tei:idno[@subtype="wiki"]', ns)
    wikipedia_url = None
    
    if wikipedia_elem is not None and wikipedia_elem.text:
        wikipedia_url = wikipedia_elem.text.strip()
    
    # Only add persons that have at least GND or Wikipedia
    if gnd_id or wikipedia_url:
        # Extract name components
        persName = person.find('.//tei:persName', ns)
        
        if persName is not None:
            forename = persName.find('tei:forename', ns)
            surname = persName.find('tei:surname', ns)
            
            name_parts = []
            if forename is not None and forename.text:
                name_parts.append(forename.text.strip())
            if surname is not None and surname.text:
                name_parts.append(surname.text.strip())
            
            full_name = ' '.join(name_parts) if name_parts else ''
        else:
            full_name = ''
        
        extracted_persons.append({
            'ID': person_id,
            'name': full_name,
            'gnd_id': gnd_id,
            'gnd_url': gnd_url,
            'wikipedia_url': wikipedia_url
        })

extracted_df = pd.DataFrame(extracted_persons)

print(f"Extracted {len(extracted_df)} persons with GND IDs and/or Wikipedia links from XML")
print(f"Persons with GND IDs: {extracted_df['gnd_id'].notna().sum()}")
print(f"Persons with Wikipedia links: {extracted_df['wikipedia_url'].notna().sum()}")
print(f"\nSample of extracted data:")
print(extracted_df.head(10))

Extracted 2620 persons with GND IDs and/or Wikipedia links from XML
Persons with GND IDs: 2405
Persons with Wikipedia links: 1746

Sample of extracted data:
    ID                         name      gnd_id  \
0   P1                 Johannes Aal   118500015   
1   P8              Joachim Aberlin   119600153   
2   P9                  Thomas Abel   119600110   
3  P12              Heinrich Aberli        None   
4  P13               Peter Abaelard   11850004X   
5  P18            Zanobi Acciaiuoli   119600404   
6  P19              Israel Achatius   119600498   
7  P41                 Michael Adam   119600935   
8  P43  Barbara Welser (geb. Adler)  1019757337   
9  P66        Johannes Adlischwyler  1163345547   

                            gnd_url  \
0   https://d-nb.info/gnd/118500015   
1   https://d-nb.info/gnd/119600153   
2   https://d-nb.info/gnd/119600110   
3                              None   
4   https://d-nb.info/gnd/11850004X   
5   https://d-nb.info/gnd/119600404   
6   http

In [60]:
# Merge with curated Church Fathers list
merged_df = church_fathers_df.merge(
    extracted_df,
    on='ID',
    how='left'
)

print(f"Merged results:")
print(f"Total Church Fathers list: {len(merged_df)}")
print(f"Church Fathers with GND IDs found: {merged_df['gnd_id'].notna().sum()}")
print(f"Church Fathers with Wikipedia links found: {merged_df['wikipedia_url'].notna().sum()}")
print(f"Church Fathers missing both GND and Wikipedia: {(merged_df['gnd_id'].isna() & merged_df['wikipedia_url'].isna()).sum()}")
print(merged_df[['ID', 'Surname', 'Forename', 'gnd_id', 'wikipedia_url']].to_string(index=False))

merged_df.to_csv("../data/church-fathers/church-fathers-gnd.csv", sep=';', encoding='utf-8', index=False, header=True)

Merged results:
Total Church Fathers list: 46
Church Fathers with GND IDs found: 38
Church Fathers with Wikipedia links found: 26
Church Fathers missing both GND and Wikipedia: 8
    ID                 Surname                                 Forename     gnd_id                                                wikipedia_url
P18881 Ignatius von Antiochien                                      NaN        NaN                                                          NaN
P19930                Smyrnäus                               Polycarpus  11859558X            https://de.wikipedia.org/wiki/Polykarp_von_Smyrna
P19894             Atheniensis                              Athenagoras  118646141          https://de.wikipedia.org/wiki/Athenagoras_von_Athen
P21365             Antiochenus                               Theophilus  118756923                                                         None
P18056                von Lyon                                  Irenäus  118555766          https://d

In [61]:
# Load the manually edited CSV file with Church Fathers data with GND and Wikipedia Links for stats
edited_church_fathers_df = pd.read_csv("../data/church-fathers/manual-church-fathers-gnd.csv", sep=';',  encoding='utf-8')
print(f"Number of Church Fathers in CSV: {len(edited_church_fathers_df)}")

Number of Church Fathers in CSV: 44
