In [39]:
import os
import time
import requests
import xml.etree.ElementTree as ET
import pandas as pd

In [40]:
# CONFIGURATION
CSV_PATH = "../data/church-fathers/church-fathers-gnd-cc.csv"
OUTPUT_BASE = "../data/cc-tei"
BASE_API = "https://mlat.uzh.ch/php_modules/navigate.php?load="
BASE_HOST = "https://mlat.uzh.ch"
BASE_DATA = "https://mlat.uzh.ch/data"
NS = {"cc": "http://mlat.uzh.ch/2.0"}

# HTTP SESSION
session = requests.Session()
session.headers.update({"User-Agent": "tei-downloader/1.1 (+https://example.org)"})


def fetch_xml_root(url: str, max_retries=3, backoff=1.0):
    for attempt in range(1, max_retries + 1):
        try:
            print(f"Fetching: {url}")
            r = session.get(url, timeout=20)
            r.raise_for_status()
            return ET.fromstring(r.content)
        except Exception as e:
            print(f"Warning: fetch failed ({e}), attempt {attempt}/{max_retries}")
            if attempt == max_retries:
                raise
            time.sleep(backoff * attempt)
    raise RuntimeError("unreachable")

# Convert server-side xml path to public URL.
def normalize_xml_path_to_url(xml_path_text: str) -> str:
    if not xml_path_text:
        return None
    s = xml_path_text.strip()

    idx = s.find("/data/")
    if idx != -1:
        data_sub = s[idx:]
        return BASE_HOST + data_sub

    if s.startswith("/var/www/html"):
        idx2 = s.find("/data/")
        if idx2 != -1:
            return BASE_HOST + s[idx2:]
        remainder = s.replace("/var/www/html", "", 1)
        return BASE_HOST + remainder if remainder.startswith("/") else BASE_HOST + "/" + remainder

    if s.startswith("/data"):
        return BASE_HOST + s
    if s.startswith("data/"):
        return BASE_DATA + s[len("data"):]
    return BASE_DATA + "/" + s.lstrip("/")


def get_works_for_author(author_id: str):
    url = f"{BASE_API}{author_id}&group_by="
    root = fetch_xml_root(url)
    works = []
    for w in root.findall(".//cc:work", NS):
        wid = w.findtext("cc:idno", default="", namespaces=NS)
        name = w.findtext("cc:name", default="", namespaces=NS)
        if wid:
            works.append({"id": wid.strip(), "name": name.strip() if name else ""})
    return works


def get_texts_for_work(author_id: str, work_id: str):
    url = f"{BASE_API}{author_id}/{work_id}&group_by="
    root = fetch_xml_root(url)
    texts = []
    for txt in root.findall(".//cc:contents/cc:text", NS):
        tid = txt.findtext("cc:idno", default="", namespaces=NS)
        name = txt.findtext("cc:name", default="", namespaces=NS)
        if tid:
            texts.append({"id": tid.strip(), "name": name.strip() if name else ""})
    return texts


def download_text_xml(author_dir: str, author_id: str, work_id: str, text_id: str):
    if text_id:
        url = f"{BASE_API}{author_id}/{work_id}/{text_id}&group_by="
    else:
        url = f"{BASE_API}{author_id}/{work_id}&group_by="
    root = fetch_xml_root(url)

    xml_path_elem = root.find(".//cc:xml_file_path", NS)
    downloadable_elem = root.find(".//cc:xml_file_downloadable", NS)
    downloadable = True
    if downloadable_elem is not None and downloadable_elem.text:
        downloadable = downloadable_elem.text.strip().lower() == "true"

    if xml_path_elem is None or not xml_path_elem.text or not downloadable:
        print(f"No downloadable TEI at {author_id}/{work_id}/{text_id}")
        return False

    xml_path_text = xml_path_elem.text.strip()
    file_url = normalize_xml_path_to_url(xml_path_text)
    if not file_url:
        print(f"Could not normalize xml path: {xml_path_text}")
        return False

    filename = os.path.basename(file_url)
    out_path = os.path.join(author_dir, filename)

    print(f"Downloading TEI: {file_url}")
    try:
        r = session.get(file_url, timeout=30)
        r.raise_for_status()
        with open(out_path, "wb") as fh:
            fh.write(r.content)
        print(f"Saved: {out_path}")
        return True
    except Exception as e:
        print(f"Download failed: {e}")
        return False


def process_author(author_id: str, author_name: str):
    author_name_sanitized = author_name.replace(" ", "_")
    author_dir = os.path.join(OUTPUT_BASE, author_id)
    os.makedirs(author_dir, exist_ok=True)

    print(f"Downloading TEI XML for {author_name} (ID {author_id})")
    works = get_works_for_author(author_id)
    print(f"Found {len(works)} works for {author_name}.")

    total_downloaded = 0
    for w in works:
        wid = w["id"]
        wname = w.get("name", "")
        print(f"\n→ Work: {wname} (ID {wid})")
        texts = get_texts_for_work(author_id, wid)
        print(f"Found {len(texts)} text(s).")

        if not texts:
            if download_text_xml(author_dir, author_id, wid, ""):
                total_downloaded += 1
            continue

        for t in texts:
            tid = t["id"]
            tname = t.get("name", "")
            print(f"  - Text: {tname} (ID {tid})")
            if download_text_xml(author_dir, author_id, wid, tid):
                total_downloaded += 1
            time.sleep(0.15)
    print(f"Done for {author_name}: {total_downloaded} files saved in {author_dir}")


def main():
    # Load CSV
    df = pd.read_csv(CSV_PATH, sep=';', encoding='utf-8')
    df_with_cc = df[df['cc_id'].notna()].copy()
    print(f"Found {len(df_with_cc)} authors with Corpus Corporum IDs.\n")

    # Iterate through authors
    for _, row in df_with_cc.iterrows():
        cc_id = str(int(row['cc_id']))  # e.g. 876.0 → '876'
        name = row['Name']
        process_author(cc_id, name)
        time.sleep(1)


if __name__ == "__main__":
    main()


Found 28 authors with Corpus Corporum IDs.

Downloading TEI XML for Quintus Septimius Florens Tertullian (ID 867)
Fetching: https://mlat.uzh.ch/php_modules/navigate.php?load=867&group_by=
Found 48 works for Quintus Septimius Florens Tertullian.

→ Work: Ad Scapulam (ID 136)
Fetching: https://mlat.uzh.ch/php_modules/navigate.php?load=867/136&group_by=
Found 2 text(s).
  - Text: Responsa ex libris Digestorum (Auctor incertus), J. P. Migne (ID 37)
Fetching: https://mlat.uzh.ch/php_modules/navigate.php?load=867/136/37&group_by=
Downloading TEI: https://mlat.uzh.ch/data/Corpus2_PL/001_Auctor-incertus_Responsa-ex-libris-Digestorum.xml
Saved: ../data/cc-tei/867/001_Auctor-incertus_Responsa-ex-libris-Digestorum.xml
  - Text: Ad Scapulam (Tertullianus), J. P. Migne (ID 39)
Fetching: https://mlat.uzh.ch/php_modules/navigate.php?load=867/136/39&group_by=
Downloading TEI: https://mlat.uzh.ch/data/Corpus2_PL/001_Tertullianus_Ad-Scapulam.xml
Saved: ../data/cc-tei/867/001_Tertullianus_Ad-Scapulam.xml