<a href="https://colab.research.google.com/github/isomjd-code/latin-courthand-correction/blob/main/latin_courthand_llm_correction.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [4]:
# ==============================================================================
#               Claude-Based Transkribus Correction Script (Single Document)
# ==============================================================================
#
# DESCRIPTION:
# This script is designed to be run in a Google Colab notebook. It takes a single
# document from a Transkribus collection, processes it line-by-line in chunks,
# and uses the Anthropic Claude 3 Sonnet model to correct the transcription.
#
# It performs a "two-call" correction for each chunk of text:
# 1. It sends the image chunk and HTR text to Claude twice to get two independent
#    transcriptions (Run A and Run B).
# 2. It compares the two Claude versions. If they differ significantly, the line
#    is flagged as "unclear" in the final XML.
# 3. It selects the best transcription for each line by comparing Claude's outputs
#    to the original HTR text and choosing the one with the lowest Character
#    Error Rate (CER).
# 4. It writes the corrected text and metadata back to Transkribus, overwriting
#    the latest version of the page transcription.
#
# HOW TO USE IN GOOGLE COLAB:
# 1. In the left sidebar, click the key icon and create secrets for:
#    - TRANKRIBUS_USER
#    - TRANKRIBUS_PASSWORD
#    - anthropic_key
# 2. Paste this entire script into a single cell in a new Colab notebook.
# 3. Fill in the document details in the "USER CONFIGURATION" section below.
# 4. Run the cell.
#
# ==============================================================================

# --- Step 0: Install Dependencies (only runs if in a Colab-like environment) ---
try:
    import google.colab
    print("Installing required libraries for Google Colab...")
    # Use -q for a quieter installation
    !pip install -q anthropic python-dotenv beautifulsoup4 lxml python-levenshtein Pillow requests numpy
    print("Installation complete.")
except ImportError:
    print("Not running in Google Colab. Assuming libraries are already installed.")

# --- Library Imports ---
import xml.etree.ElementTree as ET
import requests
from PIL import Image, ImageDraw, ImageFont
import anthropic
import os
import io
import sys
import json
from bs4 import BeautifulSoup
import Levenshtein
import datetime
import math
import numpy as np
import base64
import re
import time
from typing import List, Dict, Any, Optional, Tuple
# Import userdata for secrets management in Colab
try:
    from google.colab import userdata
except ImportError:
    userdata = None # Will cause an error later if not in Colab, which is intended.


# ==============================================================================
#                           1. USER CONFIGURATION
# ==============================================================================
# --- Transkribus Credentials (from Colab Secrets) ---
# Ensure you have set these secrets in your Colab environment (left sidebar, key icon)
if userdata:
    TRANKRIBUS_USERNAME = userdata.get('TRANKRIBUS_USER')
    TRANKRIBUS_PASSWORD = userdata.get('TRANKRIBUS_PASSWORD')
    ANTHROPIC_API_KEY = userdata.get('anthropic_key')
else:
    # Fallback for local execution (not recommended for sharing)
    TRANKRIBUS_USERNAME = "YOUR_LOCAL_EMAIL"
    TRANKRIBUS_PASSWORD = "YOUR_LOCAL_PASSWORD"
    ANTHROPIC_API_KEY = "YOUR_LOCAL_API_KEY"

# --- Document to Process ---
COLLECTION_ID = 1957043  # <<< --- YOUR COLLECTION ID --- >>>
DOCUMENT_ID = 9310076  # <<< --- THE SPECIFIC DOCUMENT ID YOU WANT TO PROCESS --- >>>

# --- LLM & Processing Configuration ---
ANTHROPIC_MODEL_NAME = "claude-3-7-sonnet-20250219" # Or "claude-3-opus-20240229", "claude-3-haiku-20240307"
LLM_TEMPERATURE = 0.3      # Controls the randomness of the LLM. 0.3 is a good balance for this task.
ANTHROPIC_MAX_TOKENS = 4096  # Max tokens for Claude's response per chunk.
CHUNK_SIZE = 10            # Number of lines to process in each call to the LLM.
BBOX_PADDING = 40          # Pixels to add around text lines when cropping the image.

# --- Thresholds for Analysis ---
CER_THRESHOLD = 0.05  # CER between Claude's two versions. If higher, the line is marked "unclear".
LLM_HTR_CER_THRESHOLD = 0.05 # Max CER between any LLM output and HTR to be considered 'certain'

# --- AALT Index Configuration (for fetching named entities) ---
AALT_INDEX_URL = "https://waalt.uh.edu/index.php/KB27/795"

# --- File/Directory Configuration ---
TEMP_XML_FILENAME = "temp_page_for_correction.xml" # Temporary file to store downloaded XML.
TEMP_CHUNK_IMAGE_DIR = "temp_chunk_images" # Directory to save temporary images for debugging.

# ==============================================================================
#                       2. SYSTEM PROMPT FOR THE LLM
# ==============================================================================
# This is the detailed set of instructions given to the Claude model.
# It defines the expert persona, transcription rules, and output format.
# This prompt is critical to the quality of the output.

SYSTEM_PROMPT_BASE = """
YOUR ROLE: Medieval Latin Paleography Expert
You are transcribing Latin legal documents from the English Court of Common Pleas for the `abbreviated_latin_lines` output. Strictly follow these rules:

---
**A. SOURCE PRIORITIZATION & HTR USAGE**
1.  **Source Priority:** Use sources in this order: 1. Document Image (Absolute authority for form/content), 2. HTR Transcription (Word ID aid ONLY - ignore its expansion status), 3. Named Entity List (Reference), 4. Your Knowledge (Lowest). **# ADDED: Always prioritize the Document Image over the HTR and Named Entity List if there is a conflict in spelling, capitalization, or abbreviation form.**
2.  **HTR Use:** Consult HTR only for word identification clues; its expansion status is irrelevant for your `abbreviated_latin_lines`.

---
**B. CORE PRINCIPLE: IMAGE AUTHORITY & **ABSOLUTE LINE INTEGRITY** (Non-Negotiable)**
3.  **Image Authority:** The Document Image dictates text content, word order, line breaks, and abbreviation forms.
4.  **Line Mapping & Structure (CRITICAL - Non-Negotiable):**
    *   **Anchor to Red Numbers:** Your primary task is to map text directly to the **red numbers** visible on the image. Each red number (e.g., `10`) is located to the left of and slightly above the start of the specific baseline it identifies.
    *   **JSON Key = Red Number:** The JSON key in your output (e.g., `"10"`) MUST correspond *exactly* to the red number (e.g., `10`) visible on the image next to the baseline whose text you are transcribing.
    *   **First/Last Word Matching (CRITICAL):** Ensure the *first* word transcribed for JSON key `"n"` matches the *first* word visually associated with the baseline marked by the red number `n` on the image. Ensure the *last* word transcribed for JSON key `"n"` matches the *last* word visually associated with that same baseline.
    *   **Preserve Line Content:** Ensure the text transcribed for JSON key `"n"` matches the text visually associated with the baseline marked by the red number `n` on the image.
    *   **Strict Line Breaks (CRITICAL): DO NOT, under ANY circumstances, move words from the end of one numbered baseline to the beginning of the next, or vice-versa.** Each JSON line MUST correspond precisely and *only* to the text visually associated with its specific numbered baseline on the image.
5.  **Transcription Scope (CRITICAL):**
    *   **Transcribe ONLY Requested Numbered Lines:** Transcribe ONLY the text visibly associated with (typically above) the specific **numbered baselines requested for this chunk**. The requested line numbers are specified in the User Prompt (e.g., "lines 10 to 19").
    *   **IGNORE Other Visible Lines:** Even if other text lines (or parts of lines) are visible in the image chunk (e.g., a line above the first requested number, or below the last requested number), you MUST **ignore** them completely if they do not correspond to one of the **red numbers within the requested range**. Do NOT transcribe text associated with unnumbered or out-of-range baselines.
# ADDED Rule 5.1 to explicitly address omissions
5.1 **Transcribe ALL Visible Text (CRITICAL - NO OMISSIONS):** You MUST transcribe **every** word, abbreviation, symbol, and mark visibly associated with the numbered baseline. **DO NOT OMIT ANY TEXT** present on the image for the requested line. Your transcription for line `n` must be a complete representation of the text on baseline `n`. Double-check you have captured the entire line content.
6.  **Empty Lines:** If no text corresponds to a specific requested baseline `n` (identified by its red number `n`), output `""` for JSON key `n`.
7.  **Line Count:** The number of lines (keys) in the output JSON MUST exactly match the number of lines requested for the chunk.

---
**C. ABBREVIATION HANDLING (`abbreviated_latin_lines` - Visual Representation)**
8.  **Goal:** The `abbreviated_latin_lines` MUST visually mirror the text on the corresponding image line (identified by its red number), including all abbreviations *exactly as written*.
9.  **ABSOLUTELY NO EXPANSION (CRITICAL - Non-Negotiable):** Transcribe abbreviations *exactly* as seen on the image (e.g., `q'd`, `p'd'co`, `d'no`). Transcribe fully spelled-out words as seen (e.g., `quod`, `predictus`, `domino`). **NEVER** expand abbreviations shown on the image, nor abbreviate words spelled out on the image. Your primary goal for `abbreviated_latin_lines` is *visual fidelity* to the image line, not linguistic expansion or normalization. **# ADDED: Pay close attention to the *exact form* of the abbreviation as seen on the image. Transcribe *exactly* what you see.**
    *   *More Examples:* Transcribe spelled-out `Et` as `Et`, NEVER `&`. Transcribe Tironian `&` as `&`, NEVER `Et`. Transcribe spelled-out `habeat` as `habeat`, NEVER `h'eat`. Transcribe abbreviated `p'cept'` as `p'cept'`, NEVER `p'ceptum`. Transcribe spelled-out `Robertum` as `Robertum`, NEVER `Rob'tum`. Transcribe abbreviated `Rob't` as `Rob't`, NEVER `Robertum`. Transcribe spelled-out `Willelmum` as `Willelmum`, NEVER `Will'm`. Transcribe abbreviated `Will'm` as `Will'm`, NEVER `Willelmum`. Transcribe `q'd` as `q'd`, NOT `quod`. Transcribe `p'd'co` as `p'd'co`, NOT `predicto`. Transcribe `uic'` as `uic'`, NOT `vicecomes`. Transcribe `pl'ito` as `pl'ito`, NOT `placito`.
10. **Apostrophe Use:** Use a single straight apostrophe (') ONLY to represent a visible abbreviation mark (macron, hook, superscript, symbol) or clearly omitted letters *seen on the image*. Place it immediately after the last written letter before the omission/mark. Use universally for all mark types.

---
**D. SPECIFIC ABBREVIATIONS (Transcribe based on IMAGE evidence & Rules)**

**CRITICAL ABBREVIATIONS (Strict Transcription - Follow Image & Rule C.9)**
*   Tironian 'et' (&): MUST be transcribed as `&`. (**CRITICAL RULE D.11**)
*   Spelled-out 'Et' or 'et': MUST be transcribed as `Et` or `et` (matching image case). (**CRITICAL RULE D.12**)
*   'etcetera' abbr (&c): MUST be transcribed as `&c`. (Rule D.13)
*   'quod' abbr (q w/ macron or similar mark): MUST be transcribed as `q'd`. (Rule D.14)
*   'per'/'pro' abbr (crossed-p): MUST be transcribed as `p'`. (Rule D.16)
*   'com'/'con' prefix abbr: MUST be transcribed as `com'`. (Rule D.18)
*   'predictus' forms abbr: MUST be transcribed using `p'd'` prefix, e.g., `p'd'cus`, `p'd'co`, `p'd'ca`. (Rule D.26)
*   'vicecomes/ti' abbr: MUST be transcribed as `vic'`. (Rule D.24)
*   'nuper' abbr: MUST be transcribed as `nup'`. (Rule D.36)
*   'apud' abbr: MUST be transcribed as `ap'd`. (Rule D.38)
*   'super' abbr: MUST be transcribed as `sup'`. (Rule D.40)
*   'placito' abbr: MUST be transcribed as `pl'ito`. (Rule D.44)
*   'transgressio...' abbr: MUST be transcribed as `transgr'` (or `t'nsgr'` if seen). (Rule D.46)
*   'ibidem' abbr: MUST be transcribed as `ib'm`. (Rule D.47)
*   'scilicet' abbr: MUST be transcribed as `scil'` or `sc'l't` (matching image). (Rule D.49)
*   'Willelmum/us' abbr: MUST be transcribed as `Will'm` (or specific form seen). (Rule D.51)
*   'Ricardus/um' abbr: MUST be transcribed as `Ric'` (or specific form seen). (Rule D.53)
*   'Thomas/am/e' abbr: MUST be transcribed as `Thom'` (or specific form seen). (Rule D.55)
*   'Robertus/um' abbr: MUST be transcribed as `Rob't` (or specific form seen). (Rule D.57)

*(Note: The following list provides standard forms. Your transcription MUST reflect the visual form on the image line, prioritizing the Critical Abbreviations above and Rule C.9.)*
11. **`&` (CRITICAL):** Transcribe Tironian 'et' as `&`.
12. **`et` / `Et` (CRITICAL):** Transcribe spelled-out `et` or `Et` exactly as written (matching case).
# ADDED: Explicit negative constraint for & / Et
**CRITICAL REMINDER:** **NEVER** transcribe a spelled-out `Et` or `et` on the image as `&`. **NEVER** transcribe a Tironian `&` on the image as `Et` or `et`. Follow the image exactly.
13. **`&c`:** Transcribe 'etcetera' abbreviation (e.g., `&c`, `&c.`) as `&c`.
14. **`q'd`:** Transcribe 'quod' abbreviation (e.g., `q` w/ macron or similar mark) as `q'd`.
15. **`quod`:** Transcribe spelled-out `quod` as `quod`.
16. **`p'`:** Transcribe 'per'/'pro' abbreviation (crossed-p) as `p'`.
17. **`per`/`pro`:** Transcribe spelled-out `per` or `pro` as written.
18. **`com'`:** Transcribe 'com'/'con' prefix abbreviation (macron/hook) as `com'`.
19. **`comitatus`:** Transcribe spelled-out `comitatus`, `comiti`, etc., as written.
20. **`d'ni`/`d'no`:** Transcribe 'dominus/i/o' abbreviation as `d'ni` or `d'no`.
21. **`Dominus`:** Transcribe spelled-out `Dominus`, `domini`, `domino` as written (respect case).
22. **`Ioh'es`/`Ioh'em`:** Transcribe 'Johannes/em' abbreviation as `Ioh'es` or `Ioh'em`.
23. **`Iohannes`:** Transcribe spelled-out `Iohannes`, `Iohannem` as written.
24. **`vic'`:** Transcribe 'vicecomes/ti' abbreviation as `vic'`.
25. **`vicecomes`:** Transcribe spelled-out `vicecomes`, `vicecomiti` as written.
26. **`p'd'`:** Transcribe 'predictus' forms abbreviation (e.g., `p'dcus`, `p'dco`) as `p'd'cus`, `p'd'co`, `p'd'ca`, etc.
27. **`predictus`:** Transcribe spelled-out `predictus`, `predicto`, etc., as written.
28. **`Reg'`:** Transcribe 'Regis/Rege/Regi' abbreviation as `Reg'`.
29. **`Regis`/`Rege`/`Regi`:** Transcribe spelled-out `Regis`, `Rege`, `Regi` as written (capitalized).
30. **`Angl'`:** Transcribe 'Anglie' abbreviation as `Angl'`.
31. **`Anglie`:** Transcribe spelled-out `Anglie` as written.
32. **`Westm'`:** Transcribe 'Westmonasterium' abbreviation as `Westm'`.
33. **`Westmonasterium`:** Transcribe spelled-out `Westmonasterium` as written.
34. **`attorn'`:** Transcribe 'attornatus/um' abbreviation as `attorn'`.
35. **`attornatus`:** Transcribe spelled-out `attornatus`, `attornatum` as written.
36. **`nup'`:** Transcribe 'nuper' abbreviation as `nup'`.
37. **`nuper`:** Transcribe spelled-out `nuper` as written.
38. **`ap'd`:** Transcribe 'apud' abbreviation as `ap'd`.
39. **`apud`:** Transcribe spelled-out `apud` as written.
40. **`sup'`:** Transcribe 'super' abbreviation as `sup'`.
41. **`super`:** Transcribe spelled-out `super` as written.
42. **`saluo'`:** Transcribe `saluo` with an abbreviation mark as `saluo'`.
43. **`saluo`:** Transcribe spelled-out `saluo` as `saluo`.
44. **`pl'ito`:** Transcribe 'placito' abbreviation as `pl'ito`.
45. **`placito`:** Transcribe spelled-out `placito` as `placito`.
46. **`transgr'`:** Transcribe 'transgressio...' abbreviation as `transgr'` (or `t'nsgr'` if seen).
47. **`ib'm`:** Transcribe 'ibidem' abbreviation as `ib'm`.
48. **`ibidem`:** Transcribe spelled-out `ibidem` as `ibidem`.
49. **`scil'`/`sc'l't`:** Transcribe 'scilicet' abbreviation as `scil'` or `sc'l't` (matching image).
50. **`scilicet`:** Transcribe spelled-out `scilicet` as `scilicet`.
51. **`Will'm`:** Transcribe 'Willelmum/us' abbreviation as `Will'm` (or specific form seen).
52. **`Willelmus`:** Transcribe spelled-out `Willelmus`, `Willelmum` as written.
53. **`Ric'`:** Transcribe 'Ricardus/um' abbreviation as `Ric'` (or specific form seen).
54. **`Ricardus`:** Transcribe spelled-out `Ricardus`, `Ricardum` as written.
55. **`Thom'`:** Transcribe 'Thomas/am/e' abbreviation as `Thom'` (or specific form seen).
56. **`Thomas`:** Transcribe spelled-out `Thomas`, `Thomam`, `Thome` as written.
57. **`Rob't`:** Transcribe 'Robertus/um' abbreviation as `Rob't` (or specific form seen).
58. **`Robertus`:** Transcribe spelled-out `Robertus`, `Robertum` as written.
*(Note on Names: Pay close attention to whether names are abbreviated or fully spelled out on the image line and transcribe accordingly, following Rule C.9.)*

---
**E. REFERENCE EXAMPLES (Transcribe ONLY if seen on image)**
*(This list shows common abbreviated phrases. Transcribe the exact form visible.)*
*   absq' hoc q'd, accion'm ... h'ere non debet, ad cognosc', ad dampnum / Ad g've dampnu', ad faciend' & recipiend' q'd cur' ... cons', ad largum dimittatur, ad sectam, ad valenc', armig', assumpsit p' se ip'o, q'd attach' eu' / q'd att'o, bene et veru' est q'd, cal'pn' fu't, q'd cap'et eu' / capiatur / capiat, capiend' inde explec' ad valenc', ciuis et ..., clausum ... fregit, cl'icus, cons' est q'd, Et cont' pacem d'ni Regis, cuius dat' est die et Anno sup'dcis, de Com' in Com', de die in diem, de pl'ito q'd redd', de pl'ito detenc'o'is catallor', de pl'ito quare ... de novo fecit, defend' ius ... quando etc. / defend' vim et iniur' quando etc., p' defalt', deteriorat' sunt, diem p' ess' suos, die impetrac'onis b'ris originalis, domu' ... fregit, eat inde sine die, Et alia enormia etc., Et h'et etc., Et hoc paratus est verificare, Et hoc petit q'd inquiratur p' p'riam, Et ip'e non ven', Et p'd'cus ... similiter, Et q'd tale sit ius suum offert etc., Et saluo &c., Et totum etc., Et unde &c., Et vic' modo mand', exigat' eu' in Hustengo, q'd exigi fac' / exigifac', execut' test'i, expediens & necesse est, fenu' inde p'venient' ... cepit et asportavit, fil' et hered', gentilman' / Gentilman, gratis ... warr'izat, h'as d'ni Regis de p'donat'o utlagat'e, h'eat de t'ra ... ad valenc', husbondman, Idem dies dat' est, imp'p'm, in d'nico suo ut de feodo et iure, in m'ia / in misericordia, in Octab's s'ti Hillar' / a die pasche in xv dies / etc., in pp'ia p'sona sua, Io' / I'o, Ita q'd h'eret corpus eius hic / Ita q'd h'eat corpus eius hic, iuxta formam statuti, latitat vagat' et discurrit, p' legem t're, licet sepius requisit', manucep'unt, m'cator / m'cer, nich'il c'piat p' br'e suu', nich'il h'et / nichil habet, non est invent', non est p's / non s't p'sec', non obstante p'allegato, nondum reddidit, nup' de, nup' maiore', op' se iiij° die, ostens' si quid ... quare ... non debeat, pandoxator, p' aliqua p'allegata ab accio' sua ... p'cludi non deb', p' attornatu' suu', p' br'e d'ni Reg' de recto, p'c' fuit vic' / prec' est vic' / p'cept' fuit vic' / p'cept' est vic', pet' iud'm / petit iudiciu', pet' iud'm de br'i, pet' licenc' inde int'loquendi / vlt'ius pet' licenc', pet' recogn' fieri, pet' v'sus, pon' se in magnam assi'am d'ni Reg', postq'm sum' etc., p' int'esse suo, p'munt' fuit essond', p'muniant' p'usq'm, pbos & leg'les ho'ies, Et p'ferunt hic in cur' sc'ptum p'dcm, p'ut patet t'mio... Ro, p'ut p' b're et narracom' sup'pon', quare vi et armis, que fuit ux', queritur de, quiete de, quiet' & exon'at' a cur' dimittatur, quousq' &c., p' quos etc Et qui nec etc Ad recogn' etc Quia tam etc., recogn' de t'ris & catallis suis ad opus d'ni Regis levari, recup'et seisinam suam, scil't, scire fac', p' quoddam sc'ptum suu' obligator', s'viens, set in contemptum cur' recessit et defalt' fec', set sit in m'ia p'ro fa'l clam', si &c., sicut plur' / sicut plur'ies / sicut p'us, solempnit' exact', soluend', sub pena, q'd sum' eu' / q'd sum' eos, tenend' sibi et her' suis, tenentem p' warr' suam, t'pore pacis t'pore d'ni Reg' nunc, unde p'duc' sectam etc., ut ius et hereditatem suam, utlagat' / utlaget', utrum ip'e maius ius h'eat, uterq' eor' sum' est p', vic' non mis' br'e, voc' inde ad warr', yoman'

---
**F. LETTERFORMS & SPELLING**
59. **CRITICAL LETTERFORMS (U/V & I/J):** **ALWAYS** use only 'u' or 'U' (never 'v' or 'V') and only 'i' or 'I' (never 'j' or 'J'). This applies to all words, including `uersus`, `uilla`, `ualenciam`, `iudicium`, `iuratus`, etc.
60. **Long S:** Transcribe long 's' (ſ) as standard 's'.
61. **Differentiation:** Carefully distinguish minims (n/u, m/in/ni/iu/ui), c/t, and f/s based on the image. **# ADDED: Be particularly careful distinguishing similar letter forms like `a`/`o`, `e`/`o`, `D`/`S`, `G`/`B`. Always verify against the image.**
62. **Standard Letters:** Transcribe other letters to standard modern equivalents based on image form.
63. **`uersus`/`sicut`:** Transcribe `uersus`/`u'sus` and `sicut`/`sic'` based on image form (abbreviated or full).

---
**G. CAPITALIZATION (Based on Image)**
64. **Strict Following (CRITICAL):** Follow manuscript capitalization **EXACTLY** for **ALL** words.
    *   *Example:* If the image shows `predictus`, transcribe `predictus`, NOT `Predictus`. If the image shows `Comes`, transcribe `Comes`, NOT `comes`. If the image shows `die`, transcribe `die`, NOT `Die`. If image shows `Anno`, transcribe `Anno`, NOT `anno`. Match the image case **exactly** for **all** words. # CHANGED: Strengthened rule and added more examples.
65. **`Rex`/`Dominus`/`Regis`:** Capitalize `Rex`, `Dominus`, `Regis`, `Rege`, `Regi` if spelled out on image.
66. **Nouns/Titles:** Capitalize proper nouns (people, places like `London'`, `Pasche`), titles/occupations (`Gentilman`, `yoman`, `clericus`) *only if capitalized on the image*. Retain abbreviation marks if present.

---
**H. NUMERALS & SYMBOLS**
67. **Roman Numerals:** Transcribe Roman numerals exactly as they appear (e.g., `xij`, `xv`, `iiij`); do NOT convert to Arabic.
68. **Paragraph Mark:** Transcribe the paragraph mark symbol as `¶` if present.

---
**I. WORKFLOW & VERIFICATION**
69. **Image Focus:** Examine the image, identifying the **red numbers** marking the start of each requested baseline.
70. **Structure Analysis:** Note paragraphs and line counts using the **red image numbers** and Named Entity List for context. Use the Named Entity List as a reference for potential names/places, but **# ADDED: always prioritize the spelling, capitalization, and form seen on the image** for your transcription.

71. **PRELIMINARY STEP (Internal Grounding - DO NOT OUTPUT THIS): LAST WORD FOCUS**
    *   For each **requested line number** (e.g., 10, 11, ... corresponding to original document lines N, N+1, ...):
        *   Identify the *last word* written in the HTR transcription for that line (use HTR only as a guide here).
        *   Carefully examine the *end* of the corresponding baseline on the **Document Image Chunk**, identified by its **red number**.
        *   Determine the *correct* last word for that baseline based *solely* on the **image**. Pay close attention to its spelling and any abbreviation marks.
        *   **Crucially:** Mentally note this image-correct last word. This word *must* remain as the final word on its corresponding line (identified by its red number and matching JSON key) in the final `abbreviated_latin_lines` output. **This step is critical to prevent words from incorrectly wrapping to the next line.**

72. **Transcribe `abbreviated_latin_lines` (FINAL OUTPUT):**
    *   Go line-by-line according to the **red baseline numbers** requested for the chunk (`n`).
    *   Transcribe **ALL** text visually associated with the baseline marked by **red number `n`** into JSON key `"n"`. **Do not omit any words or symbols.** # CHANGED: Added emphasis on ALL text.
    *   Preserve exact image abbreviations, spelling, and capitalization found on that specific numbered line.
    *   Use apostrophe (') only for visible marks/omissions per Rule 10.
    *   Apply all relevant rules (especially B-H and I.71-73) based *only* on image evidence for that specific numbered line.
    *   **Ensure the last word identified in the Preliminary Step (Rule 71) is correctly placed at the end of its line and NO words are moved across lines.**
    *   **Crucially, ensure you ONLY transcribe lines corresponding to the requested red numbers. Ignore any other text visible.**
    *   **Self-Correction Check (Per Line):** For each line `n` you transcribe, quickly verify: Does the text in JSON key `"n"` start with the first word on the image baseline `n`? Does it end with the last word on the image baseline `n`? **Does it include ALL words/symbols visible on that baseline?** If not, re-read the image for that line and correct it before moving to the next line. # CHANGED: Added check for completeness.

73. **Verify Output:** Cross-reference the generated `abbreviated_latin_lines` JSON against the **numbered image lines**. Check:
    *   **Line Content:** Does the text in JSON key `"n"` match the image text associated with the baseline marked by **red number `n`**?
    *   **Completeness (CRITICAL):** Does the transcribed line contain *all* the words/symbols visible on the image baseline? Are *any* words missing?
    *   **Word Order:** Is the word order identical to the image for that numbered line?
    *   **Line Breaks (CRITICAL):** Does the *first* word of JSON key `"n"` match the *first* word on the image baseline marked `n`? Does the *last* word of JSON key `"n"` match the *last* word on the image baseline marked `n`? **Are there absolutely NO words moved between lines?**
    *   **Correct Lines Transcribed:** Does the JSON contain keys *only* for the requested line numbers, and is the text derived *only* from those specific numbered lines, ignoring any other visible lines?
    *   **Abbreviation Forms (CRITICAL):** Are abbreviations transcribed *exactly* as seen on the specific numbered line, following Rule C.9 (NO EXPANSION) and the Critical Abbreviations list in Section D?
    *   **Rule Adherence:** Are all other rules (capitalization, I/J/U/V, etc.) followed based *only* on the image evidence for that numbered line?

---
**FINAL REMINDER:** Ensure your final response is ONLY the JSON object containing the `abbreviated_latin_lines` dictionary. Adhere strictly to all rules, especially:
1.  **Line Integrity:** Anchor transcription to **visible red baseline numbers** and ensure **NO words are moved between lines**.
2.  **Completeness:** Transcribe **ALL** visible text associated with the numbered baseline. **DO NOT OMIT** any words or symbols.
3.  **No Expansion/Abbreviation:** Transcribe abbreviations and spelled-out words **exactly as they appear** on the image line (e.g., `Et` stays `Et`, `&` stays `&`, `Will'm` stays `Will'm`, `Willelmum` stays `Willelmum`).
4.  **Exact `&` vs `Et`:** Follow the image precisely. **NEVER** convert spelled-out `Et`/`et` to `&`, or `&` to `Et`/`et`.
5.  **Critical Abbreviations:** Follow the specific transcription rules for the critical forms listed in Section D.
6.  **Exact Capitalization:** Match manuscript capitalization **exactly** for **ALL** words.
7.  **Requested Lines Only:** Transcribe only the lines requested for the chunk.
"""

# ==============================================================================
#                       3. HELPER & CORE FUNCTIONS
# ==============================================================================
# These are the functions that perform the main tasks of the script, such as
# communicating with Transkribus, processing images, and calling the LLM.

# --- Transkribus & Data Fetching Functions ---

TRANSKRIBUS_BASE_URL = "https://transkribus.eu/TrpServer/rest"
PAGE_XML_NAMESPACE = {'page': 'http://schema.primaresearch.org/PAGE/gts/pagecontent/2013-07-15'}
ET.register_namespace('', PAGE_XML_NAMESPACE['page'])

def get_transkribus_session() -> requests.Session:
    """Authenticates with Transkribus and returns a session object."""
    if not TRANKRIBUS_USERNAME or not TRANKRIBUS_PASSWORD:
        raise ValueError("Transkribus username and password must be provided via Colab Secrets.")

    print(f"Authenticating with Transkribus as user: {TRANKRIBUS_USERNAME}...")
    login_url = f"{TRANSKRIBUS_BASE_URL}/auth/login"
    session = requests.Session()
    try:
        response = session.post(login_url, data={'user': TRANKRIBUS_USERNAME, 'pw': TRANKRIBUS_PASSWORD})
        response.raise_for_status()
        if '<sessionId>' in response.text and 'JSESSIONID' in session.cookies:
            print("Transkribus authentication successful.")
            return session
        else:
            raise ConnectionRefusedError(f"Transkribus login failed. Response: {response.text[:500]}")
    except requests.exceptions.RequestException as e:
        print(f"FATAL: Network error during Transkribus authentication: {e}")
        raise

def get_page_details(session: requests.Session, coll_id: int, doc_id: int, page_index: int = 0) -> Tuple[Optional[int], Optional[str]]:
    """Gets the page number and image URL for the first page of a document."""
    print(f"Getting page details for document {doc_id} (page index {page_index})...")
    pages_url = f"{TRANSKRIBUS_BASE_URL}/collections/{coll_id}/{doc_id}/pages"
    try:
        response = session.get(pages_url, params={"nValues": 1, "index": page_index}, timeout=30)
        response.raise_for_status()

        response_data = response.json()
        page_list = []

        # *** FIX: Check for the 'trpPage' key in the dictionary response ***
        if isinstance(response_data, dict) and 'trpPage' in response_data:
            page_list = response_data['trpPage']
            print("DEBUG: Found 'trpPage' key in API response. Parsing list from there.")
        # Fallback for the old, direct list format, just in case.
        elif isinstance(response_data, list):
            page_list = response_data
            print("DEBUG: API returned a direct list of pages.")

        # Now, process the extracted page_list
        if page_list and isinstance(page_list, list) and len(page_list) > 0:
            page_info = page_list[0]
            page_nr = int(page_info.get('pageNr'))
            img_url = page_info.get('url') or page_info.get('imageUrl')
            if page_nr is not None and img_url:
                print(f"Found page details: Page Number={page_nr}, Image URL found.")
                return page_nr, img_url

        # If the checks above fail, we raise the error with a clear message.
        raise ValueError("Could not find valid page details in API response. The response might be empty or in an unexpected format.")
    except (requests.exceptions.RequestException, json.JSONDecodeError, ValueError, TypeError) as e:
        print(f"FATAL: Could not get page details for document {doc_id}: {e}")
        raise

def download_page_xml(session: requests.Session, coll_id: int, doc_id: int, page_nr: int, output_path: str) -> bool:
    """Downloads the PAGE XML for a specific page."""
    print(f"Downloading PAGE XML for doc {doc_id}, page {page_nr}...")
    xml_url = f"{TRANSKRIBUS_BASE_URL}/collections/{coll_id}/{doc_id}/{page_nr}/text"
    try:
        response = session.get(xml_url, timeout=30)
        response.raise_for_status()
        with open(output_path, 'wb') as f:
            f.write(response.content)
        print(f"PAGE XML downloaded successfully to {output_path}")
        return True
    except requests.exceptions.RequestException as e:
        print(f"FATAL: Error downloading PAGE XML: {e}")
        raise

def download_image(url: str) -> bytes:
    """Downloads an image from a URL and returns it as bytes."""
    print(f"Downloading document image...")
    try:
        response = requests.get(url, stream=True, timeout=60)
        response.raise_for_status()
        image_bytes = response.content
        Image.open(io.BytesIO(image_bytes)).verify() # Verify it's a valid image
        print(f"Image downloaded successfully ({len(image_bytes) / 1024:.1f} KB).")
        return image_bytes
    except (requests.exceptions.RequestException, IOError) as e:
        print(f"FATAL: Error downloading or verifying image: {e}")
        raise

def fetch_and_parse_waalt_wiki_index(url: str, target_image_number: int, target_side: str) -> str:
    """Fetches the WAALT wiki index and extracts named entities for the target image."""
    print(f"Fetching named entities from WAALT index for image {target_image_number}{target_side}...")
    if target_side not in ['f', 'd'] or not isinstance(target_image_number, int):
        print("Warning: Invalid image number or side. Cannot fetch entities.")
        return "Entity fetch skipped (invalid input)."
    try:
        response = requests.get(url, timeout=30)
        response.raise_for_status()
        soup = BeautifulSoup(response.content, 'lxml')
        table = soup.find('table', {'border': True})
        if not table:
            print("Warning: Could not find data table on WAALT page.")
            return "Entity fetch failed (no table found)."

        matching_entries = []
        rows = table.find_all('tr')
        for row in rows[1:]: # Skip header
            cells = row.find_all('td')
            if len(cells) == 5:
                try:
                    side_image_text = cells[0].get_text(strip=True)
                    side, image_num_str = side_image_text.split(maxsplit=1)
                    if side.lower() == target_side and int(image_num_str) == target_image_number:
                        county = cells[1].get_text(strip=True)
                        plaintiffs = cells[2].get_text(separator=" ", strip=True)
                        defendants = cells[3].get_text(separator=" ", strip=True)
                        pleas = cells[4].get_text(strip=True)
                        entry = f"County: {county}, Plaintiffs: {plaintiffs}, Defendants: {defendants}, Plea: {pleas}"
                        matching_entries.append(entry)
                except (ValueError, IndexError):
                    continue # Skip rows that don't parse correctly

        if matching_entries:
            print(f"Found {len(matching_entries)} matching named entity entries.")
            return "\n".join(matching_entries)
        else:
            print("No specific named entities found for this image.")
            return "No specific entities found."

    except requests.exceptions.RequestException as e:
        print(f"Warning: Could not fetch WAALT index page: {e}")
        return "Entity fetch failed (network error)."

# --- XML & Image Processing Functions ---

def parse_page_xml(xml_path: str) -> Dict[str, Any]:
    """Parses PAGE XML to extract line data and HTR text."""
    print(f"Parsing XML file: {xml_path}...")
    try:
        tree = ET.parse(xml_path)
        root = tree.getroot()

        lines_data = []
        htr_full_text_lines = []
        line_counter = 0

        for region in root.findall('.//page:TextRegion', PAGE_XML_NAMESPACE):
            for line in region.findall('.//page:TextLine', PAGE_XML_NAMESPACE):
                line_id = line.get('id', f"gen_id_{line_counter}")
                baseline_elem = line.find('page:Baseline', PAGE_XML_NAMESPACE)
                textequiv_elem = line.find('page:TextEquiv', PAGE_XML_NAMESPACE)

                htr_text = ""
                if textequiv_elem is not None:
                    unicode_elem = textequiv_elem.find('page:Unicode', PAGE_XML_NAMESPACE)
                    if unicode_elem is not None and unicode_elem.text:
                        htr_text = unicode_elem.text.strip()

                baseline_coords = None
                if baseline_elem is not None and baseline_elem.get('points'):
                    points_str = baseline_elem.get('points')
                    baseline_coords = [tuple(map(int, p.split(','))) for p in points_str.split()]

                lines_data.append({
                    'id': line_id,
                    'baseline_coords': baseline_coords,
                    'htr_text': htr_text,
                })
                htr_full_text_lines.append(htr_text)
                line_counter += 1

        print(f"Successfully parsed XML. Found {len(lines_data)} text lines.")
        return {'lines': lines_data, 'htr_full_text_lines': htr_full_text_lines}
    except (ET.ParseError, ValueError) as e:
        print(f"FATAL: Error parsing XML file: {e}")
        raise

def calculate_bounding_box(lines_data: List[Dict[str, Any]], start_index: int, end_index: int) -> Optional[Tuple[int, int, int, int]]:
    """Calculates the bounding box for a chunk of lines."""
    min_x, min_y = float('inf'), float('inf')
    max_x, max_y = float('-inf'), float('-inf')
    found_coords = False

    for i in range(start_index, end_index):
        line = lines_data[i]
        coords = line.get('baseline_coords')
        if coords:
            for p in coords:
                x, y = float(p[0]), float(p[1])
                min_x, min_y = min(min_x, x), min(min_y, y)
                max_x, max_y = max(max_x, x), max(max_y, y)
                found_coords = True

    if not found_coords:
        return None

    # Apply padding and convert to integers
    return (
        max(0, int(min_x - BBOX_PADDING)),
        max(0, int(min_y - BBOX_PADDING * 2)), # More vertical padding
        int(max_x + BBOX_PADDING),
        int(max_y + BBOX_PADDING * 2)
    )

def calculate_chunk_baseline_angle(lines_data: List[Dict[str, Any]], start_index: int, end_index: int) -> float:
    """Calculates the average angle (in radians) for a chunk of baselines."""
    sum_vector_x, sum_vector_y = 0.0, 0.0
    valid_lines = 0
    for i in range(start_index, end_index):
        coords = lines_data[i].get('baseline_coords')
        if coords and len(coords) >= 2:
            dx = coords[-1][0] - coords[0][0]
            dy = coords[-1][1] - coords[0][1]
            length = math.hypot(dx, dy)
            if length > 1e-6:
                sum_vector_x += dx / length
                sum_vector_y += dy / length
                valid_lines += 1
    if valid_lines == 0:
        return 0.0
    return math.atan2(sum_vector_y, sum_vector_x)

def _draw_single_number(draw, text, base_point, font, offset_x, offset_y, img_width, img_height, anchor_side='left'):
    """Helper to draw a line number on the image."""
    font_size = font.size
    if anchor_side == 'left':
        text_x = base_point[0] - offset_x
    else: # 'right'
        text_x = base_point[0] + offset_x
    text_y = base_point[1] - offset_y - font_size

    # Use textbbox for more accurate positioning if available
    try:
        bbox = draw.textbbox((text_x, text_y), text, font=font)
    except AttributeError: # Fallback for older Pillow versions
        text_width, text_height = draw.textsize(text, font=font)
        bbox = (text_x, text_y, text_x + text_width, text_y + text_height)

    # Draw a semi-transparent background for readability
    bg_box = (bbox[0] - 3, bbox[1] - 3, bbox[2] + 3, bbox[3] + 3)
    draw.rectangle(bg_box, fill=(255, 255, 255, 180))
    draw.text((bbox[0], bbox[1]), text, fill=(255, 0, 0, 255), font=font)

def draw_chunk_baselines_and_numbers(image_object, lines_data, start_index, end_index, crop_box) -> Image.Image:
    """Draws baselines and numbers for a specific chunk onto a cropped image."""
    img = image_object.copy().convert("RGBA")
    draw = ImageDraw.Draw(img, "RGBA")

    try:
        font = ImageFont.truetype("DejaVuSans.ttf", 20)
    except IOError:
        print("Warning: DejaVuSans.ttf not found. Using default font.")
        font = ImageFont.load_default()

    min_x_crop, min_y_crop = crop_box[0], crop_box[1]

    for i in range(start_index, end_index):
        line = lines_data[i]
        coords = line.get('baseline_coords')
        line_number_str = str(i + 1)

        if coords and len(coords) >= 2:
            # Translate coordinates to the cropped image's coordinate system
            translated_coords = [
                (p[0] - min_x_crop, p[1] - min_y_crop) for p in coords
            ]
            draw.line(translated_coords, fill=(255, 0, 0, 255), width=2)

            # Draw numbers at the start and end of the baseline
            _draw_single_number(draw, line_number_str, translated_coords[0], font, 25, 10, img.width, img.height, 'left')
            _draw_single_number(draw, line_number_str, translated_coords[-1], font, 25, 10, img.width, img.height, 'right')

    return img

def image_to_base64(image: Image.Image, format: str = "JPEG") -> str:
    """Converts a PIL Image to a base64 encoded string."""
    buffered = io.BytesIO()
    img_to_save = image
    if format == "JPEG":
        if img_to_save.mode == 'RGBA':
            img_to_save = image.convert('RGB')
    img_to_save.save(buffered, format=format)
    return base64.b64encode(buffered.getvalue()).decode('utf-8')

def calculate_cer(s1: str, s2: str) -> float:
    """Calculates Character Error Rate (CER)."""
    if not isinstance(s1, str) or not isinstance(s2, str): return 1.0
    if len(s1) == 0 and len(s2) == 0: return 0.0
    if len(s1) == 0 or len(s2) == 0: return 1.0
    return Levenshtein.distance(s1, s2) / max(len(s1), len(s2))

# --- Core LLM and Transkribus Update Functions ---

def get_claude_corrections_for_chunk(
    client: anthropic.Anthropic,
    user_prompt_content: List[Dict[str, Any]]
) -> Tuple[Optional[str], Optional[str]]:
    """
    Makes two synchronous calls to the Claude API for a single chunk
    and returns the text content of both responses.
    """
    results = [None, None]
    for i in range(2): # Make two calls
        run_name = "A" if i == 0 else "B"
        print(f"    - Making LLM call for Run {run_name}...")
        try:
            response = client.messages.create(
                model=ANTHROPIC_MODEL_NAME,
                max_tokens=ANTHROPIC_MAX_TOKENS,
                temperature=LLM_TEMPERATURE,
                system=SYSTEM_PROMPT_BASE,
                messages=[{"role": "user", "content": user_prompt_content}]
            )

            if response.content and response.content[0].type == 'text':
                results[i] = response.content[0].text
                print(f"    - Run {run_name} successful.")
            else:
                print(f"    - WARNING: Run {run_name} response was empty or invalid.")

        except anthropic.APIError as e:
            print(f"    - FATAL: Anthropic API Error on Run {run_name}: {e}")
            # In a single-document script, one failure is enough to stop.
            raise
        time.sleep(1) # Be polite to the API

    return tuple(results)

def modify_xml_tree(
    xml_path: str,
    original_lines_data: List[Dict[str, Any]],
    htr_lines: List[str],
    llm_lines_A: List[str],
    llm_lines_B: List[str],
    uncertainty_flags: List[bool]
) -> bytes:
    """
    Modifies the XML tree with the best LLM transcription and adds metadata.
    """
    print("Modifying XML with corrected transcriptions...")
    tree = ET.parse(xml_path)
    root = tree.getroot()

    # Add/update metadata to show this script ran
    metadata = root.find('.//page:Metadata', PAGE_XML_NAMESPACE)
    if metadata is None:
        metadata = ET.Element(f"{{{PAGE_XML_NAMESPACE['page']}}}Metadata")
        root.insert(0, metadata)

    now_iso = datetime.datetime.now(datetime.timezone.utc).isoformat()
    ET.SubElement(metadata, f"{{{PAGE_XML_NAMESPACE['page']}}}MetadataItem", attrib={
        "type": "processingStep", "name": "tool", "value": "Claude Correction Script (SingleDoc v1.0)"
    })
    ET.SubElement(metadata, f"{{{PAGE_XML_NAMESPACE['page']}}}MetadataItem", attrib={
        "type": "processingStep", "name": "description", "value": "Page text reviewed by Anthropic Claude (2 runs)."
    })
    ET.SubElement(metadata, f"{{{PAGE_XML_NAMESPACE['page']}}}MetadataItem", attrib={
        "type": "processingStep", "name": "model", "value": ANTHROPIC_MODEL_NAME
    })
    ET.SubElement(metadata, f"{{{PAGE_XML_NAMESPACE['page']}}}MetadataItem", attrib={
        "type": "processingStep", "name": "timestampUTC", "value": now_iso
    })

    # Create a map of line ID to its index for quick lookup
    line_id_to_index = {line_info['id']: i for i, line_info in enumerate(original_lines_data)}
    lines_updated = 0
    uncertain_tags_added = 0

    for text_line in root.findall('.//page:TextLine', PAGE_XML_NAMESPACE):
        line_id = text_line.get('id')
        if line_id in line_id_to_index:
            idx = line_id_to_index[line_id]

            # Choose the best line (lowest CER vs HTR)
            cer_a = calculate_cer(htr_lines[idx], llm_lines_A[idx])
            cer_b = calculate_cer(htr_lines[idx], llm_lines_B[idx])
            best_line_text = llm_lines_A[idx] if cer_a <= cer_b else llm_lines_B[idx]

            # Update the Unicode text
            text_equiv = text_line.find('page:TextEquiv', PAGE_XML_NAMESPACE)
            if text_equiv is None:
                text_equiv = ET.SubElement(text_line, f"{{{PAGE_XML_NAMESPACE['page']}}}TextEquiv")

            unicode_elem = text_equiv.find('page:Unicode', PAGE_XML_NAMESPACE)
            if unicode_elem is None:
                unicode_elem = ET.SubElement(text_equiv, f"{{{PAGE_XML_NAMESPACE['page']}}}Unicode")

            if unicode_elem.text != best_line_text:
                unicode_elem.text = best_line_text
                lines_updated += 1

            # Add or remove 'unclear' tag based on uncertainty flag
            custom_attr = text_line.get('custom', '')
            is_uncertain = uncertainty_flags[idx]

            # Remove any existing unclear tag first
            custom_attr = re.sub(r"unclear\s*\{[^}]*\}\s*", "", custom_attr).strip()

            if is_uncertain:
                new_tag = f"unclear {{offset:0; length:{len(best_line_text)};}}"
                text_line.set('custom', (custom_attr + " " + new_tag).strip())
                uncertain_tags_added += 1
            elif custom_attr:
                text_line.set('custom', custom_attr)
            elif 'custom' in text_line.attrib:
                del text_line.attrib['custom']

    print(f"XML modification complete. Lines updated: {lines_updated}. Lines flagged as uncertain: {uncertain_tags_added}.")

    if hasattr(ET, 'indent'): # Pretty-print the XML
        ET.indent(tree, space="  ", level=0)

    return ET.tostring(root, encoding='utf-8', method='xml', xml_declaration=True)

def update_transkribus_page_xml(session: requests.Session, coll_id: int, doc_id: int, page_nr: int, xml_bytes: bytes):
    """Uploads the modified PAGE XML back to Transkribus."""
    print(f"Uploading corrected XML to Transkribus for doc {doc_id}, page {page_nr}...")
    update_url = f"{TRANSKRIBUS_BASE_URL}/collections/{coll_id}/{doc_id}/{page_nr}/text"
    params = {
        'status': 'DONE',
        'note': f'Corrected via Claude-SingleDoc v1.0 using {ANTHROPIC_MODEL_NAME}',
        'toolName': 'Claude Correction Script (SingleDoc v1.0)'
    }
    headers = {'Content-Type': 'application/xml;charset=UTF-8'}
    try:
        response = session.post(update_url, headers=headers, params=params, data=xml_bytes, timeout=60)
        response.raise_for_status()
        print("✅ Successfully updated page in Transkribus!")
        return True
    except requests.exceptions.RequestException as e:
        print(f"FATAL: Failed to upload updated XML to Transkribus. Status: {e.response.status_code if e.response else 'N/A'}, Body: {e.response.text if e.response else 'N/A'}")
        raise

# ==============================================================================
#                           4. MAIN EXECUTION WORKFLOW
# ==============================================================================

def main():
    """The main function to run the end-to-end correction process."""
    start_time = time.time()
    print("==============================================================")
    print("          STARTING TRANSKRIBUS CORRECTION PROCESS")
    print(f"          Document ID: {DOCUMENT_ID} in Collection: {COLLECTION_ID}")
    print("==============================================================")

    # --- Ensure temp directory exists ---
    os.makedirs(TEMP_CHUNK_IMAGE_DIR, exist_ok=True)

    # --- Initialize API Clients ---
    try:
        if not ANTHROPIC_API_KEY or "YOUR_" in ANTHROPIC_API_KEY:
            raise ValueError("Anthropic API Key is not set. Please configure it in Colab Secrets.")
        if not TRANKRIBUS_USERNAME or "YOUR_" in TRANKRIBUS_USERNAME:
            raise ValueError("Transkribus credentials are not set. Please configure them in Colab Secrets.")

        anthropic_client = anthropic.Anthropic(api_key=ANTHROPIC_API_KEY)
        print("Anthropic client initialized.")

        session = get_transkribus_session()
    except (ValueError, ConnectionRefusedError, requests.exceptions.RequestException, anthropic.APIError) as e:
        print(f"\n--- SCRIPT HALTED: Could not initialize APIs. Error: {e} ---")
        return # Stop execution

    try:
        # --- Step 1: Get Page Info ---
        print("\n--- [Step 1/7] Fetching Page Details from Transkribus ---")
        page_nr, img_url = get_page_details(session, COLLECTION_ID, DOCUMENT_ID)

        # --- Step 2: Download Data ---
        print("\n--- [Step 2/7] Downloading Document Data ---")
        download_page_xml(session, COLLECTION_ID, DOCUMENT_ID, page_nr, TEMP_XML_FILENAME)
        original_image_bytes = download_image(img_url)
        full_original_image = Image.open(io.BytesIO(original_image_bytes))

        # --- Step 3: Parse Data and Fetch Entities ---
        print("\n--- [Step 3/7] Parsing Local Data and Fetching Entities ---")
        xml_data = parse_page_xml(TEMP_XML_FILENAME)
        htr_lines_from_xml = xml_data['htr_full_text_lines']
        original_lines_data = xml_data['lines']
        num_total_lines = len(htr_lines_from_xml)
        if num_total_lines == 0:
            print("\n--- SCRIPT HALTED: The downloaded XML contains 0 text lines. ---")
            return

        # Fetch WAALT entities based on document title (assuming a standard format)
        doc_title_response = session.get(f"{TRANSKRIBUS_BASE_URL}/collections/{COLLECTION_ID}/{DOCUMENT_ID}/fulldoc").json()
        doc_title = doc_title_response.get('md', {}).get('title', '')
        print(f"Document Title: {doc_title}")

        named_entity_list = "Entity fetch skipped (title pattern mismatch)."
        match = re.search(r"_([fd])_(\d+)\.JPG", doc_title, re.IGNORECASE)
        if match:
            side, img_num = match.group(1).lower(), int(match.group(2))
            named_entity_list = fetch_and_parse_waalt_wiki_index(AALT_INDEX_URL, img_num, side)
        else:
            print("Warning: Could not parse side/number from document title. Skipping entity fetch.")

        # --- Step 4: Process Document in Chunks with LLM ---
        print(f"\n--- [Step 4/7] Processing {num_total_lines} lines in chunks of {CHUNK_SIZE} ---")
        num_chunks = math.ceil(num_total_lines / CHUNK_SIZE)
        all_results_A = []
        all_results_B = []

        for i in range(num_chunks):
            start_line = i * CHUNK_SIZE
            end_line = min(start_line + CHUNK_SIZE, num_total_lines)
            print(f"\n  Processing Chunk {i+1}/{num_chunks} (Lines {start_line+1}-{end_line})...")

            # 1. Prepare image for this chunk
            print("    - Preparing image chunk...")
            bbox = calculate_bounding_box(original_lines_data, start_line, end_line)
            if bbox is None:
                print(f"    - WARNING: Could not calculate bounding box for chunk {i+1}. Skipping.")
                # Add empty results to maintain list length
                all_results_A.extend([""] * (end_line - start_line))
                all_results_B.extend([""] * (end_line - start_line))
                continue

            cropped_image = full_original_image.crop(bbox)
            numbered_cropped_image = draw_chunk_baselines_and_numbers(cropped_image, original_lines_data, start_line, end_line, bbox)

            # Rotate image to align text horizontally
            chunk_angle_rad = calculate_chunk_baseline_angle(original_lines_data, start_line, end_line)
            rotated_image = numbered_cropped_image.rotate(
                math.degrees(chunk_angle_rad),
                resample=Image.Resampling.BICUBIC,
                expand=True,
                fillcolor=(255, 255, 255, 0)
            )

            # Save a temporary image for debugging if needed
            temp_chunk_path = os.path.join(TEMP_CHUNK_IMAGE_DIR, f"chunk_{i+1}.jpg")
            rotated_image.convert("RGB").save(temp_chunk_path, "JPEG")
            print(f"    - Saved numbered chunk image to {temp_chunk_path}")

            image_base64 = image_to_base64(rotated_image)

            # 2. Prepare prompt for this chunk
            htr_chunk_str = "\n".join([f'  "{start_line + 1 + j}": "{line}"' for j, line in enumerate(htr_lines_from_xml[start_line:end_line])])
            user_prompt_text = f"""
INPUT MATERIALS FOR THIS CHUNK:
1.  **Document Image Chunk:** (Provided as input image) Contains lines {start_line + 1} to {end_line}.
2.  **Named Entity List (Full Document):**
    {named_entity_list}
3.  **HTR Transcription (This Chunk Only):**
    {{
    {htr_chunk_str}
    }}
---
**JSON OUTPUT FORMAT (Strict Requirement - For This Chunk)**
Output ONLY the JSON object containing `abbreviated_latin_lines` for lines {start_line + 1} through {end_line}.
"""
            user_prompt_content = [
                {"type": "image", "source": {"type": "base64", "media_type": "image/jpeg", "data": image_base64}},
                {"type": "text", "text": user_prompt_text}
            ]

            # 3. Get corrections from Claude
            result_A_raw, result_B_raw = get_claude_corrections_for_chunk(anthropic_client, user_prompt_content)

            # 4. Parse and store results
            for raw_result, result_list in [(result_A_raw, all_results_A), (result_B_raw, all_results_B)]:
                chunk_lines = [""] * (end_line - start_line)
                if raw_result:
                    try:
                        # Extract JSON from the response text
                        json_match = re.search(r"\{.*\}", raw_result, re.DOTALL)
                        if json_match:
                            parsed_json = json.loads(json_match.group(0))
                            draft_dict = parsed_json.get('abbreviated_latin_lines', {})
                            for line_num_str, line_text in draft_dict.items():
                                line_idx = int(line_num_str) - 1 - start_line
                                if 0 <= line_idx < len(chunk_lines):
                                    chunk_lines[line_idx] = line_text
                        else:
                            print("    - WARNING: No JSON object found in LLM response.")
                    except json.JSONDecodeError:
                        print("    - WARNING: Could not decode JSON from LLM response.")
                result_list.extend(chunk_lines)

        # --- Step 5: Analyze LLM Results ---
        print("\n--- [Step 5/7] Analyzing LLM Results and Flagging Uncertainties ---")
        uncertainty_flags = [False] * num_total_lines
        lines_with_high_cer = 0
        for i in range(num_total_lines):
            cer_AB = calculate_cer(all_results_A[i], all_results_B[i])
            cer_A_HTR = calculate_cer(all_results_A[i], htr_lines_from_xml[i])
            cer_B_HTR = calculate_cer(all_results_B[i], htr_lines_from_xml[i])

            if (cer_AB >= CER_THRESHOLD) or (max(cer_A_HTR, cer_B_HTR) >= LLM_HTR_CER_THRESHOLD):
                uncertainty_flags[i] = True
                lines_with_high_cer += 1
        print(f"Analysis complete. Found {lines_with_high_cer} lines with high CER to be flagged as 'unclear'.")

        # --- Step 6: Modify XML ---
        print("\n--- [Step 6/7] Generating Final Corrected XML ---")
        modified_xml_bytes = modify_xml_tree(
            TEMP_XML_FILENAME,
            original_lines_data,
            htr_lines_from_xml,
            all_results_A,
            all_results_B,
            uncertainty_flags
        )

        # --- Step 7: Upload to Transkribus ---
        print("\n--- [Step 7/7] Uploading Final XML to Transkribus ---")
        update_transkribus_page_xml(session, COLLECTION_ID, DOCUMENT_ID, page_nr, modified_xml_bytes)

    except (ValueError, ConnectionRefusedError, requests.exceptions.RequestException, anthropic.APIError, ET.ParseError, IOError) as e:
        print(f"\n--- SCRIPT HALTED DUE TO A CRITICAL ERROR ---")
        print(f"Error: {e}")
        # In a real script, you might want more detailed error logging here.
    finally:
        # --- Cleanup ---
        print("\n--- Cleaning up temporary files... ---")
        if os.path.exists(TEMP_XML_FILENAME):
            os.remove(TEMP_XML_FILENAME)
            print(f"Removed {TEMP_XML_FILENAME}")

        # You can optionally clear the temp_chunk_images directory
        # for f in os.listdir(TEMP_CHUNK_IMAGE_DIR):
        #     os.remove(os.path.join(TEMP_CHUNK_IMAGE_DIR, f))
        # print(f"Cleared {TEMP_CHUNK_IMAGE_DIR}")

        end_time = time.time()
        print("\n==============================================================")
        print("            PROCESS COMPLETE")
        print(f"            Total execution time: {end_time - start_time:.2f} seconds.")
        print("==============================================================")


# --- Execute the main function when the script is run ---
if __name__ == "__main__":
    # This check ensures the main function runs when executed as a script,
    # which is standard practice and works well in Colab.
    main()

Installing required libraries for Google Colab...
Installation complete.
          STARTING TRANSKRIBUS CORRECTION PROCESS
          Document ID: 9310076 in Collection: 1957043
Anthropic client initialized.
Authenticating with Transkribus as user: isomjd@gmail.com...
Transkribus authentication successful.

--- [Step 1/7] Fetching Page Details from Transkribus ---
Getting page details for document 9310076 (page index 0)...
DEBUG: Found 'trpPage' key in API response. Parsing list from there.
Found page details: Page Number=1, Image URL found.

--- [Step 2/7] Downloading Document Data ---
Downloading PAGE XML for doc 9310076, page 1...
PAGE XML downloaded successfully to temp_page_for_correction.xml
Downloading document image...
Image downloaded successfully (349.7 KB).

--- [Step 3/7] Parsing Local Data and Fetching Entities ---
Parsing XML file: temp_page_for_correction.xml...
Successfully parsed XML. Found 37 text lines.
Document Title: IMG_0276.jpg

--- [Step 4/7] Processing 37 lines 