In [None]:
%pip install reader

In [None]:
# The code is traversing the input directories and processing all images for OCR.
# The prompt used in the code was automatically optimised in OpenAI Playground.

import glob
import base64
import time
from openai import OpenAI
from pathlib import Path


# ----------------------------------------------------------------------
# 1. Set up client and the saved prompt reference you supplied
# ----------------------------------------------------------------------

client = OpenAI(api_key="ADD_YOUR_API_KEY_HERE")

# ----------------------------------------------------------------------
# 2. Set up input and output paths
# ----------------------------------------------------------------------

input_path = Path("pages")                              # folder with JPGs
output_path = input_path.parent / "extractions"        # sibling of "pages" (NOT named "articles")
output_path.mkdir(parents=True, exist_ok=True)

# ----------------------------------------------------------------------
# Utility helpers
# ---------------------------------------------------------------------

def encode_image(input_path: str) -> str:
    """Return a Base‑64 data URL for the JPG so it can be sent inline."""
    with open(input_path, "rb") as image_file:
        img_b64 = base64.b64encode(image_file.read()).decode("utf-8")
    return f"data:image/jpeg;base64,{img_b64}"


def extract_text_from_completion(comp_msg) -> str | None:
    """Return the assistant's textual reply independent of the format it came in.

    • If the model answered with a plain string → return it.
    • If the model answered with a list of content parts → join the *text* parts.
    • Otherwise → None (signals a retry).
    """
    # Case A – plain string already present
    if isinstance(comp_msg.content, str) and comp_msg.content.strip():
        return comp_msg.content

    # Case B – rich‑content list (e.g. [{"type":"text", ...}, {"type":"image_url", ...}])
    if isinstance(comp_msg.content, list):
        pieces: list[str] = []
        for part in comp_msg.content:
            # The OpenAI Python client uses dicts for multimodal parts
            if isinstance(part, dict) and part.get("type") == "text":
                pieces.append(part.get("text", ""))
            # Future‑proof: the object may expose .text directly (pydantic models)
            elif hasattr(part, "text"):
                pieces.append(getattr(part, "text"))
        joined = "".join(pieces).strip()
        return joined or None

    # Anything else – treat as failure
    return None


system_prompt = (
    "You are an expert in careful and accurate extraction of text from historical newspapers. \n\n"
    "Extract individual articles from the given page of a newspaper issue.\n\n"
    "## Output Format\n"
    "Return a JSON array containing objects for each extracted article. Each article object must include the following fields:\n"
    "- \"id\": a unique identifier for the article (e.g., sequential number or generated ID)\n"
    "- \"title\": the article's title (string, or null if not available)\n"
    "- \"author\": the author of the article (string, or null if not available)\n"
    "- \"date\": date of publication (string in YYYY-MM-DD format, or null if not available)\n"
    "- \"content\": full extracted text of the article\n"
    "- \"continues-to\": no, or page and column number on which the article continues.\n"
    "- \"continues-from\": no, or page and column number from where the article continues.\n"
    "- \"metadata\": an object with any additional extracted metadata (may be empty)\n"
    "- \"errors\": an array of issues or ambiguities if any occurred during extraction (empty array if none)\n\n"
    "If an article cannot be confidently separated, or if OCR/extraction failures occur, include an \"errors\" array within the relevant article object, listing the nature of ambiguities or extraction issues.\n\n"
    "Example output:\n"
    "[\n"
    "  {\n"
    "    \"id\": \"1\",\n"
    "    \"title\": \"Breaking News in Boston\",\n"
    "    \"author\": \"Jane Doe\",\n"
    "    \"date\": \"1912-07-14\",\n"
    "    \"content\": \"Full article text goes here...\",\n"
    "    \"continues-to\": \"no\",\n"
    "    \"continues-from\": \"no\",\n"
    "    \"metadata\": {\"section\": \"Front Page\"},\n"
    "    \"errors\": []\n"
    "  },\n"
    "  {\n"
    "    \"id\": \"2\",\n"
    "    \"title\": null,\n"
    "    \"author\": null,\n"
    "    \"date\": null,\n"
    "    \"content\": \"Article text mixed with an image and difficult to separate...\",\n"
    "    \"continues-to\": \"page 3, column 2\",\n"
    "    \"continues-from\": \"page 1, column 1\",\n"
    "    \"metadata\": {},\n"
    "    \"errors\": [\"Unable to confidently extract boundaries due to OCR artifacts.\"]\n"
    "  }\n"
    "]"
)

user_prompt = "You are analyzing a historical newspaper called Pi. It is scanned page comes from the newspaper issue as a JPG. Each image contains multiple articles that span multi-colum layout. Your task is to extract each of the individual articles from the JPG file. Some articles may continue to other columns make sure the text that spans multiple columns but belongs to the same article is extracted as a single article. Indicate which article continues on another page, if this is seen. Return a JSON object with a single key 'articles', whose value is an array of article objects. All articles must be extracted from each page. Only read the content from the newspaper page as is, and do not add or embelish the content."


# ----------------------------------------------------------------------
# Core routine
# ----------------------------------------------------------------------
def process_image(image_path: Path, input_root: Path, output_root: Path, max_retries: int = 2) -> None:
    """Send image_path to the model and save JSON into a mirrored path under output_root."""
    # Compute mirrored output locations (preserve subfolders)
    rel = image_path.relative_to(input_root)
    json_out = (output_root / rel).with_suffix(".json")
    err_out  = (output_root / rel).with_suffix(".error.txt")
    json_out.parent.mkdir(parents=True, exist_ok=True)

    for attempt in range(1, max_retries + 2):  # first try + retries
        data_url = encode_image(image_path)
        completion = client.chat.completions.create(
            model="o4-mini-2025-04-16",
            response_format={"type": "json_object"},
            messages=[
                {"role": "system", "content": system_prompt},
                {"role": "user", "content": [
                    {"type": "text", "text": user_prompt},
                    {"type": "image_url", "image_url": {"url": data_url}},
                ]},
            ],
        )

        msg = completion.choices[0].message
        json_content = extract_text_from_completion(msg)

        if json_content:
            json_out.write_text(json_content, encoding="utf-8")
            print(f"✅  Saved extraction to {json_out}")
            return

        # Retry logic
        if attempt <= max_retries:
            print(f"⚠️  No textual content for {image_path} (attempt {attempt}). Retrying …")
            time.sleep(1.5)
        else:
            err_out.write_text(
                "Assistant returned no textual parts after multiple attempts.",
                encoding="utf-8",
            )
            print(f"❌  Failed after {max_retries + 1} attempts. Details → {err_out}")

# ----------------------------------------------------------------------
# Batch runner
# ----------------------------------------------------------------------
if __name__ == "__main__":
    image_paths = sorted(input_path.rglob("*.jpg"))
    for img_path in image_paths:
        tic = time.perf_counter()
        print(f"----- Processing {img_path} -----")
        process_image(img_path, input_path, output_path)
        toc = time.perf_counter()
        print(f"Processing time  {img_path}: {toc - tic:0.4f} seconds")

----- Processing pages/Pi-Newspaper-1978-1.jpg -----
✅  Saved extraction to extractions/Pi-Newspaper-1978-1.json
Processing time  pages/Pi-Newspaper-1978-1.jpg: 40.4992 seconds
