In [4]:
import os
import base64
from pathlib import Path
from mistralai import Mistral
from dotenv import load_dotenv

# Load environment variables
load_dotenv()

MISTRAL_API_KEY = os.getenv("MISTRAL_API_KEY")
if not MISTRAL_API_KEY:
    raise ValueError("MISTRAL_API_KEY not found. Add it in Colab > Secrets.")

pdf_path = "../data/input-docs/split-pdfs/USCIS Chapter 1.pdf"

def pdf_to_markdown(pdf_path: str, api_key: str) -> str:
    output_dir = Path("../results/mistral_ocr/")
    images_dir = output_dir / "images"

    print(f"\nProcessing: {pdf_path}")
    with open(pdf_path, "rb") as f:
        pdf_base64 = base64.b64encode(f.read()).decode("utf-8")

    client = Mistral(api_key=api_key)
    print("Extracting text and images...")
    response = client.ocr.process(
        model="mistral-ocr-latest",
        document={
            "type": "document_url",
            "document_url": f"data:application/pdf;base64,{pdf_base64}",
        },
        include_image_base64=True,
    )

    markdown = [f"# {Path(pdf_path).stem}\n\n---\n\n"]
    image_counter = 0

    for page in response.pages:
        page_md = page.markdown or ""
        images = getattr(page, "images", []) or []

        for img in images:
            image_counter += 1
            fmt = "png"
            if img.image_base64.startswith("data:image/"):
                fmt = img.image_base64.split(";")[0].split("/")[1]
            img_file = f"image_{image_counter}.{fmt}"
            img_path = images_dir / img_file
            data = img.image_base64.split("base64,", 1)[-1]
            with open(img_path, "wb") as f:
                f.write(base64.b64decode(data))
            page_md = page_md.replace(f"![]({img.id})", f"![Image {image_counter}](images/{img_file})")

        markdown.append(f"## Page {page.index + 1}\n\n{page_md}\n\n---\n\n")

    md_file = output_dir / f"{Path(pdf_path).stem}.md"
    with open(md_file, "w", encoding="utf-8") as f:
        f.write("".join(markdown))

    print(f"\nDone. Markdown saved to {md_file}")
    return str(md_file)

pdf_to_markdown(pdf_path, MISTRAL_API_KEY)


Processing: ../data/input-docs/split-pdfs/USCIS Chapter 1.pdf
Extracting text and images...

Done. Markdown saved to ../results/mistral_ocr/USCIS Chapter 1.md


'../results/mistral_ocr/USCIS Chapter 1.md'