In [12]:
from typing import TypedDict, Literal

from langgraph.graph import StateGraph, END
import os, io, base64, mimetypes
from typing import Optional, TypedDict

from PIL import Image
from litellm import completion

from langgraph.graph import StateGraph, START, END
from langgraph.types import Command, interrupt
from langgraph.checkpoint.memory import InMemorySaver

from dotenv import load_dotenv
load_dotenv()

True

In [13]:
# ---------- State ----------
from typing import Optional

from pydantic import BaseModel, Field


# ---------- State ----------
class State(BaseModel):
    image_path: Optional[str] = Field(default=None)
    image_bytes: Optional[bytes] = Field(default=None)
    ocr_text: Optional[str] = Field(default=None)
    final_text: Optional[str] = Field(default=None)
    approved: Optional[bool] = Field(default=None)
    corrected_text: Optional[str] = Field(default=None)


In [14]:
# Helpers

def _load_image_bytes(state: State) -> tuple[bytes,str]:
    if state.image_bytes:
        return state.image_bytes, "image/png"
    if state.image_path:
        with open(state.image_path, "rb") as f:
            b = f.read()
        mime = mimetypes.guess_type(state.image_path)[0] or "image/png"
        return b, mime
    raise ValueError("Provide 'image_path' or 'image_bytes' in the state.")

def _to_data_url(b: bytes, mime: str) -> str:
    b64 = base64.b64encode(b).decode("utf-8")
    return f"data:{mime};base64,{b64}"

In [15]:
# ---------- Nodes ----------
def ocr_with_gpt4o_via_litellm(state: State) -> State:
    """
    Uses GPT-4o (via LiteLLM) to extract text from an image.
    Model can be controlled with env LITELLM_MODEL (defaults to 'gpt-4o').
    """
    print("IN FUNCTION: ocr_with_gpt4o_via_litellm")
    img_bytes, mime = _load_image_bytes(state)

    # Fail fast on invalid image
    _ = Image.open(io.BytesIO(img_bytes))

    data_url = _to_data_url(img_bytes, mime)
    model_name = os.getenv("LITELLM_MODEL", "gpt-4o")

    system_prompt = (
        "You are an OCR assistant. Extract ONLY the visible text from the image. "
        "Preserve line breaks where helpful. Do not add commentary."
    )

    resp = completion(
        model=model_name,
        temperature=0,
        messages=[
            {"role": "system", "content": system_prompt},
            {
                "role": "user",
                "content": [
                    {"type": "text", "text": "Read the text from this image, verbatim:"},
                    {"type": "image_url", "image_url": {"url": data_url}},
                ],
            },
        ],
    )

    # LiteLLM returns OpenAI-compatible objects
    ocr_text = resp.choices[0].message.content or ""
    return State(ocr_text=ocr_text.strip())

In [16]:
def human_review(state: State) -> State:
    """
    Pause for human validation.
    Human resumes with either:
      {"approve": true}
    or
      {"approve": false, "corrected_text": "..."}
    """
    print("IN FUNCTION: human_review")
    paylod = {
        "question": "Is the extracted text correct? If not, please correct it and approve.",
        "ocr_text_sample": state.ocr_text,
        "expected_input_schema": {
            "approve": "bool (required)",
            "corrected_text": "string (optional when approve=true)"
        },
        "instructions": 'Reply JSON, e.g. {"approve": true} or {"approve": false, "corrected_text": "..."}',
    }
    fb = interrupt(payload)  # <-- graph pauses until resumed with Command(resume=...)
    approved = bool(fb.get("approve"))
    corrected = fb.get("corrected_text") or state.ocr_text
    return State(approved=approved,corrected_text=corrected)

def finalize(state: State) -> State:
    return State(final_text=state.corrected_text)

def route_after_review(state: State) -> str:
    return "approved" if state.approved else "needs_fix"

In [17]:
builder = StateGraph(State)
builder.add_node("ocr", ocr_with_gpt4o_via_litellm)
builder.add_node("human_review", human_review)
builder.add_node("finalize", finalize)

builder.add_edge(START, "ocr")
builder.add_edge(start_key="ocr", end_key="human_review")
builder.add_conditional_edges("human_review", route_after_review, {"approved": "finalize", "needs_fix": "human_review"})

builder.add_edge("finalize", END)

memory = InMemorySaver()
app = builder.compile(checkpointer=memory)

In [19]:
from IPython.display import Image, display
display(Image(app.get_graph().draw_mermaid_png()))


ValueError: Failed to reach https://mermaid.ink/ API while trying to render your graph after 1 retries. To resolve this issue:
1. Check your internet connection and try again
2. Try with higher retry settings: `draw_mermaid_png(..., max_retries=5, retry_delay=2.0)`
3. Use the Pyppeteer rendering method which will render your graph locally in a browser: `draw_mermaid_png(..., draw_method=MermaidDrawMethod.PYPPETEER)`

In [None]:
# Use a stable thread_id so you can resume across processes/UI callbacks
from langchain_core.runnables import RunnableConfig
config: RunnableConfig = {
    "configurable": {
        "thread_id": "ocr-hitl-litellm-1"
    }
}
# 1) Start — pauses at interrupt()
for event in app.stream(State(image_path="/Users/ifilimon/Documents/PERSONAL_PROJECTS/AGENTIC-KG/142-basic-radar-chart.png"), config, stream_mode="updates"):
    print(f"EVENT: ${event}")

# 2) What to show the human (interrupt payload)
state = app.get_state(config)
pending = state.tasks[0].interrupts[0].value
print("\n--- Show this to the human reviewer ---")
#print(pending)

# 3) Human rejects first (provides corrections)
for event in app.stream(
    Command(resume={"approve": False, "corrected_text": "Corrected OCR text here."}),
    config,
    stream_mode="updates",
):  
    print("HUMAN REJECTS")
    print(event)

# 4) Human approves on next pass
for event in app.stream(
    Command(resume={"approve": True}),
    config,
    stream_mode="updates",
):
    print("HUMAN APPROVE")
    print(event)

# 5) Final approved text
final = app.get_state(config).values["final_text"]
print("\nFINAL APPROVED TEXT:\n", final)

IN FUNCTION: ocr_with_gpt4o_via_litellm
EVENT: ${'ocr': {'ocr_text': 'math  \nsport  \nstatistic  \nphysic  \nfrench  \ndata-viz  \nR-coding  \nmusic  \nbiology  \nenglish'}}
IN FUNCTION: human_review
EVENT: ${'__interrupt__': (Interrupt(value={'question': 'Is the extracted text correct? If not, please correct it and approve.', 'ocr_text_sample': 'math  \nsport  \nstatistic  \nphysic  \nfrench  \ndata-viz  \nR-coding  \nmusic  \nbiology  \nenglish', 'expected_input_schema': {'approve': 'bool (required)', 'corrected_text': 'string (optional when approve=true)'}, 'instructions': 'Reply JSON, e.g. {"approve": true} or {"approve": false, "corrected_text": "..."}'}, id='9ed89ab1086c87964c5215c160cbd8f2'),)}

--- Show this to the human reviewer ---
IN FUNCTION: human_review
HUMAN REJECTS
{'human_review': {'approved': False, 'corrected_text': 'Corrected OCR text here.'}}
IN FUNCTION: human_review
HUMAN REJECTS
{'__interrupt__': (Interrupt(value={'question': 'Is the extracted text correct? If 