In [1]:
# --------------------------------------------------------------
# Cell 1️⃣ – Install / upgrade all required packages (clean‑up + install)
# --------------------------------------------------------------
# 1️⃣  Remove any old Optimum wheels (both the core package and the
#     separate `optimum‑onnx` wrapper).  This eliminates the
#     “Multiple distributions found for package optimum” warning.
# 2️⃣  Install the **single, unified** Optimum wheel (≥ 1.20) with the
#     ONNX‑Runtime extra.  The wheel already provides
#     `optimum.onnxruntime` and the class `ORTModelForVision2Seq`.
# 3️⃣  Install the remaining notebook dependencies.
# 4️⃣  Invalidate the import‑system caches and report the installed
#     versions so you can verify that everything is correct.
# --------------------------------------------------------------

# ------------------------------------------------------------------
# 1️⃣  Clean‑up old Optimum installations (core + wrapper)
# ------------------------------------------------------------------
# The `-q` flag silences pip’s output; we also redirect stdout/
# stderr to /dev/null because the messages are not needed in the notebook.
!pip uninstall -y optimum optimum-onnx > /dev/null 2>&1

# ------------------------------------------------------------------
# 2️⃣  Install the unified Optimum package (with ONNX support)
# ------------------------------------------------------------------
# `>=1.20` guarantees that `ORTModelForVision2Seq` exists.
%pip install -U "optimum[onnxruntime]>=1.20"

# ------------------------------------------------------------------
# 3️⃣  Install the remaining dependencies used by the notebook
# ------------------------------------------------------------------
%pip install -q "transformers[onnx]" \
               huggingface_hub \
               pillow \
               torch   # let pip pick the latest compatible torch version

# ------------------------------------------------------------------
# 4️⃣  Refresh Python’s import‑system caches and show versions
# ------------------------------------------------------------------
import importlib, importlib.metadata

# Force the import machinery to re‑scan the site‑packages directory.
importlib.invalidate_caches()

def _print_version(pkg_name: str, import_name: str = None):
    """Print the installed version of a package (metadata first, then __version__)."""
    import_name = import_name or pkg_name
    try:
        # Prefer the canonical metadata version – works even if the module
        # does not expose a __version__ attribute.
        ver = importlib.metadata.version(import_name)
    except Exception:
        try:
            mod = importlib.import_module(import_name)
            ver = getattr(mod, "__version__", "unknown")
        except Exception as e:  # pragma: no cover
            ver = f"NOT INSTALLED ({e})"
    print(f"{pkg_name:<20} {ver}")

print("\n🔎 Installed package versions:")
_print_version("optimum")                     # unified optimum package
_print_version("transformers")
_print_version("torch")
_print_version("huggingface_hub")
_print_version("pillow", "PIL")               # Pillow registers as `PIL`
_print_version("onnxruntime")

print("\n✅ Packages installed (Optimum ≥ 1.20).")
print("⚠️  No runtime restart is needed **as long as** no earlier cell imported any of these packages.")

Collecting optimum>=1.20 (from optimum[onnxruntime]>=1.20)
  Downloading optimum-2.0.0-py3-none-any.whl.metadata (14 kB)
Collecting optimum-onnx[onnxruntime] (from optimum[onnxruntime]>=1.20)
  Downloading optimum_onnx-0.0.3-py3-none-any.whl.metadata (4.6 kB)
Collecting transformers>=4.29 (from optimum>=1.20->optimum[onnxruntime]>=1.20)
  Downloading transformers-4.55.4-py3-none-any.whl.metadata (41 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m42.0/42.0 kB[0m [31m1.5 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting onnx (from optimum-onnx[onnxruntime]; extra == "onnxruntime"->optimum[onnxruntime]>=1.20)
  Downloading onnx-1.19.1-cp312-cp312-manylinux2014_x86_64.manylinux_2_17_x86_64.whl.metadata (7.0 kB)
Collecting onnxruntime>=1.18.0 (from optimum-onnx[onnxruntime]; extra == "onnxruntime"->optimum[onnxruntime]>=1.20)
  Downloading onnxruntime-1.23.2-cp312-cp312-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl.metadata (5.1 kB)
Collecting tokenizers<0.22,>=0.21 (f

In [2]:
# --------------------------------------------------------------
# Cell 2️⃣ – Import all Python modules we’ll need later.
# --------------------------------------------------------------
import os                                 # File‑system utilities
from pathlib import Path                  # Convenient path handling
import numpy as np                       # Numerical arrays (used by ONNX‑Runtime)
from PIL import Image                    # Image loading / conversion (Pillow)

# Hugging‑Face utilities
from huggingface_hub import hf_hub_download, login

# Processor that knows how to build multimodal prompts
from transformers import AutoProcessor

# ------------------------------------------------------------------
# ONNX Runtime – we will create three separate InferenceSession objects
# (vision encoder, token embedder, decoder) manually, as shown in the
# model‑card example.
# ------------------------------------------------------------------
import onnxruntime as ort

# ------------------------------------------------------------------
# Helper to load images (optional – you can also use PIL directly)
# ------------------------------------------------------------------
from transformers.image_utils import load_image

In [3]:
# --------------------------------------------------------------
# Cell 3️⃣ – Specify the Hugging Face repository that hosts the
#            ONNX‑converted Granite‑Docling model.
# --------------------------------------------------------------
# The repo contains three ONNX files under the `onnx/` sub‑folder.
model_id = "onnx-community/granite-docling-258M-ONNX"

print(f"✅ Model repository set to: {model_id}")

✅ Model repository set to: onnx-community/granite-docling-258M-ONNX


In [4]:

# --------------------------------------------------------------
# Cell 4️⃣ – Download the three ONNX model files to the local
#            Colab filesystem.
# --------------------------------------------------------------
# 1️⃣ vision_encoder.onnx   – image encoder (SigLIP2)
# 2️⃣ embed_tokens.onnx     – token embedding layer (Granite‑165M)
# 3️⃣ decoder_model_merged.onnx – text decoder (Idefics3‑style)
# --------------------------------------------------------------

import getpass
from huggingface_hub import hf_hub_download, login

# ① request HF token (optional for private repos)
HF_TOKEN = getpass.getpass('🔑 Enter your Hugging Face token (will not be echoed): ')
login          # optional but ensures auth for private repos

# ② download each ONNX file – the .onnx_data companion is fetched automatically
vision_path = hf_hub_download(
    repo_id=model_id,
    filename="onnx/vision_encoder.onnx",
    token=HF_TOKEN,
)

embed_path = hf_hub_download(
    repo_id=model_id,
    filename="onnx/embed_tokens.onnx",
    token=HF_TOKEN,
)

decoder_path = hf_hub_download(
    repo_id=model_id,
    filename="onnx/decoder_model_merged.onnx",
    token=HF_TOKEN,
)

print("✅ ONNX files downloaded:")
print(f"   Vision encoder  → {vision_path}")
print(f"   Token embedder  → {embed_path}")
print(f"   Decoder (LLM)  → {decoder_path}")

🔑 Enter your Hugging Face token (will not be echoed): ··········


onnx/vision_encoder.onnx:   0%|          | 0.00/297k [00:00<?, ?B/s]

onnx/embed_tokens.onnx:   0%|          | 0.00/434 [00:00<?, ?B/s]

onnx/decoder_model_merged.onnx:   0%|          | 0.00/203k [00:00<?, ?B/s]

✅ ONNX files downloaded:
   Vision encoder  → /root/.cache/huggingface/hub/models--onnx-community--granite-docling-258M-ONNX/snapshots/e8602580df77443fc3421cf3bae0601da601e5c6/onnx/vision_encoder.onnx
   Token embedder  → /root/.cache/huggingface/hub/models--onnx-community--granite-docling-258M-ONNX/snapshots/e8602580df77443fc3421cf3bae0601da601e5c6/onnx/embed_tokens.onnx
   Decoder (LLM)  → /root/.cache/huggingface/hub/models--onnx-community--granite-docling-258M-ONNX/snapshots/e8602580df77443fc3421cf3bae0601da601e5c6/onnx/decoder_model_merged.onnx


In [5]:
# hf_hub_download will pull the companion .onnx_data file automatically,
# but you can verify it and force a re‑download if it’s missing:

from huggingface_hub import hf_hub_download, snapshot_download
import os, pathlib

def download_onnx(name):
    # download the .onnx file (this also fetches the .onnx_data)
    onnx_path = hf_hub_download(
        repo_id=model_id,
        filename=f"onnx/{name}.onnx",
        token=HF_TOKEN,
    )
    # ensure the matching .onnx_data file exists; if not, fetch the whole repo snapshot
    data_path = pathlib.Path(onnx_path).with_name(f"{name}.onnx_data")
    if not data_path.is_file():
        # fallback: download the entire repo (only .onnx/.onnx_data files)
        snapshot_download(
            repo_id=model_id,
            allow_patterns=[f"onnx/{name}.onnx", f"onnx/{name}.onnx_data"],
            token=HF_TOKEN,
        )
    return onnx_path

vision_path = download_onnx("vision_encoder")
embed_path  = download_onnx("embed_tokens")
decoder_path = download_onnx("decoder_model_merged")
print("✅ All ONNX files and their .onnx_data weights are present.")

Fetching 2 files:   0%|          | 0/2 [00:00<?, ?it/s]

onnx/vision_encoder.onnx_data:   0%|          | 0.00/374M [00:00<?, ?B/s]

Fetching 2 files:   0%|          | 0/2 [00:00<?, ?it/s]

onnx/embed_tokens.onnx_data:   0%|          | 0.00/231M [00:00<?, ?B/s]

Fetching 2 files:   0%|          | 0/2 [00:00<?, ?it/s]

onnx/decoder_model_merged.onnx_data:   0%|          | 0.00/658M [00:00<?, ?B/s]

✅ All ONNX files and their .onnx_data weights are present.


In [6]:
# --------------------------------------------------------------
# Cell 5️⃣ – Load the Granite‑Docling model (manual ONNXRuntime sessions)
# --------------------------------------------------------------
def get_providers() -> list:
    """Return the optimal ONNX Runtime execution provider."""
    try:
        import torch
        if torch.cuda.is_available():
            return ["CUDAExecutionProvider", "CPUExecutionProvider"]
    except Exception:
        pass
    return ["CPUExecutionProvider"]

providers = get_providers()
import onnxruntime as ort
if providers[0] not in ort.get_available_providers():
    providers = ["CPUExecutionProvider"]
print(f"🔧 ONNX Runtime providers: {providers}")

# --------------------------------------------------------------
# Processor (same as in the Space app)
# --------------------------------------------------------------
processor = AutoProcessor.from_pretrained(
    model_id,
    token=HF_TOKEN or None,
    trust_remote_code=True,
)

# --------------------------------------------------------------
# Load the model configuration (needed for KV‑cache dimensions)
# --------------------------------------------------------------
from transformers import AutoConfig
config = AutoConfig.from_pretrained(model_id)

# Extract the fields required to build the past‑key‑value cache.
# These attributes exist in the original PyTorch config; they are
# missing from the Idefics3Config, so we read them from the
# `text_config` sub‑object.
num_key_value_heads = config.text_config.num_key_value_heads
head_dim            = config.text_config.head_dim
num_hidden_layers   = config.text_config.num_hidden_layers
eos_token_id       = config.text_config.eos_token_id
image_token_id      = config.image_token_id

# --------------------------------------------------------------
# Download the three ONNX files (if not already cached)
# --------------------------------------------------------------
vision_path  = hf_hub_download(
    repo_id=model_id,
    subfolder="onnx",
    filename="vision_encoder.onnx",
)
embed_path   = hf_hub_download(
    repo_id=model_id,
    subfolder="onnx",
    filename="embed_tokens.onnx",
)
decoder_path = hf_hub_download(
    repo_id=model_id,
    subfolder="onnx",
    filename="decoder_model_merged.onnx",
)

# --------------------------------------------------------------
# Create ONNX Runtime inference sessions
# --------------------------------------------------------------
vision_session  = ort.InferenceSession(vision_path,  providers=[providers[0]])
embed_session   = ort.InferenceSession(embed_path,   providers=[providers[0]])
decoder_session = ort.InferenceSession(decoder_path, providers=[providers[0]])

print("✅ ONNX Runtime sessions (vision, embed, decoder) created successfully.")

🔧 ONNX Runtime providers: ['CPUExecutionProvider']


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


processor_config.json:   0%|          | 0.00/68.0 [00:00<?, ?B/s]

chat_template.jinja:   0%|          | 0.00/588 [00:00<?, ?B/s]

preprocessor_config.json:   0%|          | 0.00/486 [00:00<?, ?B/s]

tokenizer_config.json: 0.00B [00:00, ?B/s]

vocab.json: 0.00B [00:00, ?B/s]

merges.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

added_tokens.json:   0%|          | 0.00/35.0 [00:00<?, ?B/s]

special_tokens_map.json: 0.00B [00:00, ?B/s]

config.json: 0.00B [00:00, ?B/s]

✅ ONNX Runtime sessions (vision, embed, decoder) created successfully.


In [11]:
# --------------------------------------------------------------
# Cell 6️⃣ – Vision → embed → decoder (image‑conditioned caption)
# --------------------------------------------------------------

# --------------------------------------------------------------
# 0️⃣  Imports & optional Hugging Face token prompt
# --------------------------------------------------------------
import urllib.request, os
import numpy as np
from PIL import Image
import torchvision.transforms as T
from transformers import AutoTokenizer
import getpass
from huggingface_hub import login

HF_TOKEN = getpass.getpass('🔑 Enter your Hugging Face token (leave empty to skip): ')
if HF_TOKEN:
    login(token=HF_TOKEN)
    os.environ["HF_TOKEN"] = HF_TOKEN

# --------------------------------------------------------------
# 1️⃣  ONNX Runtime sessions – must already exist (created in Cell 5)
# --------------------------------------------------------------
#   vision_session, embed_session, decoder_session
#   num_hidden_layers, num_key_value_heads, head_dim
# If you run the notebook from the top, Cell 5 will have instantiated these.

# --------------------------------------------------------------
# 2️⃣  Helper – display model I/O (useful for debugging)
# --------------------------------------------------------------
def show_inputs(sess, name: str):
    """Print the inputs of an ONNX Runtime session."""
    print(f"\n{name} inputs:")
    for i in sess.get_inputs():
        print(f"  • {i.name}  shape={i.shape}  type={i.type}")

show_inputs(vision_session,  "Vision encoder")
show_inputs(embed_session,   "Token embedder")
show_inputs(decoder_session, "Decoder")

# --------------------------------------------------------------
# 3️⃣  Image preprocessing – 512×512, bool mask
# --------------------------------------------------------------
preprocess = T.Compose([
    T.Resize((512, 512)),
    T.ToTensor(),
    T.Normalize(mean=[0.485, 0.456, 0.406],
                std =[0.229, 0.224, 0.225]),
])

def prepare_image(pil_img: Image.Image) -> dict:
    """
    Convert a PIL image into the dict expected by the vision encoder.
    Returns a dict with:
        - pixel_values: (1,1,3,512,512) float32
        - pixel_attention_mask: (1,1,512,512) bool
    """
    pixel_values = preprocess(pil_img).unsqueeze(0).numpy().astype(np.float32)  # (1,3,512,512)
    pixel_values = np.expand_dims(pixel_values, axis=1)                        # (1,1,3,512,512)
    mask = np.ones(pixel_values.shape[:2] + pixel_values.shape[3:], dtype=np.bool_)
    return {"pixel_values": pixel_values, "pixel_attention_mask": mask}

# --------------------------------------------------------------
# 4️⃣  Helper – create an empty KV‑cache for the first decoder step
# --------------------------------------------------------------
def empty_past(num_layers: int, batch: int, heads: int, head_dim: int) -> dict:
    """
    Build a dict with empty past_key_values tensors for each layer.
    Shape: (batch, heads, 0, head_dim) – zero‑length sequence.
    """
    empty = np.empty((batch, heads, 0, head_dim), dtype=np.float32)
    past = {}
    for i in range(num_layers):
        past[f"past_key_values.{i}.key"]   = empty
        past[f"past_key_values.{i}.value"] = empty
    return past

# --------------------------------------------------------------
# 5️⃣  Generation option A – Greedy (original) – kept for reference
# --------------------------------------------------------------
def generate_caption_greedy(pil_img: Image.Image, max_len: int = 64) -> str:
    """Original greedy implementation (kept for comparison). Returns a space‑separated
    string of token IDs."""
    # Vision encoder -------------------------------------------------
    img_emb = vision_session.run(None, prepare_image(pil_img))[0]
    if img_emb.ndim == 4:                     # (1,1,1,embed_dim) → (1,1,embed_dim)
        img_emb = np.squeeze(img_emb, axis=2)

    # BOS token ------------------------------------------------------
    bos_emb = embed_session.run(
        None,
        {"input_ids": np.array([[0]], dtype=np.int64)}
    )[0]

    # Initialise sequence + empty KV‑cache ---------------------------
    seq_embeds = np.concatenate([img_emb, bos_emb], axis=1)   # (1,2,embed_dim)
    past = empty_past(num_hidden_layers, 1, num_key_value_heads, head_dim)

    tokenizer = AutoTokenizer.from_pretrained("onnx-community/granite-docling-258M-ONNX")
    eos_id = tokenizer.eos_token_id

    generated_ids = []

    for _ in range(max_len):
        attn_mask = np.ones((1, seq_embeds.shape[1]), dtype=np.int64)

        out = decoder_session.run(
            None,
            {"inputs_embeds": seq_embeds,
             "attention_mask": attn_mask,
             **past},
        )
        logits = out[0][0, -1, :]                     # (vocab,)

        next_id = int(np.argmax(logits))
        generated_ids.append(next_id)

        if next_id == eos_id:
            break

        # Append new token embedding
        next_emb = embed_session.run(
            None,
            {"input_ids": np.array([[next_id]], dtype=np.int64)},
        )[0]
        seq_embeds = np.concatenate([seq_embeds, next_emb], axis=1)

        # Re‑build KV‑cache
        past = {}
        for i in range(num_hidden_layers):
            past[f"past_key_values.{i}.key"]   = out[1 + i * 2]
            past[f"past_key_values.{i}.value"] = out[2 + i * 2]

    return " ".join(map(str, generated_ids))

# --------------------------------------------------------------
# 6️⃣  Generation option B – Sampling (top‑k / nucleus)
# --------------------------------------------------------------
def generate_caption_sampling(pil_img: Image.Image,
                              max_len: int = 64,
                              top_k: int = 50,
                              top_p: float = 0.9) -> str:
    """Stochastic decoding using top‑k and/or nucleus (top‑p) sampling.
    Returns a space‑separated string of token IDs."""
    # Vision encoder -------------------------------------------------
    img_emb = vision_session.run(None, prepare_image(pil_img))[0]
    if img_emb.ndim == 4:
        img_emb = np.squeeze(img_emb, axis=2)

    # BOS token ------------------------------------------------------
    bos_emb = embed_session.run(
        None,
        {"input_ids": np.array([[0]], dtype=np.int64)}
    )[0]

    # Initialise sequence + empty KV‑cache ---------------------------
    seq_embeds = np.concatenate([img_emb, bos_emb], axis=1)
    past = empty_past(num_hidden_layers, 1, num_key_value_heads, head_dim)

    tokenizer = AutoTokenizer.from_pretrained("onnx-community/granite-docling-258M-ONNX")
    eos_id = tokenizer.eos_token_id

    generated_ids = []

    for _ in range(max_len):
        attn_mask = np.ones((1, seq_embeds.shape[1]), dtype=np.int64)

        out = decoder_session.run(
            None,
            {"inputs_embeds": seq_embeds,
             "attention_mask": attn_mask,
             **past},
        )
        logits = out[0][0, -1, :]                     # (vocab,)

        # ---------- top‑k filtering ----------
        if top_k > 0:
            kth_vals = np.partition(logits, -top_k)[-top_k:]
            kth_min = kth_vals.min()
            logits = np.where(logits >= kth_min, logits, -np.inf)

        # ---------- nucleus (top‑p) filtering ----------
        sorted_idx = np.argsort(-logits)               # descending order
        sorted_logits = logits[sorted_idx]
        probs = np.exp(sorted_logits - np.max(sorted_logits))
        probs /= probs.sum()
        cumulative = np.cumsum(probs)
        cutoff = cumulative > top_p
        if cutoff.any():
            cutoff_idx = np.argmax(cutoff)
            keep_idx = sorted_idx[: cutoff_idx + 1]
            probs = probs[: cutoff_idx + 1]
            probs /= probs.sum()
        else:
            keep_idx = sorted_idx

        # ---------- sample ----------
        next_id = int(np.random.choice(keep_idx, p=probs))
        generated_ids.append(next_id)

        if next_id == eos_id:
            break

        # Append embedding of the sampled token
        next_emb = embed_session.run(
            None,
            {"input_ids": np.array([[next_id]], dtype=np.int64)},
        )[0]
        seq_embeds = np.concatenate([seq_embeds, next_emb], axis=1)

        # Update KV‑cache
        past = {}
        for i in range(num_hidden_layers):
            past[f"past_key_values.{i}.key"]   = out[1 + i * 2]
            past[f"past_key_values.{i}.value"] = out[2 + i * 2]

    return " ".join(map(str, generated_ids))

# --------------------------------------------------------------
# 7️⃣  Generation option C – Full‑sequence **without** KV‑cache
# --------------------------------------------------------------
def generate_caption_no_cache(pil_img: Image.Image, max_len: int = 64) -> str:
    """Runs the decoder without ever feeding a KV‑cache.  Because the ONNX
    model still declares the past‑key‑value inputs as required, we provide
    *empty* tensors for them on every step (zero‑length sequence).  This
    satisfies the runtime while keeping the semantics of “no cache”
    (the decoder recomputes its internal cache each step)."""

    # Vision encoder -------------------------------------------------
    img_emb = vision_session.run(None, prepare_image(pil_img))[0]
    if img_emb.ndim == 4:                     # (1,1,1,embed_dim) → (1,1,embed_dim)
        img_emb = np.squeeze(img_emb, axis=2)

    # BOS token ------------------------------------------------------
    bos_emb = embed_session.run(
        None,
        {"input_ids": np.array([[0]], dtype=np.int64)}
    )[0]

    # Initialise sequence ---------------------------------------------
    seq_embeds = np.concatenate([img_emb, bos_emb], axis=1)

    tokenizer = AutoTokenizer.from_pretrained("onnx-community/granite-docling-258M-ONNX")
    eos_id = tokenizer.eos_token_id

    generated_ids = []

    for _ in range(max_len):
        attn_mask = np.ones((1, seq_embeds.shape[1]), dtype=np.int64)

        # Provide empty past‑key‑values (required by the model)
        past = empty_past(num_hidden_layers, 1, num_key_value_heads, head_dim)

        out = decoder_session.run(
            None,
            {"inputs_embeds": seq_embeds,
             "attention_mask": attn_mask,
             **past},
        )
        logits = out[0][0, -1, :]                     # (vocab,)

        next_id = int(np.argmax(logits))
        generated_ids.append(next_id)

        if next_id == eos_id:
            break

        # Append embedding of the newly generated token
        next_emb = embed_session.run(
            None,
            {"input_ids": np.array([[next_id]], dtype=np.int64)},
        )[0]
        seq_embeds = np.concatenate([seq_embeds, next_emb], axis=1)

    return " ".join(map(str, generated_ids))

# --------------------------------------------------------------
# 8️⃣  Generation option D – Correct BOS/EOS IDs from tokenizer
# --------------------------------------------------------------
def generate_caption_correct_ids(pil_img: Image.Image, max_len: int = 64) -> str:
    """Retrieves BOS and EOS token IDs from the tokenizer (instead of hard‑coding).
    Uses KV‑cache + full‑sequence inputs (same pattern as the original greedy)."""
    tokenizer = AutoTokenizer.from_pretrained("onnx-community/granite-docling-258M-ONNX")
    bos_id = tokenizer.bos_token_id if tokenizer.bos_token_id is not None else 0
    eos_id = tokenizer.eos_token_id

    # Vision encoder -------------------------------------------------
    img_emb = vision_session.run(None, prepare_image(pil_img))[0]
    if img_emb.ndim == 4:
        img_emb = np.squeeze(img_emb, axis=2)

    # BOS embedding ---------------------------------------------------
    bos_emb = embed_session.run(
        None,
        {"input_ids": np.array([[bos_id]], dtype=np.int64)},
    )[0]

    # Initialise sequence + empty KV‑cache ---------------------------
    seq_embeds = np.concatenate([img_emb, bos_emb], axis=1)
    past = empty_past(num_hidden_layers, 1, num_key_value_heads, head_dim)

    generated_ids = []

    for _ in range(max_len):
        attn_mask = np.ones((1, seq_embeds.shape[1]), dtype=np.int64)

        out = decoder_session.run(
            None,
            {"inputs_embeds": seq_embeds,
             "attention_mask": attn_mask,
             **past},
        )
        logits = out[0][0, -1, :]                     # (vocab,)

        next_id = int(np.argmax(logits))
        generated_ids.append(next_id)

        if next_id == eos_id:
            break

        # Append new token embedding
        next_emb = embed_session.run(
            None,
            {"input_ids": np.array([[next_id]], dtype=np.int64)},
        )[0]
        seq_embeds = np.concatenate([seq_embeds, next_emb], axis=1)

        # Update KV‑cache
        past = {}
        for i in range(num_hidden_layers):
            past[f"past_key_values.{i}.key"]   = out[1 + i * 2]
            past[f"past_key_values.{i}.value"] = out[2 + i * 2]

    return " ".join(map(str, generated_ids))

# --------------------------------------------------------------
# 9️⃣  Helper – decode token IDs to readable text
# --------------------------------------------------------------
def decode_ids(token_ids_str: str) -> str:
    """Convert a space‑separated string of token IDs into a human‑readable caption."""
    tokenizer = AutoTokenizer.from_pretrained("onnx-community/granite-docling-258M-ONNX")
    ids = [int(t) for t in token_ids_str.split()]
    return tokenizer.decode(ids, skip_special_tokens=True)

# --------------------------------------------------------------
# 🔟  Load a sample image (you can replace the URL with any image)
# --------------------------------------------------------------
img_url = (
    "https://huggingface.co/spaces/ibm-granite/granite-docling-258m-demo/resolve/main/data/images/"
    "lake-zurich-switzerland-view-nature-landscapes-7bbda4-1024.jpg"
)
img_path = "/tmp/sample.jpg"
urllib.request.urlretrieve(img_url, img_path)
sample_image = Image.open(img_path).convert("RGB")

# --------------------------------------------------------------
# 1️⃣1️⃣  Run each generation variant and print results
# --------------------------------------------------------------
print("\n=== Greedy (original) ===")
ids_greedy = generate_caption_greedy(sample_image)
print("Token IDs :", ids_greedy)
print("Caption   :", decode_ids(ids_greedy))

print("\n=== Sampling (top‑k/​top‑p) ===")
ids_sampling = generate_caption_sampling(sample_image, top_k=50, top_p=0.9)
print("Token IDs :", ids_sampling)
print("Caption   :", decode_ids(ids_sampling))

print("\n=== No‑cache (full‑sequence only) ===")
ids_no_cache = generate_caption_no_cache(sample_image)
print("Token IDs :", ids_no_cache)
print("Caption   :", decode_ids(ids_no_cache))

print("\n=== Correct BOS/EOS IDs ===")
ids_correct_ids = generate_caption_correct_ids(sample_image)
print("Token IDs :", ids_correct_ids)
print("Caption   :", decode_ids(ids_correct_ids))

🔑 Enter your Hugging Face token (leave empty to skip): ··········


Note: Environment variable`HF_TOKEN` is set and is the current active token independently from the token you've just configured.



Vision encoder inputs:
  • pixel_values  shape=['batch_size', 'num_images', 3, 512, 512]  type=tensor(float)
  • pixel_attention_mask  shape=['batch_size', 'num_images', 512, 512]  type=tensor(bool)

Token embedder inputs:
  • input_ids  shape=['batch_size', 'sequence_length']  type=tensor(int64)

Decoder inputs:
  • inputs_embeds  shape=['batch_size', 'sequence_length', 576]  type=tensor(float)
  • attention_mask  shape=['batch_size', 'total_sequence_length']  type=tensor(int64)
  • past_key_values.0.key  shape=['batch_size', 3, 'past_sequence_length', 64]  type=tensor(float)
  • past_key_values.0.value  shape=['batch_size', 3, 'past_sequence_length', 64]  type=tensor(float)
  • past_key_values.1.key  shape=['batch_size', 3, 'past_sequence_length', 64]  type=tensor(float)
  • past_key_values.1.value  shape=['batch_size', 3, 'past_sequence_length', 64]  type=tensor(float)
  • past_key_values.2.key  shape=['batch_size', 3, 'past_sequence_length', 64]  type=tensor(float)
  • past_key_va