In [1]:
from transformers import TrOCRProcessor, VisionEncoderDecoderModel
from PIL import Image

processor = TrOCRProcessor.from_pretrained('microsoft/trocr-base-printed')
model = VisionEncoderDecoderModel.from_pretrained('microsoft/trocr-base-printed')

# pixel_values = processor(images=image, return_tensors="pt").pixel_values
# generated_ids = model.generate(pixel_values)
# generated_text = processor.batch_decode(generated_ids, skip_special_tokens=True)[0]


  from .autonotebook import tqdm as notebook_tqdm
Using a slow image processor as `use_fast` is unset and a slow processor was saved with this model. `use_fast=True` will be the default behavior in v4.52, even if the model was saved with a slow processor. This will result in minor differences in outputs. You'll still be able to use a slow processor with `use_fast=False`.
Some weights of VisionEncoderDecoderModel were not initialized from the model checkpoint at microsoft/trocr-base-printed and are newly initialized: ['encoder.pooler.dense.bias', 'encoder.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [2]:
# ── 1. tell the model where decoding starts ───────────────────────────────
# Roberta-based TrOCR uses <s> as BOS.
if processor.tokenizer.bos_token_id is None:
    processor.tokenizer.add_special_tokens({"bos_token": "<s>"})
    model.decoder.resize_token_embeddings(len(processor.tokenizer))
model.config.decoder_start_token_id = processor.tokenizer.bos_token_id

# ── 2. the other two “must-haves” ─────────────────────────────────────────
model.config.eos_token_id  = processor.tokenizer.eos_token_id            # = 2
model.config.pad_token_id  = processor.tokenizer.pad_token_id            # = 1

# (optional but convenient for generate() and Trainer)
model.config.vocab_size    = model.config.decoder.vocab_size
model.config.max_length    = 128
model.config.num_beams     = 4

In [3]:
import os
from PIL import Image
from torch.utils.data import Dataset


class ImageTextDataset(Dataset):
    def __init__(self, data_dir, max_length=128):
        self.data_dir = data_dir
        self.max_length = max_length

        # find all indexes by looking for .png files
        self.ids = sorted([
            os.path.splitext(fn)[0]
            for fn in os.listdir(data_dir)
            if fn.endswith(".png")
        ])

    def __len__(self):
        return len(self.ids)

    def __getitem__(self, idx):
        idx_str = self.ids[idx].zfill(len(self) - 1)

        img_path = os.path.join(self.data_dir, f"{idx_str}.png")
        image = Image.open(img_path).convert("RGB")

        pixel_values = processor(images=image, return_tensors="pt").pixel_values.squeeze(0)

        txt_path = os.path.join(self.data_dir, f"{idx_str}.txt")
        caption = open(txt_path, "r", encoding="utf-8").read().strip()

        tokens = processor.tokenizer(
            caption,
            padding="max_length",
            truncation=True,
            max_length=self.max_length,
            return_tensors="pt",
        ).input_ids.squeeze(0)  # → (max_length,)
        # tokens[tokens == tokenizer.pad_token_id] = -100

        return {"pixel_values": pixel_values, "labels": tokens}

train_dataset = ImageTextDataset(
    'data', max_length=128
)
validation_dataset = ImageTextDataset(
    'valid', max_length=128
)

In [4]:
import numpy as np
import evaluate
from transformers import (
    TrOCRProcessor,
    VisionEncoderDecoderModel,
    TrainingArguments,
    Trainer,
    default_data_collator,
)

cer_metric = evaluate.load("cer")          # use the right metric

def compute_metrics(eval_pred):
    preds, labels = eval_pred          # preds may be tuple OR ndarray

    # ── 1️⃣  unwrap if we received a tuple ───────────────────────────
    if isinstance(preds, tuple):       # case A → logits are preds[0]
        preds = preds[0]

    # ── 2️⃣  if still 3-D, collapse to token-ids with argmax ─────────
    if preds.ndim == 3:                # (batch, seq_len, vocab)
        pred_ids = np.argmax(preds, axis=-1)
    else:                              # already (batch, seq_len)
        pred_ids = preds

    # ── 3️⃣  put PAD tokens back into labels so they decode cleanly ─
    labels = np.where(labels == -100,
                      processor.tokenizer.pad_token_id,
                      labels)

    # ── 4️⃣  decode and compute CER ─────────────────────────────────
    pred_str  = processor.batch_decode(pred_ids, skip_special_tokens=True)
    label_str = processor.batch_decode(labels,   skip_special_tokens=True)

    cer = cer_metric.compute(predictions=pred_str,
                             references=label_str)

    return {"cer": cer}

# 5. Define training args
training_args = TrainingArguments(
    output_dir="./trocr-finetuned",
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    eval_strategy="epoch",
    logging_strategy="epoch",
    save_strategy="epoch",
    num_train_epochs=5,
    learning_rate=5e-5,
    weight_decay=0.01,
    fp16=True,
    # predict_with_generate=True,
)

# 6. Create Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=validation_dataset,
    data_collator=default_data_collator,
    tokenizer=processor.tokenizer,
    compute_metrics=compute_metrics,
)

# 7. Start training
trainer.train()

# 8. (Optional) Save the fine‐tuned model
trainer.save_model("./trocr-finetuned-final")

  trainer = Trainer(
`loss_type=None` was set in the config but it is unrecognised.Using the default loss: `ForCausalLMLoss`.


Epoch,Training Loss,Validation Loss


FileNotFoundError: [Errno 2] No such file or directory: 'valid\\000.png'