### Avaliação TrOCR no IAM baixado via KaggleHub

1. Instalar dependências:

In [1]:
!pip -q install kagglehub transformers accelerate evaluate jiwer pillow pandas tqdm rapidfuzz

[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m84.1/84.1 kB[0m [31m5.3 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m3.2/3.2 MB[0m [31m41.1 MB/s[0m eta [36m0:00:00[0m
[?25h

2. Baixar o dataset:

In [2]:
import kagglehub, sys
print("python:", sys.executable)
print("kagglehub:", kagglehub.__version__)

python: /usr/bin/python3
kagglehub: 0.3.13


In [3]:
from pathlib import Path
import kagglehub

path = kagglehub.dataset_download("changheonkim/iam-trocr")
path = Path(path) / "IAM"
print("Path to dataset files:", path)

Downloading from https://www.kaggle.com/api/v1/datasets/download/changheonkim/iam-trocr?dataset_version_number=1...


100%|██████████| 90.1M/90.1M [00:00<00:00, 96.6MB/s]

Extracting files...





Path to dataset files: /root/.cache/kagglehub/datasets/changheonkim/iam-trocr/versions/1/IAM


3. Listar imagens:

In [4]:
import glob

image_directory = path / "image"
image_paths = sorted(glob.glob(str(image_directory / "*.jpg")))

print(f"Found {len(image_paths)} images.")
print("First 5:", image_paths[:5])

Found 2915 images.
First 5: ['/root/.cache/kagglehub/datasets/changheonkim/iam-trocr/versions/1/IAM/image/c04-110-00.jpg', '/root/.cache/kagglehub/datasets/changheonkim/iam-trocr/versions/1/IAM/image/c04-110-01.jpg', '/root/.cache/kagglehub/datasets/changheonkim/iam-trocr/versions/1/IAM/image/c04-110-02.jpg', '/root/.cache/kagglehub/datasets/changheonkim/iam-trocr/versions/1/IAM/image/c04-110-03.jpg', '/root/.cache/kagglehub/datasets/changheonkim/iam-trocr/versions/1/IAM/image/c04-116-00.jpg']


4. Encontrar e carregar labels:

In [5]:
import os, json, re
import pandas as pd

def find_annotation_files(base: Path):
    exts = (".csv", ".tsv", ".jsonl", ".json", ".txt")
    files = []
    for root, _, fnames in os.walk(base):
        for f in fnames:
            if f.lower().endswith(exts):
                files.append(Path(root) / f)
    return files

def build_labels_map(base: Path):
    ann_files = find_annotation_files(base)
    print("Annotation candidates:")
    for f in ann_files[:30]:
        print(" -", f.relative_to(base))
    if len(ann_files) > 30:
        print(f" ... +{len(ann_files)-30} more")

    labels = {}

    # 1) tenta CSV/TSV com colunas comuns
    for f in ann_files:
        if f.suffix.lower() in [".csv", ".tsv"]:
            sep = "\t" if f.suffix.lower() == ".tsv" else ","
            try:
                df = pd.read_csv(f, sep=sep)
            except:
                continue

            cols = [c.lower() for c in df.columns]
            # possíveis nomes de colunas
            img_candidates = ["image", "image_path", "filename", "file", "img", "path"]
            txt_candidates = ["text", "gt", "label", "transcript", "annotation", "sentence"]

            img_col = next((df.columns[i] for i,c in enumerate(cols) if c in img_candidates), None)
            txt_col = next((df.columns[i] for i,c in enumerate(cols) if c in txt_candidates), None)
            if img_col and txt_col:
                for _, r in df.iterrows():
                    img = str(r[img_col])
                    txt = str(r[txt_col])
                    stem = Path(img).stem
                    labels[stem] = txt
                if labels:
                    print("Loaded labels from:", f.relative_to(base))
                    return labels

    # 2) tenta JSONL com chaves comuns
    for f in ann_files:
        if f.suffix.lower() == ".jsonl":
            try:
                with open(f, "r", encoding="utf-8") as fh:
                    for line in fh:
                        obj = json.loads(line)
                        img = obj.get("image") or obj.get("image_path") or obj.get("filename") or obj.get("file")
                        txt = obj.get("text") or obj.get("gt") or obj.get("label") or obj.get("transcript")
                        if img and txt is not None:
                            labels[Path(img).stem] = str(txt)
                if labels:
                    print("Loaded labels from:", f.relative_to(base))
                    return labels
            except:
                labels = {}

    # 3) tenta TXT no formato: "filename<sep>text"
    for f in ann_files:
        if f.suffix.lower() == ".txt":
            try:
                with open(f, "r", encoding="utf-8") as fh:
                    for line in fh:
                        line = line.strip()
                        if not line:
                            continue
                        # tenta separar por tab ou espaço múltiplo
                        if "\t" in line:
                            a,b = line.split("\t", 1)
                        else:
                            parts = re.split(r"\s{2,}", line, maxsplit=1)
                            if len(parts) < 2:
                                continue
                            a,b = parts[0], parts[1]
                        labels[Path(a).stem] = b.strip()
                if labels:
                    print("Loaded labels from:", f.relative_to(base))
                    return labels
            except:
                labels = {}

    raise RuntimeError("Não achei um arquivo de labels em formato reconhecido dentro do IAM/.")

labels_map = build_labels_map(path)
print("Labels loaded:", len(labels_map))

Annotation candidates:
 - gpt2.dict.txt
 - gt_test.txt
Loaded labels from: gt_test.txt
Labels loaded: 2915


5. Montar um dataframe com amostras do GT:

In [6]:
from pathlib import Path

rows = []
missing = 0
for p in image_paths:
    stem = Path(p).stem
    gt = labels_map.get(stem)
    if gt is None:
        missing += 1
        continue
    rows.append({"id": stem, "image_path": p, "gt": gt})

df = pd.DataFrame(rows)
print("Total images:", len(image_paths))
print("With labels:", len(df))
print("Missing labels:", missing)
df.head()

Total images: 2915
With labels: 2915
Missing labels: 0


Unnamed: 0,id,image_path,gt
0,c04-110-00,/root/.cache/kagglehub/datasets/changheonkim/i...,Become a success with a disc and hey presto ! ...
1,c04-110-01,/root/.cache/kagglehub/datasets/changheonkim/i...,"assuredness "" Bella Bella Marie "" ( Parlophone..."
2,c04-110-02,/root/.cache/kagglehub/datasets/changheonkim/i...,I don't think he will storm the charts with th...
3,c04-110-03,/root/.cache/kagglehub/datasets/changheonkim/i...,"CHRIS CHARLES , 39 , who lives in Stockton-on-..."
4,c04-116-00,/root/.cache/kagglehub/datasets/changheonkim/i...,He is also a director of a couple of garages ....


6. Carregar e rodar o TrOCR original (handwritten):

In [7]:
import torch
from transformers import VisionEncoderDecoderModel, TrOCRProcessor
from PIL import Image
from tqdm import tqdm
import pandas as pd
import time

# ======= Config de performance =======
device = "cuda" if torch.cuda.is_available() else "cpu"
torch.backends.cudnn.benchmark = True  # ajuda CNNs com input similar

checkpoint = "microsoft/trocr-base-handwritten"
processor = TrOCRProcessor.from_pretrained(checkpoint)
model = VisionEncoderDecoderModel.from_pretrained(checkpoint).to(device)
model.eval()

# Half precision na GPU (grande ganho)
use_fp16 = (device == "cuda")
if use_fp16:
    model = model.half()

# ======= Controles =======
BATCH_SIZE = 16 if device == "cuda" else 2
MAX_NEW_TOKENS = 64          # 128 geralmente é overkill no IAM
NUM_BEAMS = 1               # beam search deixa bem mais lento
SUBSET_N = None             # ex: 1000 pra testar rápido; None = tudo

df_run = df.copy()
if SUBSET_N is not None:
    df_run = df_run.sample(SUBSET_N, random_state=42).reset_index(drop=True)
else:
    df_run = df_run.reset_index(drop=True)

def load_images(paths):
    imgs = []
    for p in paths:
        imgs.append(Image.open(p).convert("RGB"))
    return imgs

preds = []
lat_ms = []

# Warmup pequeno (evita primeira passada lenta)
if device == "cuda" and len(df_run) > 0:
    warm_paths = df_run["image_path"].iloc[:min(2, len(df_run))].tolist()
    warm_imgs = load_images(warm_paths)
    with torch.no_grad(), torch.autocast(device_type="cuda", dtype=torch.float16):
        inputs = processor(images=warm_imgs, return_tensors="pt", padding=True).to(device)
        _ = model.generate(**inputs, max_new_tokens=16, num_beams=1)

# Inferência em batches
for i in tqdm(range(0, len(df_run), BATCH_SIZE)):
    batch_paths = df_run["image_path"].iloc[i:i+BATCH_SIZE].tolist()
    imgs = load_images(batch_paths)

    t0 = time.perf_counter()
    with torch.no_grad():
        if device == "cuda":
            with torch.autocast(device_type="cuda", dtype=torch.float16):
                inputs = processor(images=imgs, return_tensors="pt", padding=True).to(device)
                out = model.generate(
                    **inputs,
                    max_new_tokens=MAX_NEW_TOKENS,
                    num_beams=NUM_BEAMS,
                )
        else:
            inputs = processor(images=imgs, return_tensors="pt", padding=True).to(device)
            out = model.generate(
                **inputs,
                max_new_tokens=MAX_NEW_TOKENS,
                num_beams=NUM_BEAMS,
            )

    dt = (time.perf_counter() - t0) * 1000.0
    batch_pred = processor.batch_decode(out, skip_special_tokens=True)

    preds.extend(batch_pred)
    # latência média por amostra no batch (bom o suficiente pra relatório)
    per_item = dt / max(1, len(batch_pred))
    lat_ms.extend([per_item] * len(batch_pred))

df_run["pred_raw"] = preds
df_run["latency_ms"] = lat_ms

df_run.head()

preprocessor_config.json:   0%|          | 0.00/224 [00:00<?, ?B/s]

The image processor of type `ViTImageProcessor` is now loaded as a fast processor by default, even if the model checkpoint was saved with a slow processor. This is a breaking change and may produce slightly different outputs. To continue using the slow processor, instantiate this class with `use_fast=False`. 


config.json: 0.00B [00:00, ?B/s]

tokenizer_config.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/772 [00:00<?, ?B/s]

vocab.json: 0.00B [00:00, ?B/s]

merges.txt: 0.00B [00:00, ?B/s]

model.safetensors:   0%|          | 0.00/1.33G [00:00<?, ?B/s]

Loading weights:   0%|          | 0/478 [00:00<?, ?it/s]

VisionEncoderDecoderModel LOAD REPORT from: microsoft/trocr-base-handwritten
Key                         | Status  | 
----------------------------+---------+-
encoder.pooler.dense.weight | MISSING | 
encoder.pooler.dense.bias   | MISSING | 

Notes:
- MISSING	:those params were newly initialized because missing from the checkpoint. Consider training on your downstream task.


generation_config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

100%|██████████| 183/183 [02:42<00:00,  1.13it/s]


Unnamed: 0,id,image_path,gt,pred_raw,latency_ms
0,c04-110-00,/root/.cache/kagglehub/datasets/changheonkim/i...,Become a success with a disc and hey presto ! ...,Become a success with a disc and key presto ! ...,96.76693
1,c04-110-01,/root/.cache/kagglehub/datasets/changheonkim/i...,"assuredness "" Bella Bella Marie "" ( Parlophone...","assuredness . "" Bella Bella Marie "" ( Parlopho...",96.76693
2,c04-110-02,/root/.cache/kagglehub/datasets/changheonkim/i...,I don't think he will storm the charts with th...,I don't think he will storm the charts with th...,96.76693
3,c04-110-03,/root/.cache/kagglehub/datasets/changheonkim/i...,"CHRIS CHARLES , 39 , who lives in Stockton-on-...","CHRIS CHARLES , 39 , who lives in Stockton - o...",96.76693
4,c04-116-00,/root/.cache/kagglehub/datasets/changheonkim/i...,He is also a director of a couple of garages ....,He is also a director of a couple of garages ....,96.76693


7. Testar modelo com as métricas: CER, WER, Exact Match (RAW e NORMALIZED) + latência.

In [8]:
import evaluate, re

cer = evaluate.load("cer")
wer = evaluate.load("wer")

def normalize_text(s: str) -> str:
    s = (s or "").strip().lower()
    s = re.sub(r"\s+", " ", s)
    return s

gt_raw = df_run["gt"].astype(str).tolist()
pred_raw = df_run["pred_raw"].astype(str).tolist()

gt_norm = [normalize_text(x) for x in gt_raw]
pred_norm = [normalize_text(x) for x in pred_raw]

metrics = {
    "CER_raw": cer.compute(predictions=pred_raw, references=gt_raw),
    "WER_raw": wer.compute(predictions=pred_raw, references=gt_raw),
    "ExactMatch_raw": sum(p==g for p,g in zip(pred_raw, gt_raw)) / len(gt_raw),
    "CER_norm": cer.compute(predictions=pred_norm, references=gt_norm),
    "WER_norm": wer.compute(predictions=pred_norm, references=gt_norm),
    "ExactMatch_norm": sum(p==g for p,g in zip(pred_norm, gt_norm)) / len(gt_norm),
    "Latency_ms_mean": float(df_run["latency_ms"].mean()),
    "Latency_ms_p95": float(df_run["latency_ms"].quantile(0.95)),
}
metrics

Downloading builder script: 0.00B [00:00, ?B/s]

Downloading builder script: 0.00B [00:00, ?B/s]

{'CER_raw': 0.03555548472436929,
 'WER_raw': 0.08938395970554049,
 'ExactMatch_raw': 0.5385934819897084,
 'CER_norm': 0.034025531101088496,
 'WER_norm': 0.08434715226656335,
 'ExactMatch_norm': 0.5591766723842195,
 'Latency_ms_mean': 54.20740955540304,
 'Latency_ms_p95': 72.95335056249996}

8. Análise de erros:

In [9]:
import numpy as np

def char_error_rate_simple(ref, hyp): # Taxa de erro por caractere
    # CER simples via distância de edição (levenshtein) usando DP
    ref, hyp = ref or "", hyp or ""
    n, m = len(ref), len(hyp)
    dp = np.zeros((n+1, m+1), dtype=int)
    for i in range(n+1): dp[i,0] = i
    for j in range(m+1): dp[0,j] = j
    for i in range(1, n+1):
        for j in range(1, m+1):
            cost = 0 if ref[i-1] == hyp[j-1] else 1
            dp[i,j] = min(dp[i-1,j] + 1, dp[i,j-1] + 1, dp[i-1,j-1] + cost)
    dist = dp[n,m]
    return dist / max(1, n)

df_run["cer_simple_norm"] = [char_error_rate_simple(g, p) for g,p in zip(gt_norm, pred_norm)]
worst = df_run.sort_values("cer_simple_norm", ascending=False).head(20)[["id","image_path","gt","pred_raw","cer_simple_norm","latency_ms"]]
best = df_run.sort_values("cer_simple_norm", ascending=True).head(20)[["id","image_path","gt","pred_raw","cer_simple_norm","latency_ms"]]

worst.head(10)

Unnamed: 0,id,image_path,gt,pred_raw,cer_simple_norm,latency_ms
2654,p02-109-01,/root/.cache/kagglehub/datasets/changheonkim/i...,----------------------------------------------...,when ence a u case,1.0,46.92405
2653,p02-109-00,/root/.cache/kagglehub/datasets/changheonkim/i...,Sentence Database P02-109,c.t.0t.bb.oct.,0.88,46.92405
1699,m03-013-06,/root/.cache/kagglehub/datasets/changheonkim/i...,head-shrinkers .,head - strict-shrangers,0.8125,61.075699
1819,m04-107-10,/root/.cache/kagglehub/datasets/changheonkim/i...,weep ...,WLYP.00,0.625,44.128452
803,f04-064-08,/root/.cache/kagglehub/datasets/changheonkim/i...,darkness .,dazzkeeper .,0.6,50.697426
2840,p06-047-07,/root/.cache/kagglehub/datasets/changheonkim/i...,"his # prescription pad , and","his preference script composed , and",0.571429,42.003755
447,d06-060-09,/root/.cache/kagglehub/datasets/changheonkim/i...,century .,clinker y .,0.555556,51.494655
2147,n02-114-09,/root/.cache/kagglehub/datasets/changheonkim/i...,courage !,Cocesage D,0.444444,49.820859
2895,p06-096-10,/root/.cache/kagglehub/datasets/changheonkim/i...,Philip's spirits soared .,Philip's girlfriend .,0.44,38.875344
2262,n03-120-09,/root/.cache/kagglehub/datasets/changheonkim/i...,"huh ? """,hah 2 1,0.428571,62.04726


9. Exportação (CSV + resumo TXT):

In [10]:
OUT_CSV = "iam_trocr_eval_results.csv"
OUT_TXT = "iam_trocr_eval_summary.txt"

df_run.to_csv(OUT_CSV, index=False)

with open(OUT_TXT, "w", encoding="utf-8") as f:
    f.write("=== TrOCR IAM Evaluation Summary ===\n")
    for k,v in metrics.items():
        f.write(f"{k}: {v}\n")
    f.write("\n--- Worst 20 (by CER simple norm) ---\n")
    f.write(worst.to_string(index=False))
    f.write("\n\n--- Best 20 (by CER simple norm) ---\n")
    f.write(best.to_string(index=False))

print("Saved:", OUT_CSV, OUT_TXT)

Saved: iam_trocr_eval_results.csv iam_trocr_eval_summary.txt
