In [1]:
from pathlib import Path
import kagglehub

# Download latest version
path = kagglehub.dataset_download("changheonkim/iam-trocr")
path = Path(path)/"IAM"
print("Path to dataset files:", path)

Using Colab cache for faster access to the 'iam-trocr' dataset.
Path to dataset files: /kaggle/input/iam-trocr/IAM


In [2]:
import os

# Assuming 'path' variable holds the base directory from kagglehub.dataset_download
# If not, please replace 'path' with the correct directory string, e.g., '/content/IAM'
if 'path' in globals():
    print(f"Listing directories in: {path}")
    # Use a shell command to list only directories recursively, and sort them
    !ls {path/"image"}

Listing directories in: /kaggle/input/iam-trocr/IAM
c04-110-00.jpg	e06-070-02.jpg	 g07-000b-00.jpg  n02-157-05.jpg
c04-110-01.jpg	e06-070-03.jpg	 g07-000b-01.jpg  n02-157-06.jpg
c04-110-02.jpg	e06-070-04.jpg	 g07-000b-02.jpg  n02-157-07.jpg
c04-110-03.jpg	e06-070-05.jpg	 g07-000b-03.jpg  n02-157-08.jpg
c04-116-00.jpg	e06-070-06.jpg	 g07-000b-04.jpg  n03-038-00.jpg
c04-116-01.jpg	e06-070-07.jpg	 g07-000b-05.jpg  n03-038-01.jpg
c04-116-02.jpg	e06-070-08.jpg	 g07-000b-06.jpg  n03-038-02.jpg
c04-116-03.jpg	e06-070-09.jpg	 g07-000b-07.jpg  n03-038-03.jpg
c04-134-00.jpg	f04-032-00.jpg	 g07-000b-08.jpg  n03-038-04.jpg
c04-134-01.jpg	f04-032-01.jpg	 g07-000b-09.jpg  n03-038-05.jpg
c04-134-02.jpg	f04-032-02.jpg	 g07-079a-00.jpg  n03-038-06.jpg
c04-134-03.jpg	f04-032-03.jpg	 g07-079a-01.jpg  n03-064-00.jpg
c04-134-04.jpg	f04-032-04.jpg	 g07-079a-02.jpg  n03-064-01.jpg
c04-134-05.jpg	f04-032-05.jpg	 g07-079a-03.jpg  n03-064-02.jpg
c04-134-06.jpg	f04-032-06.jpg	 g07-079a-04.jpg  n03-064-03.jpg
c04

In [3]:
import glob

# Assuming 'path' is defined and points to the base directory of the dataset
# The images are located in the 'image' subdirectory relative to 'path'
image_directory = path / "image"

# Use glob to find all .jpg files in the image directory
image_paths = sorted(glob.glob(str(image_directory / "*.jpg")))

print(f"Found {len(image_paths)} images in the dataset.")
print("First 5 image paths:")
for i, img_path in enumerate(image_paths[:5]):
    print(f"  {i+1}: {img_path}")

Found 2915 images in the dataset.
First 5 image paths:
  1: /kaggle/input/iam-trocr/IAM/image/c04-110-00.jpg
  2: /kaggle/input/iam-trocr/IAM/image/c04-110-01.jpg
  3: /kaggle/input/iam-trocr/IAM/image/c04-110-02.jpg
  4: /kaggle/input/iam-trocr/IAM/image/c04-110-03.jpg
  5: /kaggle/input/iam-trocr/IAM/image/c04-116-00.jpg


In [4]:
import torch
from torch.utils.data import Dataset
from PIL import Image
import torchvision.transforms as transforms

class IAMImageDataset(Dataset):
    def __init__(self, image_paths, transform=None):
        self.image_paths = image_paths
        self.transform = transform

    def __len__(self):
        return len(self.image_paths)

    def __getitem__(self, idx):
        img_path = self.image_paths[idx]
        image = Image.open(img_path).convert('RGB') # Load as RGB (or 'L' for grayscale if preferred)

        if self.transform:
            image = self.transform(image)

        # For now, we'll just return the image.
        # In a real scenario, you'd also load and return the corresponding label/text.
        return image

# Define transformations (you can customize these)
# Example: Resize to 224x224 and convert to tensor, then normalize
transform = transforms.Compose([
    transforms.Resize((224, 224)),
    transforms.ToTensor(),
    transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]) # ImageNet normalization
])

# Instantiate your custom dataset
# 'image_paths' is assumed to be defined from a previous cell
iam_dataset = IAMImageDataset(image_paths=image_paths, transform=transform)

print(f"Number of samples in the dataset: {len(iam_dataset)}")

# To get a single image from the dataset:
single_image_tensor = iam_dataset[0]
print(f"Shape of a single image tensor: {single_image_tensor.shape}")

Number of samples in the dataset: 2915
Shape of a single image tensor: torch.Size([3, 224, 224])


In [5]:
from transformers import VisionEncoderDecoderModel, TrOCRProcessor
import torch

# Load the model
model = VisionEncoderDecoderModel.from_pretrained("microsoft/trocr-base-handwritten")
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


config.json: 0.00B [00:00, ?B/s]

model.safetensors:   0%|          | 0.00/1.33G [00:00<?, ?B/s]

Loading weights:   0%|          | 0/478 [00:00<?, ?it/s]

VisionEncoderDecoderModel LOAD REPORT from: microsoft/trocr-base-handwritten
Key                         | Status  | 
----------------------------+---------+-
encoder.pooler.dense.weight | MISSING | 
encoder.pooler.dense.bias   | MISSING | 

Notes:
- MISSING	:those params were newly initialized because missing from the checkpoint. Consider training on your downstream task.


generation_config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

VisionEncoderDecoderModel(
  (encoder): ViTModel(
    (embeddings): ViTEmbeddings(
      (patch_embeddings): ViTPatchEmbeddings(
        (projection): Conv2d(3, 768, kernel_size=(16, 16), stride=(16, 16))
      )
      (dropout): Dropout(p=0.0, inplace=False)
    )
    (encoder): ViTEncoder(
      (layer): ModuleList(
        (0-11): 12 x ViTLayer(
          (attention): ViTAttention(
            (attention): ViTSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=False)
              (key): Linear(in_features=768, out_features=768, bias=False)
              (value): Linear(in_features=768, out_features=768, bias=False)
            )
            (output): ViTSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.0, inplace=False)
            )
          )
          (intermediate): ViTIntermediate(
            (dense): Linear(in_features=768, out_features=3072, bias=True)
            (i

1. Quantas camadas tem o modelo TrOCR?

In [6]:
for i, layer in enumerate(model.encoder.encoder.layer):
    print(f"Bloco {i} ->", type(layer))
for i, layer in enumerate(model.decoder.model.decoder.layers):
    print(f"Bloco {i} ->", type(layer))

Bloco 0 -> <class 'transformers.models.vit.modeling_vit.ViTLayer'>
Bloco 1 -> <class 'transformers.models.vit.modeling_vit.ViTLayer'>
Bloco 2 -> <class 'transformers.models.vit.modeling_vit.ViTLayer'>
Bloco 3 -> <class 'transformers.models.vit.modeling_vit.ViTLayer'>
Bloco 4 -> <class 'transformers.models.vit.modeling_vit.ViTLayer'>
Bloco 5 -> <class 'transformers.models.vit.modeling_vit.ViTLayer'>
Bloco 6 -> <class 'transformers.models.vit.modeling_vit.ViTLayer'>
Bloco 7 -> <class 'transformers.models.vit.modeling_vit.ViTLayer'>
Bloco 8 -> <class 'transformers.models.vit.modeling_vit.ViTLayer'>
Bloco 9 -> <class 'transformers.models.vit.modeling_vit.ViTLayer'>
Bloco 10 -> <class 'transformers.models.vit.modeling_vit.ViTLayer'>
Bloco 11 -> <class 'transformers.models.vit.modeling_vit.ViTLayer'>
Bloco 0 -> <class 'transformers.models.trocr.modeling_trocr.TrOCRDecoderLayer'>
Bloco 1 -> <class 'transformers.models.trocr.modeling_trocr.TrOCRDecoderLayer'>
Bloco 2 -> <class 'transformers.mo

2. Quais os mÃ³dulos de todas as camadas do TrOCR?

In [7]:
for name, module in model.encoder.named_modules():
    print(name, "->", type(module))

 -> <class 'transformers.models.vit.modeling_vit.ViTModel'>
embeddings -> <class 'transformers.models.vit.modeling_vit.ViTEmbeddings'>
embeddings.patch_embeddings -> <class 'transformers.models.vit.modeling_vit.ViTPatchEmbeddings'>
embeddings.patch_embeddings.projection -> <class 'torch.nn.modules.conv.Conv2d'>
embeddings.dropout -> <class 'torch.nn.modules.dropout.Dropout'>
encoder -> <class 'transformers.models.vit.modeling_vit.ViTEncoder'>
encoder.layer -> <class 'torch.nn.modules.container.ModuleList'>
encoder.layer.0 -> <class 'transformers.models.vit.modeling_vit.ViTLayer'>
encoder.layer.0.attention -> <class 'transformers.models.vit.modeling_vit.ViTAttention'>
encoder.layer.0.attention.attention -> <class 'transformers.models.vit.modeling_vit.ViTSelfAttention'>
encoder.layer.0.attention.attention.query -> <class 'torch.nn.modules.linear.Linear'>
encoder.layer.0.attention.attention.key -> <class 'torch.nn.modules.linear.Linear'>
encoder.layer.0.attention.attention.value -> <class

In [8]:
for name, module in model.decoder.named_modules():
    print(name, "->", type(module))

 -> <class 'transformers.models.trocr.modeling_trocr.TrOCRForCausalLM'>
model -> <class 'transformers.models.trocr.modeling_trocr.TrOCRDecoderWrapper'>
model.decoder -> <class 'transformers.models.trocr.modeling_trocr.TrOCRDecoder'>
model.decoder.embed_tokens -> <class 'transformers.models.trocr.modeling_trocr.TrOCRScaledWordEmbedding'>
model.decoder.embed_positions -> <class 'transformers.models.trocr.modeling_trocr.TrOCRLearnedPositionalEmbedding'>
model.decoder.layernorm_embedding -> <class 'torch.nn.modules.normalization.LayerNorm'>
model.decoder.layers -> <class 'torch.nn.modules.container.ModuleList'>
model.decoder.layers.0 -> <class 'transformers.models.trocr.modeling_trocr.TrOCRDecoderLayer'>
model.decoder.layers.0.self_attn -> <class 'transformers.models.trocr.modeling_trocr.TrOCRAttention'>
model.decoder.layers.0.self_attn.k_proj -> <class 'torch.nn.modules.linear.Linear'>
model.decoder.layers.0.self_attn.v_proj -> <class 'torch.nn.modules.linear.Linear'>
model.decoder.layers

As principais camadas do encoder(ViT) sÃ£o as camadas de Embedding, Self-Attention, Feed-Forward, Layer Norm e Residual Connection
As principais camadas do decoder(Texto) sÃ£o as camadas de Embedding, Self-Attention, Cross-Attention, Feed-Forward e Layer Norm

3. Quantos paramÃªtros tem o TrORC?

In [9]:
encoder_params = sum(p.numel() for p in model.encoder.parameters())
print("ParÃ¢metros do Encoder:", encoder_params)

decoder_params = sum(p.numel() for p in model.decoder.parameters())
print("ParÃ¢metros do Decoder:", decoder_params)

total_params = sum(p.numel() for p in model.parameters())
print("ParÃ¢metros Totais:", total_params)

ParÃ¢metros do Encoder: 86653440
ParÃ¢metros do Decoder: 247268352
ParÃ¢metros Totais: 333921792


4. Quais os paramÃªtros de cada camada?

In [10]:
for name, param in model.encoder.named_parameters():
    print(name, param.shape)

embeddings.cls_token torch.Size([1, 1, 768])
embeddings.position_embeddings torch.Size([1, 577, 768])
embeddings.patch_embeddings.projection.weight torch.Size([768, 3, 16, 16])
embeddings.patch_embeddings.projection.bias torch.Size([768])
encoder.layer.0.attention.attention.query.weight torch.Size([768, 768])
encoder.layer.0.attention.attention.key.weight torch.Size([768, 768])
encoder.layer.0.attention.attention.value.weight torch.Size([768, 768])
encoder.layer.0.attention.output.dense.weight torch.Size([768, 768])
encoder.layer.0.attention.output.dense.bias torch.Size([768])
encoder.layer.0.intermediate.dense.weight torch.Size([3072, 768])
encoder.layer.0.intermediate.dense.bias torch.Size([3072])
encoder.layer.0.output.dense.weight torch.Size([768, 3072])
encoder.layer.0.output.dense.bias torch.Size([768])
encoder.layer.0.layernorm_before.weight torch.Size([768])
encoder.layer.0.layernorm_before.bias torch.Size([768])
encoder.layer.0.layernorm_after.weight torch.Size([768])
encoder.

In [11]:
for name, param in model.decoder.named_parameters():
    print(name, param.shape)

model.decoder.embed_tokens.weight torch.Size([50265, 1024])
model.decoder.embed_positions.weight torch.Size([514, 1024])
model.decoder.layernorm_embedding.weight torch.Size([1024])
model.decoder.layernorm_embedding.bias torch.Size([1024])
model.decoder.layers.0.self_attn.k_proj.weight torch.Size([1024, 1024])
model.decoder.layers.0.self_attn.k_proj.bias torch.Size([1024])
model.decoder.layers.0.self_attn.v_proj.weight torch.Size([1024, 1024])
model.decoder.layers.0.self_attn.v_proj.bias torch.Size([1024])
model.decoder.layers.0.self_attn.q_proj.weight torch.Size([1024, 1024])
model.decoder.layers.0.self_attn.q_proj.bias torch.Size([1024])
model.decoder.layers.0.self_attn.out_proj.weight torch.Size([1024, 1024])
model.decoder.layers.0.self_attn.out_proj.bias torch.Size([1024])
model.decoder.layers.0.self_attn_layer_norm.weight torch.Size([1024])
model.decoder.layers.0.self_attn_layer_norm.bias torch.Size([1024])
model.decoder.layers.0.encoder_attn.k_proj.weight torch.Size([1024, 768])
m

RESUMO GERAL:

Self-Attention Ã© um mecanismo que permite que cada token preste atenÃ§Ã£o nos outros tokens da mesma sequÃªncia para entender melhor o contexto.

q(Query) = "O que eu quero buscar?"
k(Key) = "O que eu ofereÃ§o?"
v(Value) = "O que eu passo adiante?"

Cross-Attention Ã© o mecanismo que permite que o decoder olhe para a saÃ­da do encoder e use essa informaÃ§Ã£o para gerar o prÃ³ximo token corretamente.

O Feed-Forward processa cada token de forma independente, transformando o vetor de atenÃ§Ã£o em algo mais rico e nÃ£o-linear.

LayerNorm mantÃ©m os vetores dos tokens estÃ¡veis e equilibrados em todas as subcamadas do Transformer.




