In [1]:
from pathlib import Path
import kagglehub

# Download latest version
path = kagglehub.dataset_download("changheonkim/iam-trocr")
path = Path(path)/"IAM"
print("Path to dataset files:", path)

Downloading from https://www.kaggle.com/api/v1/datasets/download/changheonkim/iam-trocr?dataset_version_number=1...


100%|██████████| 90.1M/90.1M [00:01<00:00, 92.5MB/s]

Extracting files...





Path to dataset files: /root/.cache/kagglehub/datasets/changheonkim/iam-trocr/versions/1/IAM


In [2]:
import os

# Assuming 'path' variable holds the base directory from kagglehub.dataset_download
# If not, please replace 'path' with the correct directory string, e.g., '/content/IAM'
if 'path' in globals():
    print(f"Listing directories in: {path}")
    # Use a shell command to list only directories recursively, and sort them
    !ls {path/"image"}




Listing directories in: /root/.cache/kagglehub/datasets/changheonkim/iam-trocr/versions/1/IAM
c04-110-00.jpg	e06-070-02.jpg	 g07-000b-00.jpg  n02-157-05.jpg
c04-110-01.jpg	e06-070-03.jpg	 g07-000b-01.jpg  n02-157-06.jpg
c04-110-02.jpg	e06-070-04.jpg	 g07-000b-02.jpg  n02-157-07.jpg
c04-110-03.jpg	e06-070-05.jpg	 g07-000b-03.jpg  n02-157-08.jpg
c04-116-00.jpg	e06-070-06.jpg	 g07-000b-04.jpg  n03-038-00.jpg
c04-116-01.jpg	e06-070-07.jpg	 g07-000b-05.jpg  n03-038-01.jpg
c04-116-02.jpg	e06-070-08.jpg	 g07-000b-06.jpg  n03-038-02.jpg
c04-116-03.jpg	e06-070-09.jpg	 g07-000b-07.jpg  n03-038-03.jpg
c04-134-00.jpg	f04-032-00.jpg	 g07-000b-08.jpg  n03-038-04.jpg
c04-134-01.jpg	f04-032-01.jpg	 g07-000b-09.jpg  n03-038-05.jpg
c04-134-02.jpg	f04-032-02.jpg	 g07-079a-00.jpg  n03-038-06.jpg
c04-134-03.jpg	f04-032-03.jpg	 g07-079a-01.jpg  n03-064-00.jpg
c04-134-04.jpg	f04-032-04.jpg	 g07-079a-02.jpg  n03-064-01.jpg
c04-134-05.jpg	f04-032-05.jpg	 g07-079a-03.jpg  n03-064-02.jpg
c04-134-06.jpg	f04-032-0

In [3]:
import glob

# Assuming 'path' is defined and points to the base directory of the dataset
# The images are located in the 'image' subdirectory relative to 'path'
image_directory = path / "image"

# Use glob to find all .jpg files in the image directory
image_paths = sorted(glob.glob(str(image_directory / "*.jpg")))

print(f"Found {len(image_paths)} images in the dataset.")
print("First 5 image paths:")
for i, img_path in enumerate(image_paths[:5]):
    print(f"  {i+1}: {img_path}")

Found 2915 images in the dataset.
First 5 image paths:
  1: /root/.cache/kagglehub/datasets/changheonkim/iam-trocr/versions/1/IAM/image/c04-110-00.jpg
  2: /root/.cache/kagglehub/datasets/changheonkim/iam-trocr/versions/1/IAM/image/c04-110-01.jpg
  3: /root/.cache/kagglehub/datasets/changheonkim/iam-trocr/versions/1/IAM/image/c04-110-02.jpg
  4: /root/.cache/kagglehub/datasets/changheonkim/iam-trocr/versions/1/IAM/image/c04-110-03.jpg
  5: /root/.cache/kagglehub/datasets/changheonkim/iam-trocr/versions/1/IAM/image/c04-116-00.jpg


In [4]:
import torch
from torch.utils.data import Dataset
from PIL import Image
import torchvision.transforms as transforms

class IAMImageDataset(Dataset):
    def __init__(self, image_paths, transform=None):
        self.image_paths = image_paths
        self.transform = transform

    def __len__(self):
        return len(self.image_paths)

    def __getitem__(self, idx):
        img_path = self.image_paths[idx]
        image = Image.open(img_path).convert('RGB') # Load as RGB (or 'L' for grayscale if preferred)

        if self.transform:
            image = self.transform(image)

        # For now, we'll just return the image.
        # In a real scenario, you'd also load and return the corresponding label/text.
        return image

# Define transformations (you can customize these)
# Example: Resize to 224x224 and convert to tensor, then normalize
transform = transforms.Compose([
    transforms.Resize((224, 224)),
    transforms.ToTensor(),
    transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]) # ImageNet normalization
])

# Instantiate your custom dataset
# 'image_paths' is assumed to be defined from a previous cell
iam_dataset = IAMImageDataset(image_paths=image_paths, transform=transform)

print(f"Number of samples in the dataset: {len(iam_dataset)}")

# To get a single image from the dataset:
single_image_tensor = iam_dataset[0]
print(f"Shape of a single image tensor: {single_image_tensor.shape}")

Number of samples in the dataset: 2915
Shape of a single image tensor: torch.Size([3, 224, 224])


After creating your `Dataset` object, you would typically use a `DataLoader` to iterate over batches of data, shuffle them, and load them in parallel. This is especially useful for training deep learning models.