In [1]:
import torch
import torchvision
from torch.utils.data import Dataset
from PIL import Image
import pandas as pd
from torchvision import models, transforms as T
import numpy as np
import os
from patchify import patchify, unpatchify
import matplotlib.pyplot as plt

In [2]:
voc_dir = 'C:/Users/Hayden/Machine Learning/d2l/d2l-en/pytorch/chapter_computer-vision/data/VOCdevkit/VOC2012/'

In [None]:
import os
from PIL import Image

def get_image_dimensions(folder_path):
    min_width = float('inf')
    max_width = float('-inf')
    min_height = float('inf')
    max_height = float('-inf')
    total_width = 0
    total_height = 0
    total_images = 0
    image_files = [f for f in os.listdir(folder_path) if f.lower().endswith(('.png', '.jpg', '.jpeg', '.bmp', '.gif', '.webp'))]

    if not image_files:
        print("No image files found in the folder.")
        return None

    for file_name in image_files:
        file_path = os.path.join(folder_path, file_name)
        try:
            with Image.open(file_path) as img:
                width, height = img.size
                min_width = min(min_width, width)
                max_width = max(max_width, width)
                min_height = min(min_height, height)
                max_height = max(max_height, height)
                total_width = total_width + width
                total_height = total_height + height
                total_images = total_images + 1
        except Exception as e:
            print(f"Error processing {file_name}: {e}")

    return {
        'min_width': min_width,
        'max_width': max_width,
        'min_height': min_height,
        'max_height': max_height,
        'total_width': total_width,
        'total_height': total_height,
        'total_images': total_images
    }

# Example usage
result = get_image_dimensions(os.path.join(voc_dir, 'SegmentationClass'))
if result:
    print("Image Dimension Summary:")
    print(f"Min Width: {result['min_width']}")
    print(f"Max Width: {result['max_width']}")
    print(f"Min Height: {result['min_height']}")
    print(f"Max Height: {result['max_height']}")
    print(f"Average Width: {result['total_width'] / result['total_images']}")
    print(f"Average Height: {result['total_height'] / result['total_images']}")


In [None]:
def read_images(data_dir, is_train):
    '''Reads and returns all images and their respective masks.'''
    annotations_file = os.path.join(data_dir, 'ImageSets', 'Segmentation', 'train.txt' if is_train else 'val.txt')
    rgb_mode = torchvision.io.image.ImageReadMode.RGB

    with open(annotations_file, 'r') as f:
        image_names = f.read().split()
    
    images, masks = [], []
    for file_name in image_names:
        images.append(
            torchvision.io.read_image(os.path.join(data_dir, 'JPEGImages', f'{file_name}.jpg'))
        )
        masks.append(
            torchvision.io.read_image(os.path.join(data_dir, 'SegmentationClass', f'{file_name}.png'), rgb_mode)
        )
    
    return images, masks, image_names

In [3]:
def patch_image(patch_size, img_dir):
    '''Converts an image into several patches.'''
    img = Image.open(img_dir).convert('RGB')

    # Compute padding needed to make dimensions divisible by patch_size
    pad_h = (patch_size - img.height % patch_size) % patch_size
    pad_w = (patch_size - img.width  % patch_size) % patch_size
    pad = T.Pad((0, 0, pad_w, pad_h))  # Pad right and bottom
    img_padded = pad(img)

    img_array = np.array(img_padded)
    patches = patchify(img_array, (patch_size, patch_size, 3), step=patch_size)

    # print(patches.shape)
    return patches

def read_and_patch_images(data_dir, is_train):
    annotations_file = os.path.join(data_dir, 'ImageSets', 'Segmentation', 'train.txt' if is_train else 'val.txt')

    with open(annotations_file, 'r') as f:
        image_names = f.read().split()
    
    images, masks = [], []
    for file_name in image_names:
        images.extend(
            patch_image(128, os.path.join(data_dir, 'JPEGImages', f'{file_name}.jpg'))
        )
        masks.extend(
            patch_image(128, os.path.join(data_dir, 'SegmentationClass', f'{file_name}.png'))
        )
    # print(len(images[0]), len(images[0][1]), len(images[0][1][2]), len(images[3]))
    images, masks = np.concatenate(images), np.concatenate(masks)
    # print(images.shape, masks.shape)
    # print('rapi:', images[0].shape)
    return images, masks

In [None]:
def display_patches(patches):
    '''Displays all patches in an image in their respective positions in the image.'''
    n_rows, n_cols = patches.shape[0], patches.shape[1]
    fig, axes = plt.subplots(n_rows, n_cols, figsize=(n_cols*2, n_rows*2))

    for i in range(n_rows):
        for j in range(n_cols):
            patch = patches[i, j, 0]  # shape: (patch_size, patch_size, 3)
            axes[i, j].imshow(patch)
            axes[i, j].axis('off')

    plt.tight_layout()
    plt.show()

image = os.path.join(
    'C:/Users/Hayden/Machine Learning/d2l/d2l-en/pytorch/chapter_computer-vision/data/VOCdevkit/VOC2012/JPEGImages/',
    read_images(voc_dir, False)[2][10] + '.jpg'
)
display_patches(patch_image(128, image))
mask = os.path.join(
    'C:/Users/Hayden/Machine Learning/d2l/d2l-en/pytorch/chapter_computer-vision/data/VOCdevkit/VOC2012/SegmentationClass/',
    read_images(voc_dir, False)[2][10] + '.png'
)
display_patches(patch_image(128, mask))

patches1 = patch_image(128, 'C:/Users/Hayden/Machine Learning/d2l/d2l-en/pytorch/chapter_computer-vision/data/VOCdevkit/VOC2012/SegmentationClass/2007_000033.png')
display_patches(patches1)
display_patches(patch_image(128, 'C:/Users/Hayden/Machine Learning/d2l/d2l-en/pytorch/chapter_computer-vision/data/VOCdevkit/VOC2012/JPEGImages/2007_000033.jpg'))

In [None]:
class VocDataset(Dataset):
    '''Dataset that pre-loads the PASCAL2 VOC 2012 dataset into RAM. This implies that the DataLoader must have num_workers=0. The argument "transforms" must be a transformation ending with ToTensor.'''
    def __init__(self, data_dir, transforms, is_train):
        self.data_dir = data_dir
        self.filter = filter
        self.transforms = transforms
        self.is_train = is_train
        self.normalize = torchvision.transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])
        self.images, self.masks = self.load_images()

    def load_images(self):
        images, masks = read_and_patch_images(self.data_dir, self.is_train)
        images = images.squeeze(1)
        masks = masks.squeeze(1)

        images = torch.stack([
            self.normalize(self.transforms(image).to(torch.float32))
            for image in images
        ])
        masks = torch.stack([
            self.transforms(mask).to(torch.float32)
            for mask in masks
        ])
        print('TENSOR', images.shape, type(images), type(images[0]), type(images[0][0]), type(images[0][0][0]))
        
        return images, masks

    def __len__(self):
        return len(self.images)

    def __getitem__(self, index):
        return self.images[index], self.masks[index]

transforms = torchvision.transforms.ToTensor()
train_set = VocDataset(voc_dir, transforms, True)

TENSOR torch.Size([17900, 3, 128, 128]) <class 'torch.Tensor'> <class 'torch.Tensor'> <class 'torch.Tensor'> <class 'torch.Tensor'>


In [5]:
train_iter = torch.utils.data.DataLoader(train_set, 256, shuffle=True, drop_last=True, num_workers=0)
print('Total number of batches:', len(train_iter))
for X, Y in train_iter:
    print('X shape:', X.shape)
    print('Y shape:', Y.shape)
    print(X)
    break

Total number of batches: 69
X shape: torch.Size([256, 3, 128, 128])
Y shape: torch.Size([256, 3, 128, 128])
tensor([[[[ 0.9646,  0.7248,  0.8961,  ..., -2.1179, -2.1179, -2.1179],
          [ 0.9132,  1.1358,  1.2728,  ..., -2.1179, -2.1179, -2.1179],
          [ 1.3755,  1.4440,  1.3584,  ..., -2.1179, -2.1179, -2.1179],
          ...,
          [-1.3987, -1.7412, -1.7583,  ..., -2.1179, -2.1179, -2.1179],
          [-1.2445, -1.6384, -1.7412,  ..., -2.1179, -2.1179, -2.1179],
          [-0.6965, -1.1932, -1.0219,  ..., -2.1179, -2.1179, -2.1179]],

         [[ 1.0630,  0.7479,  0.7479,  ..., -2.0357, -2.0357, -2.0357],
          [ 0.9755,  1.1331,  1.1506,  ..., -2.0357, -2.0357, -2.0357],
          [ 1.4132,  1.3957,  1.2381,  ..., -2.0357, -2.0357, -2.0357],
          ...,
          [-1.0378, -1.3880, -1.4230,  ..., -2.0357, -2.0357, -2.0357],
          [-0.9853, -1.4405, -1.4930,  ..., -2.0357, -2.0357, -2.0357],
          [-0.5126, -1.0553, -0.8452,  ..., -2.0357, -2.0357, -2.035