# Efficient Data Handling

### Working with `torch.utils.data.Dataset`

In [1]:
import torch
from torch.utils.data import Dataset
import numpy as np

class SimpleCustomDataset(Dataset):
    """A simple example dataset with features and labels."""

    def __init__(self, features, labels):
        """
        Args:
            features (list or np.array): A list or array of features.
            labels (list or np.array): A list or array of labels.
        """
        # Basic check: Features and labels must have the same length
        assert len(features) == len(labels), "Features and labels must have the same length."
        self.features = features
        self.labels = labels

    def __len__(self):
        """Returns the total number of samples."""
        return len(self.features)

    def __getitem__(self, idx):
        """
        Generates one sample of data.

        Args:
            idx (int): The index of the item.

        Returns:
            tuple: (feature, label) for the given index.
        """
        # Retrieve feature and label for the given index
        feature = self.features[idx]
        label = self.labels[idx]

        # Often, you'll convert data to PyTorch tensors here
        # We assume features/labels might not be tensors yet
        sample = (torch.tensor(feature, dtype=torch.float32),
                  torch.tensor(label, dtype=torch.long)) # Assuming classification label

        return sample

In [2]:
# --- Example Usage ---
# Sample data (replace with your actual data)
num_samples = 100
num_features = 10
features_data = np.random.randn(num_samples, num_features)
labels_data = np.random.randint(0, 5, size=num_samples) # Example: 5 classes

# Create an instance of the custom dataset
my_dataset = SimpleCustomDataset(features_data, labels_data)

# Access dataset properties and elements
print(f"Dataset size: {len(my_dataset)}")

# Get the first sample
first_sample = my_dataset[0]
feature_sample, label_sample = first_sample
print(f"\nFirst sample features:\n{feature_sample}")
print(f"First sample shape: {feature_sample.shape}")
print(f"First sample label: {label_sample}")

# Get the tenth sample
tenth_sample = my_dataset[9]
print(f"\nTenth sample label: {tenth_sample[1]}")

Dataset size: 100

First sample features:
tensor([-0.6250,  0.5624, -0.0776,  1.0968,  0.5423, -0.1851,  1.3632, -0.9676,
         0.1250, -1.5552])
First sample shape: torch.Size([10])
First sample label: 0

Tenth sample label: 4


In [3]:
import torch
from torch.utils.data import Dataset
from PIL import Image # Python Imaging Library for image loading
import pandas as pd
import os

class ImageFilelistDataset(Dataset):
    """Dataset for loading image paths and labels from a CSV file."""

    def __init__(self, csv_file, root_dir, transform=None):
        """
        Args:
            csv_file (string): Path to the csv file with annotations.
                               Assumes columns: 'image_path', 'label'
            root_dir (string): Directory with all the images.
            transform (callable, optional): Optional transform to be applied
                                           on a sample.
        """
        self.annotations = pd.read_csv(csv_file)
        self.root_dir = root_dir
        self.transform = transform # We'll discuss transforms later

    def __len__(self):
        return len(self.annotations)

    def __getitem__(self, idx):
        # Get image path relative to root_dir from the CSV
        img_rel_path = self.annotations.iloc[idx, 0] # Assuming first column is path
        img_full_path = os.path.join(self.root_dir, img_rel_path)

        # Load the image using PIL
        try:
            image = Image.open(img_full_path).convert('RGB') # Ensure 3 channels
        except FileNotFoundError:
            print(f"Error: Image not found at {img_full_path}")
            # Handle error appropriately, e.g., return None or raise exception
            # For simplicity here, we'll return None and rely on DataLoader's collate_fn
            # to potentially handle it (or filter later). A better approach
            # might be to clean the CSV beforehand.
            return None, None # Returning None values

        # Get the label from the CSV
        label = self.annotations.iloc[idx, 1] # Assuming second column is label
        label = torch.tensor(int(label), dtype=torch.long)

        # Apply transformations if any
        if self.transform:
            image = self.transform(image) # Transforms usually convert PIL Image to Tensor

        # If no transform is provided that converts to tensor, do it manually
        if not isinstance(image, torch.Tensor):
             # Basic conversion if no other transform applied
             image = torch.tensor(np.array(image), dtype=torch.float32).permute(2, 0, 1) / 255.0

        return image, label

### Built-in Datasets (e.g., TorchVision)

In [4]:
import torchvision
import torchvision.transforms as transforms

# Define a simple transformation to convert images to PyTorch Tensors
transform = transforms.Compose([transforms.ToTensor()])

# Load the training dataset
# root: directory where data will be stored/found
# train=True: specifies the training set
# download=True: downloads the data if not found locally
# transform: applies the defined transformation to each image
train_dataset = torchvision.datasets.CIFAR10(root='./data',
                                             train=True,
                                             download=True,
                                             transform=transform)

# Load the test dataset
test_dataset = torchvision.datasets.CIFAR10(root='./data',
                                            train=False,
                                            download=True,
                                            transform=transform)

print(f"CIFAR-10 training dataset size: {len(train_dataset)}")
print(f"CIFAR-10 test dataset size: {len(test_dataset)}")

# Accessing a single data point (image, label)
img, label = train_dataset[0]
print(f"Image shape: {img.shape}") 
print(f"Label: {label}")   

100%|██████████| 170M/170M [00:01<00:00, 106MB/s]  


CIFAR-10 training dataset size: 50000
CIFAR-10 test dataset size: 10000
Image shape: torch.Size([3, 32, 32])
Label: 6


### Data Transformations (`torchvision.transforms`)

In [1]:
import torchvision.transforms as transforms

# Example transform pipeline for training
train_transform = transforms.Compose([
    transforms.Resize(256),             # Resize smaller edge to 256
    transforms.RandomCrop(224),         # Randomly crop a 224x224 patch
    transforms.RandomHorizontalFlip(),  # Randomly flip horizontally
    transforms.ToTensor(),              # Convert PIL Image to tensor (0-1 range)
    transforms.Normalize(mean=[0.485, 0.456, 0.406], # Normalize with ImageNet stats
                        std=[0.229, 0.224, 0.225])
])

In [2]:
# Example transform pipeline for validation/testing (no augmentation)
test_transform = transforms.Compose([
    transforms.Resize(256),
    transforms.CenterCrop(224),         # Center crop to 224x224
    transforms.ToTensor(),
    transforms.Normalize(mean=[0.485, 0.456, 0.406],
                        std=[0.229, 0.224, 0.225])
])

In [3]:
print("Training Transforms:")
print(train_transform)

print("\nTesting Transforms:")
print(test_transform)

Training Transforms:
Compose(
    Resize(size=256, interpolation=bilinear, max_size=None, antialias=True)
    RandomCrop(size=(224, 224), padding=None)
    RandomHorizontalFlip(p=0.5)
    ToTensor()
    Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])
)

Testing Transforms:
Compose(
    Resize(size=256, interpolation=bilinear, max_size=None, antialias=True)
    CenterCrop(size=(224, 224))
    ToTensor()
    Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])
)


### Using `torch.utils.data.DataLoader`

In [1]:
import torch
from torch.utils.data import Dataset, DataLoader

# For demonstration, let's create a simple dummy dataset:
class DummyDataset(Dataset):
    def __init__(self, num_samples=100):
        self.num_samples = num_samples
        self.features = torch.randn(num_samples, 10) # Example: 100 samples, 10 features
        self.labels = torch.randint(0, 2, (num_samples,)) # Example: 100 binary labels

    def __len__(self):
        return self.num_samples

    def __getitem__(self, idx):
        return self.features[idx], self.labels[idx]

In [2]:
# Instantiate the dataset
dataset = DummyDataset(num_samples=105)

# Instantiate the DataLoader
# batch_size: Number of samples per batch
# shuffle: Set to True to shuffle data every epoch (important for training)
train_loader = DataLoader(dataset=dataset, batch_size=32, shuffle=True)

In [3]:
# Iterate over the DataLoader
print(f"Dataset size: {len(dataset)}")
print(f"DataLoader batch size: {train_loader.batch_size}")

Dataset size: 105
DataLoader batch size: 32


In [4]:
for epoch in range(1): # Example for one epoch
    print(f"\n--- Epoch {epoch+1} ---")
    for i, batch in enumerate(train_loader):
        # DataLoader yields batches. Each 'batch' is typically a tuple or list
        # containing tensors for features and labels.
        features, labels = batch
        print(f"Batch {i+1}: Features shape={features.shape}, Labels shape={labels.shape}")


--- Epoch 1 ---
Batch 1: Features shape=torch.Size([32, 10]), Labels shape=torch.Size([32])
Batch 2: Features shape=torch.Size([32, 10]), Labels shape=torch.Size([32])
Batch 3: Features shape=torch.Size([32, 10]), Labels shape=torch.Size([32])
Batch 4: Features shape=torch.Size([9, 10]), Labels shape=torch.Size([9])


In [5]:
# If you prefer all batches to have the exact batch_size, 
# discarding the smaller last batch, 
# you can set drop_last=True when creating the DataLoader:

# Drop the last incomplete batch if dataset size is not divisible by batch size
train_loader_drop_last = DataLoader(dataset=dataset, batch_size=32, shuffle=True, drop_last=True)

print("\n--- DataLoader with drop_last=True ---")
for i, batch in enumerate(train_loader_drop_last):
    features, labels = batch
    print(f"Batch {i+1}: Features shape={features.shape}, Labels shape={labels.shape}")


--- DataLoader with drop_last=True ---
Batch 1: Features shape=torch.Size([32, 10]), Labels shape=torch.Size([32])
Batch 2: Features shape=torch.Size([32, 10]), Labels shape=torch.Size([32])
Batch 3: Features shape=torch.Size([32, 10]), Labels shape=torch.Size([32])


In [6]:
# Use 4 worker processes for data loading
# num_workers > 0 enables multi-process data loading
# A common starting point is num_workers = 4 * num_gpus, but optimal value depends
# on the system (CPU cores, disk speed) and batch size. Experimentation is often needed.
fast_loader = DataLoader(dataset=dataset, batch_size=32, shuffle=True, num_workers=4)

# Iteration looks the same, but data loading happens in background processes
# for features, labels in fast_loader:
#     # Training steps...
#     pass



_The `DataLoader` wraps the `Dataset` and, if `num_workers` `> 0`, uses worker processes to fetch and collate samples into batches, which are then consumed by the training loop._

In [7]:
# Enable pinned memory for faster CPU-to-GPU transfers
gpu_optimized_loader = DataLoader(dataset=dataset,
                                batch_size=32,
                                shuffle=True,
                                num_workers=4,
                                pin_memory=True)

Setting `pin_memory=True` instructs the `DataLoader` to allocate the tensors in "pinned" (page-locked) memory on the CPU side. 
Transfers from pinned CPU memory to GPU memory are generally faster than from standard pageable CPU memory. This is most effective when used in conjunction with `num_workers > 0`. 

**Note that using pinned memory consumes more CPU RAM.

### Customizing DataLoader Behavior

In [None]:
import torch
from torch.utils.data import Dataset, DataLoader, WeightedRandomSampler

# Assume 'dataset' is your torch.utils.data.Dataset instance
# Assume 'targets' is a list or tensor containing the class label for each sample
# e.g., targets = [0, 0, 1, 0, ..., 1, 0]

# Calculate weights for each sample
class_counts = torch.bincount(torch.tensor(targets)) # Counts per class: e.g., [900, 100]
num_samples = len(targets) # Total samples: 1000

# Weight for each sample is 1 / (number of samples in its class)
sample_weights = torch.tensor([1.0 / class_counts[t] for t in targets])

# Create the sampler
sampler = WeightedRandomSampler(weights=sample_weights, num_samples=num_samples, replacement=True)

# Create the DataLoader using the custom sampler
# Note: shuffle must be False when using a sampler
dataloader = DataLoader(dataset, batch_size=32, sampler=sampler)

# Now, batches drawn from this dataloader will have a more balanced
# representation of classes over time.
# for batch_features, batch_labels in dataloader:
#     # Training steps...
#     pass

A custom `collate_fn` can pad the sequences within each batch to the maximum length in that batch.

In [8]:
import torch
from torch.utils.data import Dataset, DataLoader
from torch.nn.utils.rnn import pad_sequence

# Example Dataset returning variable-length tensors
class VariableSequenceDataset(Dataset):
    def __init__(self, data):
        # data is a list of tensors, e.g., [torch.randn(5), torch.randn(8), ...]
        self.data = data

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        # For simplicity, assume each item also has a label (e.g., its length)
        sequence = self.data[idx]
        label = len(sequence)
        return sequence, label

# Custom collate function
def pad_collate(batch):
    # batch is a list of tuples: [(sequence1, label1), (sequence2, label2), ...]
    # Sort batch elements by sequence length (optional, but often done for RNN efficiency)
    # batch.sort(key=lambda x: len(x[0]), reverse=True) # Not strictly necessary for padding

    # Separate sequences and labels
    sequences = [item[0] for item in batch]
    labels = [item[1] for item in batch]

    # Pad sequences to the length of the longest sequence in the batch
    # `batch_first=True` makes the output shape (batch_size, max_seq_len, features)
    padded_sequences = pad_sequence(sequences, batch_first=True, padding_value=0.0)

    # Stack labels (assuming they are simple scalars)
    labels = torch.tensor(labels)

    return padded_sequences, labels

# Create dataset and dataloader
sequences = [torch.randn(torch.randint(5, 15, (1,)).item()) for _ in range(100)]
dataset = VariableSequenceDataset(sequences)

dataloader = DataLoader(dataset, batch_size=4, collate_fn=pad_collate)

# Iterate through the dataloader
# for padded_batch, label_batch in dataloader:
#     # padded_batch shape: (4, max_len_in_this_batch, 1) if sequences were 1D
#     # label_batch shape: (4,)
#     # Model processing...
#     pass

This custom `collate_fn` uses `torch.nn.utils.rnn.pad_sequence`to handle the padding, ensuring all sequences in the batch have the same length, making them suitable for processing by models like RNNs.