Experimentation with and learning the PyTorch Dataset/Dataloader format for feeding data to different models

In [None]:
import torch
from torchvision import datasets, transforms
import torch.utils as utils
from torch.utils.data import random_split

In [None]:
# https://www.scottcondron.com/jupyter/visualisation/audio/2020/12/02/dataloaders-samplers-collate.html

In [None]:
# for xb, yb in dataloader:
#   ....


In [None]:
# dummy dataset
xs = list(range(10))
ys = list(range(10,20))
print('xs values: ', xs)
print('ys values: ', ys)

xs values:  [0, 1, 2, 3, 4, 5, 6, 7, 8, 9]
ys values:  [10, 11, 12, 13, 14, 15, 16, 17, 18, 19]


In [None]:
# all datasets must be indexable (by a single value)
# all datasets must have a length
# AND all datasets return a tuple (label/value pairing)
dataset = list(zip(xs,ys))
dataset[0] # returns the tuple (x[0], y[0])
len(dataset)

10

In [None]:
# can wrap the dataset inside a dataloader
# all data gets converted to tensor format AND you can use stuff like shuffling, batching, multiprocessing 
from torch.utils.data import DataLoader

for x, y in DataLoader(dataset):
    print(x,y)

tensor([0]) tensor([10])
tensor([1]) tensor([11])
tensor([2]) tensor([12])
tensor([3]) tensor([13])
tensor([4]) tensor([14])
tensor([5]) tensor([15])
tensor([6]) tensor([16])
tensor([7]) tensor([17])
tensor([8]) tensor([18])
tensor([9]) tensor([19])


In [None]:
for x, y in DataLoader(dataset, batch_size = 2): # batches of two
    print(x,y)

tensor([0, 1]) tensor([10, 11])
tensor([2, 3]) tensor([12, 13])
tensor([4, 5]) tensor([14, 15])
tensor([6, 7]) tensor([16, 17])
tensor([8, 9]) tensor([18, 19])


In [None]:
for x, y in DataLoader(dataset, batch_size=2, shuffle=True): # shuffle and THEN make batches
    print(x,y)

tensor([4, 3]) tensor([14, 13])
tensor([0, 7]) tensor([10, 17])
tensor([6, 8]) tensor([16, 18])
tensor([1, 5]) tensor([11, 15])
tensor([2, 9]) tensor([12, 19])


In [None]:
# training data, download if unavailable annd save to CIFAR-10-data 
# CIFAR-10 has 60000 images, 50000 are for trainining with 10000 for testing
train_data = datasets.CIFAR10(root="CIFAR-10", train=True, download=True)

Downloading https://www.cs.toronto.edu/~kriz/cifar-10-python.tar.gz to CIFAR-10/cifar-10-python.tar.gz


  0%|          | 0/170498071 [00:00<?, ?it/s]

Extracting CIFAR-10/cifar-10-python.tar.gz to CIFAR-10


In [None]:
len(train_data)

50000

In [None]:
# same data is downloaded anyways, maybe only need to invoke download *ONCE* ?
test_data = datasets.CIFAR10(root="CIFAR-10-test", train=False, download=True)

Downloading https://www.cs.toronto.edu/~kriz/cifar-10-python.tar.gz to CIFAR-10-test/cifar-10-python.tar.gz


  0%|          | 0/170498071 [00:00<?, ?it/s]

Extracting CIFAR-10-test/cifar-10-python.tar.gz to CIFAR-10-test


In [None]:
len(test_data)

10000

In [None]:
# image (x), with label 6 (y)
train_data[0]

(<PIL.Image.Image image mode=RGB size=32x32 at 0x7F4D59D48750>, 6)

In [None]:
# from pytorch docs
# s1, s2 =random_split(range(10), [3, 7], generator=torch.Generator().manual_seed(42))
# len(s1) == 3
# len(s2) == 7

In [None]:
type(train)

torch.utils.data.dataset.Subset

In [None]:
# https://www.geeksforgeeks.org/training-neural-networks-with-validation-using-pytorch/
# General idea: data is already partitioned

# https://www.binarystudy.com/2021/04/how-to-calculate-mean-standard-deviation-images-pytorch.html

# https://stackoverflow.com/questions/62549990/what-does-next-and-iter-do-in-pytorchs-dataloader

# http://www.bikashsantra.byethost7.com/pyTorch/4_cifar10_tutorial.html?i=1
# THIS gives the correct solution from what I can tell 

img_transform = transforms.Compose([
        transforms.Resize(224),
        transforms.Normalize((0.5, 0.5, 0.5), (0.5, 0.5, 0.5)),
        transforms.ToTensor()   
])

train = datasets.CIFAR10(root="CIFAR-10-test", train=True, transform=img_transform)
train, validation = random_split(train, [40000, 10000], generator=torch.Generator().manual_seed(42))
print("Size of Training Set: {0}".format(len(train)))
print("Size of Validation Set: {0}".format(len(validation)))
test = datasets.CIFAR10(root="CIFAR-10-test", train=False, transform=img_transform)
print("Size of Test Set: {0}".format(len(test)))

Size of Training Set: 40000
Size of Validation Set: 10000
Size of Test Set: 10000


In [None]:
train.dataset

Dataset CIFAR10
    Number of datapoints: 50000
    Root location: CIFAR-10-test
    Split: Train
    StandardTransform
Transform: Compose(
               Resize(size=224, interpolation=bilinear, max_size=None, antialias=None)
               ToTensor()
           )

In [None]:
# apply transforms and loaders
# RandomResizedCrop -> crop the image at a random part and thenn resize it
# RandomHorziontalFlip -> Randomly flips an image horizontally
# ToTensor -> literally convert to tensor
# Normalize -> Normalizes givne mean and std for EACH channel
# NOTE: The following values were derived from Imagenet based stuff, may be
#       worth normalizing for each set
#data_transforms = {
#    'train': transforms.Compose([
#        transforms.RandomResizedCrop(input_size),
#        transforms.RandomHorizontalFlip(),
#        transforms.ToTensor(),
#        transforms.Normalize([0.485, 0.456, 0.406], [0.229, 0.224, 0.225])
#    ]),
#    'val': transforms.Compose([
#        transforms.Resize(input_size),
#        transforms.CenterCrop(input_size),
#        transforms.ToTensor(),
#        transforms.Normalize([0.485, 0.456, 0.406], [0.229, 0.224, 0.225])
#    ]),
#}

In [None]:
# https://deeplizard.com/learn/video/lu7TCu7HeYc
loader = DataLoader(train, batch_size=len(train), num_workers=4)
data = next(iter(loader))
data[0].mean(), data[0].std()

  cpuset_checked))


RuntimeError: ignored