Data set and data loader classes

Unlike the earlier Lin  and Logistic regression modules where training happenend on all the data at once per epoch, typically that's too much data at one shot, we rarely compute loss across all the data in reality. We divide data set into batches and compute per batch and do grad descent per batch



The pytorch dataset and data loader make batch training easier

In [3]:
import torch
import torchvision
from torch.utils.data import Dataset, DataLoader
import numpy as np
import math

In [4]:
## implement own custom dataset

In [7]:
class WineDataset(Dataset):
    
    def __init__(self):
        ## data loading
        xy = np.loadtxt('wine.txt', delimiter = ",", dtype = np.float32, skiprows=1)
        self.x = torch.from_numpy(xy[:, 1: ]) ## all columns except first column
        self.y = torch.from_numpy(xy[:, [0]]) ## n_samples*1
        self.n_samples = xy.shape[0]
        
    def __getitem__(self, index):
        return self.x[index], self.y[index]
        
    def __len__(self):
        ## length of dataset
        return self.n_samples
        
        
        

In [8]:
dataset = WineDataset()

In [9]:
first_data = dataset[0]

In [10]:
features, labels = first_data

In [13]:
features

tensor([1.4230e+01, 1.7100e+00, 2.4300e+00, 1.5600e+01, 1.2700e+02, 2.8000e+00,
        3.0600e+00, 2.8000e-01, 2.2900e+00, 5.6400e+00, 1.0400e+00, 3.9200e+00,
        1.0650e+03])

In [None]:
batch_size = 4

In [20]:
dataloader = DataLoader(dataset = dataset, batch_size=batch_size, shuffle=True, num_workers=0)

In [21]:
dataiter = iter(dataloader)


In [23]:
data = next(dataiter)

In [26]:
data[0].shape

torch.Size([4, 13])

In [27]:
features, labels = data

In [28]:
print(features) ## see 4 samples features as batch size is 4

tensor([[1.2520e+01, 2.4300e+00, 2.1700e+00, 2.1000e+01, 8.8000e+01, 2.5500e+00,
         2.2700e+00, 2.6000e-01, 1.2200e+00, 2.0000e+00, 9.0000e-01, 2.7800e+00,
         3.2500e+02],
        [1.1840e+01, 2.8900e+00, 2.2300e+00, 1.8000e+01, 1.1200e+02, 1.7200e+00,
         1.3200e+00, 4.3000e-01, 9.5000e-01, 2.6500e+00, 9.6000e-01, 2.5200e+00,
         5.0000e+02],
        [1.2530e+01, 5.5100e+00, 2.6400e+00, 2.5000e+01, 9.6000e+01, 1.7900e+00,
         6.0000e-01, 6.3000e-01, 1.1000e+00, 5.0000e+00, 8.2000e-01, 1.6900e+00,
         5.1500e+02],
        [1.2330e+01, 9.9000e-01, 1.9500e+00, 1.4800e+01, 1.3600e+02, 1.9000e+00,
         1.8500e+00, 3.5000e-01, 2.7600e+00, 3.4000e+00, 1.0600e+00, 2.3100e+00,
         7.5000e+02]])


In [29]:
## dummy training loop

In [32]:
num_epochs = 2
total_samples = len(dataset)
n_iterations = math.ceil(total_samples/batch_size) ## total samples/number of batches per epoch

In [33]:
print(total_samples, n_iterations)

178 45


In [34]:
for epoch in range(num_epochs):
    for i, (inputs, labels) in enumerate(dataloader):
        
        ## forward , backward pass
        if (i+1)%5 == 0:
            print(f'epoch : {epoch + 1}/{num_epochs}, step {i+1}/{n_iterations}, inputs {inputs.shape}')
    

epoch : 1/2, step 5/45, inputs torch.Size([4, 13])
epoch : 1/2, step 10/45, inputs torch.Size([4, 13])
epoch : 1/2, step 15/45, inputs torch.Size([4, 13])
epoch : 1/2, step 20/45, inputs torch.Size([4, 13])
epoch : 1/2, step 25/45, inputs torch.Size([4, 13])
epoch : 1/2, step 30/45, inputs torch.Size([4, 13])
epoch : 1/2, step 35/45, inputs torch.Size([4, 13])
epoch : 1/2, step 40/45, inputs torch.Size([4, 13])
epoch : 1/2, step 45/45, inputs torch.Size([2, 13])
epoch : 2/2, step 5/45, inputs torch.Size([4, 13])
epoch : 2/2, step 10/45, inputs torch.Size([4, 13])
epoch : 2/2, step 15/45, inputs torch.Size([4, 13])
epoch : 2/2, step 20/45, inputs torch.Size([4, 13])
epoch : 2/2, step 25/45, inputs torch.Size([4, 13])
epoch : 2/2, step 30/45, inputs torch.Size([4, 13])
epoch : 2/2, step 35/45, inputs torch.Size([4, 13])
epoch : 2/2, step 40/45, inputs torch.Size([4, 13])
epoch : 2/2, step 45/45, inputs torch.Size([2, 13])


## Transforms

The ability to apply transformations on dataset

Very relevant for images (cropping, rotation etc) but also for tensors in general

You can also compose multiple transforms

Pytorch natively supports a lot of transforms https://pytorch.org/tutorials/beginner/basics/transforms_tutorial.html

In addition, we can write custom transforms like below

Let's take the same wine data set class as before, but comment out the torch conversion steps, we will do that in transform instead. Add a transform  argument

In [51]:
class WineDataset(Dataset):
    
    def __init__(self, transform=None):
        ## data loading
        xy = np.loadtxt('wine.txt', delimiter = ",", dtype = np.float32, skiprows=1)
        ##self.x = torch.from_numpy(xy[:, 1: ]) ## all columns except first column
        ##self.y = torch.from_numpy(xy[:, [0]]) ## n_samples*1
        self.x =(xy[:, 1:])
        self.y =(xy[:, [0]])
        self.n_samples = xy.shape[0]
        self.transform = transform
        
    def __getitem__(self, index): ## transform and return if transform available
        sample = self.x[index], self.y[index]
        sample = self.transform(sample)
        return sample
        
    def __len__(self):
        ## length of dataset
        return self.n_samples

## define a custom transform class

In [52]:
class ToTensor:
    
    def __call__(self, sample): ## the call method is a special method like init, which allows the object to directly be used to call a function rather than a method
        inputs , targets = sample
        return torch.from_numpy(inputs), torch.from_numpy(targets)
    

In [53]:
dataset = WineDataset(transform=ToTensor())

In [55]:
first_data = dataset[0]
features, labels = first_data
print(type(features))

<class 'torch.Tensor'>


## using compose

In [64]:
class MulTransform:
    
    def __init__(self, factor):
        self.factor = factor
    
    def __call__(self, sample):
        inputs, target = sample
        inputs = inputs*self.factor
        return inputs, target

In [67]:
dataset = WineDataset(transform=ToTensor())
first_data = dataset[0]
features, labels = first_data
print(type(features))
print(features[0])


composed = torchvision.transforms.Compose([ToTensor(), MulTransform(2)])
dataset = WineDataset(transform = composed)
first_data = dataset[0]
features, labels = first_data
print(type(features))

print(features[0])

<class 'torch.Tensor'>
tensor(14.2300)
<class 'torch.Tensor'>
tensor(28.4600)


<class 'torch.Tensor'>


## References

https://www.youtube.com/watch?v=PXOzkkB5eH0&list=PLqnslRFeH2UrcDBWF5mfPGpqQDSta6VK4&index=9

https://www.youtube.com/watch?v=X_QOZEko5uE&list=PLqnslRFeH2UrcDBWF5mfPGpqQDSta6VK4&index=10