#1. [PyTorch Tutorial 09 - Dataset and DataLoader - Batch Training](https://www.youtube.com/watch?v=PXOzkkB5eH0)

epoch = 1 forward and backward pass of ALL training samples

batch_size = number of training samples in one forward and backward pass

number of iterations = number of passes, each pass using [batch_size] number of samples

e.g. 100 samples, batch_size=20 --> 100/20 = 5 iterations for 1 epoch



In [1]:
import torch
import torchvision
from torch.utils.data import Dataset, DataLoader
import numpy as np
import math

In [4]:
#dataset available in google colab sample_data folder
class CaliforniaHousingDataset(Dataset):
  def __init__(self):
    # potential memory risks - do load in the getitem method (e.g keep only references to the files location for images)
    xy = np.loadtxt('./sample_data/california_housing_test.csv', delimiter=',', dtype=np.float32, skiprows=1)
    self.x = torch.from_numpy(xy[:, :-1])
    self.y = torch.from_numpy(xy[:, [-1]])
    self.n_samples = xy.shape[0]
  
  def __getitem__(self, index):
    return self.x[index], self.y[index]

  def __len__(self):
    return self.n_samples

In [21]:
dataset = CaliforniaHousingDataset()
features, labels = dataset[0]
features, labels


(tensor([-122.0500,   37.3700,   27.0000, 3885.0000,  661.0000, 1537.0000,
          606.0000,    6.6085]), tensor(344700.))

In [16]:
#datasampler?
dataloader = DataLoader(dataset=dataset, batch_size=2, shuffle=True, num_workers=2)

In [22]:
dataiter = iter(dataloader)
data = dataiter.next()
features, labels = data
features, labels

(tensor([[-1.2132e+02,  3.8620e+01,  2.9000e+01,  2.4300e+03,  4.4800e+02,
           1.0870e+03,  3.9400e+02,  3.0864e+00],
         [-1.1729e+02,  3.4490e+01,  3.0000e+00,  7.6890e+03,  1.5450e+03,
           3.8040e+03,  1.3990e+03,  3.3871e+00]]), tensor([177900., 111800.]))

In [28]:
# dummy training loop
num_epochs = 2
total_samples = len(dataset)
n_iterations = math.ceil(total_samples / 2)
total_samples, n_iterations

(3000, 1500)

In [29]:
for epoch in range(num_epochs):
  for i, (inputs, labels) in enumerate(dataloader):
    #forward, backward, update
    if (i+1) % 5 == 0:
      print(f'epoch {epoch+1}/{num_epochs}, step {i+1}/{n_iterations}, inputs {inputs.shape}')

epoch 1/2, step 5/1500, inputs torch.Size([2, 8])
epoch 1/2, step 10/1500, inputs torch.Size([2, 8])
epoch 1/2, step 15/1500, inputs torch.Size([2, 8])
epoch 1/2, step 20/1500, inputs torch.Size([2, 8])
epoch 1/2, step 25/1500, inputs torch.Size([2, 8])
epoch 1/2, step 30/1500, inputs torch.Size([2, 8])
epoch 1/2, step 35/1500, inputs torch.Size([2, 8])
epoch 1/2, step 40/1500, inputs torch.Size([2, 8])
epoch 1/2, step 45/1500, inputs torch.Size([2, 8])
epoch 1/2, step 50/1500, inputs torch.Size([2, 8])
epoch 1/2, step 55/1500, inputs torch.Size([2, 8])
epoch 1/2, step 60/1500, inputs torch.Size([2, 8])
epoch 1/2, step 65/1500, inputs torch.Size([2, 8])
epoch 1/2, step 70/1500, inputs torch.Size([2, 8])
epoch 1/2, step 75/1500, inputs torch.Size([2, 8])
epoch 1/2, step 80/1500, inputs torch.Size([2, 8])
epoch 1/2, step 85/1500, inputs torch.Size([2, 8])
epoch 1/2, step 90/1500, inputs torch.Size([2, 8])
epoch 1/2, step 95/1500, inputs torch.Size([2, 8])
epoch 1/2, step 100/1500, inputs

In [None]:
from torchvision.datasets import ...

# 2. [PyTorch Tutorial 10 - Dataset Transforms](https://www.youtube.com/watch?v=X_QOZEko5uE&list=PLqnslRFeH2UrcDBWF5mfPGpqQDSta6VK4&index=10)

In [74]:
class CaliforniaHousingDataset(Dataset):
  def __init__(self, transform=None):
    # potential memory risks - do load in the getitem method (e.g keep only references to the files location for images)
    xy = np.loadtxt('./sample_data/california_housing_test.csv', delimiter=',', dtype=np.float32, skiprows=1)
    self.x = xy[:, :-1]
    self.y = xy[:, [-1]]
    self.n_samples = xy.shape[0]

    self.transform = transform

  def __getitem__(self, index):
    sample = self.x[index], self.y[index]
    if self.transform:
      sample = self.transform(sample)
    
    return sample

  def __len__(self):
    return self.n_samples

In [75]:
#custom to tensor transform
class ToTensor():
  def __call__(self, sample):
    inputs, targets = sample
    return torch.from_numpy(inputs), torch.from_numpy(targets)

In [76]:
dataset = CaliforniaHousingDataset(transform=ToTensor())

In [77]:
first_data = dataset[0]
inputs, labels = first_data
type(inputs), type(labels)

(torch.Tensor, torch.Tensor)

In [78]:
class MulTransform():
  def __init__(self, factor):
    self.factor = factor

  def __call__(self, sample):
    inputs, target = sample
    inputs*= self.factor
    return inputs, target

In [79]:
composed = torchvision.transforms.Compose([ToTensor(), MulTransform(2)])
dataset = CaliforniaHousingDataset(transform=composed)

In [80]:
first_data = dataset[0]
inputs, labels = first_data
type(inputs), type(labels)

(torch.Tensor, torch.Tensor)