## PyTorch Tutorial 09 - Dataset and DataLoader - Batch Training

https://www.youtube.com/watch?v=PXOzkkB5eH0

In [1]:
import numpy as np
import pandas as pd

In [2]:
#data = np.loadtxt("data/wine.csv", skiprows=1, delimiter=",")
filepath = "https://raw.githubusercontent.com/python-engineer/pytorchTutorial/master/data/wine/wine.csv"
data = np.loadtxt(filepath, skiprows=1, delimiter=",")

In [3]:
pd.read_csv(filepath)

Unnamed: 0,Wine,Alcohol,Malic.acid,Ash,Acl,Mg,Phenols,Flavanoids,Nonflavanoid.phenols,Proanth,Color.int,Hue,OD,Proline
0,1,14.23,1.71,2.43,15.6,127,2.80,3.06,0.28,2.29,5.64,1.04,3.92,1065
1,1,13.20,1.78,2.14,11.2,100,2.65,2.76,0.26,1.28,4.38,1.05,3.40,1050
2,1,13.16,2.36,2.67,18.6,101,2.80,3.24,0.30,2.81,5.68,1.03,3.17,1185
3,1,14.37,1.95,2.50,16.8,113,3.85,3.49,0.24,2.18,7.80,0.86,3.45,1480
4,1,13.24,2.59,2.87,21.0,118,2.80,2.69,0.39,1.82,4.32,1.04,2.93,735
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
173,3,13.71,5.65,2.45,20.5,95,1.68,0.61,0.52,1.06,7.70,0.64,1.74,740
174,3,13.40,3.91,2.48,23.0,102,1.80,0.75,0.43,1.41,7.30,0.70,1.56,750
175,3,13.27,4.28,2.26,20.0,120,1.59,0.69,0.43,1.35,10.20,0.59,1.56,835
176,3,13.17,2.59,2.37,20.0,120,1.65,0.68,0.53,1.46,9.30,0.60,1.62,840


In [4]:
data[:,1:]

array([[1.423e+01, 1.710e+00, 2.430e+00, ..., 1.040e+00, 3.920e+00,
        1.065e+03],
       [1.320e+01, 1.780e+00, 2.140e+00, ..., 1.050e+00, 3.400e+00,
        1.050e+03],
       [1.316e+01, 2.360e+00, 2.670e+00, ..., 1.030e+00, 3.170e+00,
        1.185e+03],
       ...,
       [1.327e+01, 4.280e+00, 2.260e+00, ..., 5.900e-01, 1.560e+00,
        8.350e+02],
       [1.317e+01, 2.590e+00, 2.370e+00, ..., 6.000e-01, 1.620e+00,
        8.400e+02],
       [1.413e+01, 4.100e+00, 2.740e+00, ..., 6.100e-01, 1.600e+00,
        5.600e+02]])

In [5]:
# epoch               : 1 forward and backward pass of ALL training samples
# batch_size          : number of training samples in one forward and backward pass
# number of iterations: number of passes, each pass using [batch_size] number of samples
# e.g. 100 samples, batch_size = 20 --> 100/20 = 5 iterations for 1 epoch


import torch
import torchvision
from torch.utils.data import Dataset, DataLoader
import numpy as np
import math

In [6]:
class WineDataset(Dataset):
    def __init__(self):
        # data loading
        filepath = "https://raw.githubusercontent.com/python-engineer/pytorchTutorial/master/data/wine/wine.csv"
        xy = np.loadtxt(filepath, skiprows=1, delimiter=",", dtype=np.float32)
        self.x = torch.from_numpy(xy[:,1:]) # all except first
        self.y = torch.from_numpy(xy[:, 0]) # firs col
        self.n_samples = xy.shape[0]
        
    def __getitem__(self, index):
        # datatset[0]
        return self.x[index], self.y[index]
        
    def __len__(self):
        return self.n_samples
        
dataset = WineDataset()

first_data = dataset[0]
features, labels = first_data

print(features)
print(labels)

tensor([1.4230e+01, 1.7100e+00, 2.4300e+00, 1.5600e+01, 1.2700e+02, 2.8000e+00,
        3.0600e+00, 2.8000e-01, 2.2900e+00, 5.6400e+00, 1.0400e+00, 3.9200e+00,
        1.0650e+03])
tensor(1.)


In [7]:
# Lets tryg using dataloader
dataloader = DataLoader(dataset = dataset, 
                        batch_size = 4, 
                        shuffle = True, #Will shuffle true 
                        num_workers = 2  # make multiple subprocess for loading
                       )
dataiter = iter(dataloader)
data = dataiter.next()
features, labels = data
print(features)
print(labels)

tensor([[1.3560e+01, 1.7100e+00, 2.3100e+00, 1.6200e+01, 1.1700e+02, 3.1500e+00,
         3.2900e+00, 3.4000e-01, 2.3400e+00, 6.1300e+00, 9.5000e-01, 3.3800e+00,
         7.9500e+02],
        [1.2290e+01, 3.1700e+00, 2.2100e+00, 1.8000e+01, 8.8000e+01, 2.8500e+00,
         2.9900e+00, 4.5000e-01, 2.8100e+00, 2.3000e+00, 1.4200e+00, 2.8300e+00,
         4.0600e+02],
        [1.3860e+01, 1.5100e+00, 2.6700e+00, 2.5000e+01, 8.6000e+01, 2.9500e+00,
         2.8600e+00, 2.1000e-01, 1.8700e+00, 3.3800e+00, 1.3600e+00, 3.1600e+00,
         4.1000e+02],
        [1.4200e+01, 1.7600e+00, 2.4500e+00, 1.5200e+01, 1.1200e+02, 3.2700e+00,
         3.3900e+00, 3.4000e-01, 1.9700e+00, 6.7500e+00, 1.0500e+00, 2.8500e+00,
         1.4500e+03]])
tensor([1., 2., 2., 1.])


Since we have batch_size = 4, we see 4 features vector and 4 labels above

In [8]:
# trainign loop
num_epochs = 2
total_samples = len(dataset)
n_iterations = math.ceil(total_samples/4)  # 4 is batchsize

print("total_samples:{} n_iterations:{} ".format(total_samples, n_iterations))

for epoch in range(num_epochs):
    for i, (inputs, labels) in enumerate(dataloader):
        # remember that we have set batchsize 4 in dataloader - so in one call it gives 4 rows
        
        # forward, backward update here
        
        # We print info at every 5th step
        if (i+1)%5 == 0:
            print(f'epoch {epoch + 1}/{num_epochs}  step {i+1}/{n_iterations}, inputs {inputs.shape} ')
        

total_samples:178 n_iterations:45 
epoch 1/2  step 5/45, inputs torch.Size([4, 13]) 
epoch 1/2  step 10/45, inputs torch.Size([4, 13]) 
epoch 1/2  step 15/45, inputs torch.Size([4, 13]) 
epoch 1/2  step 20/45, inputs torch.Size([4, 13]) 
epoch 1/2  step 25/45, inputs torch.Size([4, 13]) 
epoch 1/2  step 30/45, inputs torch.Size([4, 13]) 
epoch 1/2  step 35/45, inputs torch.Size([4, 13]) 
epoch 1/2  step 40/45, inputs torch.Size([4, 13]) 
epoch 1/2  step 45/45, inputs torch.Size([2, 13]) 
epoch 2/2  step 5/45, inputs torch.Size([4, 13]) 
epoch 2/2  step 10/45, inputs torch.Size([4, 13]) 
epoch 2/2  step 15/45, inputs torch.Size([4, 13]) 
epoch 2/2  step 20/45, inputs torch.Size([4, 13]) 
epoch 2/2  step 25/45, inputs torch.Size([4, 13]) 
epoch 2/2  step 30/45, inputs torch.Size([4, 13]) 
epoch 2/2  step 35/45, inputs torch.Size([4, 13]) 
epoch 2/2  step 40/45, inputs torch.Size([4, 13]) 
epoch 2/2  step 45/45, inputs torch.Size([2, 13]) 


Note that earlier we did not make batches. When we make batches (4 here e.g), we move 4 steps of gradient descent in one epoch (i.e one single pass though data). Also since while taking steps we are not using full data and work on shuffled and partial rows, it is a random step. So essentially there are four random steps in one full set data scan.


We can also use famous datasets built in torch. e.g torchvision.datasets.MNIST(), cifar etc. We do this in next turorial