# 09 - Dataset and DataLoader - Batch Training

使用数据集所有samples做gradient descent需要花费非常多的时间。因此我们应该把数据集分为几个small batches

```python
# Training Loop
for epoch in range(n_iters):
    # loop over all batches
    for i in range(total_batches):
        x_batch, y_batch = ...
```

## Terms
1. `epoch`: 1 forward and backward pass of <font size=3 color="red">ALL</font> training set
2. `batch_size` = number of training samples in one forward & backward pass
3. `number of iterations`: numbers of pass, each pass use [batch_size] number of pass
 
e.g. 100 samples, batch_size=20 --> 100/20 = 5 iteration for a epoch

In [77]:
import torch
import torchvision
from torch.utils.data import Dataset, DataLoader
import numpy as np
import math

# 1. Implement Custom Dataset

In [78]:
class WineDataset(Dataset):
    def __init__(self):
        # data loading
        xy = np.loadtxt('wine.txt', delimiter=",", dtype=np.float32, skiprows=1)
        self.x = torch.from_numpy(xy[:, 1:])
        self.y = torch.from_numpy(xy[:, [0]]) # shape = (n_samples,1)
        self.n_samples = xy.shape[0]
        
    def __getitem__(self, index):
        # dataset[0]
        return self.x[index], self.y[index]
    
    def __len__(self):
        # len(dataset)
        return self.n_samples

In [81]:
dataset = WineDataset()
dataset[0]

dataloader = DataLoader(dataset=dataset, batch_size=4, shuffle=True, num_workers=0)

### `torch.utils.data.Dataloader(dataset, batch_size, shuffle, num_workers)`
### 1. Parameters
1. `dataset`(DataSet): 传入的数据集
2. `batch_size`(int, optional): 每个batch有多少样本
3. `shuffle`(bool, optional): 每个epoch开始，对数据进行重新排序
4. `num_workers`(int, optional): 这个参数决定了有几个进程来处理data loading。0意味着所有的数据都会被load进主进程。（默认为0）
### 2.Return
return a `iterable`

# 附加: `torch.utils.data.DataLoader()`: iterable-->iterator

迭代器：遵守迭代协议

In [85]:
dataiter = iter(dataloader)
data = dataiter.next()
features, label = data
print(features, label)

tensor([[1.3830e+01, 1.5700e+00, 2.6200e+00, 2.0000e+01, 1.1500e+02, 2.9500e+00,
         3.4000e+00, 4.0000e-01, 1.7200e+00, 6.6000e+00, 1.1300e+00, 2.5700e+00,
         1.1300e+03],
        [1.4830e+01, 1.6400e+00, 2.1700e+00, 1.4000e+01, 9.7000e+01, 2.8000e+00,
         2.9800e+00, 2.9000e-01, 1.9800e+00, 5.2000e+00, 1.0800e+00, 2.8500e+00,
         1.0450e+03],
        [1.2990e+01, 1.6700e+00, 2.6000e+00, 3.0000e+01, 1.3900e+02, 3.3000e+00,
         2.8900e+00, 2.1000e-01, 1.9600e+00, 3.3500e+00, 1.3100e+00, 3.5000e+00,
         9.8500e+02],
        [1.3290e+01, 1.9700e+00, 2.6800e+00, 1.6800e+01, 1.0200e+02, 3.0000e+00,
         3.2300e+00, 3.1000e-01, 1.6600e+00, 6.0000e+00, 1.0700e+00, 2.8400e+00,
         1.2700e+03]]) tensor([[1.],
        [1.],
        [2.],
        [1.]])


# 2. Neueal Network Training Loop

In [89]:
num_epochs = 2
total_samples = len(dataset)
n_iterations = math.ceil(total_samples/4)
print(total_samples, n_iterations)

178 45


In [None]:
for epoch in range(num_epoch):
    for i, (inputs, labels) in enumerate(dataloader):
        # forward and backward pass, update, zero grad
        if (i+1) % 5 == 0:
            print("Epoch {0}/{1}, Step {1}/{2}".format(epoch+1, num_epochs, i+1, n_iterations))
            