In [1]:
# Derived and modified from
# Source: https://towardsdatascience.com/understanding-pytorch-with-an-example-a-step-by-step-tutorial-81fc5f8c4e8e
from torch.utils.data import Dataset, TensorDataset
import torch
import numpy as np
%config Completer.use_jedi = False

In [2]:
np.random.seed(42)

device = 'cuda' if torch.cuda.is_available() else 'cpu'

x = np.random.rand(100, 1)
y = 1 + 2 * x + .1 * np.random.randn(100, 1)

# Shuffles the indices
idx = np.arange(100)
np.random.shuffle(idx)

# Uses first 80 random indices for train
train_idx = idx[:80]
# Uses the remaining indices for validation
val_idx = idx[80:]

# Generates train and validation sets
x_train, y_train = x[train_idx], y[train_idx]
x_val, y_val = x[val_idx], y[val_idx]

## Making Custom Dataset by inheriting Dataset class

In [3]:
# Inherits from Dataset Class
class CustomDataset(Dataset):
    def __init__(self, x_tensor, y_tensor):
        """
        Normally you preprocess your data here
        
        it takes whatever arguments needed to build a list of tuples
        There is no need to load the whold dataset in the constructor.
        It is recommended to load them on demand (whenever __getitem__) is called.
        """
        self.x = x_tensor
        self.y = y_tensor
        
    def __getitem__(self, index):
        """
        Function which enables taking 1 sample from the dataset.
        It allows the dataset to be indexed, so it can work like a list
        Must return a tuple (features, label) corresponding to the requested data point.
        """
        return (self.x[index], self.y[index])
    
    def __len__(self):
        """
        Simply return the size of the whole dataset so whenever it is sampled, 
        its indexing is limited to the actual size.
        """
        return len(self.x)

In [4]:
# We do not send them to a device. So train_tensors are on CPU.
# Because we don't want our whole training data to be loaded into GPU tensors
# as it takes up space in our precious GPU's RAM.
x_train_tensor = torch.from_numpy(x_train).float()  # no .to(device)
y_train_tensor = torch.from_numpy(y_train).float()  # no .to(device)

train_data = CustomDataset(x_train_tensor, y_train_tensor)
print(train_data[0])

train_data = TensorDataset(x_train_tensor, y_train_tensor)
print(train_data[0])

(tensor([0.7713]), tensor([2.4745]))
(tensor([0.7713]), tensor([2.4745]))


## DataLoader --> mini-batch GD
Above example can only do Batch Gradient Descent.  
Continued from the CustomDataset and by using DataLoader, we can do mini-batch or SGD

In [5]:
from torch.utils.data import DataLoader

In [6]:
train_loader = DataLoader(dataset=train_data, batch_size=10, shuffle=True)

In [7]:
type(train_loader)

torch.utils.data.dataloader.DataLoader

In [8]:
type(iter(train_loader))

torch.utils.data.dataloader._SingleProcessDataLoaderIter

In [9]:
# First set of mini-batch.
# Returning a list containing two tensors, 
# One for features another for labels
next(iter(train_loader))

[tensor([[0.3253],
         [0.3252],
         [0.1409],
         [0.2713],
         [0.7751],
         [0.7713],
         [0.9507],
         [0.9395],
         [0.0254],
         [0.1997]]),
 tensor([[1.8057],
         [1.7291],
         [1.1211],
         [1.5105],
         [2.4936],
         [2.4745],
         [2.8715],
         [2.8890],
         [1.0785],
         [1.3651]])]

In [10]:
# From 03-Models.ipynb
import torch.nn as nn

class MyLinearModel(nn.Module):
    def __init__(self):
        super().__init__()
        self.linear = nn.Linear(1, 1)
        
    def forward(self, x):
        return self.linear(x)
    
def return_train_step_function(model, loss_function, optimizer):
    # Build function that is to be returned and used in every epoch
    def train_step(x, y):
        model.train()
        y_hat = model(x)
        loss = loss_function(y, y_hat)
        loss.backward()
        optimizer.step()
        optimizer.zero_grad()
        return loss.item()
    
    # returning a function
    return train_step

model = MyLinearModel().to(device)
mse_loss = nn.MSELoss(reduction='mean')
train_step_func = return_train_step_function(model=model, 
                                             loss_function=mse_loss,
                                             optimizer=torch.optim.SGD(model.parameters(), lr=1e-1)
                                            )
losses = []

In [11]:
"""
By using train_loader we've made, below can perform mini-batch GD
"""

for epoch in range(300):
    for x_batch, y_batch in train_loader:
        # the dataset lives in the CPU, so do our mini-batches.
        # so we need to send those mini-batches to the device 
        # where the model lives
        """
        For bigger datasets, loading data sample by sample (into a CPU tensor) using Dataset’s __get_item__ 
        and then sending all samples that belong to the same mini-batch at once to your GPU (device) 
        is the way to go in order to make the best use of your GPU's RAM.
        """
        x_batch = x_batch.to(device)
        y_batch = y_batch.to(device)
    
        loss = train_step_func(x_batch, y_batch)  # returns loss.item() with mini-batch
        losses.append(loss)

print(model.state_dict())

OrderedDict([('linear.weight', tensor([[1.9698]])), ('linear.bias', tensor([1.0257]))])


Rather than splitting the dataset into train and validation like above (top cell), we can use `random_split()`. Then the splitted dataset can be fed into the `DataLoader`

In [12]:
from torch.utils.data.dataset import random_split

In [13]:
x_tensor = torch.from_numpy(x).float()
y_tensor = torch.from_numpy(y).float()

dataset = TensorDataset(x_tensor, y_tensor)

In [14]:
train_dataset, val_dataset = random_split(dataset, [80, 20])

In [15]:
train_loader = DataLoader(dataset=train_dataset, batch_size=10)
val_loader = DataLoader(dataset=val_dataset, batch_size=10)

### Evaluation with validation set
Now our training loop can be changed like below:

In [16]:
"""
1. By using train_loader we've made, below can perform mini-batch GD
2. We use random_splitted and DataLoader'ed train and validation data
"""
losses = []
val_losses =[]

for epoch in range(300):
    for x_batch, y_batch in train_loader:
        # the dataset lives in the CPU, so do our mini-batches.
        # so we need to send those mini-batches to the device 
        # where the model lives
        """
        For bigger datasets, loading data sample by sample (into a CPU tensor) using Dataset’s __get_item__ 
        and then sending all samples that belong to the same mini-batch at once to your GPU (device) 
        is the way to go in order to make the best use of your GPU's RAM.
        """
        x_batch = x_batch.to(device)
        y_batch = y_batch.to(device)
    
        loss = train_step_func(x_batch, y_batch)  # returns loss.item() with mini-batch
        losses.append(loss)
        
    with torch.no_grad():
        for x_val, y_val in val_loader:
            # to(device) for the same reason
            x_val = x_val.to(device)
            y_val = y_val.to(device)
            
            model.eval()
            
            y_hat = model(x_val)
            val_loss = mse_loss(y_val, y_hat)
            val_losses.append(val_loss.item())
    

print(model.state_dict())

OrderedDict([('linear.weight', tensor([[1.9894]])), ('linear.bias', tensor([0.9916]))])


## Advanced Example
## Custom Datasets, DataLoaders and Transforms
Derived and modified from the
source: https://pytorch.org/tutorials/beginner/data_loading_tutorial.html

In [17]:
from __future__ import print_function, division
import os
import torch
import pandas as pd
from skimage import io, transform
import numpy as np
import matplotlib.pyplot as plt
from torch.utils.data import Dataset, DataLoader
from torchvision import transforms, utils

# Ignore warnings
import warnings
warnings.filterwarnings("ignore")

plt.ion()   # interactive mode