# Activation Functions

In this tutorial, we will compare several commonly used activation functions - Sigmoid, Tahn, ReLU and Leaky ReLU.

In [None]:
import os
os.environ["CUDA_DEVICE_ORDER"] = "PCI_BUS_ID" # so the IDs match nvidia-smi
os.environ["CUDA_VISIBLE_DEVICES"] = "0"       # eg. "0, 1, 2" for multiple


DATA_ROOT = '/data1/mnist/'
DEVICE = 'cuda:0'
BATCH_SIZE = 64
TEST_BATCH_SIZE = 256

In [None]:
# spells...
%matplotlib inline
import numpy as np
import matplotlib.pyplot as plt

plt.style.reload_library()
plt.style.use(['seaborn-whitegrid'])
plt.ion()    # interactive mode: on

import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim

import torchvision
import torchvision.transforms as transforms

import tensorboardX

device = torch.device(DEVICE if torch.cuda.is_available() else "cpu")

We will use the handwritten digits dataset (MNIST) in following experiments.

Each image in MNIST is a $1\times28\times28$ tensor. 

In [None]:
# Training dataset
train_loader = torch.utils.data.DataLoader(
    torchvision.datasets.MNIST(root=DATA_ROOT, train=True, #download=True#,
                               transform=transforms.Compose([
                                   transforms.ToTensor(),
                                   transforms.Normalize((0.1307,), (0.3081,))
                               ])), batch_size=BATCH_SIZE, shuffle=True, num_workers=4)

# Test dataset
test_loader = torch.utils.data.DataLoader(
    torchvision.datasets.MNIST(root=DATA_ROOT, train=False,
                                transform=transforms.Compose([
                                transforms.ToTensor(),
                                transforms.Normalize((0.1307,), (0.3081,))
                            ])), batch_size=TEST_BATCH_SIZE, num_workers=4)

## Define a MLP with a specified activation


In [None]:
class MLP(nn.Module):
    def __init__(self, in_size, out_size, hidden_size=64, num_hidden_layers=5, activation='sigmoid', leaky_a=0.01):
        super(MLP, self).__init__()
        
        # select activation module by name
        activation_dict = {
            'sigmoid': nn.Sigmoid(),
            'tanh': nn.Tanh(),
            'relu': nn.ReLU(),
            'leaky_relu': nn.LeakyReLU(leaky_a),
        }
        self.activation = activation_dict[activation]
        
        self.num_hidden_layers = num_hidden_layers
        num_connections = num_hidden_layers - 1
        
        if isinstance(hidden_size, int):
            hidden_sizes = [hidden_size] * num_connections
        elif type(hidden_size) in (list, tuple) and len(hidden_size) == num_connections:
            hidden_sizes = hidden_size
        else:
            raise Exception('hidden_size must be a integer or a list/tuple with size {}!'.format(num_connections))
                  
        layers = []
        
        # the first layer
        layers.append(nn.Linear(in_size, hidden_sizes[0]))
        layers.append(self.activation)
        
        # the middle layers
        for i in range(num_connections-1):
            # Your codes:
            
                    
        # the last layer
        layers.append(nn.Linear(hidden_sizes[num_hidden_layers-2], out_size))
        
        # sequentialization; otherwise the module can not access the parameters
        self.layers = nn.Sequential(*layers)
        
    def forward(self, x):
        x = self.layers(x)
        
        return x

In [None]:
# Loss function
criterion = nn.CrossEntropyLoss()

In [None]:
# Evaluation
def evaluate_by_accuracy(model):
    correct = 0
    total = 0

    with torch.no_grad():                    
        for data in test_loader:
            images, labels = data
            images, labels = images.view(-1, 28*28).to(device), labels.to(device)
            

            outputs = model(images)
            _, predicted = torch.max(outputs.data, 1)      

            total += labels.size(0)
            correct += int((predicted == labels).sum()) 

    return correct / total

In [None]:
# Training
def train_over_models(models_dict, writer, lr=0.01, epochs=30):
    for model_name, model in models_dict.items():
        optimizer = optim.SGD(model.parameters(), lr=lr, momentum=0.9)

        for epoch in range(epochs):   

            running_loss = 0.0
            total_loss = 0.0

            for i, data in enumerate(train_loader, 0):

                model.train()

                inputs, labels = data
                inputs, labels = inputs.view(-1, 28*28).to(device), labels.to(device)

                optimizer.zero_grad()

                outputs = model(inputs)
                loss = criterion(outputs, labels)
                loss.backward()
                optimizer.step()

                total_loss += float(loss)

            # add the weights and the gradients of them in the first and last layers to TensorBoard's Histogram
            for layer_no in (0, (model.num_hidden_layers-1)*2):
                weight = model.layers[layer_no].weight
                writer.add_histogram('weight/{}/{}'.format(model_name, layer_no//2+1), 
                                     weight.clone().cpu().data.numpy(), epoch)
                writer.add_histogram('grad_weight/{}/{}'.format(model_name, layer_no//2+1),
                                     weight.grad.clone().cpu().data.numpy(), epoch)
            
            # add evaluation on test set and loss on train set to TensorBoard's Scalar
            acc = evaluate_by_accuracy(model)
            writer.add_scalars('accuracy', {model_name: acc}, epoch+1)
            writer.add_scalars('loss', {model_name: total_loss/(i+1)}, epoch+1)

    print('Finished Training')

In [None]:
def plot_activation_functions(functions_dict, figsize=(12, 6)):
    plt.figure(figsize=figsize)
    
    for name, func in functions_dict.items():
        x = torch.arange(-5.0, 5.0, 1e-2, requires_grad=True)
        y = func(x)
        y.backward(torch.ones(x.size()))            # autograd
        
        plt.plot(x.data.numpy(), y.data.numpy(), '-', label=name, alpha=0.7)
        plt.plot(x.data.numpy(), x.grad.data.numpy(), '--', label='derivative '+name, alpha=0.7)
        
    plt.legend()
    plt.show()

## Sigmoid vs. Hyperbolic Tangent (Tanh)


||<center><img width=200/>Sigmoid</center>|<center><img width=200/>Tanh</center>|
|-|-|-|
|Expression|$$f(x) = \frac{1}{1 + e^{-x}}$$|$$f(x)=\frac{2}{1+e^{-2x}} - 1$$|
|Range|(0, 1)|(-1, 1)|
|Over the Origin|No|Yes|
|Symmetry|Asymmetric|Centrosymmetric|
|Derivative Function|$$f'(x) = f(x)\left(1-f(x)\right)$$ | $$ f'(x)=1-[f(x)]^2$$|
|Range|(0, 0.25)|(0, 1]|

Sigmoid function is especially used for predicting the probability.

Tanh function is also sigmoidal (s-shaped), and mainly used in classification between two classes.

In [None]:
plot_activation_functions({'sigmoid': torch.sigmoid,
                           'tanh': torch.tanh})

Run the following cell and go to your TensorBoard to observe the change of accuracies on test set and losses on train set.

In [None]:
models_dict1 = {name: MLP(28*28, 10, activation=name).to(device) for name in ('sigmoid', 'tanh')}

train_over_models(models_dict1, tensorboardX.SummaryWriter('./runs/05_shallow_mlp'))

## Sigmoid vs. Hyperbolic Tangent (Tanh) vs. Recified Linear Unit (ReLU) vs. Leaky ReLU


||<center><img width=200/>Sigmoid</center>|<center><img width=200/>Tanh</center>|<center><img width=200/>ReLU</center>|<center><img width=200/>Leaky ReLU</center>
|-|-|-|-|
|Expression|$$f(x) = \frac{1}{1 + e^{-x}}$$|$$f(x)=\frac{2}{1+e^{-2x}} - 1$$|$$f(x)=\max(0, x)$$|$$f(x)=\begin{cases}
\displaystyle x & \text{if  } x \geq 0 \\ 0.01x & \text{if  } x \lt 0 
\end{cases}$$
|Range|$(0, 1)$|$(-1, 1)$|$[0,\infty]$|$[-\infty, \infty]$|
|Symmetry|Asymmetric|Centrosymmetric|Asymmetric|Asymmetric|
|Derivative Function|$$f'(x) = f(x)\left(1-f(x)\right)$$ | $$ f'(x)=1-[f(x)]^2$$| $$ f'(x) = \begin{cases}
\displaystyle 1 & \text{if  } x \geq 0 \\ 0 & \text{if  } x \lt 0 
\end{cases}$$ | $$ f'(x) = \begin{cases}
\displaystyle 1 & \text{if  } x \geq 0 \\ 0.01 & \text{if  } x \lt 0 
\end{cases}$$
|Range|$(0, 0.25)$|$(0, 1]$|$\{0, 1\}$|$\{0, 1\}$
|Continuous|Yes|Yes|No|No|
|Vanishing Gradient|Yes|Yes|No?|Yes|
|Saturation|Yes|Yes|No|No|
|Dead Neurons|No|No|Yes|No|

ReLU is used in almost all the convolutional neural networks or deep learning.

Since ReLU force the negative values to be zero, the model may loss information from the data during training. Leaky ReLU can cover such a need.

In [None]:
plot_activation_functions({'relu': torch.relu})

In [None]:
plot_activation_functions({'leaky_relu': F.leaky_relu})

Run the following cell and go to your TensorBoard to observe the difference of the histograms of gradients.

In [None]:
models_dict2 = {name: MLP(28*28, 10, hidden_size=48, num_hidden_layers=10, activation=name).to(device)
                for name in ('sigmoid', 'tanh', 'relu', 'leaky_relu')}

train_over_models(models_dict2, tensorboardX.SummaryWriter('./runs/05_deeper_mlp'))

## Deeper is better

We can see that a deeper MLP model with ReLU will perform as well as a shallow one with Tanh, while a deeper one with Leaky ReLU will even work better than the others.

### To compare the number of parameters between shallow models and deeper ones.

In [None]:
def count_parameters(model):
    return sum(p.numel() for p in model.parameters() if p.requires_grad)

In [None]:
print('Shallow: ', count_parameters(models_dict1['sigmoid']))
print('Deeper: ', count_parameters(models_dict2['sigmoid']))

For information of other activation function candidates, please check [this site](https://dashee87.github.io/deep%20learning/visualising-activation-functions-in-neural-networks/).