# Hswish, swish and HSigmoid Experiments

In [44]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import numpy as np
import seaborn as sns
import pandas as pd
import matplotlib.pyplot as plt

from collections import namedtuple
# from pyraul.nn import MLP
# from pyraul.pipeline import accuracy
# from pyraul.pipeline.train_step import train_step
from pyraul.tools.dataset import Dataset
from pyraul.tools.dumping import dump_weights
%matplotlib inline

## Activation functions

### Swish

In [45]:
class Swish(nn.Module):
    def __init__(self):
        super().__init__()
        self.sigmoid = nn.Sigmoid()
    def forward(self, x):
        return x*self.sigmoid(x)

### Hard Swish

In [46]:
class HSwish(nn.Module):
    def forward(self, x):
        return x*F.relu6(x+3, inplace=True)/6

### Hard Sigmoid

In [47]:
class HSigmoid(nn.Module):
    def forward(self, x):
        return F.relu6(x+3, inplace=True)/6

## Comparation h-swish vs swish and ReLU, Sigmoid vs h-sigmoid

#### Forward

In [None]:
x = torch.from_numpy(np.arange(-8.0, 8.0, 0.5, dtype=np.float32))
relu_result = nn.ReLU().eval()(x)
swish_result = Swish().eval()(x)
hswish_result = HSwish().eval()(x)
sigmoid_result = nn.Sigmoid().eval()(x)
hsigmoid_result = HSigmoid().eval()(x)

In [None]:
plt.title('Swish vs Hard Swish vs ReLU functions (forward)')
plt.grid(True)
plt.ylabel('F(x)')
plt.xlabel('x')
plt.plot(x, relu_result, label="ReLU")
plt.plot(x, swish_result, label="Swish")
plt.plot(x, hswish_result, label="h-Swish")
plt.legend(loc='lower right')

In [None]:
plt.title('Sigmoid vs Hard Sigmoid functions (forward)')
plt.grid(True)
plt.ylabel('F(x)')
plt.xlabel('x')
plt.plot(x, sigmoid_result, label="Sigmoid")
plt.plot(x, hsigmoid_result, label="h-Sigmoid")
plt.legend(loc='lower right')

#### Backward

In [None]:
def get_grad(function, input_x):
    x = input_x.clone().requires_grad_(True)
    y = function.train()(x)
    y = y.sum()
    y.backward()
    return x.grad

In [None]:
x_grad = torch.from_numpy(np.arange(-8.0, 8.0, 0.25, dtype=np.float32))
relu_grad_y = get_grad(nn.ReLU(), x_grad)
swish_grad_y = get_grad(Swish(), x_grad)
hswish_grad_y = get_grad(HSwish(), x_grad)
sigmoid_grad_y = get_grad(nn.Sigmoid(), x_grad)
hsigmoid_grad_y = get_grad(HSigmoid(), x_grad)

In [None]:
plt.title('Swish vs Hard Swish vs ReLU functions (backward)')
plt.grid(True)
plt.ylabel('grad F(x)')
plt.xlabel('x')
plt.plot(x_grad, relu_grad_y, label="ReLU")
plt.plot(x_grad, swish_grad_y, label="Swish")
plt.plot(x_grad, hswish_grad_y, label="h-Swish")
plt.legend(loc='lower right')

In [None]:
plt.title('Sigmoid vs Hard Sigmoid functions (backward)')
plt.grid(True)
plt.ylabel('grad F(x)')
plt.xlabel('x')
plt.plot(x_grad, sigmoid_grad_y, label="Sigmoid")
plt.plot(x_grad, hsigmoid_grad_y, label="h-Sigmoid")
plt.legend(loc='lower right')

In [None]:
list(zip(x_grad, hsigmoid_grad_y))

#### H-swish gradient formula

In [None]:
def hswish_grad(x, grad):
    if x == -3.0: return 0.0
    if x == 3.0: return grad
    if x > 3.0: return grad
    if x > -3.0 and x < 3.0: return  grad*(x/3.0 + 0.5)
    return 0.0

In [None]:
formula_hswish_grad = [hswish_grad(x, 1) for x in x_grad]

In [None]:
list(zip(x_grad, hswish_grad_y,formula_hswish_grad))

## Simple arcitecture

In [48]:
from enum import Enum

class NetType(Enum):
    relu = 0,
    swish = 1,
    hswish = 2,
    sigmoid = 3,
    hsigmoid = 4
    
def trace_forward(name, tensor, batch=0, start=0, stop=-1):
    print(f"{name} ({tensor.shape}), #{batch}[{start}:{stop}]")
    print(*[x.item() for x in tensor[batch][start:stop]])
    print("-----")
    
class Toy(nn.Module):
    def __init__(self, activation, n_input, n_hidden, n_output, trace = None, **kwargs):
        super().__init__()
        self.trace = trace
        
        self.fc1 = nn.Linear(n_input, n_hidden)
        self.activation = activation
        self.fc2 = nn.Linear(n_hidden, n_output)
        self.softmax = nn.LogSoftmax(dim=1)

    def forward(self, x):
        self.trace and trace_forward("data", x, **self.trace)
        out = self.fc1(x)
        self.trace and trace_forward("fc1", out, **self.trace)
        out = self.activation(out)
        self.trace and trace_forward("act", out, **self.trace)
        out = self.fc2(out)
        self.trace and trace_forward("fc2", out, **self.trace)
        out = self.softmax(out)
        self.trace and trace_forward("softmax", out, **self.trace)
        return out
    
def gen_net(net_type: NetType, net_config, device, trace = False):
    if net_type == NetType.relu:
        return Toy(activation=nn.ReLU(), trace=trace, **net_config).to(device)
    if net_type == NetType.swish:
        return Toy(activation=Swish(), trace=trace, **net_config).to(device)
    if net_type == NetType.hswish:
        return Toy(activation=HSwish(), trace=trace, **net_config).to(device)
    if net_type == NetType.sigmoid:
        return Toy(activation=nn.Sigmoid(), trace=trace, **net_config).to(device)
    if net_type == NetType.hsigmoid:
        return Toy(activation=HSigmoid(), trace=trace, **net_config).to(device)
    raise NotImplementedError("Unknown network type")

### MNIST Training

In [49]:
import torch
from torch.utils.data import DataLoader
from typing import Callable, Optional


def accuracy(
    model: torch.nn.Module,
    dataloader: DataLoader,
    preprocessor: Optional[Callable] = None,
    device: str = "cpu",
    squeeze_target: bool = False,
    **kwargs,
) -> float:
    """
    The function returns an accuracy score in percentages.

    Accuracy = correct answer / total answers

    :param model: Neural network model
    :param dataset: Wrapping object that contains data loaders
    :param preprocessor: Callable object which is preprocess data
    :param kwargs: Other arguments in dictionary
    :return:
    """
    model.eval()
    correct, total = 0, 0
    cnt = 0
    with torch.no_grad():
        for data, labels in dataloader:
            if preprocessor:
                data = preprocessor(data)
            data = data.to(device)
            labels = labels.to(device)
            if squeeze_target:
                labels = labels.squeeze()
            outputs = model(data)
            _, predicted = torch.max(outputs.data, 1)
            total += outputs.size(0)
            correct += (predicted == labels).sum().item()
            cnt += 1
    return 100.0 * correct / total


In [54]:
import time 
from collections import namedtuple
from typing import Callable, Optional, List
from pyraul.tools.logging import get_fixedwide_str

class AverageMeter:
    """Computes and stores the average and current value"""
    def __init__(self, history: bool = False):
        self.use_history = history
        self.reset()

    def reset(self):
        self.val = 0
        self.avg = 0
        self.sum = 0
        self.count = 0
        if self.use_history:
            self.history=[]

    def update(self, val, n=1):
        self.val = val
        self.sum += val * n
        self.count += n
        self.avg = self.sum / self.count
        if self.use_history:
            self.history.append(val)
            
            
def show_params(model):
    print("====================================")
    for name, param in model.named_parameters():
        if param.requires_grad:
            if param.data is not None:
                print(f"{name}, {param.data.shape}")
                data = np.transpose(param.data)
                data = data[0] if len(data.shape) > 1 else data
                print([x.item() for x in data][:10])
            if param.grad is not None:
                print(f"grad of {name}, {param.grad.shape}")
                grad = np.transpose(param.grad)
                grad = grad[0] if len(grad.shape) > 1 else grad
                print([x.item() for x in grad][:10])
    print("====================================")
        
TrainStepResult = namedtuple("TrainStepResult", ["loss", "time_batch_load", "time_batch_full"])

def train_step(train_loader, 
               model, 
               criterion, 
               optimizer, 
               device, 
               print_freq=1,
               verbose: bool = True,
               loss_history: bool = False,
               preprocessor: Optional[Callable] = None):
    
    batch_time = AverageMeter()
    data_time = AverageMeter()
    losses = AverageMeter(history=loss_history)

    model.train()

    n = len(train_loader)
    n_wide = len(str(n))
    
    end = time.time()
    for i, (input, target) in enumerate(train_loader):
        
        if preprocessor:
            input = preprocessor(input)

        # measure data loading time
        data_time.update(time.time() - end)

        target = target.to(device)
        input_var = input.to(device)
        target_var = target

        # compute output
        output = model(input_var)
        loss = criterion(output, target_var)

        # compute gradient and do SGD step
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

        output = output.float()
        loss = loss.float()
        
        losses.update(loss.item(), input.size(0))

        # measure elapsed time
        batch_time.update(time.time() - end)
        end = time.time()
    
        if verbose and i % print_freq == 0:
            print(f"Step {get_fixedwide_str(str(i), n_wide)}/{n}\t"
                  f"Loss: {losses.val:.6f} ({losses.avg:.6f})\t"
                  f"Time.step: {batch_time.val:.3f} ({batch_time.avg:.3f})\t"
                  f"Time.load: {data_time.val:.3f} ({data_time.avg:.3f})"
                 )
    return TrainStepResult(loss=losses, time_batch_load=data_time, time_batch_full=batch_time)

In [56]:
from pyraul.tools.seed import set_seed

config = {
    "batch_size": 50,
    "feature_space_dim": 784,
    "hidden_layer_dim": 500,
    "classes_n": 10,
    "seed": 0,
    "device": "cuda",
    "epochs": 50,
    "sgd": {"lr": 0.05}
}

net_config = {
    "n_input": config["feature_space_dim"], 
    "n_hidden": config["hidden_layer_dim"], 
    "n_output": config["classes_n"]
}

set_seed(config["seed"])

device = torch.device(config["device"])
model = gen_net(NetType.hsigmoid, net_config, device)

# dump_weights(model, "init.txt")

ds= Dataset("MNIST", **config)
optimizer = torch.optim.SGD(model.parameters(), lr=config["sgd"]["lr"])
criterion = nn.NLLLoss(reduction="mean")

accuracy_before = accuracy(
        model=model,
        dataloader=ds.test_loader,
        preprocessor=lambda images: images.reshape(-1, 28 * 28),
        **config,
)

print(accuracy_before)

loss, _, _ = train_step(
                    ds.train_loader, 
                    model,
                    criterion,
                    optimizer,
                    device,
                    print_freq=100,
                    verbose=True,
                    loss_history=True,
                    preprocessor=lambda images: images.reshape(-1, 28 * 28),
                )
accuracy_after = accuracy(
    model=model,
    dataloader=ds.test_loader,
    preprocessor=lambda images: images.reshape(-1, 28 * 28),
    **config,
)
print(accuracy_after)

INFO: Loading MNIST dataset...


10.28
Step    0/1200	Loss: 2.352203 (2.352203)	Time.step: 0.013 (0.013)	Time.load: 0.009 (0.009)
Step  100/1200	Loss: 2.276294 (2.296440)	Time.step: 0.005 (0.006)	Time.load: 0.004 (0.005)
Step  200/1200	Loss: 2.057391 (2.238303)	Time.step: 0.006 (0.006)	Time.load: 0.005 (0.005)
Step  300/1200	Loss: 1.876074 (2.160662)	Time.step: 0.006 (0.006)	Time.load: 0.005 (0.005)
Step  400/1200	Loss: 1.765929 (2.063696)	Time.step: 0.005 (0.006)	Time.load: 0.004 (0.005)
Step  500/1200	Loss: 1.270914 (1.950968)	Time.step: 0.005 (0.006)	Time.load: 0.004 (0.005)
Step  600/1200	Loss: 1.210810 (1.835414)	Time.step: 0.005 (0.006)	Time.load: 0.004 (0.005)
Step  700/1200	Loss: 0.949828 (1.733999)	Time.step: 0.006 (0.006)	Time.load: 0.005 (0.005)
Step  800/1200	Loss: 0.816559 (1.635761)	Time.step: 0.007 (0.006)	Time.load: 0.005 (0.005)
Step  900/1200	Loss: 0.940067 (1.549612)	Time.step: 0.010 (0.006)	Time.load: 0.009 (0.005)
Step 1000/1200	Loss: 0.779427 (1.471990)	Time.step: 0.006 (0.006)	Time.load: 0.004 (

In [63]:
print(
    "const raul::dtVec idealLosses{", 
    ", ".join([f"{x}_dt" for x in loss.history[::100]]),
    "};")

const raul::dtVec idealLosses{ 2.352203369140625_dt, 2.2762935161590576_dt, 2.0573911666870117_dt, 1.876073956489563_dt, 1.7659292221069336_dt, 1.2709139585494995_dt, 1.210809588432312_dt, 0.9498279690742493_dt, 0.8165586590766907_dt, 0.9400674700737_dt, 0.7794268131256104_dt, 0.6358923316001892_dt };


### Train network with toy dataset (binary classification)

In [None]:
config = {
    "batches": 10,
    "batch_size": 4,
    "feature_space_dim": 16,
    "classes_n": 2,
    "dataset_offset": [0.0, 0.5],
    "hidden_layer_dim": 64,
    "seed": 0,
    "device": "cpu",
    "epochs": 50,
    "sgd": {"lr": 0.05}
}

In [None]:
from pyraul.tools.seed import set_seed

def generate_toy_dataset(dataset_offset: list,
                         classes_n: int,
                         feature_space_dim: int,
                         batches: int,
                         batch_size: int,
                         seed: int,
                         device: str,
                         **kwargs):
    
    set_seed()
    assert len(dataset_offset) == classes_n
    
    amoutn_of_vectors = batch_size*batches // classes_n
    assert amoutn_of_vectors>0
    
    x_class_list = []
    for i in range(classes_n):
        _x = torch.randn(amoutn_of_vectors, feature_space_dim, device=device) + dataset_offset[i]
        x_class_list.append(_x)

    x = torch.cat(x_class_list, dim=0)
    
    y_class_list = []
    for i in range(classes_n):
        _y = torch.ones(amoutn_of_vectors, 1, device=device).long() * i
        y_class_list.append(_y)
        
    y = torch.cat(y_class_list, dim=0)

    return x, y
        
x, y = generate_toy_dataset(**config)

plt.title('Projection of the feature space')
plt.ylabel('x0')
plt.xlabel('x1')
plt.scatter(x[:,0], x[:, 1], c=y, alpha=0.5)
plt.show()

In [None]:
def get_ds(full_dataset):
    train_size = int(0.8 * len(full_dataset))
    test_size = len(full_dataset) - train_size
    return torch.utils.data.random_split(full_dataset, [train_size, test_size])

In [None]:
from torch.utils.data import DataLoader, Dataset
device = torch.device(config["device"])

net_config = {
    "n_input": config["feature_space_dim"], 
    "n_hidden": config["hidden_layer_dim"], 
    "n_output": config["classes_n"]
}

nets = Nets(
    relu = Toy(activation=nn.ReLU(), **net_config).to(device),
    swish = Toy(activation=Swish(), **net_config).to(device),
    hswish = Toy(activation=HSwish(), **net_config).to(device),
    sigmoid = Toy(activation=nn.Sigmoid(), **net_config).to(device),
    hsigmoid = Toy(activation=HSigmoid(), **net_config).to(device)
)

model = nets.hsigmoid
optimizer = torch.optim.SGD(model.parameters(), lr=config["sgd"]["lr"])
criterion = lambda y,t: nn.NLLLoss()(y, t.squeeze())

train_ds, test_ds = get_ds(list(zip(x, y)))
ds_train_loader = DataLoader(train_ds, batch_size=config["batch_size"], shuffle=True)
ds_test_loader = DataLoader(test_ds, batch_size=config["batch_size"], shuffle=True)

print(accuracy(model, ds_test_loader, squeeze_target=True))
      
history_loss=[]
history_acc=[]
for epoch in range(config["epochs"]):
    loss, _, _ = train_step(
                        ds_train_loader, 
                        model,
                        criterion,
                        optimizer,
                        config["device"],
                        print_freq=5,
                        verbose=False
                    )
    acc = accuracy(model, ds_test_loader, squeeze_target=True)
    history_acc.append(acc)
    history_loss.append(loss)

In [None]:
fig, ax1 = plt.subplots()

plt.grid(True)
plt.title('Training')

ax2 = ax1.twinx()
ax1.plot(history_acc, 'g-')
ax2.plot(history_loss, 'r-')

ax1.set_xlabel('Epoch')
ax1.set_ylabel('Test accuracy', color='g')
ax2.set_ylabel('Train Loss avg', color='r')
plt.show()

### Converter prototyp

In [None]:
from pyraul.tools.converter import cvt_model_to_raul
cvt_model_to_raul(model)

In [None]:
from pyraul.tools.converter import cvt_tensor_to_raul
cvt_tensor_to_raul(y.float())

## Watermark

In [None]:
%load_ext watermark
%watermark -d -u -v -iv