# Blocks

In [192]:
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.nn import init
from pyraul.tools.dumping import print_torch_tensor, DumpMode, gen_cpp_dtVec

# inplace=True means that x will be overwritten which reduces memory usage
class hswish(nn.Module):
    def forward(self, x):
        out = x * F.relu6(x + 3, inplace=True) / 6
        return out
    
# inplace=True means that x will be overwritten which reduces memory usage
class hsigmoid(nn.Module):
    def forward(self, x):
        out = F.relu6(x + 3, inplace=True) / 6
        return out
    
class SeModule(nn.Module):
    def __init__(self, in_channels, reduction=4, bias=False):
        super().__init__()
        self.se = nn.Sequential(
            nn.AdaptiveAvgPool2d(1), # Output 1x1xC (parameters adapts automatically)
            nn.Conv2d(in_channels, in_channels // reduction, kernel_size=1, stride=1, padding=0, bias=bias), # FC
            nn.BatchNorm2d(in_channels // reduction),
            nn.ReLU(inplace=True),
            nn.Conv2d(in_channels // reduction, in_channels, kernel_size=1, stride=1, padding=0, bias=bias), # FC
            nn.BatchNorm2d(in_channels),
            hsigmoid(),
        )

    def forward(self, x):
        return x * self.se(x)
    
class Block(nn.Module):
    '''expand + depthwise + pointwise'''
    def __init__(self, 
                 kernel_size, 
                 in_channels, 
                 expand_channels, 
                 out_channels, 
                 nolinear, 
                 semodule, 
                 stride,
                 bias=False
                ):
        super().__init__()
        self.stride = stride
        self.se = semodule

        # 1x1, NL
        self.conv1 = nn.Conv2d(in_channels, expand_channels, kernel_size=1, stride=1, padding=0, bias=bias)
        self.bn1 = nn.BatchNorm2d(expand_channels)
        self.nolinear1 = nolinear
        
        # Dwise
        self.conv2 = nn.Conv2d(expand_channels, expand_channels, kernel_size=kernel_size, stride=stride, padding=kernel_size//2, groups=expand_channels, bias=bias)
        self.bn2 = nn.BatchNorm2d(expand_channels)
        self.nolinear2 = nolinear
        
        # Linear
        self.conv3 = nn.Conv2d(expand_channels, out_channels, kernel_size=1, stride=1, padding=0, bias=bias)
        self.bn3 = nn.BatchNorm2d(out_channels)

        # For stride=2 no shorcut
        self.shortcut = nn.Sequential()
        
        # For stride=1 blocks
        if stride == 1 and in_channels != out_channels:
            self.shortcut = nn.Sequential(
                nn.Conv2d(in_channels, out_channels, kernel_size=1, stride=1, padding=0, bias=bias),
                nn.BatchNorm2d(out_channels),
            )

    def forward(self, x):
        # 1x1, NL
        out = self.conv1(x)
        out = self.bn1(out)
        out = self.nolinear1(out)
        # Dwise
        out = self.conv2(out)
        out = self.bn2(out)
        out = self.nolinear2(out)
        # Lineaer
        out = self.conv3(out)
        out = self.bn3(out)
        if self.se != None:
            out = self.se(out)
        out = out + self.shortcut(x) if self.stride==1 else out
        return out

# Blocks under debug

In [193]:
def dump(name, tensor):
    if not hasattr(dump, "history"):
        dump.history = {}
    tensor = tensor.detach().cpu().numpy() 
    dump.history[name] = tensor
    data = tensor[0].flatten()
    with open(f'trace/{name}_w.txt', 'w') as f:
        print(tensor.shape, file=f)
        print(data.shape, file=f)
        np.savetxt(f, data)
    data = tensor.T[0].flatten()
    with open(f'trace/{name}_h.txt', 'w') as f:
        print(tensor.shape, file=f)
        print(data.shape, file=f)
        np.savetxt(f, data)
            
class MobileNetV3_Small(nn.Module):
    def __init__(self, num_classes=1000, bias=False):
        super().__init__()
        self.layers = []
        self.conv1 = nn.Conv2d(3, 16, kernel_size=3, stride=2, padding=1, bias=bias)
        self.bn1 = nn.BatchNorm2d(16)
        self.hs1 = hswish()
        
        self.layers.append(("conv1", self.conv1))
        self.layers.append(("bn1", self.bn1))
        self.layers.append(("hs1", self.hs1))

        self.bneck = nn.Sequential(
            Block(3, 16, 16, 16, nn.ReLU(inplace=True), SeModule(16, bias=bias), 2, bias=bias),
            Block(3, 16, 72, 24, nn.ReLU(inplace=True), None, 2, bias=bias),
            Block(3, 24, 88, 24, nn.ReLU(inplace=True), None, 1, bias=bias),
            Block(5, 24, 96, 40, hswish(), SeModule(40, bias=bias), 2, bias=bias),
            Block(5, 40, 240, 40, hswish(), SeModule(40, bias=bias), 1, bias=bias),
            Block(5, 40, 240, 40, hswish(), SeModule(40, bias=bias), 1, bias=bias),
            Block(5, 40, 120, 48, hswish(), SeModule(48, bias=bias), 1, bias=bias),
            Block(5, 48, 144, 48, hswish(), SeModule(48, bias=bias), 1, bias=bias),
            Block(5, 48, 288, 96, hswish(), SeModule(96, bias=bias), 2, bias=bias),
            Block(5, 96, 576, 96, hswish(), SeModule(96, bias=bias), 1, bias=bias),
            Block(5, 96, 576, 96, hswish(), SeModule(96, bias=bias), 1, bias=bias),
        )
        
        self.layers.append(("bneck", self.bneck))

        self.conv2 = nn.Conv2d(96, 576, kernel_size=1, stride=1, padding=0, bias=bias)
        self.bn2 = nn.BatchNorm2d(576)
        self.hs2 = hswish()
        
        self.layers.append(("conv2", self.conv2))
        self.layers.append(("bn2", self.bn2))
        self.layers.append(("hs2", self.hs2))
        
        self.pool = lambda x: F.avg_pool2d(x, 7)
        self.reshape = lambda x: x.view(x.size(0), -1)
        
        self.layers.append(("pool", self.pool))
        self.layers.append(("reshape", self.reshape))
        
        self.linear3 = nn.Linear(576, 1024)
        self.hs3 = hswish()
        
        self.layers.append(("linear3", self.linear3))
        self.layers.append(("hs3", self.hs3))
        
        self.linear4 = nn.Linear(1024, num_classes)
        self.layers.append(("linear4", self.linear4))
        
        self.init_params()

    def init_params(self):
        for m in self.modules():
            if isinstance(m, nn.Conv2d):
                init.kaiming_normal_(m.weight, mode='fan_out')
                if m.bias is not None:
                    init.constant_(m.bias, 0)
            elif isinstance(m, nn.BatchNorm2d):
                init.constant_(m.weight, 1)
                init.constant_(m.bias, 0)
            elif isinstance(m, nn.Linear):
                init.normal_(m.weight, std=0.001)
                if m.bias is not None:
                    init.constant_(m.bias, 0)

    def forward(self, x):
        trace_tensor = x
        for index, (name, layer) in enumerate(self.layers):
            dump(f"{index:02}_{name}_in", trace_tensor.clone())
            trace_tensor = layer(trace_tensor)
            dump(f"{index:02}_{name}_out", trace_tensor.clone())
        return trace_tensor
            
    
from enum import Enum

class ClassifierType(Enum):
    Small=0
    Large=1
    
class Classifier(nn.Module):
    def __init__(self, model: ClassifierType, num_classes, bias=False):
        super().__init__()
        self.mobilenetv3 = None
        if model == ClassifierType.Small:
            self.mobilenetv3 = MobileNetV3_Small(num_classes, bias=bias)
        if model == ClassifierType.Large:
            self.mobilenetv3 = MobileNetV3_Large(num_classes, bias=bias)
        assert self.mobilenetv3
        self.softmax = nn.LogSoftmax(dim=1)
    def forward(self, x):
        x = self.mobilenetv3(x)
        x = self.softmax(x)
        return x

# CIFAR Train

In [194]:
import time 
from collections import namedtuple
from typing import Callable, Optional, List
from pyraul.tools.logging import get_fixedwide_str
import numpy as np
from pyraul.tools.seed import set_seed
import torchvision.transforms as transforms
from pyraul.tools.dataset import Dataset
from pyraul.tools.dumping import dump_weights
from pyraul.pipeline import accuracy

class AverageMeter:
    """Computes and stores the average and current value"""
    def __init__(self, history: bool = False):
        self.use_history = history
        self.reset()

    def reset(self):
        self.val = 0
        self.avg = 0
        self.sum = 0
        self.count = 0
        if self.use_history:
            self.history=[]

    def update(self, val, n=1):
        self.val = val
        self.sum += val * n
        self.count += n
        self.avg = self.sum / self.count
        if self.use_history:
            self.history.append(val)
            
            
def show_params(model):
    print("====================================")
    for name, param in model.named_parameters():
        if param.requires_grad:
            print_torch_tensor(name, param, slice_obj=slice(0,10), grad=True)
#             if param.data is not None:
#                 print(f"{name}, {param.data.shape}")
#                 data = np.transpose(param.data)
#                 data = data[0] if len(data.shape) > 1 else data
#                 print([x.item() for x in data][:10])
#             if param.grad is not None:
#                 print(f"grad of {name}, {param.grad.shape}")
#                 grad = np.transpose(param.grad)
#                 grad = grad[0] if len(grad.shape) > 1 else grad
#                 print([x.item() for x in grad][:10])
    print("====================================")
        
TrainStepResult = namedtuple("TrainStepResult", ["loss", "time_batch_load", "time_batch_full"])

def train_step(train_loader, 
               model, 
               criterion, 
               optimizer, 
               device, 
               print_freq=1,
               verbose: bool = True,
               loss_history: bool = False,
               preprocessor: Optional[Callable] = None):
    
    batch_time = AverageMeter()
    data_time = AverageMeter()
    losses = AverageMeter(history=loss_history)

    model.train()

    n = len(train_loader)
    n_wide = len(str(n))
    
    cnt = 0
    
    end = time.time()
    for i, (input, target) in enumerate(train_loader):
        
        if preprocessor:
            input = preprocessor(input)

        # measure data loading time
        data_time.update(time.time() - end)

        target = target.to(device)
        input_var = input.to(device)
        target_var = target

        # compute output
        output = model(input_var)
        loss = criterion(output, target_var)

        # compute gradient and do SGD step
        optimizer.zero_grad()
        loss.register_hook(lambda x: print("loss", x))  
        
        loss.backward()
        print_torch_tensor("loss", loss, slice_obj=slice(0,10))
#         show_params(model)
        optimizer.step()

        output = output.float()
        loss = loss.float()
        
        losses.update(loss.item(), input.size(0))

        # measure elapsed time
        batch_time.update(time.time() - end)
        end = time.time()
    
        if verbose and i % print_freq == 0:
            print(f"Step {get_fixedwide_str(str(i), n_wide)}/{n}\t"
                  f"Loss: {losses.val:.6f} ({losses.avg:.6f})\t"
                  f"Time.step: {batch_time.val:.3f} ({batch_time.avg:.3f})\t"
                  f"Time.load: {data_time.val:.3f} ({data_time.avg:.3f})"
                 )
        cnt += 1
        if cnt == 1:
            break
    return TrainStepResult(loss=losses, time_batch_load=data_time, time_batch_full=batch_time)

# Delete me (from here)
from torch.utils.data import DataLoader
from typing import Callable, Optional

def accuracy(
    model: torch.nn.Module,
    dataloader: DataLoader,
    preprocessor: Optional[Callable] = None,
    device: str = "cpu",
    squeeze_target: bool = False,
    **kwargs,
) -> float:
    model.eval()
    correct, total = 0, 0
    with torch.no_grad():
        for data, labels in dataloader:
            if preprocessor:
                data = preprocessor(data)
            data = data.to(device)
            labels = labels.to(device)
            if squeeze_target:
                labels = labels.squeeze()
            outputs = model(data)
            _, predicted = torch.max(outputs.data, 1)
            total += outputs.size(0)
            correct += (predicted == labels).sum().item()
            break
    return 100.0 * correct / total
# Delete me (to here)

In [302]:
config = {
    "seed": 0,
    "classes": 10,
    "bias": True,
    "batch_size": 4,
#     "batch_size": 50,
    "device": "cuda",
#     "device": "cpu",
    "epochs": 1,
    "sgd": {"lr": 0.05}
}


set_seed(config["seed"])

device = torch.device(config["device"])
model = Classifier(ClassifierType.Small, num_classes=config["classes"], bias=config["bias"])

# dump_weights(model, "init.txt", mode=DumpMode.flatten_transpose)
# dump_weights(model, "init.txt", mode=DumpMode.transpose_flatten, filter="linear")

model = model.to(device)

ds= Dataset("CIFAR10",
            train_transform=[transforms.Resize(224, interpolation=0), transforms.ToTensor()],
            test_transform=[transforms.Resize(224, interpolation=0), transforms.ToTensor()],
            **config)
optimizer = torch.optim.SGD(model.parameters(), lr=config["sgd"]["lr"])
criterion = nn.NLLLoss(reduction="mean")

# accuracy_before = accuracy(
#         model=model,
#         dataloader=ds.test_loader,
#         **config,
# )


# print(accuracy_before)


# ####################
# params = [(name, param) for name, param in model.named_parameters() if param.requires_grad]
# print(params[-2][0])
# debug_weights = params[-2][1].clone()
# ####################


loss, _, _ = train_step(
                    ds.train_loader, 
                    model,
                    criterion,
                    optimizer,
                    device,
                    print_freq=100,
                    verbose=True,
                    loss_history=True
                )

# accuracy_after = accuracy(
#     model=model,
#     dataloader=ds.test_loader,
#     **config,
# )
# print(accuracy_after)

INFO: Loading CIFAR10 dataset...


Files already downloaded and verified
Files already downloaded and verified
loss tensor(1., device='cuda:0')
loss (torch.Size([])):
['2.30252886']
Step     0/12500	Loss: 2.302529 (2.302529)	Time.step: 3.034 (3.034)	Time.load: 0.003 (0.003)


In [213]:
x=torch.tensor(dump.history["01_bn1_in"])
bn=nn.BatchNorm2d(16).train()
print(bn)

print("========================Before")
for k in bn.state_dict().keys():
    tensor = bn.state_dict()[k]
    print("----", k, tensor.shape, [t.item() for t in tensor] if tensor.shape else tensor.item(), sep="\n")
    
y=bn(x)
y_exp = dump.history["01_bn1_out"]

print("========================After")
for k in bn.state_dict().keys():
    tensor = bn.state_dict()[k]
    print("----", k, tensor.shape, [t.item() for t in tensor] if tensor.shape else tensor.item(), sep="\n")

BatchNorm2d(16, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
----
weight
torch.Size([16])
[1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0]
----
bias
torch.Size([16])
[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0]
----
running_mean
torch.Size([16])
[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0]
----
running_var
torch.Size([16])
[1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0]
----
num_batches_tracked
torch.Size([])
0
----
weight
torch.Size([16])
[1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0]
----
bias
torch.Size([16])
[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0]
----
running_mean
torch.Size([16])
[0.0033524830359965563, 0.02775103785097599, -0.039781078696250916, -0.008996550925076008, 0.01896393671631813, 0.007416348904371262, -0.007661657873541117, 0.05841472744941711

In [206]:
x.shape

torch.Size([4, 16, 112, 112])

In [207]:
with open(f'x.txt', 'w') as f:
    np.savetxt(f, x.flatten().T)
with open(f'y.txt', 'w') as f:
    np.savetxt(f, y_exp.flatten().T)

In [299]:
def bn_manual(tensor, momentum=0.1, epsilon=1e-5, unbiased=False):
    result = tensor.clone()
    channels = result.shape[1]
    channel_mean_prev = [0.0]*channels
    channel_var_prev = [1.0]*channels
    shifted_input = result.clone()
    
    def mval(v, prev): return (1-momentum)*prev + momentum*v
    
    for c in range(channels):
        channel_mean = result[:,c,:].mean()
        channel_var = result[:,c,:].var(unbiased=unbiased)
#         print(
#             f"{mval(channel_mean.item(), channel_mean_prev[c]):.8f}",
#             f"{mval(channel_var.item(), channel_var_prev[c]):.8f}"
#             )
        print(f"{channel_mean:.8f}", f"{channel_var:.8f}")
#         channel_mean = mval(channel_mean.item(), channel_mean_prev[c])
#         channel_var = mval(channel_var.item(), channel_var_prev[c])
        
        shifted = result[:,c,:]-channel_mean
        normalized = shifted / (channel_var+epsilon)**(0.5)
        result[:,c,:] = normalized
        shifted_input[:,c,:] = shifted
    return result, shifted_input
        
            
# y_man_unbiased, _ = bn_manual(x, unbiased=True)
# print("==")
y_man, y_man_shifted = bn_manual(x, unbiased=False)

0.03352483 0.00767649
0.27751040 0.01930936
-0.39781079 0.04446352
-0.08996550 0.00249623
0.18963939 0.00902082
0.07416350 0.00329357
-0.07661659 0.00273430
0.58414733 0.13616510
-0.00015173 0.00263250
0.37472659 0.05073946
0.08732253 0.00551090
-0.32286820 0.02658235
0.27576125 0.04402352
0.35050261 0.04262232
-0.15507863 0.00594011
-0.23937045 0.02738193


In [295]:
i, j, k = 0, 0, 0
for cmp_a, cmp_b, cmp_c, cmp_d, cmp_e in zip(y[i][j][k], y_exp[i][j][k], y_man[i][j][k], x[i][j][k], y_man_shifted[i][j][k]):
    print(f"y={cmp_a.item():.7f}", 
          f"e={cmp_b.item():.7f}", 
          f"m={cmp_c.item():.7f}", 
          f"x={cmp_d.item():.7f}", 
          f"s={cmp_e.item():.7f}", 
          f"de={abs(cmp_a - cmp_b).item()}", 
          f"dm={abs(cmp_a - cmp_c).item()}", 
#           f"du={abs(cmp_a - cmp_d).item()}", 
          "----", sep = "\n")

y=-0.0145845
e=-0.0145845
m=-0.0145846
x=0.0322462
s=-0.0012787
de=0.0
dm=4.284083843231201e-08
----
y=-1.1202892
e=-1.1202892
m=-1.1202893
x=-0.0646939
s=-0.0982187
de=0.0
dm=1.1920928955078125e-07
----
y=-1.1202892
e=-1.1202892
m=-1.1202893
x=-0.0646939
s=-0.0982187
de=0.0
dm=1.1920928955078125e-07
----
y=-0.9679800
e=-0.9679800
m=-0.9679801
x=-0.0513405
s=-0.0848654
de=0.0
dm=1.7881393432617188e-07
----
y=-0.9414218
e=-0.9414218
m=-0.9414219
x=-0.0490121
s=-0.0825369
de=0.0
dm=5.960464477539063e-08
----
y=-0.9414218
e=-0.9414218
m=-0.9414219
x=-0.0490121
s=-0.0825369
de=0.0
dm=5.960464477539063e-08
----
y=-0.9414218
e=-0.9414218
m=-0.9414219
x=-0.0490121
s=-0.0825369
de=0.0
dm=5.960464477539063e-08
----
y=-1.0109986
e=-1.0109986
m=-1.0109987
x=-0.0551121
s=-0.0886369
de=0.0
dm=1.1920928955078125e-07
----
y=-1.1017083
e=-1.1017083
m=-1.1017083
x=-0.0630648
s=-0.0965897
de=0.0
dm=0.0
----
y=-1.1017083
e=-1.1017083
m=-1.1017083
x=-0.0630648
s=-0.0965897
de=0.0
dm=0.0
----
y=-1.1135340
