## Gradient Accumulation ON - FP32

In [1]:
%reload_ext autoreload
%autoreload 2
%matplotlib inline

In [2]:
from fastai import *
from fastai.vision import *
import os

In [3]:
gpu_device = 1
defaults.device = torch.device(f'cuda:{gpu_device}')
torch.cuda.set_device(gpu_device)

In [4]:
BS = 2
N_STEP = 16  # grad accumulation for n steps

In [5]:
path = untar_data(URLs.PETS); path

PosixPath('/home/haider/.fastai/data/oxford-iiit-pet')

In [6]:
path.ls()

[PosixPath('/home/haider/.fastai/data/oxford-iiit-pet/annotations'),
 PosixPath('/home/haider/.fastai/data/oxford-iiit-pet/models'),
 PosixPath('/home/haider/.fastai/data/oxford-iiit-pet/images')]

In [7]:
path_anno = path/'annotations'
path_img = path/'images'

In [8]:
fnames = get_image_files(path_img)
fnames[:5]

[PosixPath('/home/haider/.fastai/data/oxford-iiit-pet/images/leonberger_84.jpg'),
 PosixPath('/home/haider/.fastai/data/oxford-iiit-pet/images/american_pit_bull_terrier_78.jpg'),
 PosixPath('/home/haider/.fastai/data/oxford-iiit-pet/images/newfoundland_13.jpg'),
 PosixPath('/home/haider/.fastai/data/oxford-iiit-pet/images/english_setter_63.jpg'),
 PosixPath('/home/haider/.fastai/data/oxford-iiit-pet/images/Persian_38.jpg')]

In [9]:
def seed_everything(seed):
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.backends.cudnn.deterministic = True

In [10]:
class AccumulateBatchNorm(nn.Module):
    
    def __init__(self, bn):
        super().__init__()
        self.bn,self.num_features = bn,bn.num_features
        self.track_running_stats,self.momentum = bn.track_running_stats,bn.momentum
        self.running_mean,self.running_square,self.iterations = None,None,None
    
    def reset_running_stats(self):
        self.running_mean,self.running_square,self.iterations = None,None,None
        self.bn.reset_running_stats()
    
    def update_stats(self):
        if self.training and self.track_running_stats:
            self.bn.num_batches_tracked += 1
            eaf = 1.0 / float(self.bn.num_batches_tracked) if self.bn.momentum is None else self.bn.momentum
            self.bn.running_mean = self.bn.running_mean * (1-eaf) + self.running_mean * eaf / self.iterations
            var = self.running_square/self.iterations - (self.running_mean/self.iterations).pow(2)
            self.bn.running_var  = self.bn.running_var  * (1-eaf) + var  * eaf
            self.running_mean,self.running_square,self.iterations = None,None,None
    
    def reset_parameters(self):
        self.bn.reset_parameters()
        
    def forward(self, input):
        self.bn._check_input_dim(input)
        if self.track_running_stats:
            if self.iterations is None:
                self.running_mean   = self.bn.weight.new_zeros(self.num_features)
                self.running_square = self.bn.weight.new_zeros(self.num_features)
                self.iterations   = 0
            self.running_mean += input.view(input.size(0), input.size(1), -1).mean(2).sum(0)
            self.running_square += input.view(input.size(0), input.size(1), -1).pow(2).mean(2).sum(0)
            self.iterations += input.size(0)
        return torch.batch_norm(input, self.bn.weight, self.bn.bias, self.bn.running_mean, self.bn.running_var, 
            False, 0., self.bn.eps, torch.backends.cudnn.enabled) 

In [11]:
def change_all_BN(module):
    for i in range(5):
        atr = 'bn'+str(i)
        if hasattr(module, atr):
            setattr(module,atr,AccumulateBatchNorm(getattr(module,atr)))


def wrap_BN(model):
    for i in range(len(model)):
        for j in range(len(model[i])):
            if isinstance(model[i][j], bn_types):
                model[i][j] = AccumulateBatchNorm(model[i][j])
            elif model[i][j].__class__.__name__ == "Sequential":
                for k in range(len(model[i][j])):
                    if isinstance(model[i][j][k], bn_types):
                        model[i][j][k] = AccumulateBatchNorm(model[i][j][k])
                    elif model[i][j][k].__class__.__name__ == "BasicBlock":
                        change_all_BN(model[i][j][k])
                        if hasattr(model[i][j][k],'downsample'):
                            if model[i][j][k].downsample is not None:
                                for l in range(len(model[i][j][k].downsample)):
                                     if isinstance(model[i][j][k].downsample[l], bn_types):
                                        model[i][j][k].downsample[l] = AccumulateBatchNorm(model[i][j][k].downsample[l])
                               

In [12]:
class AccumulateOptimWrapper(OptimWrapper):
    def step(self):          pass
    def zero_grad(self):      pass
    def real_step(self):      super().step()
    def real_zero_grad(self): super().zero_grad()
        
def acc_create_opt(self, lr:Floats, wd:Floats=0.):
        "Create optimizer with `lr` learning rate and `wd` weight decay."
        self.opt = AccumulateOptimWrapper.create(self.opt_func, lr, self.layer_groups,
                                         wd=wd, true_wd=self.true_wd, bn_wd=self.bn_wd)
        
@dataclass
class AccumulateStep(LearnerCallback):
    """
    Does accumlated step every nth step by accumulating gradients
    """
    def __init__(self, learn:Learner, n_step:int = 1):
        super().__init__(learn)
        self.n_step = n_step
 
    def on_train_begin(self, **kwargs):
        "check if loss is reduction"
        if self.loss_func.reduction == "mean":
             print("For better gradients consider 'reduction=sum'")
        
    def on_epoch_begin(self, **kwargs):
        "init samples and batches, change optimizer"
        self.acc_samples = 0
        self.acc_batches = 0
        
    def on_batch_begin(self, last_input, last_target, **kwargs):
        "accumulate samples and batches"
        self.acc_samples += last_input.shape[0]
        self.acc_batches += 1
        print(f"At batch {self.acc_batches}")
        
    def on_backward_end(self, **kwargs):
        "step if number of desired batches accumulated, reset samples"
        if (self.acc_batches % self.n_step) == 0:
            for p in (self.learn.model.parameters()):
                if p.requires_grad: p.grad.div_(self.acc_samples)
    
            print(f"Stepping at batch: {self.acc_batches}")
            self.learn.opt.real_step()
            self.learn.opt.real_zero_grad()
            self.acc_samples = 0
    
    def on_epoch_end(self, **kwargs):
        "step the rest of the accumulated grads"
        self.learn.opt.real_step()
        self.learn.opt.real_zero_grad()

In [13]:
original_create_opt = Learner.create_opt
def turn_off_accumulation(): Learner.create_opt = original_create_opt
def turn_on_accumulation(): Learner.create_opt = acc_create_opt

### **batch size=32 , no accum.**

In [24]:
seed_everything(2)
pat = re.compile(r'/([^/]+)_\d+.jpg$')
data = ImageDataBunch.from_name_re(path_img, fnames, pat, ds_tfms=get_transforms(), size=224, bs=32
                                  ).normalize(imagenet_stats)

In [25]:
def get_learner():
    turn_off_accumulation()
    learn = create_cnn(data=data, arch=models.resnet34, metrics=error_rate)
    return learn

In [26]:
learn = get_learner() 

In [27]:
learn.fit_one_cycle(1)

epoch,train_loss,valid_loss,error_rate
1,0.681628,0.276923,0.079161


In [28]:
learn.unfreeze()
learn.fit_one_cycle(1, max_lr=slice(1e-6,1e-4))

epoch,train_loss,valid_loss,error_rate
1,0.435896,0.242077,0.070365


### **batch size=2 , no accum.**

In [29]:
seed_everything(2)
pat = re.compile(r'/([^/]+)_\d+.jpg$')
data = ImageDataBunch.from_name_re(path_img, fnames, pat, ds_tfms=get_transforms(), size=224, bs=BS
                                  ).normalize(imagenet_stats)

In [30]:
def get_learner():
    turn_off_accumulation()
    learn = create_cnn(data=data, arch=models.resnet34, metrics=error_rate)
    return learn

In [31]:
learn = get_learner() 

In [32]:
learn.fit_one_cycle(1)

epoch,train_loss,valid_loss,error_rate
1,2.421090,0.856192,0.236806


In [33]:
learn.unfreeze()
learn.fit_one_cycle(1, max_lr=slice(1e-6,1e-4))

epoch,train_loss,valid_loss,error_rate
1,2.291767,0.865607,0.225981


## **Training: resnet34 - grad. accum. - effective batch size=32: 2bs x 16 step**

In [21]:
seed_everything(2)

In [22]:
pat = re.compile(r'/([^/]+)_\d+.jpg$')

In [23]:
data = ImageDataBunch.from_name_re(path_img, fnames, pat, ds_tfms=get_transforms(), size=224, bs=BS
                                  ).normalize(imagenet_stats)

In [24]:
def get_learner():
    turn_on_accumulation()
    learn = create_cnn(data=data, arch=models.resnet34, metrics=error_rate,
                       callback_fns=[partial(AccumulateStep, n_step=N_STEP)])
    learn.loss_func = CrossEntropyFlat(reduction="sum")
    return learn

In [25]:
learn = get_learner() 

In [26]:
wrap_BN(learn.model)

In [28]:
learn.fit_one_cycle(1)

epoch,train_loss,valid_loss,error_rate


At batch 1
At batch 2
At batch 3
At batch 4
At batch 5
At batch 6
At batch 7
At batch 8
At batch 9
At batch 10
At batch 11
At batch 12
At batch 13
At batch 14
At batch 15
At batch 16
Stepping at batch: 16
At batch 17
At batch 18
At batch 19
At batch 20
At batch 21
At batch 22
At batch 23
At batch 24
At batch 25
At batch 26
At batch 27
At batch 28
At batch 29
At batch 30
At batch 31
At batch 32
Stepping at batch: 32
At batch 33
At batch 34
At batch 35
At batch 36
At batch 37
At batch 38
At batch 39
At batch 40
At batch 41
At batch 42
At batch 43
At batch 44
At batch 45
At batch 46
At batch 47
At batch 48
Stepping at batch: 48
At batch 49
At batch 50
At batch 51
At batch 52
At batch 53
At batch 54
At batch 55
At batch 56
At batch 57
At batch 58
At batch 59
At batch 60
At batch 61
At batch 62
At batch 63
At batch 64
Stepping at batch: 64
At batch 65
At batch 66
At batch 67
At batch 68
At batch 69
At batch 70
At batch 71
At batch 72
At batch 73
At batch 74
At batch 75
At batch 76
At batch 

RuntimeError: CUDA out of memory. Tried to allocate 1024.00 KiB (GPU 1; 10.91 GiB total capacity; 8.94 GiB already allocated; 15.88 MiB free; 894.58 MiB cached)

In [21]:
learn.layer_groups

[Sequential(
   (0): Conv2d(3, 64, kernel_size=(7, 7), stride=(2, 2), padding=(3, 3), bias=False)
   (1): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
   (2): ReLU(inplace)
   (3): MaxPool2d(kernel_size=3, stride=2, padding=1, dilation=1, ceil_mode=False)
   (4): Conv2d(64, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
   (5): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
   (6): ReLU(inplace)
   (7): Conv2d(64, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
   (8): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
   (9): Conv2d(64, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
   (10): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
   (11): ReLU(inplace)
   (12): Conv2d(64, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
   (13): BatchNorm2d(64, eps=1e-05, momentum=0.1, affi

In [40]:
learn.unfreeze()
learn.fit_one_cycle(1, max_lr=slice(1e-6,1e-4))

epoch,train_loss,valid_loss,error_rate
1,5.052551,2.621395,0.302436


At batch 1
At batch 2
At batch 3
At batch 4
At batch 5
At batch 6
At batch 7
At batch 8
At batch 9
At batch 10
At batch 11
At batch 12
At batch 13
At batch 14
At batch 15
At batch 16
Stepping at batch: 16
At batch 17
At batch 18
At batch 19
At batch 20
At batch 21
At batch 22
At batch 23
At batch 24
At batch 25
At batch 26
At batch 27
At batch 28
At batch 29
At batch 30
At batch 31
At batch 32
Stepping at batch: 32
At batch 33
At batch 34
At batch 35
At batch 36
At batch 37
At batch 38
At batch 39
At batch 40
At batch 41
At batch 42
At batch 43
At batch 44
At batch 45
At batch 46
At batch 47
At batch 48
Stepping at batch: 48
At batch 49
At batch 50
At batch 51
At batch 52
At batch 53
At batch 54
At batch 55
At batch 56
At batch 57
At batch 58
At batch 59
At batch 60
At batch 61
At batch 62
At batch 63
At batch 64
Stepping at batch: 64
At batch 65
At batch 66
At batch 67
At batch 68
At batch 69
At batch 70
At batch 71
At batch 72
At batch 73
At batch 74
At batch 75
At batch 76
At batch 

### **effective batch size=32: 2bs x 16 step - BNFreeze**

In [41]:
from fastai.train import BnFreeze

In [42]:
class BnFreeze(LearnerCallback):
    "Freeze moving average statistics in all non-trainable batchnorm layers."
    def on_train_begin(self, **kwargs:Any)->None:
        "Put bn layers in eval mode just after `model.train()`."
        set_bn_eval(self.learn.model)

In [43]:
seed_everything(2)
pat = re.compile(r'/([^/]+)_\d+.jpg$')
data = ImageDataBunch.from_name_re(path_img, fnames, pat, ds_tfms=get_transforms(), size=224, bs=BS
                                  ).normalize(imagenet_stats)

In [44]:
def get_learner():
    turn_on_accumulation()
    learn = create_cnn(data=data, arch=models.resnet34, metrics=error_rate,
                       callback_fns=[partial(AccumulateStep, n_step=N_STEP), BnFreeze])
    learn.loss_func = CrossEntropyFlat(reduction="sum")
    return learn

In [45]:
learn = get_learner() 

In [46]:
# freeze bn layers
for g in learn.layer_groups[:-1]:
    for l in g:
        if isinstance(l, bn_types): requires_grad(l, False)

In [47]:
learn.loss_func = CrossEntropyFlat(reduction='sum')

In [48]:
learn.fit_one_cycle(1)

epoch,train_loss,valid_loss,error_rate
1,5.230990,2.585538,0.315968


At batch 1
At batch 2
At batch 3
At batch 4
At batch 5
At batch 6
At batch 7
At batch 8
At batch 9
At batch 10
At batch 11
At batch 12
At batch 13
At batch 14
At batch 15
At batch 16
Stepping at batch: 16
At batch 17
At batch 18
At batch 19
At batch 20
At batch 21
At batch 22
At batch 23
At batch 24
At batch 25
At batch 26
At batch 27
At batch 28
At batch 29
At batch 30
At batch 31
At batch 32
Stepping at batch: 32
At batch 33
At batch 34
At batch 35
At batch 36
At batch 37
At batch 38
At batch 39
At batch 40
At batch 41
At batch 42
At batch 43
At batch 44
At batch 45
At batch 46
At batch 47
At batch 48
Stepping at batch: 48
At batch 49
At batch 50
At batch 51
At batch 52
At batch 53
At batch 54
At batch 55
At batch 56
At batch 57
At batch 58
At batch 59
At batch 60
At batch 61
At batch 62
At batch 63
At batch 64
Stepping at batch: 64
At batch 65
At batch 66
At batch 67
At batch 68
At batch 69
At batch 70
At batch 71
At batch 72
At batch 73
At batch 74
At batch 75
At batch 76
At batch 

In [49]:
learn.unfreeze()
learn.fit_one_cycle(1, max_lr=slice(1e-6,1e-4))

epoch,train_loss,valid_loss,error_rate
1,4.880914,2.586400,0.317997


At batch 1
At batch 2
At batch 3
At batch 4
At batch 5
At batch 6
At batch 7
At batch 8
At batch 9
At batch 10
At batch 11
At batch 12
At batch 13
At batch 14
At batch 15
At batch 16
Stepping at batch: 16
At batch 17
At batch 18
At batch 19
At batch 20
At batch 21
At batch 22
At batch 23
At batch 24
At batch 25
At batch 26
At batch 27
At batch 28
At batch 29
At batch 30
At batch 31
At batch 32
Stepping at batch: 32
At batch 33
At batch 34
At batch 35
At batch 36
At batch 37
At batch 38
At batch 39
At batch 40
At batch 41
At batch 42
At batch 43
At batch 44
At batch 45
At batch 46
At batch 47
At batch 48
Stepping at batch: 48
At batch 49
At batch 50
At batch 51
At batch 52
At batch 53
At batch 54
At batch 55
At batch 56
At batch 57
At batch 58
At batch 59
At batch 60
At batch 61
At batch 62
At batch 63
At batch 64
Stepping at batch: 64
At batch 65
At batch 66
At batch 67
At batch 68
At batch 69
At batch 70
At batch 71
At batch 72
At batch 73
At batch 74
At batch 75
At batch 76
At batch 

### **effective batch size=32: 2bs x 16 step - BNFreeze  + More momentum**

In [50]:
from fastai.train import BnFreeze

In [51]:
class BnFreeze(LearnerCallback):
    "Freeze moving average statistics in all non-trainable batchnorm layers."
    def on_train_begin(self, **kwargs:Any)->None:
        "Put bn layers in eval mode just after `model.train()`."
        set_bn_eval(self.learn.model)

In [52]:
seed_everything(2)
pat = re.compile(r'/([^/]+)_\d+.jpg$')
data = ImageDataBunch.from_name_re(path_img, fnames, pat, ds_tfms=get_transforms(), size=224, bs=BS
                                  ).normalize(imagenet_stats)

In [53]:
def get_learner():
    turn_on_accumulation()
    learn = create_cnn(data=data, arch=models.resnet34, metrics=error_rate,
                       callback_fns=[partial(AccumulateStep, n_step=N_STEP), BnFreeze])
    learn.loss_func = CrossEntropyFlat(reduction="sum")
    return learn

In [54]:
learn = get_learner() 

In [55]:
# # freeze bn layers
# for g in learn.layer_groups[:-1]:
#     for l in g:
#         if isinstance(l, bn_types): requires_grad(l, False)
for g in learn.layer_groups:
    for l in g:
        if isinstance(l, bn_types): l.momentum = 0.9

In [56]:
learn.loss_func = CrossEntropyFlat(reduction='sum')

In [57]:
learn.fit_one_cycle(1)

epoch,train_loss,valid_loss,error_rate
1,5.437956,14.748205,0.747632


At batch 1
At batch 2
At batch 3
At batch 4
At batch 5
At batch 6
At batch 7
At batch 8
At batch 9
At batch 10
At batch 11
At batch 12
At batch 13
At batch 14
At batch 15
At batch 16
Stepping at batch: 16
At batch 17
At batch 18
At batch 19
At batch 20
At batch 21
At batch 22
At batch 23
At batch 24
At batch 25
At batch 26
At batch 27
At batch 28
At batch 29
At batch 30
At batch 31
At batch 32
Stepping at batch: 32
At batch 33
At batch 34
At batch 35
At batch 36
At batch 37
At batch 38
At batch 39
At batch 40
At batch 41
At batch 42
At batch 43
At batch 44
At batch 45
At batch 46
At batch 47
At batch 48
Stepping at batch: 48
At batch 49
At batch 50
At batch 51
At batch 52
At batch 53
At batch 54
At batch 55
At batch 56
At batch 57
At batch 58
At batch 59
At batch 60
At batch 61
At batch 62
At batch 63
At batch 64
Stepping at batch: 64
At batch 65
At batch 66
At batch 67
At batch 68
At batch 69
At batch 70
At batch 71
At batch 72
At batch 73
At batch 74
At batch 75
At batch 76
At batch 

In [58]:
learn.unfreeze()
learn.fit_one_cycle(1, max_lr=slice(1e-6,1e-4))

epoch,train_loss,valid_loss,error_rate
1,5.052551,11.831019,0.679296


At batch 1
At batch 2
At batch 3
At batch 4
At batch 5
At batch 6
At batch 7
At batch 8
At batch 9
At batch 10
At batch 11
At batch 12
At batch 13
At batch 14
At batch 15
At batch 16
Stepping at batch: 16
At batch 17
At batch 18
At batch 19
At batch 20
At batch 21
At batch 22
At batch 23
At batch 24
At batch 25
At batch 26
At batch 27
At batch 28
At batch 29
At batch 30
At batch 31
At batch 32
Stepping at batch: 32
At batch 33
At batch 34
At batch 35
At batch 36
At batch 37
At batch 38
At batch 39
At batch 40
At batch 41
At batch 42
At batch 43
At batch 44
At batch 45
At batch 46
At batch 47
At batch 48
Stepping at batch: 48
At batch 49
At batch 50
At batch 51
At batch 52
At batch 53
At batch 54
At batch 55
At batch 56
At batch 57
At batch 58
At batch 59
At batch 60
At batch 61
At batch 62
At batch 63
At batch 64
Stepping at batch: 64
At batch 65
At batch 66
At batch 67
At batch 68
At batch 69
At batch 70
At batch 71
At batch 72
At batch 73
At batch 74
At batch 75
At batch 76
At batch 

### **effective batch size=32: 2bs x 16 step - AccumulateBatchNorm**

In [59]:
# seed_everything(2)
# pat = re.compile(r'/([^/]+)_\d+.jpg$')
# data = ImageDataBunch.from_name_re(path_img, fnames, pat, ds_tfms=get_transforms(), size=224, bs=BS
#                                   ).normalize(imagenet_stats)

In [60]:
# def get_learner():
#     turn_on_accumulation()
#     learn = create_cnn(data=data, arch=models.resnet34, metrics=error_rate,
#                        callback_fns=[partial(AccumulateStep, n_step=N_STEP)])
#     learn.loss_func = CrossEntropyFlat(reduction="sum")
#     return learn

In [61]:
# learn = get_learner() 

In [62]:
# learn.model

Sequential(
  (0): Sequential(
    (0): Conv2d(3, 64, kernel_size=(7, 7), stride=(2, 2), padding=(3, 3), bias=False)
    (1): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
    (2): ReLU(inplace)
    (3): MaxPool2d(kernel_size=3, stride=2, padding=1, dilation=1, ceil_mode=False)
    (4): Sequential(
      (0): BasicBlock(
        (conv1): Conv2d(64, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
        (bn1): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        (relu): ReLU(inplace)
        (conv2): Conv2d(64, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
        (bn2): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
      )
      (1): BasicBlock(
        (conv1): Conv2d(64, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
        (bn1): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        (rel

In [63]:
# # # freeze bn layers
# # for g in learn.layer_groups[:-1]:
# #     for l in g:
# #         if isinstance(l, bn_types): requires_grad(l, False)

# for i, g in enumerate(learn.layer_groups):
#     for k, l in enumerate(g):
#         if isinstance(l, nn.modules.batchnorm.BatchNorm1d): 
#             learn.layer_groups[i][k]=AccumulateBatchNorm(nn.modules.batchnorm.BatchNorm1d,num_features=l.num_features, eps=1e-05, momentum=0.9, affine=True, track_running_stats=True)
#         if isinstance(l, nn.modules.batchnorm.BatchNorm2d): 
#             learn.layer_groups[i][k]=AccumulateBatchNorm(nn.modules.batchnorm.BatchNorm2d,num_features=l.num_features, eps=1e-05, momentum=0.9, affine=True, track_running_stats=True)
            

In [64]:
# learn.layer_groups

[Sequential(
   (0): Conv2d(3, 64, kernel_size=(7, 7), stride=(2, 2), padding=(3, 3), bias=False)
   (1): AccumulateBatchNorm(
     (bn): BatchNorm2d(64, eps=1e-05, momentum=0.9, affine=True, track_running_stats=True)
   )
   (2): ReLU(inplace)
   (3): MaxPool2d(kernel_size=3, stride=2, padding=1, dilation=1, ceil_mode=False)
   (4): Conv2d(64, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
   (5): AccumulateBatchNorm(
     (bn): BatchNorm2d(64, eps=1e-05, momentum=0.9, affine=True, track_running_stats=True)
   )
   (6): ReLU(inplace)
   (7): Conv2d(64, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
   (8): AccumulateBatchNorm(
     (bn): BatchNorm2d(64, eps=1e-05, momentum=0.9, affine=True, track_running_stats=True)
   )
   (9): Conv2d(64, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
   (10): AccumulateBatchNorm(
     (bn): BatchNorm2d(64, eps=1e-05, momentum=0.9, affine=True, track_running_stats=True)
   )
   (11): ReLU(in

In [65]:
# learn.model

Sequential(
  (0): Sequential(
    (0): Conv2d(3, 64, kernel_size=(7, 7), stride=(2, 2), padding=(3, 3), bias=False)
    (1): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
    (2): ReLU(inplace)
    (3): MaxPool2d(kernel_size=3, stride=2, padding=1, dilation=1, ceil_mode=False)
    (4): Sequential(
      (0): BasicBlock(
        (conv1): Conv2d(64, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
        (bn1): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        (relu): ReLU(inplace)
        (conv2): Conv2d(64, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
        (bn2): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
      )
      (1): BasicBlock(
        (conv1): Conv2d(64, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
        (bn1): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        (rel

In [66]:
# learn.loss_func = CrossEntropyFlat(reduction='sum')

In [67]:
# learn.fit_one_cycle(1)

epoch,train_loss,valid_loss,error_rate
1,5.218886,2.734461,0.343031


At batch 1
At batch 2
At batch 3
At batch 4
At batch 5
At batch 6
At batch 7
At batch 8
At batch 9
At batch 10
At batch 11
At batch 12
At batch 13
At batch 14
At batch 15
At batch 16
Stepping at batch: 16
At batch 17
At batch 18
At batch 19
At batch 20
At batch 21
At batch 22
At batch 23
At batch 24
At batch 25
At batch 26
At batch 27
At batch 28
At batch 29
At batch 30
At batch 31
At batch 32
Stepping at batch: 32
At batch 33
At batch 34
At batch 35
At batch 36
At batch 37
At batch 38
At batch 39
At batch 40
At batch 41
At batch 42
At batch 43
At batch 44
At batch 45
At batch 46
At batch 47
At batch 48
Stepping at batch: 48
At batch 49
At batch 50
At batch 51
At batch 52
At batch 53
At batch 54
At batch 55
At batch 56
At batch 57
At batch 58
At batch 59
At batch 60
At batch 61
At batch 62
At batch 63
At batch 64
Stepping at batch: 64
At batch 65
At batch 66
At batch 67
At batch 68
At batch 69
At batch 70
At batch 71
At batch 72
At batch 73
At batch 74
At batch 75
At batch 76
At batch 

In [68]:
# learn.unfreeze()
# learn.fit_one_cycle(1, max_lr=slice(1e-6,1e-4))

epoch,train_loss,valid_loss,error_rate
1,5.144168,2.727975,0.359269


At batch 1
At batch 2
At batch 3
At batch 4
At batch 5
At batch 6
At batch 7
At batch 8
At batch 9
At batch 10
At batch 11
At batch 12
At batch 13
At batch 14
At batch 15
At batch 16
Stepping at batch: 16
At batch 17
At batch 18
At batch 19
At batch 20
At batch 21
At batch 22
At batch 23
At batch 24
At batch 25
At batch 26
At batch 27
At batch 28
At batch 29
At batch 30
At batch 31
At batch 32
Stepping at batch: 32
At batch 33
At batch 34
At batch 35
At batch 36
At batch 37
At batch 38
At batch 39
At batch 40
At batch 41
At batch 42
At batch 43
At batch 44
At batch 45
At batch 46
At batch 47
At batch 48
Stepping at batch: 48
At batch 49
At batch 50
At batch 51
At batch 52
At batch 53
At batch 54
At batch 55
At batch 56
At batch 57
At batch 58
At batch 59
At batch 60
At batch 61
At batch 62
At batch 63
At batch 64
Stepping at batch: 64
At batch 65
At batch 66
At batch 67
At batch 68
At batch 69
At batch 70
At batch 71
At batch 72
At batch 73
At batch 74
At batch 75
At batch 76
At batch 

### **effective batch size=32: 2bs x 16 step - Instance Normalization**

In [44]:
seed_everything(2)
pat = re.compile(r'/([^/]+)_\d+.jpg$')
data = ImageDataBunch.from_name_re(path_img, fnames, pat, ds_tfms=get_transforms(), size=224, bs=BS
                                  ).normalize(imagenet_stats)

In [45]:
def get_learner():
    turn_on_accumulation()
    learn = create_cnn(data=data, arch=models.resnet34, metrics=error_rate,
                       callback_fns=[partial(AccumulateStep, n_step=N_STEP)])
    learn.loss_func = CrossEntropyFlat(reduction="sum")
    return learn

In [46]:
gr = 16 # GroupNorm number of groups

def BN2IN(module):
    for i in range(5):
        atr = 'bn'+str(i)
        if hasattr(module, atr):
            setattr(module,atr,nn.modules.InstanceNorm2d(num_features=getattr(module,atr).num_features, eps=1e-05, momentum=0.9, affine=True, track_running_stats=True).cuda())


def BN2IN_model(model):
    for i in range(0):#len(model)):  # only 2d BN changed
        for j in range(len(model[i])):
            if isinstance(model[i][j], bn_types):
                model[i][j] =nn.modules.InstanceNorm2d(num_features=model[i][j].num_features, eps=1e-05, momentum=0.9, affine=True, track_running_stats=True).cuda()
            elif model[i][j].__class__.__name__ == "Sequential":
                for k in range(len(model[i][j])):
                    if isinstance(model[i][j][k], bn_types):
                        model[i][j][k] =nn.modules.InstanceNorm2d(num_features=model[i][j][k].num_features, eps=1e-05, momentum=0.9, affine=True, track_running_stats=True).cuda()
                    elif model[i][j][k].__class__.__name__ == "BasicBlock":
                        BN2IN(model[i][j][k])
                        if hasattr(model[i][j][k],'downsample'):
                            if model[i][j][k].downsample is not None:
                                for l in range(len(model[i][j][k].downsample)):
                                     if isinstance(model[i][j][k].downsample[l], bn_types):
                                        model[i][j][k].downsample[l] =nn.modules.InstanceNorm2d(num_features=model[i][j][k].downsample[l].num_features, eps=1e-05, momentum=0.9, affine=True, track_running_stats=True).cuda()
                               

In [47]:
learn = get_learner() 

In [48]:
BN2IN_model(learn.model[0])

In [49]:
learn.loss_func = CrossEntropyFlat(reduction='sum')

In [50]:
learn.fit_one_cycle(1)

epoch,train_loss,valid_loss,error_rate
1,5.323525,2.499947,0.279432


At batch 1
At batch 2
At batch 3
At batch 4
At batch 5
At batch 6
At batch 7
At batch 8
At batch 9
At batch 10
At batch 11
At batch 12
At batch 13
At batch 14
At batch 15
At batch 16
Stepping at batch: 16
At batch 17
At batch 18
At batch 19
At batch 20
At batch 21
At batch 22
At batch 23
At batch 24
At batch 25
At batch 26
At batch 27
At batch 28
At batch 29
At batch 30
At batch 31
At batch 32
Stepping at batch: 32
At batch 33
At batch 34
At batch 35
At batch 36
At batch 37
At batch 38
At batch 39
At batch 40
At batch 41
At batch 42
At batch 43
At batch 44
At batch 45
At batch 46
At batch 47
At batch 48
Stepping at batch: 48
At batch 49
At batch 50
At batch 51
At batch 52
At batch 53
At batch 54
At batch 55
At batch 56
At batch 57
At batch 58
At batch 59
At batch 60
At batch 61
At batch 62
At batch 63
At batch 64
Stepping at batch: 64
At batch 65
At batch 66
At batch 67
At batch 68
At batch 69
At batch 70
At batch 71
At batch 72
At batch 73
At batch 74
At batch 75
At batch 76
At batch 

In [51]:
learn.unfreeze()
learn.fit_one_cycle(1, max_lr=slice(1e-6,1e-4))

epoch,train_loss,valid_loss,error_rate
1,4.803073,2.472720,0.278078


At batch 1
At batch 2
At batch 3
At batch 4
At batch 5
At batch 6
At batch 7
At batch 8
At batch 9
At batch 10
At batch 11
At batch 12
At batch 13
At batch 14
At batch 15
At batch 16
Stepping at batch: 16
At batch 17
At batch 18
At batch 19
At batch 20
At batch 21
At batch 22
At batch 23
At batch 24
At batch 25
At batch 26
At batch 27
At batch 28
At batch 29
At batch 30
At batch 31
At batch 32
Stepping at batch: 32
At batch 33
At batch 34
At batch 35
At batch 36
At batch 37
At batch 38
At batch 39
At batch 40
At batch 41
At batch 42
At batch 43
At batch 44
At batch 45
At batch 46
At batch 47
At batch 48
Stepping at batch: 48
At batch 49
At batch 50
At batch 51
At batch 52
At batch 53
At batch 54
At batch 55
At batch 56
At batch 57
At batch 58
At batch 59
At batch 60
At batch 61
At batch 62
At batch 63
At batch 64
Stepping at batch: 64
At batch 65
At batch 66
At batch 67
At batch 68
At batch 69
At batch 70
At batch 71
At batch 72
At batch 73
At batch 74
At batch 75
At batch 76
At batch 

### **effective batch size=32: 2bs x 16 step - Group Normalization**

In [52]:
seed_everything(2)
pat = re.compile(r'/([^/]+)_\d+.jpg$')
data = ImageDataBunch.from_name_re(path_img, fnames, pat, ds_tfms=get_transforms(), size=224, bs=BS
                                  ).normalize(imagenet_stats)

In [53]:
def get_learner():
    turn_on_accumulation()
    learn = create_cnn(data=data, arch=models.resnet34, metrics=error_rate,
                       callback_fns=[partial(AccumulateStep, n_step=N_STEP)])
    learn.loss_func = CrossEntropyFlat(reduction="sum")
    return learn

In [54]:
gr = 16 # GroupNorm number of groups

def BN2GN(module):
    for i in range(5):
        atr = 'bn'+str(i)
        if hasattr(module, atr):
            setattr(module,atr,nn.modules.GroupNorm(gr,getattr(module,atr).num_features, eps=1e-05, affine=True).cuda())


def BN2GN_model(model):
    for i in range(len(model)):
        for j in range(len(model[i])):
            if isinstance(model[i][j], bn_types):
                model[i][j] =nn.modules.GroupNorm(gr,model[i][j].num_features, eps=1e-05, affine=True).cuda()
            elif model[i][j].__class__.__name__ == "Sequential":
                for k in range(len(model[i][j])):
                    if isinstance(model[i][j][k], bn_types):
                        model[i][j][k] =nn.modules.GroupNorm(gr,model[i][j][k].num_features, eps=1e-05, affine=True).cuda()
                    elif model[i][j][k].__class__.__name__ == "BasicBlock":
                        BN2GN(model[i][j][k])
                        if hasattr(model[i][j][k],'downsample'):
                            if model[i][j][k].downsample is not None:
                                for l in range(len(model[i][j][k].downsample)):
                                     if isinstance(model[i][j][k].downsample[l], bn_types):
                                        model[i][j][k].downsample[l] =nn.modules.GroupNorm(gr,model[i][j][k].downsample[l].num_features, eps=1e-05, affine=True).cuda()
                               

In [55]:
learn = get_learner() 

In [57]:
BN2GN_model(learn.model)

In [58]:
learn.loss_func = CrossEntropyFlat(reduction='sum')

In [59]:
learn.fit_one_cycle(1)

epoch,train_loss,valid_loss,error_rate
1,7.783026,7.265579,0.972260


At batch 1
At batch 2
At batch 3
At batch 4
At batch 5
At batch 6
At batch 7
At batch 8
At batch 9
At batch 10
At batch 11
At batch 12
At batch 13
At batch 14
At batch 15
At batch 16
Stepping at batch: 16
At batch 17
At batch 18
At batch 19
At batch 20
At batch 21
At batch 22
At batch 23
At batch 24
At batch 25
At batch 26
At batch 27
At batch 28
At batch 29
At batch 30
At batch 31
At batch 32
Stepping at batch: 32
At batch 33
At batch 34
At batch 35
At batch 36
At batch 37
At batch 38
At batch 39
At batch 40
At batch 41
At batch 42
At batch 43
At batch 44
At batch 45
At batch 46
At batch 47
At batch 48
Stepping at batch: 48
At batch 49
At batch 50
At batch 51
At batch 52
At batch 53
At batch 54
At batch 55
At batch 56
At batch 57
At batch 58
At batch 59
At batch 60
At batch 61
At batch 62
At batch 63
At batch 64
Stepping at batch: 64
At batch 65
At batch 66
At batch 67
At batch 68
At batch 69
At batch 70
At batch 71
At batch 72
At batch 73
At batch 74
At batch 75
At batch 76
At batch 

In [60]:
learn.unfreeze()
learn.fit_one_cycle(1, max_lr=slice(1e-6,1e-4))

epoch,train_loss,valid_loss,error_rate
1,7.774247,7.228300,0.975643


At batch 1
At batch 2
At batch 3
At batch 4
At batch 5
At batch 6
At batch 7
At batch 8
At batch 9
At batch 10
At batch 11
At batch 12
At batch 13
At batch 14
At batch 15
At batch 16
Stepping at batch: 16
At batch 17
At batch 18
At batch 19
At batch 20
At batch 21
At batch 22
At batch 23
At batch 24
At batch 25
At batch 26
At batch 27
At batch 28
At batch 29
At batch 30
At batch 31
At batch 32
Stepping at batch: 32
At batch 33
At batch 34
At batch 35
At batch 36
At batch 37
At batch 38
At batch 39
At batch 40
At batch 41
At batch 42
At batch 43
At batch 44
At batch 45
At batch 46
At batch 47
At batch 48
Stepping at batch: 48
At batch 49
At batch 50
At batch 51
At batch 52
At batch 53
At batch 54
At batch 55
At batch 56
At batch 57
At batch 58
At batch 59
At batch 60
At batch 61
At batch 62
At batch 63
At batch 64
Stepping at batch: 64
At batch 65
At batch 66
At batch 67
At batch 68
At batch 69
At batch 70
At batch 71
At batch 72
At batch 73
At batch 74
At batch 75
At batch 76
At batch 

### **batch size=2 , no accum. - instance Norm**

In [33]:
seed_everything(2)
pat = re.compile(r'/([^/]+)_\d+.jpg$')
data = ImageDataBunch.from_name_re(path_img, fnames, pat, ds_tfms=get_transforms(), size=224, bs=BS
                                  ).normalize(imagenet_stats)

In [34]:
def get_learner():
    turn_off_accumulation()
    learn = create_cnn(data=data, arch=models.resnet34, metrics=error_rate)
    return learn

In [40]:
gr = 16 # GroupNorm number of groups

def BN2IN(module):
    for i in range(5):
        atr = 'bn'+str(i)
        if hasattr(module, atr):
            setattr(module,atr,nn.modules.InstanceNorm2d(num_features=getattr(module,atr).num_features, eps=1e-05, momentum=0.9, affine=True, track_running_stats=True).cuda())


def BN2IN_model(model):
    for i in range(0):#len(model)):  # only 2d BN changed
        for j in range(len(model[i])):
            if isinstance(model[i][j], bn_types):
                model[i][j] =nn.modules.InstanceNorm2d(num_features=model[i][j].num_features, eps=1e-05, momentum=0.9, affine=True, track_running_stats=True).cuda()
            elif model[i][j].__class__.__name__ == "Sequential":
                for k in range(len(model[i][j])):
                    if isinstance(model[i][j][k], bn_types):
                        model[i][j][k] =nn.modules.InstanceNorm2d(num_features=model[i][j][k].num_features, eps=1e-05, momentum=0.9, affine=True, track_running_stats=True).cuda()
                    elif model[i][j][k].__class__.__name__ == "BasicBlock":
                        BN2IN(model[i][j][k])
                        if hasattr(model[i][j][k],'downsample'):
                            if model[i][j][k].downsample is not None:
                                for l in range(len(model[i][j][k].downsample)):
                                     if isinstance(model[i][j][k].downsample[l], bn_types):
                                        model[i][j][k].downsample[l] =nn.modules.InstanceNorm2d(num_features=model[i][j][k].downsample[l].num_features, eps=1e-05, momentum=0.9, affine=True, track_running_stats=True).cuda()
                               

In [41]:
learn = get_learner() 

In [42]:
BN2IN_model(learn.model[0])

In [43]:
learn.fit_one_cycle(1)

epoch,train_loss,valid_loss,error_rate
1,2.444106,0.867396,0.216509


In [None]:
learn.unfreeze()
learn.fit_one_cycle(1, max_lr=slice(1e-6,1e-4))

### **batch size=2 , no accum. - group Norm**

In [14]:
seed_everything(2)
pat = re.compile(r'/([^/]+)_\d+.jpg$')
data = ImageDataBunch.from_name_re(path_img, fnames, pat, ds_tfms=get_transforms(), size=224, bs=BS
                                  ).normalize(imagenet_stats)

In [15]:
def get_learner():
    turn_off_accumulation()
    learn = create_cnn(data=data, arch=models.resnet34, metrics=error_rate)
    return learn

In [16]:
gr = 32# GroupNorm number of groups

def BN2GN(module):
    for i in range(5):
        atr = 'bn'+str(i)
        if hasattr(module, atr):
            setattr(module,atr,nn.modules.GroupNorm(gr,getattr(module,atr).num_features).cuda())


def BN2GN_model(model):
    for i in range(len(model)):
        for j in range(len(model[i])):
            if isinstance(model[i][j], bn_types):
                model[i][j] =nn.modules.GroupNorm(gr,model[i][j].num_features).cuda()
            elif model[i][j].__class__.__name__ == "Sequential":
                for k in range(len(model[i][j])):
                    if isinstance(model[i][j][k], bn_types):
                        model[i][j][k] =nn.modules.GroupNorm(gr,model[i][j][k].num_features).cuda()
                    elif model[i][j][k].__class__.__name__ == "BasicBlock":
                        BN2GN(model[i][j][k])
                        if hasattr(model[i][j][k],'downsample'):
                            if model[i][j][k].downsample is not None:
                                for l in range(len(model[i][j][k].downsample)):
                                     if isinstance(model[i][j][k].downsample[l], bn_types):
                                        model[i][j][k].downsample[l] =nn.modules.GroupNorm(gr,model[i][j][k].downsample[l].num_features, eps=1e-05, affine=True).cuda()
                               

In [63]:
gr = 32# GroupNorm number of groups

def BN2GN(module):
    for i in range(5):
        atr = 'bn'+str(i)
        if hasattr(module, atr):
            setattr(module,atr,nn.modules.GroupNorm(gr,getattr(module,atr).num_features, eps=1e-05, affine=True).cuda())


def BN2GN_model(model):
    for i in range(len(model)):
        for j in range(len(model[i])):
            if isinstance(model[i][j], bn_types):
                model[i][j] =nn.modules.GroupNorm(gr,model[i][j].num_features, eps=1e-05, affine=True).cuda()
            elif model[i][j].__class__.__name__ == "Sequential":
                for k in range(len(model[i][j])):
                    if isinstance(model[i][j][k], bn_types):
                        model[i][j][k] =nn.modules.GroupNorm(gr,model[i][j][k].num_features, eps=1e-05, affine=True).cuda()
                    elif model[i][j][k].__class__.__name__ == "BasicBlock":
                        BN2GN(model[i][j][k])
                        if hasattr(model[i][j][k],'downsample'):
                            if model[i][j][k].downsample is not None:
                                for l in range(len(model[i][j][k].downsample)):
                                     if isinstance(model[i][j][k].downsample[l], bn_types):
                                        model[i][j][k].downsample[l] =nn.modules.GroupNorm(gr,model[i][j][k].downsample[l].num_features, eps=1e-05, affine=True).cuda()
                               

In [17]:
learn = get_learner() 

In [18]:
BN2GN_model(learn.model)

In [19]:
learn.fit_one_cycle(1)

epoch,train_loss,valid_loss,error_rate
1,4.006924,3.648457,0.981055


In [67]:
learn.unfreeze()
learn.fit_one_cycle(1, max_lr=slice(1e-6,1e-4))

epoch,train_loss,valid_loss,error_rate
1,3.636762,3.619229,0.972260
