In [32]:
%load_ext autoreload
%autoreload 2

%matplotlib inline

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [33]:
#export
from fastprogress import *
from exp.nb_10c import *

## Imagenet(te) training

In [56]:
class dp_sched(Callback):
    
    def __init__(self, layer, dropout_final=.35, batch_size=202, num_epochs=5):
        self.layer = layer
        self.dp_final = dropout_final
        self.batch_size=0  #len(self.data.train_dl) causes recursive err(?)
        self.total_iterations = 0 
        self.warmup_sets = 0
        self.full_dropout_sets =0
        self.middle_sets =0
        self.current_epoch=0
        self.num_epochs= num_epochs
        self.mycount=0
        
        #print("batch_size ",self.iterations)
        
    
        
    def begin_fit(self, **kwargs):
        print("begin_fit")
        
        self.mycount=0
        self.batch_size = len(self.data.train_dl)
        print(self.batch_size," batch size****")
        self.n_epochs = max(1,self.num_epochs)   # min(1,kwargs['n_epochs']) #avoid 0
        self.total_iterations = (self.batch_size * self.n_epochs)
        #main calculations for when to apply dropout %
        self.warmup_sets = int(self.total_iterations * .1)
        self.full_dropout_sets = (self.warmup_sets *2)
        self.middle_sets = self.total_iterations - (self.warmup_sets + self.full_dropout_sets)
        print("breakout of sets: warmup ", self.warmup_sets," middle ",self.middle_sets," final ",self.full_dropout_sets)
        self.start_full_dropout = self.warmup_sets + self.middle_sets
        
    def begin_epoch(self):
        print("begin epoch - dp sched")
        self.current_epoch +=1
        print(self.current_epoch, " current epoch - dp sched")
    
    def begin_batch(self):
        #hardcoded batch 
        self.mycount+=1
        #print("iteration = ",self.iter)
        if self.mycount < self.warmup_sets:
            self.layer.p =0
        elif self.mycount > self.start_full_dropout:
            self.layer.p = self.dp_final
        else:
            i = self.mycount - self.warmup_sets
            print(i," i val")
            pct = round(i / self.middle_sets,2)
            print(pct," pct%")
            dp_pct = 1-  round(1 * (1/self.middle_sets)**pct,2)
            print(dp_pct," drop pct")
            new_drop = round(dp_pct * self.dp_final,2)
            print("iter ", self.iter,"mycount ",self.mycount, " dp_pct ", new_drop)
            self.layer.p = new_drop
            
        #new_dp = self.curve(1, self.total, self.iter)
        #self.layer.p = new_dp
        #self.total_iterations+=1
        #print("total iter ",self.total_iterations)
        
    def after_batch(self):
        pass#print("after batch")


In [35]:
path = datasets.untar_data(datasets.URLs.IMAGENETTE_160)

In [36]:
size = 128
tfms = [make_rgb, RandomResizedCrop(size, scale=(0.35,1)), np_to_float, PilRandomFlip()]

bs = 20

il = ImageList.from_files(path, tfms=tfms)
sd = SplitData.split_by_func(il, partial(grandparent_splitter, valid_name='val'))
ll = label_by_func(sd, parent_labeler, proc_y=CategoryProcessor())

ll.valid.x.tfms = [make_rgb, CenterCrop(size), np_to_float]

data = ll.to_databunch(bs, c_in=3, c_out=10, num_workers=0)

## XResNet

In [37]:
#export
def noop(x): return x

class Flatten(nn.Module):
    def forward(self, x): return x.view(x.size(0), -1)

def conv(ni, nf, ks=3, stride=1, bias=False):
    return nn.Conv2d(ni, nf, kernel_size=ks, stride=stride, padding=ks//2, bias=bias)

In [38]:
#export
act_fn = nn.ReLU(inplace=True)

def init_cnn(m):
    if getattr(m, 'bias', None) is not None: nn.init.constant_(m.bias, 0)
    if isinstance(m, (nn.Conv2d,nn.Linear)): nn.init.kaiming_normal_(m.weight)
    for l in m.children(): init_cnn(l)

def conv_layer(ni, nf, ks=3, stride=1, zero_bn=False, act=True):
    bn = nn.BatchNorm2d(nf)
    nn.init.constant_(bn.weight, 0. if zero_bn else 1.)
    layers = [conv(ni, nf, ks, stride=stride), bn]
    if act: layers.append(act_fn)
    return nn.Sequential(*layers)

In [39]:
#export
class ResBlock(nn.Module):
    def __init__(self, expansion, ni, nh, stride=1):
        super().__init__()
        nf,ni = nh*expansion,ni*expansion
        layers  = [conv_layer(ni, nh, 3, stride=stride),
                   conv_layer(nh, nf, 3, zero_bn=True, act=False)
        ] if expansion == 1 else [
                   conv_layer(ni, nh, 1),
                   conv_layer(nh, nh, 3, stride=stride),
                   conv_layer(nh, nf, 1, zero_bn=True, act=False)
        ]
        self.convs = nn.Sequential(*layers)
        self.idconv = noop if ni==nf else conv_layer(ni, nf, 1, act=False)
        self.pool = noop if stride==1 else nn.AvgPool2d(2, ceil_mode=True)

    def forward(self, x): return act_fn(self.convs(x) + self.idconv(self.pool(x)))

In [40]:
#export
class XResNet(nn.Sequential):
    @classmethod
    def create(cls, expansion, layers, c_in=3, c_out=1000):
        nfs = [c_in, (c_in+1)*8, 64, 64]
        stem = [conv_layer(nfs[i], nfs[i+1], stride=2 if i==0 else 1)
            for i in range(3)]

        nfs = [64//expansion,64,128,256,512]
        res_layers = [cls._make_layer(expansion, nfs[i], nfs[i+1],
                                      n_blocks=l, stride=1 if i==0 else 2)
                  for i,l in enumerate(layers)]
        res = cls(
            *stem,
            nn.MaxPool2d(kernel_size=3, stride=2, padding=1),
            *res_layers,
            nn.Dropout(),
            nn.AdaptiveAvgPool2d(1), Flatten(),
            
            nn.Linear(nfs[-1]*expansion, c_out),
        )
        init_cnn(res)
        return res

    @staticmethod
    def _make_layer(expansion, ni, nf, n_blocks, stride):
        return nn.Sequential(
            *[ResBlock(expansion, ni if i==0 else nf, nf, stride if i==0 else 1)
              for i in range(n_blocks)])

In [42]:
#export
def xresnet18 (**kwargs): return XResNet.create(1, [2, 2,  2, 2], **kwargs)
def xresnet34 (**kwargs): return XResNet.create(1, [3, 4,  6, 3], **kwargs)
def xresnet50 (**kwargs): return XResNet.create(4, [3, 4,  6, 3], **kwargs)
def xresnet101(**kwargs): return XResNet.create(4, [3, 4, 23, 3], **kwargs)
def xresnet152(**kwargs): return XResNet.create(4, [3, 8, 36, 3], **kwargs)

In [21]:
class sdrop(Callback):
    def __init__(self):
        self.learn = 0
        print("dp init")
        
    def begin_epoch(self, **kwargs):
        print ("dp begin epoch")
    def end_epoch(self, **kwargs):
        print("dp end epoch")

## Train

In [58]:
cbfs = [partial(AvgStatsCallback,accuracy), ProgressCallback, CudaCallback,
        partial(BatchTransformXCallback, norm_imagenette),
        partial(dp_sched, layer = learn.model[8])
#         partial(MixUp, alpha=0.2)
       ]

In [59]:
loss_func = LabelSmoothingCrossEntropy()
arch = partial(xresnet18, c_out=10)
opt_func = adam_opt(mom=0.9, mom_sqr=0.99, eps=1e-6, wd=1e-2)

In [57]:
learn.model

XResNet(
  (0): Sequential(
    (0): Conv2d(3, 32, kernel_size=(3, 3), stride=(2, 2), padding=(1, 1), bias=False)
    (1): BatchNorm2d(32, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
    (2): ReLU(inplace)
  )
  (1): Sequential(
    (0): Conv2d(32, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
    (1): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
    (2): ReLU(inplace)
  )
  (2): Sequential(
    (0): Conv2d(64, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
    (1): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
    (2): ReLU(inplace)
  )
  (3): MaxPool2d(kernel_size=3, stride=2, padding=1, dilation=1, ceil_mode=False)
  (4): Sequential(
    (0): ResBlock(
      (convs): Sequential(
        (0): Sequential(
          (0): Conv2d(64, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
          (1): BatchNorm2d(64, eps=1e-05, momentum=0.1, affin

In [50]:
#export
def get_batch(dl, learn):
    learn.xb,learn.yb = next(iter(dl))
    learn.do_begin_fit(0)
    learn('begin_batch')
    learn('after_fit')
    return learn.xb,learn.yb

We need to replace the old `model_summary` since it used to take a `Runner`.

In [51]:
# export
def model_summary(model, data, find_all=False, print_mod=False):
    xb,yb = get_batch(data.valid_dl, learn)
    mods = find_modules(model, is_lin_layer) if find_all else model.children()
    f = lambda hook,mod,inp,out: print(f"====\n{mod}\n" if print_mod else "", out.shape)
    with Hooks(mods, f) as hooks: learn.model(xb)

In [25]:
learn = Learner(arch(), data, loss_func, lr=1, cb_funcs=cbfs, opt_func=opt_func)

dp init


In [65]:
learn.model

XResNet(
  (0): Sequential(
    (0): Conv2d(3, 32, kernel_size=(3, 3), stride=(2, 2), padding=(1, 1), bias=False)
    (1): BatchNorm2d(32, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
    (2): ReLU(inplace)
  )
  (1): Sequential(
    (0): Conv2d(32, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
    (1): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
    (2): ReLU(inplace)
  )
  (2): Sequential(
    (0): Conv2d(64, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
    (1): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
    (2): ReLU(inplace)
  )
  (3): MaxPool2d(kernel_size=3, stride=2, padding=1, dilation=1, ceil_mode=False)
  (4): Sequential(
    (0): ResBlock(
      (convs): Sequential(
        (0): Sequential(
          (0): Conv2d(64, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
          (1): BatchNorm2d(64, eps=1e-05, momentum=0.1, affin

In [None]:
#learn.model = learn.model.cuda()
#model_summary(learn.model, data, print_mod=False)

print(learn.model[8].p)

In [52]:
print(len(data.train_dl))

645


In [None]:
class dpsched(Callback):
    __init__()

In [62]:
arch = partial(xresnet34, c_out=10)

In [63]:
learn = Learner(arch(), data, loss_func, lr=1, cb_funcs=cbfs, opt_func=opt_func)

In [66]:
learn.fit(1, cbs=[LR_Find(), Recorder()])

epoch,train_loss,train_accuracy,valid_loss,valid_accuracy,time


begin_fit
645  batch size****
breakout of sets: warmup  64  middle  453  final  128
begin epoch - dp sched
3  current epoch - dp sched
begin epoch - dp sched
4  current epoch - dp sched
0  i val
0.0  pct%
0.0  drop pct
iter  64  dp_pct  0.0
1  i val
0.0  pct%
0.0  drop pct
iter  65  dp_pct  0.0
2  i val
0.0  pct%
0.0  drop pct
iter  66  dp_pct  0.0
3  i val
0.01  pct%
0.06000000000000005  drop pct
iter  67  dp_pct  0.03
4  i val
0.01  pct%
0.06000000000000005  drop pct
iter  68  dp_pct  0.03
5  i val
0.01  pct%
0.06000000000000005  drop pct
iter  69  dp_pct  0.03
6  i val
0.01  pct%
0.06000000000000005  drop pct
iter  70  dp_pct  0.03
7  i val
0.02  pct%
0.12  drop pct
iter  71  dp_pct  0.06
8  i val
0.02  pct%
0.12  drop pct
iter  72  dp_pct  0.06
9  i val
0.02  pct%
0.12  drop pct
iter  73  dp_pct  0.06
10  i val
0.02  pct%
0.12  drop pct
iter  74  dp_pct  0.06
11  i val
0.02  pct%
0.12  drop pct
iter  75  dp_pct  0.06
12  i val
0.03  pct%
0.17000000000000004  drop pct
iter  76  dp_p

In [None]:
learn.recorder.plot(3)

In [27]:
#export
def create_phases(phases):
    phases = listify(phases)
    return phases + [1-sum(phases)]

In [28]:
print(create_phases(0.3))
print(create_phases([0.3,0.2]))

[0.3, 0.7]
[0.3, 0.2, 0.5]


In [29]:
lr = 1e-2
pct_start = 0.5
phases = create_phases(pct_start)
sched_lr  = combine_scheds(phases, cos_1cycle_anneal(lr/10., lr, lr/1e5))
sched_mom = combine_scheds(phases, cos_1cycle_anneal(0.95, 0.85, 0.95))

In [30]:
cbsched = [
    ParamScheduler('lr', sched_lr),
    ParamScheduler('mom', sched_mom)]

In [None]:
learn = Learner(arch(), data, loss_func, lr=lr, cb_funcs=cbfs, opt_func=opt_func)

In [31]:
learn.fit(5, cbs=cbsched)

epoch,train_loss,train_accuracy,valid_loss,valid_accuracy,time
0,1.765889,0.462075,2.044573,0.428,03:33


dp begin epoch
dp begin epoch
dp begin epoch
dp begin epoch


KeyboardInterrupt: 

## cnn_learner

In [None]:
#export
def cnn_learner(arch, data, loss_func, opt_func, c_in=None, c_out=None,
                lr=1e-2, cuda=True, norm=None, progress=True, mixup=0, xtra_cb=None, **kwargs):
    cbfs = [partial(AvgStatsCallback,accuracy)]+listify(xtra_cb)
    if progress: cbfs.append(ProgressCallback)
    if cuda:     cbfs.append(CudaCallback)
    if norm:     cbfs.append(partial(BatchTransformXCallback, norm))
    if mixup:    cbfs.append(partial(MixUp, mixup))
    arch_args = {}
    if not c_in : c_in  = data.c_in
    if not c_out: c_out = data.c_out
    if c_in:  arch_args['c_in' ]=c_in
    if c_out: arch_args['c_out']=c_out
    return Learner(arch(**arch_args), data, loss_func, opt_func=opt_func, lr=lr, cb_funcs=cbfs, **kwargs)

In [None]:
learn = cnn_learner(xresnet34, data, loss_func, opt_func, norm=norm_imagenette)

In [None]:
learn.fit(5, cbsched)

## Imagenet

You can see all this put together in the fastai [imagenet training script](https://github.com/fastai/fastai/blob/master/examples/train_imagenet.py). It's the same as what we've seen so far, except it also handles multi-GPU training. So how well does this work?

We trained for 60 epochs, and got an error of 5.9%, compared to the official PyTorch resnet which gets 7.5% error in 90 epochs! Our xresnet 50 training even surpasses standard resnet 152, which trains for 50% more epochs and has 3x as many layers.

## Export

In [None]:
!./notebook2script.py 11_train_imagenette.ipynb