In [1]:
%load_ext autoreload
%autoreload 2

%matplotlib inline

In [2]:
import apex.fp16_utils as fp16

In [3]:
from fastprogress import *

In [4]:
#export
from exp.nb_10c import *

In [21]:
import torch
import torch.nn as nn
from functools import partial
import sys


In [6]:
#effNets are built with varying coefficients...use effNet_Type[0...7] to load params

#enet type / width expansion / depth expansion / dropout rate / resolution
effNet_Type = [
    [ 0, 1.0, 1.0, 0.2, 224],
    [ 1, 1.0, 1.1, 0.2, 240],
    [ 2, 1.1, 1.2, 0.3, 260],
    [ 3, 1.2, 1.4, 0.3, 300],
    [ 4, 1.4, 1.8, 0.4, 380],
    [ 5, 1.6, 2.2, 0.4, 456],
    [ 6, 1.8, 2.6, 0.5, 528],
    [ 7, 2.0, 3.1, 0.5, 600],
]



In [7]:
print(effNet_params[0])

[0, 1.0, 1.0, 0.2]


In [None]:
class Swish(nn.Module):
    def forward(self, x):
        x = x * torch.sigmoid(x)  #nn.functional.sigmoid is deprecated, use torch.sigmoid instead
        return x

In [8]:
class Flatten(nn.Module):
    def forward(self, x): 
        return x.view(x.size(0), -1)

In [14]:
def init_cnn(m):
    """init cnn with kaiming weights.  Recurses through model layer by layer"""
    if getattr(m,'bias',None) is not None:
        nn.init.constant_(m.bias,0)
    if isinstance(m, (nn.Conv2d, nn.Linear)):
        nn.init.kaiming_uniform_(m.weight)
        
    for l in m.children():
        init_cnn(l)
    

In [8]:
def econv(ni, nf, ks=3, stride=1, groups=1, bias=False):
    return nn.Conv2d(ni, nf, kernel_size=ks, stride=stride, padding=ks//2, groups= groups, bias=bias)


def econv_layer(ni, nf, ks=3, stride=1, groups=1, zero_bn=False, act=True, eps=1e-03, momentum=0.01):
    
    bn = nn.BatchNorm2d(nf, eps=eps, momentum=momentum)
    nn.init.constant_(bn.weight, 0. if zero_bn else 1.)
    
    layers = [econv(ni, nf, ks, stride=stride, groups=groups), bn]
    
    if act: 
        layers.append(act_fn)
        
    return nn.Sequential(*layers)

In [10]:
#drop connect.  Two implementations, use second one due to fp16 training issue per Seb
# not compatible with fp16 training  

class Drop_Connect(nn.Module):
    """create a tensor mask and apply to inputs, for removing drop_ratio % of weights"""
    def __init__(self, drop_ratio=0):
        super().__init__()
        self.keep_percent = 1.0 - drop_ratio

    def forward(self, x):
        if not self.training:
            return x

        batch_size = x.size(0)
        random_tensor = self.keep_percent
        random_tensor += torch.rand([batch_size, 1, 1, 1], dtype=x.dtype,device=x.device)   #dtype is causing issues with fp16 training
        binary_tensor = torch.floor(random_tensor)
        output = x / self.keep_percent * binary_tensor

        return output
    
    
def edrop_connect(inputs, p, training):
    """ Drop connect. """
    if not training: return inputs
    batch_size = inputs.shape[0]
    keep_prob = 1 - p
    random_tensor = keep_prob
    random_tensor += torch.rand([batch_size, 1, 1, 1], dtype=inputs.dtype,device=inputs.device)  # uniform [0,1)
    binary_tensor = torch.floor(random_tensor)
    output = inputs / keep_prob * binary_tensor
    return output


In [12]:
#squeeze and excite block:
class eSqueeze(nn.Module):
    def __init__(self, ni, reduce_ratio=.25):
        super().__init__()
        
        reduced_channels = max(1, int(ni * reduce_ratio))
        #print("reduced = ",reduced_channels)
        
        layers = [nn.AdaptiveAvgPool2d(1),
                      econv(ni, reduced_channels, ks=1, bias=True),  # in TF code, padding = 'same', ?? should be zero here
                      act_fn,
                      econv(reduced_channels, ni, ks=1, bias=True),
                      nn.Sigmoid()]
        
        self.layers = nn.Sequential(*layers)
        
    def forward(self,x):
        return x * self.layers(x)

In [17]:
class eMBConvBlock(nn.Module):
    def __init__(self, ni, nf, expansion=1, 
                 ks=3, stride=2, skip=True,
                 squeeze_ratio=.25, drop_connect_ratio=.2):
        super().__init__()

        nh = ni * expansion  #how much expansion from input count to middle/hidden count

        #1st layer, expansion
        if expansion !=1:
            self.expansion = econv_layer(ni, nh, ks=1, bias=False)
        else:
            self.expansion = nn.Identity()  #identity=no-op, forward(x)

        #2nd layer, depthwise conv
        self.depthwise = econv_layer(nh, nh, ks=ks, stride=stride, groups=nh, bias=False)
    

        #3rd layer
        self.sqex = eSqueeze(nh, squeeze_ratio) if squeeze_ratio >0 else nn.Identity()

        #4th layer
        self.projection = econv_layer(nh, nf, ks=1, stride=1, bias=False)
        

        self.skip = skip and (stride==1) and (ni==nf)
        if self.skip:
            self.dropconnect = partial(drop_connect(p=drop_connect_ratio, training=self.training))
        else:
            self.dropconnect=nn.Identity()

    def forward(self, inputs):
        expand = self.expansion(inputs)
        dwise = self.depthwise(expand)
        se = self.sqex(dwise)
        x = self.projection(se)
        if self.skip:
            x = x+ self.dropconnect(inputs)
        return x
    
    

    

In [18]:
def round_filters(filters, depth_multi, divisor=8, min_depth=None):
    
    """Round number of filters based on depth multiplier.
    see: https://github.com/tensorflow/tpu/blob/master/models/official/efficientnet/efficientnet_model.py"""
    
    orig_f = filters
    
    if not depth_multi:
        return filters

    filters = [f*depth_multi for f in filters]
    min_depth = min_depth or divisor
    new_filters = [max(min_depth, int(f + divisor / 2) // divisor * divisor) for f in filters]
    # prevent rounding by more than 10%
    new_filters = [new_filters[i] + (new_filters[i] < 0.9 * filters[i])* divisor for i in range(len(new_filters))]
    new_filters = [int(f) for f in new_filters]
    #print('round_filter input={} output={}'.format(orig_f, new_filters))
    return new_filters


def round_repeats(repeats, global_params):
    
    """Round number of filters based on depth multiplier."""
    multiplier = global_params.depth_coefficient
    if not multiplier:
        return repeats
    return int(math.ceil(multiplier * repeats))


In [28]:
class effNet(nn.Sequential):
    def __init__(self, channels, repeat, 
                 ks, stride, expand, width_multi=1.0, depth_multi=1.0, 
                 se = None, drop_connect_rate = None,dropout_rate= None, 
                 c_in=3, c_out=10):

        
        repeat = [int(math.ceil(r*depth_multi)) for r in repeat]
        channels = round_filters(channels, width_multi)
        
        stem = [econv_layer(c_in, channels[0], ks=3 ,stride=2)]

        blocks = []
        #The first block needs to take care of stride and filter size increase.

        for i in range(len(repeat)):
            blocks+= [eMBConvBlock(channels[i], channels[i+1], expand[i], ks=ks[i], 
                                   stride=stride[i], se = se, drop_connect_rate=drop_connect_rate)]
            
            blocks+= [eMBConvBlock(channels[i+1], channels[i+1], expand[i], ks=ks[i], 
                                   stride=1, se = se, drop_connect_rate=drop_connect_rate)] *(repeat[i]-1)

        dropout = nn.Dropout(p=dropout_rate) if dropout_rate else nn.Identity()

        head = [conv_layer(channels[-2], channels[-1], ks=1 ,stride=1), 
                nn.AdaptiveAvgPool2d(1), Flatten(), dropout, 
                nn.Linear(channels[-1], c_out)]


        super().__init__(*stem, *blocks, *head)
                      
        init_cnn(self)
        
        


In [27]:
me = sys.modules[__name__]
channels = [32,16,24,40,80,112,192,320,1280]  #9 count
repeat = [1,2,2,3,3,4,1]
ks = [3,3,5,3,5,5,3]
stride = [1,2,2,2,1,2,1]
exp = [1,6,6,6,6,6,6]
se = 0.25
do = 0.2
dc=0.2


# base without multipliers and dropout
setattr(me, 'effnet', partial(effNet, channels=channels, repeat=repeat, ks=ks, stride=stride, 
                                    expand=exp, se=se, drop_connect_rate=dc))

# (number, width_coefficient, depth_coefficient, dropout_rate) 
for n, wm, dm, do in [
    [ 0, 1.0, 1.0, 0.2],
    [ 1, 1.0, 1.1, 0.2],
    [ 2, 1.1, 1.2, 0.3],
    [ 3, 1.2, 1.4, 0.3],
    [ 4, 1.4, 1.8, 0.4],
    [ 5, 1.6, 2.2, 0.4],
    [ 6, 1.8, 2.6, 0.5],
    [ 7, 2.0, 3.1, 0.5],
]:
    name = f'effNetB{n}'
    setattr(me, name, partial(effnet, depth_multi=dm, width_multi=wm, dropout_rate=do))

## Imagenet(te) training

In [49]:
class LightRelu(nn.Module):
    #.46 was found to shift the mean to 0 on a random distribution test
    # maxv of 7.5 was from initial testing on MNIST.  
    #Important - cut your learning rates in half with this...
    
    def __init__(self,sub=.2,maxv=None):
        super().__init__()
        self.sub=sub
        self.maxv=maxv
    
    def forward(self,x):
        #change to lisht
        
        x = x *torch.tanh(x)
        
        if self.sub is not None:
            x.sub_(self.sub)
        if self.maxv is not None: 
            x.clamp_max_(self.maxv)
        return x

In [6]:
path = datasets.untar_data(datasets.URLs.IMAGENETTE_160)

In [7]:
size = 128
tfms = [make_rgb, RandomResizedCrop(128,scale=(0.35,1)), np_to_float, PilRandomFlip()]

bs = 24

il = ImageList.from_files(path, tfms=tfms)
sd = SplitData.split_by_func(il, partial(grandparent_splitter, valid_name='val'))
ll = label_by_func(sd, parent_labeler, proc_y=CategoryProcessor())

ll.valid.x.tfms = [make_rgb, CenterCrop(size), np_to_float]

data = ll.to_databunch(bs, c_in=3, c_out=10, num_workers=0)

## XResNet

In [8]:
#export
def noop(x): return x

class Flatten(nn.Module):
    def forward(self, x): return x.view(x.size(0), -1)

def conv(ni, nf, ks=3, stride=1, bias=False):
    return nn.Conv2d(ni, nf, kernel_size=ks, stride=stride, padding=ks//2, bias=bias)

In [9]:
def noop(x): return x

In [10]:
class Flatten(nn.Module):
    def forward(self,x): return x.view(x.size(0),-1)

In [11]:
#act_fn = LightRelu()

In [12]:
class GeneralRelu(nn.Module):
    def __init__(self, leak=.2, sub=.3, maxv=12):
        super().__init__()
        self.leak,self.sub,self.maxv = leak,sub,maxv

    def forward(self, x): 
        x = F.leaky_relu(x,self.leak) if self.leak is not None else F.relu(x)
        if self.sub is not None: x.sub_(self.sub)
        if self.maxv is not None: x.clamp_max_(self.maxv)
        return x

In [13]:
class FTSwish2(nn.Module):
    def __init__(self, threshold=-.25):
        super().__init__()
        self.threshold = threshold #,self.sub,self.maxv = threshold,sub,maxv

    def forward(self, x): 
        print(x)
        #if x > 0:
        x = (x*torch.sigmoid(x)) + threshold
       # else:
       #     x = threshold
            
        #if self.sub is not None: x.sub_(self.sub)
       # if self.maxv is not None: x.clamp_max_(self.maxv)
        return x

In [14]:
#act_fn = LightRelu()

In [15]:
#export
#act_fn = nn.ReLU(inplace=True)

def init_cnn(m):
    if getattr(m, 'bias', None) is not None: nn.init.constant_(m.bias, 0)
    if isinstance(m, (nn.Conv2d,nn.Linear)): nn.init.kaiming_normal_(m.weight)
    for l in m.children(): init_cnn(l)

def conv_layer(ni, nf, ks=3, stride=1, zero_bn=False, act=True):
    bn = nn.BatchNorm2d(nf)
    nn.init.constant_(bn.weight, 0. if zero_bn else 1.)
    layers = [conv(ni, nf, ks, stride=stride), bn]
    if act: layers.append(act_fn)
    return nn.Sequential(*layers)

In [16]:
#act_fn = nn.ReLU(inplace=True)

def init_cnn2(m):
    if getattr(m, 'bias',None) is not None:  nn.init.constant_(m.bias,0)
    if isinstance(m, (nn.Conv2d, nn.Linear)): nn.init.kaiming_normal_(m.weight)
    for l in m.children(): init_cnn(l)
        
def conv_layer2(ni, nf, ks=3, stride = 1, zero_bn=False, act=True):
    bn = nn.BatchNorm2d(nf)
    nn.init.constant_(bn.weight,0. if zero_bn else 1.)
    layers = [conv(ni,nf,ks,stride=stride),bn]
    if act: layers.append(act_fn)
    return nn.Sequential(*layers)

In [17]:
#export
class ResBlock(nn.Module):
    def __init__(self, expansion, ni, nh, stride=1):
        super().__init__()
        nf,ni = nh*expansion,ni*expansion
        layers  = [conv_layer(ni, nh, 1)]
        layers += [
            conv_layer(nh, nf, 3, stride=stride, zero_bn=True, act=False)
        ] if expansion==1 else [
            conv_layer(nh, nh, 3, stride=stride),
            conv_layer(nh, nf, 1, zero_bn=True, act=False)
        ]
        self.convs = nn.Sequential(*layers)
        self.idconv = noop if ni==nf else conv_layer(ni, nf, 1, act=False)
        self.pool = noop if stride==1 else nn.AvgPool2d(2)

    def forward(self, x): return act_fn(self.convs(x) + self.idconv(self.pool(x)))

In [18]:
class ResBlock2(nn.Module):
    def __init__(self, expansion, ni,nh, stride=1):
        super().__init__()
        nf, ni = nh*expansion, ni*expansion  #number of filters/fields, number of inputs
        layers = [conv_layer(ni,nh,1)]  #base layer
        layers += [
            conv_layer(nh,nh,3,stride=stride, zero_bn=True, act=False) #add new conv layer if expansion =1 else
        ] if expansion==1 else [
            conv_layer(nh,nh,3,stride=stride),
            conv_layer(nh,nf,1, zero_bn=True, act=False) #add two conv layerss
        ]
        self.convs = nn.Sequential(*layers)  #wrap it in a sequential
        self.idconv = noop if ni==nf else conv_layer(ni,nf,1,act=False)  # add id layer
        self.pool = noop if stride==1 else nn.AvgPool2d(2) # add pool layer
        
    def forward(self,x): return act_fn(self.convs(x)+ self.idconv(self.pool(x))) #wrap block in relu

In [19]:
#export
class XResNet(nn.Sequential):
    @classmethod
    def create(cls, expansion, layers, c_in=3, c_out=1000):
        nfs = [c_in, (c_in+1)*8, 64, 64]
        stem = [conv_layer(nfs[i], nfs[i+1], stride=2 if i==0 else 1)
            for i in range(3)]

        nfs = [64//expansion,64,128,256,512]
        res_layers = [cls._make_layer(expansion, nfs[i], nfs[i+1],
                                      n_blocks=l, stride=1 if i==0 else 2)
                  for i,l in enumerate(layers)]
        res = cls(
            *stem,
            nn.MaxPool2d(kernel_size=3, stride=2, padding=1),
            *res_layers,
            nn.AdaptiveAvgPool2d(1), Flatten(),
            nn.Linear(nfs[-1]*expansion, c_out),
        )
        init_cnn(res)
        return res

    @staticmethod
    def _make_layer(expansion, ni, nf, n_blocks, stride):
        return nn.Sequential(
            *[ResBlock(expansion, ni if i==0 else nf, nf, stride if i==0 else 1)
              for i in range(n_blocks)])

In [20]:
#export
def xresnet18 (**kwargs): return XResNet.create(1, [2, 2, 2, 2], **kwargs)
def xresnet34 (**kwargs): return XResNet.create(1, [3, 4, 6, 3], **kwargs)
def xresnet50 (**kwargs): return XResNet.create(4, [3, 4, 6, 3], **kwargs)
def xresnet101(**kwargs): return XResNet.create(4, [3, 4, 23, 3], **kwargs)
def xresnet152(**kwargs): return XResNet.create(4, [3, 8, 36, 3], **kwargs)

## Train

In [21]:
cbfs = [partial(AvgStatsCallback,accuracy), ProgressCallback, CudaCallback,
        partial(BatchTransformXCallback, norm_imagenette),
#         partial(MixUp, alpha=0.2)
       ]

In [22]:
loss_func = LabelSmoothingCrossEntropy()
arch = partial(xresnet18, c_out=10)
opt_func = adam_opt(mom=0.9, mom_sqr=0.99, eps=1e-6, wd=1e-2)

In [23]:
#export
def get_batch(dl, learn):
    learn.xb,learn.yb = next(iter(dl))
    learn.do_begin_fit(0)
    learn('begin_batch')
    learn('after_fit')
    return learn.xb,learn.yb

We need to replace the old `model_summary` since it used to take a `Runner`.

In [24]:
# export
def model_summary(model, find_all=False, print_mod=False):
    xb,yb = get_batch(data.valid_dl, learn)
    mods = find_modules(model, is_lin_layer) if find_all else model.children()
    f = lambda hook,mod,inp,out: print(f"====\n{mod}\n" if print_mod else "", out.shape)
    with Hooks(mods, f) as hooks: learn.model(xb)

In [25]:
#learn = Learner(arch(), data, loss_func, lr=1, cb_funcs=cbfs, opt_func=opt_func)

In [26]:
#learn.model = learn.model.cuda()
#model_summary(learn.model, print_mod=False)

In [27]:
#arch = partial(xresnet50, c_out=10)

In [28]:
#learn = Learner(arch(), data, loss_func, lr=1, cb_funcs=cbfs, opt_func=opt_func)

In [29]:
#learn.fit(1, cbs=[LR_Find(), Recorder()])

In [30]:
#learn.recorder.plot(3)

In [31]:
#learn.recorder.plot_loss

In [32]:
#export
def create_phases(phases):
    phases = listify(phases)
    return phases + [1-sum(phases)]

In [33]:
print(create_phases(0.3))
print(create_phases([0.3,0.2]))

[0.3, 0.7]
[0.3, 0.2, 0.5]


In [34]:
lr = 1e-2
pct_start = 0.5
phases = create_phases(pct_start)
sched_lr  = combine_scheds(phases, cos_1cycle_anneal(lr/10., lr, lr/1e5))
sched_mom = combine_scheds(phases, cos_1cycle_anneal(0.95,0.85, 0.95))

In [35]:
print(1e-15)

1e-15


In [36]:
cbsched = [
    ParamScheduler('lr', sched_lr),
    ParamScheduler('mom', sched_mom)]

In [37]:
#learn = Learner(arch(), data, loss_func, lr=lr, cb_funcs=cbfs, opt_func=opt_func)

In [38]:
#learn.fit(1, cbs=cbsched)

## cnn_learner

In [39]:
#export
def cnn_learner(arch, data, loss_func, opt_func, c_in=None, c_out=None,
                lr=1e-1, cuda=True, norm=None, progress=True, mixup=0, xtra_cb=None, **kwargs):
    cbfs = [partial(AvgStatsCallback,accuracy)]+listify(xtra_cb)
    if progress: cbfs.append(ProgressCallback)
    if cuda:     cbfs.append(CudaCallback)
    if norm:     cbfs.append(partial(BatchTransformXCallback, norm))
    if mixup:    cbfs.append(partial(MixUp, mixup))
    arch_args = {}
    if not c_in : c_in  = data.c_in
    if not c_out: c_out = data.c_out
    if c_in:  arch_args['c_in' ]=c_in
    if c_out: arch_args['c_out']=c_out
    return Learner(arch(**arch_args), data, loss_func, opt_func=opt_func, lr=lr, cb_funcs=cbfs, **kwargs)

In [40]:
#import torch.nn.functional as F  (uncomment if needed)

class FTSwish(nn.Module):
    def __init__(self, threshold=-.25, mean_shift=-.1):
        super().__init__()
        self.threshold = threshold
        self.mean_shift = mean_shift
        #warning - does not handle multi-gpu case below
        #self.device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu") 


    def forward(self, x): 
        
        x = F.relu(x) * torch.sigmoid(x) + self.threshold
        
        #note on above: ("nn.functional.sigmoid is deprecated. Use torch.sigmoid instead.")
        
        
        #FTSwish+ for positive values
        #pos_value = (x*torch.sigmoid(x)) + self.threshold
        
        #FTSwish+ for negative values
        #tval = torch.tensor([self.threshold],device=self.device)
        
        #apply to x tensor based on positive or negative value
        #x = torch.where(x>=0, pos_value, tval)
        
        
        #apply mean shift to drive mean to 0. -.1 was tested as optimal for kaiming init
        if self.mean_shift is not None:
            x.sub_(self.mean_shift)

        return x

In [41]:
#import torch.nn.functional as F  (uncomment if needed,but you likely already have it)

class FTSwishPlus(nn.Module):
    def __init__(self, threshold=-.25, mean_shift=-.1):
        super().__init__()
        self.threshold = threshold
        self.mean_shift = mean_shift

    def forward(self, x): 
        
        x = F.relu(x) * torch.sigmoid(x) + self.threshold        
        #note on above - why not F.sigmoid?: 
        #PyTorch docs - ("nn.functional.sigmoid is deprecated. Use torch.sigmoid instead.")
        
        #apply mean shift to drive mean to 0. -.1 was tested as optimal for kaiming init
        if self.mean_shift is not None:
            x.sub_(self.mean_shift)

        return x

In [42]:
class TRelu(nn.Module):
    def __init__(self, threshold= - .25, mean_shift=-.03):
        super().__init__()
        self.threshold = threshold
        self.mean_shift = mean_shift
    
    def forward(self,x):
        x = F.relu(x)+self.threshold
        
        if self.mean_shift is not None:
            x.sub_(self.mean_shift)
            
        return x
    

In [43]:
act_fn = ReluT()

In [44]:
#print(FTSwish.forward(5))

In [45]:
learn = cnn_learner(xresnet34, data, loss_func, opt_func, norm=norm_imagenette)

In [46]:
learn.fit(12, cbsched) 

epoch,train_loss,train_accuracy,valid_loss,valid_accuracy,time
0,1.708927,0.47526,1.455793,0.612,11:57
1,1.380989,0.635567,1.319252,0.676,09:32
2,1.329379,0.66054,1.855722,0.558,09:59
3,1.308054,0.666822,1.320109,0.674,10:26
4,1.275966,0.68497,1.510024,0.626,09:44
5,1.232606,0.706685,1.111155,0.756,10:33
6,1.167539,0.733209,1.037894,0.79,11:03
7,1.093275,0.758803,1.066288,0.788,12:21
8,1.015583,0.792772,0.855244,0.866,09:55
9,0.935219,0.826198,0.846282,0.87,17:09


In [47]:
learn.fit(5,cbsched)

epoch,train_loss,train_accuracy,valid_loss,valid_accuracy,time
0,0.856373,0.861176,0.874657,0.86,13:55
1,0.9507,0.820149,0.956492,0.824,16:06
2,1.022369,0.792229,1.068875,0.768,15:00
3,0.942604,0.827749,0.840393,0.876,07:13
4,0.836331,0.87374,0.750392,0.904,05:29


## Imagenet

You can see all this put together in the fastai [imagenet training script](https://github.com/fastai/fastai/blob/master/examples/train_imagenet.py). It's the same as what we've seen so far, except it also handles multi-GPU training. So how well does this work?

We trained for 60 epochs, and got an error of 5.9%, compared to the official PyTorch resnet which gets 7.5% error in 90 epochs! Our xresnet 50 training even surpasses standard resnet 152, which trains for 50% more epochs and has 3x as many layers.

## Export

In [48]:
##!./notebook2script.py 11_train_imagenette.ipynb