In [1]:
%load_ext autoreload
%autoreload 2

%matplotlib inline

In [2]:
#export
from exp.nb_11 import *

## Serializing the model

## Train ImageWoof model (10 dogbreeds) from scratch

[Jump_to lesson 12 video](https://course.fast.ai/videos/?lesson=12&t=2920)

In [3]:
path = datasets.untar_data(datasets.URLs.IMAGEWOOF_160)

In [4]:
size = 128
bs = 64

tfms = [make_rgb, RandomResizedCrop(size, scale=(0.35,1)), np_to_float, PilRandomFlip()]
val_tfms = [make_rgb, CenterCrop(size), np_to_float]
il = ImageList.from_files(path, tfms=tfms)
sd = SplitData.split_by_func(il, partial(grandparent_splitter, valid_name='val'))
ll = label_by_func(sd, parent_labeler, proc_y=CategoryProcessor())
ll.valid.x.tfms = val_tfms
data = ll.to_databunch(bs, c_in=3, c_out=10, num_workers=8)

In [5]:
len(il)

12954

In [6]:
loss_func = LabelSmoothingCrossEntropy()
opt_func = adam_opt(mom=0.9, mom_sqr=0.99, eps=1e-6, wd=1e-2)

In [7]:
learn = cnn_learner(xresnet18, data, loss_func, opt_func, norm=norm_imagenette)

In [8]:
def sched_1cycle(lr, pct_start=0.3, mom_start=0.95, mom_mid=0.85, mom_end=0.95):
    phases = create_phases(pct_start)
    sched_lr  = combine_scheds(phases, cos_1cycle_anneal(lr/10., lr, lr/1e5))
    sched_mom = combine_scheds(phases, cos_1cycle_anneal(mom_start, mom_mid, mom_end))
    return [ParamScheduler('lr', sched_lr),
            ParamScheduler('mom', sched_mom)]

In [9]:
lr = 3e-3
pct_start = 0.5
cbsched = sched_1cycle(lr, pct_start)

In [None]:
# note: on Jeremy's 1080ti, each epoch takes 10 sec, compared to 90 sec on my RTX-2070 on Windows 10
#      got 84% accuracy
learn.fit(40, cbsched)

### save the model

In [None]:
st = learn.model.state_dict()

In [None]:
type(st)

In [None]:
', '.join(st.keys())

In [None]:
st['10.bias']

In [31]:
mdl_path = path/'models'
mdl_path.mkdir(exist_ok=True)

It's also possible to save the whole model, including the architecture, but it gets quite fiddly and we don't recommend it. Instead, just save the parameters, and recreate the model directly.

In [None]:
torch.save(st, mdl_path/'iw5')

## Train a Pets model from scratch (37(?) dog and cat breeds)

[Jump_to lesson 12 video](https://course.fast.ai/videos/?lesson=12&t=3127)

In [10]:
pets = datasets.untar_data(datasets.URLs.PETS)

In [11]:
pets.ls()

[WindowsPath('C:/Users/cross-entropy/.fastai/data/oxford-iiit-pet/annotations'),
 WindowsPath('C:/Users/cross-entropy/.fastai/data/oxford-iiit-pet/images')]

In [12]:
pets_path = pets/'images'

In [13]:
il = ImageList.from_files(pets_path, tfms=tfms)

In [14]:
il

ImageList (7390 items)
[WindowsPath('C:/Users/cross-entropy/.fastai/data/oxford-iiit-pet/images/Abyssinian_1.jpg'), WindowsPath('C:/Users/cross-entropy/.fastai/data/oxford-iiit-pet/images/Abyssinian_10.jpg'), WindowsPath('C:/Users/cross-entropy/.fastai/data/oxford-iiit-pet/images/Abyssinian_100.jpg'), WindowsPath('C:/Users/cross-entropy/.fastai/data/oxford-iiit-pet/images/Abyssinian_101.jpg'), WindowsPath('C:/Users/cross-entropy/.fastai/data/oxford-iiit-pet/images/Abyssinian_102.jpg'), WindowsPath('C:/Users/cross-entropy/.fastai/data/oxford-iiit-pet/images/Abyssinian_103.jpg'), WindowsPath('C:/Users/cross-entropy/.fastai/data/oxford-iiit-pet/images/Abyssinian_104.jpg'), WindowsPath('C:/Users/cross-entropy/.fastai/data/oxford-iiit-pet/images/Abyssinian_105.jpg'), WindowsPath('C:/Users/cross-entropy/.fastai/data/oxford-iiit-pet/images/Abyssinian_106.jpg'), WindowsPath('C:/Users/cross-entropy/.fastai/data/oxford-iiit-pet/images/Abyssinian_107.jpg')...]
Path: C:\Users\cross-entropy\.fastai

In [15]:
#export
def random_splitter(fn, p_valid): return random.random() < p_valid

In [16]:
random.seed(42)

In [17]:
sd = SplitData.split_by_func(il, partial(random_splitter, p_valid=0.1))

In [18]:
sd

SplitData
Train: ImageList (6667 items)
[WindowsPath('C:/Users/cross-entropy/.fastai/data/oxford-iiit-pet/images/Abyssinian_1.jpg'), WindowsPath('C:/Users/cross-entropy/.fastai/data/oxford-iiit-pet/images/Abyssinian_100.jpg'), WindowsPath('C:/Users/cross-entropy/.fastai/data/oxford-iiit-pet/images/Abyssinian_101.jpg'), WindowsPath('C:/Users/cross-entropy/.fastai/data/oxford-iiit-pet/images/Abyssinian_102.jpg'), WindowsPath('C:/Users/cross-entropy/.fastai/data/oxford-iiit-pet/images/Abyssinian_103.jpg'), WindowsPath('C:/Users/cross-entropy/.fastai/data/oxford-iiit-pet/images/Abyssinian_104.jpg'), WindowsPath('C:/Users/cross-entropy/.fastai/data/oxford-iiit-pet/images/Abyssinian_106.jpg'), WindowsPath('C:/Users/cross-entropy/.fastai/data/oxford-iiit-pet/images/Abyssinian_108.jpg'), WindowsPath('C:/Users/cross-entropy/.fastai/data/oxford-iiit-pet/images/Abyssinian_109.jpg'), WindowsPath('C:/Users/cross-entropy/.fastai/data/oxford-iiit-pet/images/Abyssinian_110.jpg')...]
Path: C:\Users\cro

In [19]:
n = il.items[0].name; n

'Abyssinian_1.jpg'

In [20]:
re.findall(r'^(.*)_\d+.jpg$', n)[0]

'Abyssinian'

In [21]:
def pet_labeler(fn): return re.findall(r'^(.*)_\d+.jpg$', fn.name)[0]

In [22]:
proc = CategoryProcessor()

In [23]:
ll = label_by_func(sd, pet_labeler, proc_y=proc)

In [24]:
', '.join(proc.vocab)

'Abyssinian, american_bulldog, american_pit_bull_terrier, basset_hound, beagle, Bengal, Birman, Bombay, boxer, British_Shorthair, chihuahua, Egyptian_Mau, english_cocker_spaniel, english_setter, german_shorthaired, great_pyrenees, havanese, japanese_chin, keeshond, leonberger, Maine_Coon, miniature_pinscher, newfoundland, Persian, pomeranian, pug, Ragdoll, Russian_Blue, saint_bernard, samoyed, scottish_terrier, shiba_inu, Siamese, Sphynx, staffordshire_bull_terrier, wheaten_terrier, yorkshire_terrier'

In [25]:
ll.valid.x.tfms = val_tfms

In [26]:
c_out = len(proc.vocab)

In [27]:
data = ll.to_databunch(bs, c_in=3, c_out=c_out, num_workers=8)

In [28]:
learn = cnn_learner(xresnet18, data, loss_func, opt_func, norm=norm_imagenette)

In [None]:
# 40% accuracy -- not a great result
learn.fit(5, cbsched)

## Apply transfer learning from ImageWoof model to Pets model

[Jump_to lesson 12 video](https://course.fast.ai/videos/?lesson=12&t=3265)

In [29]:
# make a learner for pets data bunch, with 10 outputs like the imagewoof model 
learn = cnn_learner(xresnet18, data, loss_func, opt_func, c_out=10, norm=norm_imagenette)

In [32]:
# load the pretrained imagewoof model
st = torch.load(mdl_path/'iw5')

In [33]:
m = learn.model

In [34]:
m.load_state_dict(st)

<All keys matched successfully>

In [35]:
# remove the final layer, and create a new model from the body
# everything before the AdaptiveAvgPool2d layer
cut = next(i for i,o in enumerate(m.children()) if isinstance(o,nn.AdaptiveAvgPool2d))
m_cut = m[:cut]

###  determine how many outputs from the body

In [36]:
xb,yb = get_batch(data.valid_dl, learn)

epoch,train_loss,train_accuracy,valid_loss,valid_accuracy,time


In [37]:
pred = m_cut(xb)

In [38]:
pred.shape

torch.Size([128, 512, 4, 4])

In [39]:
ni = pred.shape[1]

### create the penultimate pooling layers

In [40]:
#export
class AdaptiveConcatPool2d(nn.Module):
    def __init__(self, sz=1):
        super().__init__()
        self.output_size = sz
        self.ap = nn.AdaptiveAvgPool2d(sz)
        self.mp = nn.AdaptiveMaxPool2d(sz)
    def forward(self, x): return torch.cat([self.mp(x), self.ap(x)], 1)

### create the new transfer learning model by combining the body with the new head

In [41]:
# nh = 40 ????? nh does not seem to be used anywhere in this code ?????

m_new = nn.Sequential(
    m_cut, AdaptiveConcatPool2d(), Flatten(),
    nn.Linear(ni*2, data.c_out))

In [42]:
learn.model = m_new

In [43]:
# transfer learning result #1
# sometimes get CUDA out of memory error
#      try reducing batch size to 32, or using mixed precision
#      got 73.4% error
learn.fit(5, cbsched)

epoch,train_loss,train_accuracy,valid_loss,valid_accuracy,time
0,2.90001,0.283936,2.179667,0.48686,01:21
1,1.969682,0.545523,2.208702,0.473029,01:21
2,1.726557,0.634918,1.854407,0.593361,01:22
3,1.48531,0.721914,1.536633,0.706777,01:22
4,1.303791,0.79586,1.471868,0.73444,01:24


## method to take a Learner, and adapt it to another model

[Jump_to lesson 12 video](https://course.fast.ai/videos/?lesson=12&t=3483)

In [44]:
def adapt_model(learn, data):
    cut = next(i for i,o in enumerate(learn.model.children())
               if isinstance(o,nn.AdaptiveAvgPool2d))
    m_cut = learn.model[:cut]
    xb,yb = get_batch(data.valid_dl, learn)
    pred = m_cut(xb)
    ni = pred.shape[1]
    m_new = nn.Sequential(
        m_cut, AdaptiveConcatPool2d(), Flatten(),
        nn.Linear(ni*2, data.c_out))
    learn.model = m_new

In [45]:
learn = cnn_learner(xresnet18, data, loss_func, opt_func, c_out=10, norm=norm_imagenette)
learn.model.load_state_dict(torch.load(mdl_path/'iw5'))

<All keys matched successfully>

In [46]:
adapt_model(learn, data)

epoch,train_loss,train_accuracy,valid_loss,valid_accuracy,time


In [47]:
# freeze body parameters, i.e. except the head
for p in learn.model[0].parameters(): p.requires_grad_(False)

In [48]:
# train just the head
learn.fit(3, sched_1cycle(1e-2, 0.5))

epoch,train_loss,train_accuracy,valid_loss,valid_accuracy,time
0,2.774446,0.308085,2.573833,0.373444,01:13
1,2.428881,0.417729,2.267052,0.459198,01:16
2,2.047926,0.530523,2.087088,0.506224,01:15


In [49]:
# unfreeze body parameters
for p in learn.model[0].parameters(): p.requires_grad_(True)

In [50]:
# transfer learning result #2, slightly worse than result #1
# train some more 
#      what does reset_opt do, and why do we need to set it to True?
#      I think reset_opt = True zeros all the gradients
learn.fit(5, cbsched, reset_opt=True)
# got 72.5%

epoch,train_loss,train_accuracy,valid_loss,valid_accuracy,time
0,1.842971,0.59487,1.884222,0.590595,01:21
1,1.727079,0.637618,1.871323,0.586445,01:23
2,1.634446,0.672566,1.764705,0.625173,01:25
3,1.453012,0.747113,1.588173,0.680498,01:22
4,1.260233,0.820609,1.501467,0.724758,01:21


## what happened here? 
Most likely due to batchnorm, because "batchnorm makes everything weird".
Result is better than training from scratch, but worse than naive fine-tuning where we didn't freeze/unfreeze.
Frozen part of model was designed for image woof, tuned for mean & std dev, but pets data set has different means & std devs. inside the model. Different set of batch norm statistics. Would be interesting to see what's really going on using histograms as we did earlier in the course.
Good news is it's easily fixed.

## Batch norm transfer
The trick is: don't freeze all of the body parameters, but freeze only the body parameters that are not in the batchnorm layers

[Jump_to lesson 12 video](https://course.fast.ai/videos/?lesson=12&t=3567)

In [51]:
learn = cnn_learner(xresnet18, data, loss_func, opt_func, c_out=10, norm=norm_imagenette)
learn.model.load_state_dict(torch.load(mdl_path/'iw5'))
adapt_model(learn, data)

epoch,train_loss,train_accuracy,valid_loss,valid_accuracy,time


In [53]:
def apply_mod(m, f):
    f(m)
    for l in m.children(): apply_mod(l, f)

def set_grad(m, b):
    if isinstance(m, (nn.Linear,nn.BatchNorm2d)): return
    if hasattr(m, 'weight'):
        for p in m.parameters(): p.requires_grad_(b)

In [54]:
# freeze body parameters that are not in linear or batchnorm layers
apply_mod(learn.model, partial(set_grad, b=False))

In [55]:
learn.fit(3, sched_1cycle(1e-2, 0.5))

epoch,train_loss,train_accuracy,valid_loss,valid_accuracy,time
0,2.644171,0.335383,2.253329,0.460581,01:16
1,2.065932,0.513124,2.003434,0.567082,01:17
2,1.821127,0.60582,1.806843,0.607192,01:19


In [56]:
# unfreeze body parameters that were just frozen
apply_mod(learn.model, partial(set_grad, b=True))

In [57]:
# transfer learning result #3
# got 73.7%, not a whole lot better than result #1
learn.fit(5, cbsched, reset_opt=True)

epoch,train_loss,train_accuracy,valid_loss,valid_accuracy,time
0,1.697546,0.654867,1.794863,0.625173,01:18
1,1.6531,0.664917,1.944319,0.572614,01:18
2,1.605481,0.682316,1.769455,0.615491,01:18
3,1.439478,0.750262,1.538387,0.716459,01:18
4,1.270474,0.819109,1.47768,0.737206,01:20


Pytorch already has an `apply` method we can use to accomplish the same task as our own method

`apply_mod(learn.model, partial(set_grad, b=False))`

In [None]:
learn.model.apply(partial(set_grad, b=False));

## Discriminative LR and param groups
could set learning rate to zero for some layers

[Jump_to lesson 12 video](https://course.fast.ai/videos/?lesson=12&t=3799)

In [58]:
learn = cnn_learner(xresnet18, data, loss_func, opt_func, c_out=10, norm=norm_imagenette)

In [59]:
learn.model.load_state_dict(torch.load(mdl_path/'iw5'))
adapt_model(learn, data)

epoch,train_loss,train_accuracy,valid_loss,valid_accuracy,time


In [60]:
# create two groups of parameters
# second group has batchnorm layers plus everything after the head
def bn_splitter(m):
    def _bn_splitter(l, g1, g2):
        if isinstance(l, nn.BatchNorm2d): g2 += l.parameters()
        elif hasattr(l, 'weight'): g1 += l.parameters()
        for ll in l.children(): _bn_splitter(ll, g1, g2)
        
    g1,g2 = [],[]
    _bn_splitter(m[0], g1, g2)
    
    g2 += m[1:].parameters()
    return g1,g2

In [61]:
a,b = bn_splitter(learn.model)

In [62]:
test_eq(len(a)+len(b), len(list(m.parameters())))

In [63]:
print(len(a),len(b))

22 46


In [120]:
{o:o for o in Learner.ALL_CBS}

{'begin_validate': 'begin_validate',
 'after_epoch': 'after_epoch',
 'after_cancel_batch': 'after_cancel_batch',
 'after_cancel_train': 'after_cancel_train',
 'after_fit': 'after_fit',
 'after_batch': 'after_batch',
 'after_pred': 'after_pred',
 'begin_fit': 'begin_fit',
 'begin_epoch': 'begin_epoch',
 'after_loss': 'after_loss',
 'after_backward': 'after_backward',
 'begin_batch': 'begin_batch',
 'after_step': 'after_step',
 'after_cancel_epoch': 'after_cancel_epoch'}

In [65]:
#export
from types import SimpleNamespace
cb_types = SimpleNamespace(**{o:o for o in Learner.ALL_CBS})

In [66]:
cb_types

namespace(after_backward='after_backward', after_batch='after_batch', after_cancel_batch='after_cancel_batch', after_cancel_epoch='after_cancel_epoch', after_cancel_train='after_cancel_train', after_epoch='after_epoch', after_fit='after_fit', after_loss='after_loss', after_pred='after_pred', after_step='after_step', begin_batch='begin_batch', begin_epoch='begin_epoch', begin_fit='begin_fit', begin_validate='begin_validate')

In [74]:
cb_types.after_backward

'after_backward'

In [94]:
#export
class DebugCallback(Callback):
    _order = 999
    def __init__(self, cb_name, f=None): self.cb_name,self.f = cb_name,f
    def __call__(self, cb_name):
        if cb_name==self.cb_name:
            if self.f: self.f(self.run)
            else:      set_trace() # don't you have to call import pdb; pdb.set_trace() ?????

In [96]:
#export
# create the schedules we will use
def sched_1cycle(lrs, pct_start=0.3, mom_start=0.95, mom_mid=0.85, mom_end=0.95):
    phases = create_phases(pct_start)
    sched_lr  = [combine_scheds(phases, cos_1cycle_anneal(lr/10., lr, lr/1e5))
                 for lr in lrs]
    sched_mom = combine_scheds(phases, cos_1cycle_anneal(mom_start, mom_mid, mom_end))
    return [ParamScheduler('lr', sched_lr),
            ParamScheduler('mom', sched_mom)]

In [97]:
disc_lr_sched = sched_1cycle([0,3e-2], 0.5) # (lrs, pct_start)???

In [105]:
disc_lr_sched

[<exp.nb_09.ParamScheduler at 0x21a20fd0710>,
 <exp.nb_09.ParamScheduler at 0x21a20635a58>]

In [104]:
disc_lr_sched[0]

<exp.nb_09.ParamScheduler at 0x21a20fd0710>

In [106]:
learn = cnn_learner(xresnet18, data, loss_func, opt_func,
                    c_out=10, norm=norm_imagenette, splitter=bn_splitter)

learn.model.load_state_dict(torch.load(mdl_path/'iw5'))
adapt_model(learn, data)

epoch,train_loss,train_accuracy,valid_loss,valid_accuracy,time


In [107]:
def _print_det(o): 
    print (len(o.opt.param_groups), o.opt.hypers)
    raise CancelTrainException()

# I thought lrs = [0.,3.e-2], but they are [0.,3.e-3] ?????
learn.fit(1, disc_lr_sched + [DebugCallback(cb_types.after_batch, _print_det)])

epoch,train_loss,train_accuracy,valid_loss,valid_accuracy,time


2 [{'mom': 0.9499999999999997, 'mom_sqr': 0.99, 'eps': 1e-06, 'wd': 0.01, 'lr': 0.0, 'sqr_mom': 0.99}, {'mom': 0.9499999999999997, 'mom_sqr': 0.99, 'eps': 1e-06, 'wd': 0.01, 'lr': 0.0030000000000000512, 'sqr_mom': 0.99}]


In [108]:
learn.fit(3, disc_lr_sched)

epoch,train_loss,train_accuracy,valid_loss,valid_accuracy,time
0,2.514204,0.372731,2.259981,0.450899,01:18
1,2.208817,0.468427,2.318595,0.434302,01:20
2,1.958383,0.549123,1.882161,0.571231,01:21


In [118]:
disc_lr_sched = sched_1cycle([1e-3,1e-2], 0.3) # (lrs, pct_start)???

In [119]:
learn.fit(5, disc_lr_sched)

epoch,train_loss,train_accuracy,valid_loss,valid_accuracy,time
0,1.79417,0.60972,1.995908,0.546335,01:18
1,1.789848,0.60912,1.888118,0.589212,01:20
2,1.658342,0.657867,1.705317,0.647303,01:19
3,1.523009,0.710664,1.635593,0.679115,01:19
4,1.40839,0.757012,1.591546,0.694329,01:19


## Export

In [136]:
# !./notebook2script.py 11a_transfer_learning.ipynb
!python notebook2script.py 11a_transfer_learning.ipynb

Converted 11a_transfer_learning.ipynb to exp\nb_11a.py
