In [1]:
%load_ext autoreload
%autoreload 2

%matplotlib inline

In [2]:
#export
from lib.nb_07 import *

### Basic CNN's

In [3]:
x_train, y_train, x_valid, y_valid = get_mnist()

In [4]:
x_train.shape, y_train.shape, x_valid.shape, y_valid.shape

(torch.Size([50000, 784]),
 torch.Size([50000]),
 torch.Size([10000, 784]),
 torch.Size([10000]))

In [5]:
normalize

<function lib.nb_01.normalize(x, m, s)>

In [6]:
#export
def normalize_to(train, valid):
    m,s = train.mean(),train.std()
    return normalize(train, m, s), normalize(valid, m, s)

In [7]:
x_train,x_valid = normalize_to(x_train,x_valid)
train_ds,valid_ds = Dataset(x_train, y_train), Dataset(x_valid, y_valid)

In [8]:
nh,bs = 50,512
c = y_train.max().item()+1
loss_func = F.cross_entropy

data = DataBunch(*get_dls(train_ds, valid_ds, bs), c_out=c)

In [9]:
#export
class Lambda(nn.Module):
    
    def __init__(self, func):
        super().__init__()
        self.func = func
        
    def forward(self,x):
        return self.func(x)
    
def flatten(x): return x.view(x.shape[0], -1)

In [10]:
#export
def mnist_resize(x): return x.view(-1, 1, 28, 28)

In [11]:
def get_cnn_model(data):
    return nn.Sequential(
        Lambda(mnist_resize),
        nn.Conv2d( 1, 8, 5, padding=2,stride=2), nn.ReLU(), #14
        nn.Conv2d( 8,16, 3, padding=1,stride=2), nn.ReLU(), # 7
        nn.Conv2d(16,32, 3, padding=1,stride=2), nn.ReLU(), # 4
        nn.Conv2d(32,32, 3, padding=1,stride=2), nn.ReLU(), # 2
        nn.AdaptiveAvgPool2d(1),
        Lambda(flatten),
        nn.Linear(32,data.c_out)
    )

In [12]:
#export
def cos_1cycle_anneal(start, high, end):
    return [sched_cos(start, high), sched_cos(high, end)]

In [13]:
model = get_cnn_model(data)


cbs = [partial(AvgStatsCallback,accuracy),
       partial(CudaCallback, get_device()), 
       Recorder,
       #partial(SaveModelCallback, every="improvement", savename="basic_seq2seq_model"),
       #partial(GradientClipping, clip=0.1),
       ProgressCallback]

lr = 1e-2

sched_lr  = combine_scheds([0.3,0.7], cos_1cycle_anneal(lr/10., lr, lr/1e5))
sched_mom = combine_scheds([0.3,0.7], cos_1cycle_anneal(0.8, 0.7, 0.8))
cbsched = [ParamScheduler('lr', sched_lr) , ParamScheduler('mom', sched_mom)]


learn = Learner(model, data, loss_func=loss_func, cb_funcs=cbs, opt_func=adam_opt())

In [14]:
learn.fit(3, cbs=cbsched)

epoch,train_loss,train_accuracy,valid_loss,valid_accuracy,time
0,0.864158,0.70622,0.204743,0.936,00:03
1,0.14604,0.95582,0.103797,0.9687,00:03
2,0.079235,0.9763,0.073493,0.9784,00:04


#### Refactor Model

In [15]:
#export
def conv_layer(ni, nf, ks=3, stride=2):
    return nn.Sequential(nn.Conv2d(ni,nf,ks, padding=ks//2, stride=stride), nn.ReLU())

Our model should be independent of dataset specific resizing etc, we can do resizing in a callback before feeding in the data to the model.

In [16]:
#export
class BatchTransformXCallback(Callback):
    _order = 2 #important to keep track of orders as time goes on and we create more callbacks
    
    def __init__(self, tfm): self.tfm = tfm
    def begin_batch(self): self.run.xb = self.tfm(self.run.xb)
        
def view_tfm(*size):
    def _inner(x): return x.view(*((-1,)+size))
    return _inner

In [17]:
mnist_view = view_tfm(1,28,28)
cbs.append(partial(BatchTransformXCallback, mnist_view))

In [18]:
nfs = [8,16,32, 32]

In [19]:
def get_cnn_layers(data, nfs, conv_layer=conv_layer):
    nfs = [1] + nfs
    return [conv_layer(nfs[i],nfs[i+1], 5 if i==0 else 3) for i in range(len(nfs)-1)] + [
        nn.AdaptiveAvgPool2d(1), Lambda(flatten), nn.Linear(nfs[-1], data.c_out)
    ]

def get_cnn_model(data, nfs): return nn.Sequential(*get_cnn_layers(data,nfs))

In [20]:
get_cnn_model(data,nfs)

Sequential(
  (0): Sequential(
    (0): Conv2d(1, 8, kernel_size=(5, 5), stride=(2, 2), padding=(2, 2))
    (1): ReLU()
  )
  (1): Sequential(
    (0): Conv2d(8, 16, kernel_size=(3, 3), stride=(2, 2), padding=(1, 1))
    (1): ReLU()
  )
  (2): Sequential(
    (0): Conv2d(16, 32, kernel_size=(3, 3), stride=(2, 2), padding=(1, 1))
    (1): ReLU()
  )
  (3): Sequential(
    (0): Conv2d(32, 32, kernel_size=(3, 3), stride=(2, 2), padding=(1, 1))
    (1): ReLU()
  )
  (4): AdaptiveAvgPool2d(output_size=1)
  (5): Lambda()
  (6): Linear(in_features=32, out_features=10, bias=True)
)

In [21]:
model = get_cnn_model(data, nfs, conv_layer=conv_layer)


cbs = [partial(AvgStatsCallback,accuracy),
       partial(CudaCallback, get_device()), 
       Recorder,
       partial(BatchTransformXCallback, mnist_resize),
       #partial(SaveModelCallback, every="improvement", savename="basic_seq2seq_model"),
       #partial(GradientClipping, clip=0.1),
       ProgressCallback]

lr = 1e-2

sched_lr  = combine_scheds([0.3,0.7], cos_1cycle_anneal(lr/10., lr, lr/1e5))
sched_mom = combine_scheds([0.3,0.7], cos_1cycle_anneal(0.8, 0.7, 0.8))
cbsched = [ParamScheduler('lr', sched_lr) , ParamScheduler('mom', sched_mom)]


learn = Learner(model, data, loss_func=loss_func, cb_funcs=cbs, opt_func=adam_opt())

TypeError: get_cnn_model() got an unexpected keyword argument 'conv_layer'

In [22]:
learn.fit(3, cbs = cbsched)

epoch,train_loss,train_accuracy,valid_loss,valid_accuracy,time
0,0.096784,0.97018,0.101155,0.9697,00:04
1,0.072458,0.97762,0.077027,0.9774,00:04
2,0.039524,0.98846,0.055958,0.9847,00:04


In [None]:
#export
def get_basic_cnn_learner(model, data, loss_func, opt_func=adam_opt(), lr=1e-2):
    cbs = [partial(AvgStatsCallback,accuracy),
       partial(CudaCallback, get_device()), 
       Recorder,
       partial(BatchTransformXCallback, mnist_resize),
       #partial(SaveModelCallback, every="improvement", savename="basic_seq2seq_model"),
       #partial(GradientClipping, clip=0.1),
       ProgressCallback]
    
    lr = lr

    sched_lr  = combine_scheds([0.3,0.7], cos_1cycle_anneal(lr/10., lr, lr/1e5))
    sched_mom = combine_scheds([0.3,0.7], cos_1cycle_anneal(0.8, 0.7, 0.8))
    cbsched = [ParamScheduler('lr', sched_lr) , ParamScheduler('mom', sched_mom)]
    
    return Learner(model, data, loss_func=loss_func, cb_funcs=cbs, opt_func=adam_opt()), cbsched

In [None]:
model = get_cnn_model(data, nfs, conv_layer=conv_layer)
learn, cbsched = get_basic_cnn_learner(model, data, F.cross_entropy, lr=1e-2)

In [None]:
learn.fit(3, cbs=cbsched)

### Hooks

We want to do some telemetry, and want the mean and standard deviation of each activations in the model.

Hooks are PyTorch object you can add to any nn.Module. A hook will be called when a layer, it is registered to, is executed during the forward pass (forward hook) or the backward pass (backward hook).

Hooks don't require us to rewrite the model.

In [None]:
model = get_cnn_model(data,nfs, conv_layer=conv_layer)
learn, cbsched = get_basic_cnn_learner(model, data, F.cross_entropy, lr=1e-2)

In [None]:
act_means = [[] for _ in model]
act_stds = [[] for _ in model]

In [None]:
def append_stats(i, mod, inp, outp):
    if mod.training:
        act_means[i].append(outp.data.mean())
        act_stds[i].append(outp.data.std())

In [None]:
for i,m in enumerate(model): m.register_forward_hook(partial(append_stats, i))

In [None]:
learn.fit(1, cbs=cbsched)

In [None]:
for o in act_means: plt.plot(o)
plt.legend(range(5));

In [None]:
len(act_means)

#### Hooks Class

In [None]:
#export
def children(m):
    return list(m.children())

In [None]:
#export
class Hook():
    def __init__(self, m, f): self.hook = m.register_forward_hook(partial(f, self))
    def remove(self): self.hook.remove()
    def __del__(self): self.remove()

In [None]:
def append_stats(hook, mod, inp, outp):
    if not hasattr(hook, 'stats'): hook.stats = ([],[])
    means, stds = hook.stats
    if mod.training:
        means.append(outp.data.mean())
        stds.append(outp.data.std())

In [None]:
model = get_cnn_model(data,nfs, conv_layer=conv_layer)
learn, cbsched = get_basic_cnn_learner(model, data, F.cross_entropy, lr=0.8)

In [None]:
hooks = [Hook(l, append_stats) for l in children(model[:4])]

In [None]:
learn.fit(1)

In [None]:
for h in hooks:
    plt.plot(h.stats[1])
    h.remove()
plt.legend(range(4));

In [None]:
#export
class Hooks(ListContainer):
    
    def __init__(self, ms, f): super().__init__([Hook(m,f) for m in ms])
    def __enter__(self, *args): return self
    def __exit__(self, *args): self.remove()
    def __del__(self): self.remove()
    
    def __delitem__(self,i):
        self[i].remove()
        super().__delitem__(i)
        
    def remove(self):
        for h in self: h.remove()

In [None]:
model = get_cnn_model(data,nfs, conv_layer=conv_layer)
learn, cbsched = get_basic_cnn_learner(model, data, F.cross_entropy, lr=0.8)

with `__enter__` and `__exit__`, we can usee `Hooks` within a context so that python takes care of removing the unused hooks after we are done.

In [None]:
with Hooks(learn.model, append_stats) as hooks:
    
    learn.fit(2)
    
    fig,(ax0,ax1) = plt.subplots(1,2, figsize=(10,4))
    for h in hooks:
        ms,ss = h.stats
        ax0.plot(ms[:10])
        ax1.plot(ss[:10])
    plt.legend(range(6));
    
    fig,(ax0,ax1) = plt.subplots(1,2, figsize=(10,4))
    for h in hooks:
        ms,ss = h.stats
        ax0.plot(ms)
        ax1.plot(ss)
    plt.legend(range(6));

In [None]:
model = get_cnn_model(data,nfs, conv_layer=conv_layer)
learn, cbsched = get_basic_cnn_learner(model, data, F.cross_entropy, lr=0.8)

In [None]:
#export
from torch.nn import init

In [None]:
for l in model:
    if isinstance(l, nn.Sequential):
        init.kaiming_normal_(l[0].weight)
        l[0].bias.data.zero_()

In [None]:
with Hooks(learn.model, append_stats) as hooks:
    
    learn.fit(2)
    
    fig,(ax0,ax1) = plt.subplots(1,2, figsize=(10,4))
    for h in hooks:
        ms,ss = h.stats
        ax0.plot(ms[:10])
        ax1.plot(ss[:10])
    plt.legend(range(6));
    
    fig,(ax0,ax1) = plt.subplots(1,2, figsize=(10,4))
    for h in hooks:
        ms,ss = h.stats
        ax0.plot(ms)
        ax1.plot(ss)
    plt.legend(range(6));

In [None]:
def append_stats(hook, mod, inp, outp):
    if not hasattr(hook,'stats'): hook.stats = ([],[],[])
    means,stds,hists = hook.stats
    if mod.training:
        means.append(outp.data.mean().cpu())
        stds .append(outp.data.std().cpu())
        hists.append(outp.data.cpu().histc(40,0,10)) #histc isn't implemented on the GPU

In [None]:
model = get_cnn_model(data,nfs, conv_layer=conv_layer)
learn, cbsched = get_basic_cnn_learner(model, data, F.cross_entropy, lr=0.8)

In [None]:
for l in model:
    if isinstance(l, nn.Sequential):
        init.kaiming_normal_(l[0].weight)
        l[0].bias.data.zero_()

In [None]:
with Hooks(learn.model, append_stats) as hooks: learn.fit(2)

In [None]:
import matplotlib as mpl
mpl.rcParams["image.cmap"] = "viridis"

In [None]:
# Thanks to @ste for initial version of histogram plotting code
def get_hist(h): return torch.stack(h.stats[2]).t().float().log1p()

In [None]:
fig,axes = plt.subplots(2,2, figsize=(15,6))
for ax,h in zip(axes.flatten(), hooks[:4]):
    ax.imshow(get_hist(h), origin='lower')
    ax.axis('off')
plt.tight_layout()

In [None]:
def get_min(h):
    h1 = torch.stack(h.stats[2]).t().float()
    return h1[:2].sum(0)/h1.sum(0)

In [None]:
fig,axes = plt.subplots(2,2, figsize=(15,6))
for ax,h in zip(axes.flatten(), hooks[:4]):
    ax.plot(get_min(h))
    ax.set_ylim(0,1)
plt.tight_layout()

In [None]:
#export
def get_cnn_layers(data, nfs, layer, **kwargs):
    nfs = [1] + nfs
    return [layer(nfs[i], nfs[i+1], 5 if i==0 else 3, **kwargs)
            for i in range(len(nfs)-1)] + [
        nn.AdaptiveAvgPool2d(1), Lambda(flatten), nn.Linear(nfs[-1], data.c_out)]

def conv_layer(ni, nf, ks=3, stride=2, **kwargs):
    return nn.Sequential(
        nn.Conv2d(ni, nf, ks, padding=ks//2, stride=stride), GeneralRelu(**kwargs))

class GeneralRelu(nn.Module):
    def __init__(self, leak=None, sub=None, maxv=None):
        super().__init__()
        self.leak,self.sub,self.maxv = leak,sub,maxv

    def forward(self, x): 
        x = F.leaky_relu(x,self.leak) if self.leak is not None else F.relu(x)
        if self.sub is not None: x.sub_(self.sub)
        if self.maxv is not None: x.clamp_max_(self.maxv)
        return x

def init_cnn(m, uniform=False):
    f = init.kaiming_uniform_ if uniform else init.kaiming_normal_
    for l in m:
        if isinstance(l, nn.Sequential):
            f(l[0].weight, a=0.1)
            l[0].bias.data.zero_()

def get_cnn_model(data, nfs, layer, **kwargs):
    return nn.Sequential(*get_cnn_layers(data, nfs, layer, **kwargs))

In [None]:
#export
def append_stats(hook, mod, inp, outp):
    if not hasattr(hook,'stats'): hook.stats = ([],[],[])
    means,stds,hists = hook.stats
    if mod.training:
        means.append(outp.data.mean().cpu())
        stds .append(outp.data.std().cpu())
        hists.append(outp.data.cpu().histc(40,-7,7))

In [None]:
model =  get_cnn_model(data, nfs, conv_layer, leak=0.1, sub=0.4, maxv=6.)
init_cnn(model, uniform=True)
learn, cbsched = get_basic_cnn_learner(model, data, F.cross_entropy, lr=0.8)

In [None]:
with Hooks(model, append_stats) as hooks:
    learn.fit(1)
    fig,(ax0,ax1) = plt.subplots(1,2, figsize=(10,4))
    for h in hooks:
        ms,ss,hi = h.stats
        ax0.plot(ms[:10])
        ax1.plot(ss[:10])
    plt.legend(range(5));
    
    fig,(ax0,ax1) = plt.subplots(1,2, figsize=(10,4))
    for h in hooks:
        ms,ss,hi = h.stats
        ax0.plot(ms)
        ax1.plot(ss)
    plt.legend(range(5));

In [None]:
fig,axes = plt.subplots(2,2, figsize=(15,6))
for ax,h in zip(axes.flatten(), hooks[:4]):
    ax.imshow(get_hist(h), origin='lower')
    ax.axis('off')
plt.tight_layout()

In [None]:
def get_min(h):
    h1 = torch.stack(h.stats[2]).t().float()
    return h1[19:22].sum(0)/h1.sum(0)

In [None]:
fig,axes = plt.subplots(2,2, figsize=(15,6))
for ax,h in zip(axes.flatten(), hooks[:4]):
    ax.plot(get_min(h))
    ax.set_ylim(0,1)
plt.tight_layout()

In [None]:
!python3 notebook2script.py 08a_cnn_hooks.ipynb