# ULMFit

In [2]:
%load_ext autoreload
%autoreload 2

%matplotlib inline

In [3]:
#export
from exp.nb_12a import *

## Get the IMDb Data

We load the IMDb data from 12a, instructions to create that file are there if you don't have it yet so go ahead and see.

[Jump_to lesson 12 video](https://course.fast.ai/videos/?lesson=12&t=7459)

In [4]:
path = datasets.untar_data(datasets.URLs.IMDB)

In [4]:
ll = pickle.load(open(path/'ll_lm.pkl', 'rb'))

### Create databunch

In [5]:
# bs,bptt = 128,70
bs,bptt = 64,70
data = lm_databunchify(ll, bs, bptt)

### Get the IMDb vocabulary

In [9]:
# proc_x has the Tokenize and Numericalize processors
ll.train.proc_x

[<exp.nb_12.TokenizeProcessor at 0x2086f4e1278>,
 <exp.nb_12.NumericalizeProcessor at 0x208a24007b8>]

In [141]:
#vocab is a property of the NumericalizeProcessor
vocab = ll.train.proc_x[1].vocab 
print(len(vocab))

60003


In [143]:
print(type(vocab))

<class 'list'>


In [144]:
dir(vocab)

['__add__',
 '__class__',
 '__contains__',
 '__delattr__',
 '__delitem__',
 '__dir__',
 '__doc__',
 '__eq__',
 '__format__',
 '__ge__',
 '__getattribute__',
 '__getitem__',
 '__gt__',
 '__hash__',
 '__iadd__',
 '__imul__',
 '__init__',
 '__init_subclass__',
 '__iter__',
 '__le__',
 '__len__',
 '__lt__',
 '__mul__',
 '__ne__',
 '__new__',
 '__reduce__',
 '__reduce_ex__',
 '__repr__',
 '__reversed__',
 '__rmul__',
 '__setattr__',
 '__setitem__',
 '__sizeof__',
 '__str__',
 '__subclasshook__',
 'append',
 'clear',
 'copy',
 'count',
 'extend',
 'index',
 'insert',
 'pop',
 'remove',
 'reverse',
 'sort']

### vocab.index is a method that maps words, tokens, alphanumeric characters, and punctuation characters to positive integers 

In [98]:
print(vocab.index('the'))
print(vocab.index(UNK))
print(vocab.index(PAD))
print(vocab.index(BOS))
print(vocab.index('p'))
print(vocab.index('100'))
print(vocab.index('p'))
print(vocab.index(';'))

8
0
1
2
6773
1350
6773
134


### Upper case letters are not mapped

In [113]:
vocab.index('B')

ValueError: 'B' is not in list

### double occurrences of the same letter are mapped

In [112]:
vocab.index('zz')

50612

### numbers up to 2025 are mapped

In [140]:
print(vocab.index('2025'))
print(vocab.index('2026'))

58372


ValueError: '2026' is not in list

### Random strings are not mapped

In [114]:
vocab.index('Btfsplk')

ValueError: 'Btfsplk' is not in list

### Build the IMDb language model
#### syntax of a call to get_language_model, for reference:
def get_language_model(vocab_sz, emb_sz, n_hid, n_layers, pad_token, output_p=0.4, hidden_p=0.2, input_p=0.6, 
                       embed_p=0.1, weight_p=0.5, tie_weights=True, bias=True):

In [145]:
# specify inputs
# output_p, hidden_p, input_p, embed_p, weight_p
dps = tensor([0.1, 0.15, 0.25, 0.02, 0.2]) * 0.5
tok_pad = vocab.index(PAD)

In [146]:
# embedding size, number of nodes in hidden layer, number of layers
emb_sz, nh, nl = 300, 300, 2
model = get_language_model(len(vocab), emb_sz, nh, nl, tok_pad, *dps)

## Finetuning the Language Model

Before tackling the classification task, we have to finetune our language model to the IMDB corpus.

We have pretrained a small model on [wikitext 103](https://blog.einstein.ai/the-wikitext-long-term-dependency-language-modeling-dataset/) that you can download by uncommenting the following cell.

In [7]:
# Linux
# wget http://files.fast.ai/models/wt103_tiny.tgz -P {path}
# ! tar xf {path}/wt103_tiny.tgz -C {path}

# Windows 10 
# Install wget and 7-zip 
# Go to directory ~/.fastai/data, then
#      wget http://files.fast.ai/models/wt103_tiny.tgz -o wt103_tiny.tgz
#      extract into IMDb folder with 7-zip

### Get the pretrained weights and vocab from WT103 model
Created with 12b_lm_pretrain.ipynb

In [12]:
old_wgts  = torch.load(path/'pretrained'/'pretrained.pth')
old_vocab = pickle.load(open(path/'pretrained'/'vocab.pkl', 'rb'))

In [177]:
print(len(old_wgts))
print(old_wgts.keys())

14
odict_keys(['0.emb.weight', '0.emb_dp.emb.weight', '0.rnns.0.weight_hh_l0_raw', '0.rnns.0.module.weight_ih_l0', '0.rnns.0.module.weight_hh_l0', '0.rnns.0.module.bias_ih_l0', '0.rnns.0.module.bias_hh_l0', '0.rnns.1.weight_hh_l0_raw', '0.rnns.1.module.weight_ih_l0', '0.rnns.1.module.weight_hh_l0', '0.rnns.1.module.bias_ih_l0', '0.rnns.1.module.bias_hh_l0', '1.decoder.weight', '1.decoder.bias'])


In [185]:
print(len(old_vocab))
print(old_vocab[:10])

60002
['xxbos', 'xxunk', 'xxpad', 'xxrep', 'xxwrep', 'xxup', 'xxmaj', 'the', ',', '.']


In our current vocabulary, it is very unlikely that the ids correspond to what is in the vocabulary used to train the pretrain model. The tokens are sorted by frequency (apart from the special tokens that are all first) so that order is specific to the corpus used. For instance, the word 'house' has a different index in our current vocab from its index in the vocab from the pretrained model.

In [148]:
idx_house_new, idx_house_old = vocab.index('house'),old_vocab.index('house')
print(idx_house_new, idx_house_old )

344 230


We somehow need to match our pretrained weights to the new vocabulary. This is done on the embeddings and the decoder (since the weights between embeddings and decoders are tied) by putting the rows of the embedding matrix (or decoder bias) in the right order.

It may also happen that we have words that aren't in the pretrained vocab, in this case, we put the mean of the pretrained embedding weights/decoder bias.

In [150]:
house_wgt  = old_wgts['0.emb.weight'][idx_house_old]
house_bias = old_wgts['1.decoder.bias'][idx_house_old] 

In [186]:
# dictionary mapping tokens to numerical indexes in old vocab
otoi = {v:k for k,v in enumerate(old_vocab)}
print(otoi['dinosaur'])

7206


In [15]:
# if word in new_vocab is found in old_vocab, 
#      use its weights and bias from old_vocab,
# otherwise, use mean weight and mean bias
def match_embeds(old_wgts, old_vocab, new_vocab):
    wgts = old_wgts['0.emb.weight']
    bias = old_wgts['1.decoder.bias']
    wgts_m,bias_m = wgts.mean(dim=0),bias.mean()
    new_wgts = wgts.new_zeros(len(new_vocab), wgts.size(1))
    new_bias = bias.new_zeros(len(new_vocab))
    # dictionary mapping tokens to numerical indexes in old vocab
    otoi = {v:k for k,v in enumerate(old_vocab)}
    for i,w in enumerate(new_vocab): 
        if w in otoi:
            idx = otoi[w]
            new_wgts[i],new_bias[i] = wgts[idx],bias[idx]
        else: new_wgts[i],new_bias[i] = wgts_m,bias_m
    old_wgts['0.emb.weight']    = new_wgts
    old_wgts['0.emb_dp.emb.weight'] = new_wgts
    old_wgts['1.decoder.weight']    = new_wgts
    old_wgts['1.decoder.bias']      = new_bias
    return old_wgts

In [16]:
wgts = match_embeds(old_wgts, old_vocab, vocab)

Now let's check that the word "*house*" was properly converted.

In [17]:
test_near(wgts['0.emb.weight'][idx_house_new],house_wgt)
test_near(wgts['1.decoder.bias'][idx_house_new],house_bias)

We can load the pretrained weights in our model before beginning training.

In [18]:
model.load_state_dict(wgts)

<All keys matched successfully>

If we want to apply discriminative learning rates, we need to split our model in different layer groups. Let's have a look at our model.

In [187]:
model

SequentialRNN(
  (0): AWD_LSTM(
    (emb): Embedding(60003, 300, padding_idx=1)
    (emb_dp): EmbeddingDropout(
      (emb): Embedding(60003, 300, padding_idx=1)
    )
    (rnns): ModuleList(
      (0): WeightDropout(
        (module): LSTM(300, 300, batch_first=True)
      )
      (1): WeightDropout(
        (module): LSTM(300, 300, batch_first=True)
      )
    )
    (input_dp): RNNDropout()
    (hidden_dps): ModuleList(
      (0): RNNDropout()
      (1): RNNDropout()
    )
  )
  (1): LinearDecoder(
    (output_dp): RNNDropout()
    (decoder): Linear(in_features=300, out_features=60003, bias=True)
  )
)

Then we split by doing two groups for each rnn/corresponding dropout, then one last group that contains the embeddings/decoder. This is the one that needs to be trained the most as we may have new embeddings vectors.

In [20]:
def lm_splitter(m):
    groups = []
    for i in range(len(m[0].rnns)): groups.append(nn.Sequential(m[0].rnns[i], m[0].hidden_dps[i]))
    groups += [nn.Sequential(m[0].emb, m[0].emb_dp, m[0].input_dp, m[1])]
    return [list(o.parameters()) for o in groups]

### First train with the RNNs frozen

In [21]:
for rnn in model[0].rnns:
    for p in rnn.parameters(): p.requires_grad_(False)

In [22]:
cbs = [partial(AvgStatsCallback,accuracy_flat),
       CudaCallback, Recorder,
       partial(GradientClipping, clip=0.1),
       partial(RNNTrainer, α=2., β=1.),
       ProgressCallback]

In [23]:
learn = Learner(model, data, cross_entropy_flat, opt_func=adam_opt(),
                cb_funcs=cbs, splitter=lm_splitter)

In [24]:
lr = 2e-2
cbsched = sched_1cycle([lr], pct_start=0.5, mom_start=0.8, mom_mid=0.7, mom_end=0.8)

In [23]:
learn.fit(1, cbs=cbsched) # 18 minutes

epoch,train_loss,train_accuracy_flat,valid_loss,valid_accuracy_flat,time
0,4.49257,0.246581,4.284493,0.262987,17:43


### Then unfreeze and train the whole model with discriminative learning rates

In [26]:
for rnn in model[0].rnns:
    for p in rnn.parameters(): p.requires_grad_(True)

In [27]:
lr = 2e-3
cbsched = sched_1cycle([lr/2., lr/2., lr], pct_start=0.5, mom_start=0.8, mom_mid=0.7, mom_end=0.8)

In [28]:
# about 3.5 hours on my laptop in Windows 10, apparently not using the GPU
learn.fit(10, cbs=cbsched)

epoch,train_loss,train_accuracy_flat,valid_loss,valid_accuracy_flat,time
0,4.262889,0.262824,4.217347,0.270023,21:08
1,4.201804,0.269074,4.175172,0.274362,21:08
2,4.153314,0.273996,4.139237,0.278281,23:17
3,4.11373,0.277834,4.11244,0.280702,21:05
4,4.079678,0.280953,4.089384,0.28319,21:05
5,4.050331,0.28363,4.068882,0.285225,31:33
6,4.022424,0.286078,4.050799,0.287069,21:07
7,3.997256,0.288302,4.038588,0.288394,21:05
8,3.977549,0.290029,4.03091,0.2892,21:05
9,3.96638,0.290964,4.029355,0.289425,21:08


### Save the encoder and the vocab

We only need to save the encoder (first part of the model) for the classification, as well as the vocabulary used (we will need to use the same in the classification task).

In [30]:
torch.save(learn.model[0].state_dict(), path/'finetuned_enc.pth')

In [31]:
pickle.dump(vocab, open(path/'vocab_lm.pkl', 'wb'))

In [32]:
torch.save(learn.model.state_dict(), path/'finetuned.pth')

## Classifier

We have to process the data again otherwise pickle will complain. We also have to use the same vocab as the language model.

[Jump_to lesson 12 video](https://course.fast.ai/videos/?lesson=12&t=7554)

In [25]:
vocab = pickle.load(open(path/'vocab_lm.pkl', 'rb'))
proc_tok,proc_num,proc_cat = TokenizeProcessor(),NumericalizeProcessor(vocab=vocab),CategoryProcessor()

In [26]:
il = TextList.from_files(path, include=['train', 'test'])
sd = SplitData.split_by_func(il, partial(grandparent_splitter, valid_name='test'))
ll = label_by_func(sd, parent_labeler, proc_x = [proc_tok, proc_num], proc_y=proc_cat)

In [27]:
pickle.dump(ll, open(path/'ll_clas.pkl', 'wb'))

In [5]:
ll = pickle.load(open(path/'ll_clas.pkl', 'rb'))
vocab = pickle.load(open(path/'vocab_lm.pkl', 'rb'))

In [6]:
bs,bptt = 64,70
data = clas_databunchify(ll, bs)

### Ignore padding

We will those two utility functions from PyTorch to ignore the padding in the inputs.

In [7]:
#export
from torch.nn.utils.rnn import pack_padded_sequence, pad_packed_sequence

Let's see how this works: first we grab a batch of the training set.

In [8]:
x,y = next(iter(data.train_dl))

In [9]:
x.size()

torch.Size([64, 3311])

In [10]:
x[1,0:5]

tensor([  2,   7,   8, 861,  13])

In [11]:
pp=x==1
print(pp.size())
print(pp.sum(1))
print(len(pp.sum(1)))

torch.Size([64, 3311])
tensor([   0, 1825, 1853, 1921, 1937, 1964, 1980, 1992, 1993, 2000, 2008, 2013,
        2018, 2032, 2032, 2044, 2044, 2053, 2059, 2074, 2090, 2099, 2100, 2122,
        2126, 2126, 2135, 2141, 2142, 2144, 2159, 2166, 2168, 2169, 2177, 2182,
        2200, 2202, 2203, 2213, 2213, 2217, 2223, 2227, 2228, 2232, 2234, 2238,
        2241, 2241, 2242, 2243, 2249, 2249, 2249, 2253, 2255, 2256, 2263, 2265,
        2270, 2271, 2278, 2285])
64


We need to pass to the utility functions the lengths of our sentences because it's applied after the embedding, so we can't see the padding anymore.

In [12]:
# lengths of unpadded sequences
#      number of non-PAD tokens in each element of the batch
lengths = x.size(1) - (x == 1).sum(1)
lengths[:5]


tensor([3311, 1486, 1458, 1390, 1374])

In [13]:
tst_emb = nn.Embedding(len(vocab), 300)

In [14]:
tst_emb(x).shape

torch.Size([64, 3311, 300])

In [15]:
print(128*70)
print(64*70)

8960
4480


We create a `PackedSequence` object that contains all of our unpadded sequences

In [16]:
packed = pack_padded_sequence(tst_emb(x), lengths, batch_first=True)

In [17]:
packed

PackedSequence(data=tensor([[-1.0515, -1.5225, -1.2218,  ..., -0.6652,  1.3230, -0.1875],
        [-1.0515, -1.5225, -1.2218,  ..., -0.6652,  1.3230, -0.1875],
        [-1.0515, -1.5225, -1.2218,  ..., -0.6652,  1.3230, -0.1875],
        ...,
        [ 1.1188, -0.1941,  0.6471,  ...,  0.5068, -0.3716, -1.2098],
        [-1.0620,  0.6845,  1.2418,  ...,  1.3682, -0.0763,  0.2287],
        [-0.2644, -0.9391, -0.0437,  ...,  0.4308,  0.6607, -0.3444]],
       grad_fn=<PackPaddedSequenceBackward>), batch_sizes=tensor([64, 64, 64,  ...,  1,  1,  1]), sorted_indices=None, unsorted_indices=None)

In [18]:
packed.data.shape

torch.Size([77076, 300])

In [19]:
packed.data.grad_fn

<PackPaddedSequenceBackward at 0x1ff10502208>

In [20]:
len(packed.batch_sizes)

3311

In [21]:
print(8960//70)
print(4480//70)

128
64


This `PackedSequence` object can be passed to any RNN directly while retaining the speed of CuDNN.

In [22]:
tst = nn.LSTM(300, 300, 2)
print(tst)

LSTM(300, 300, num_layers=2)


In [23]:
y,h = tst(packed)

In [24]:
# the output is another PackedSequence object
y

PackedSequence(data=tensor([[-0.0076,  0.0092, -0.0186,  ..., -0.0423,  0.0113,  0.0092],
        [-0.0076,  0.0092, -0.0186,  ..., -0.0423,  0.0113,  0.0092],
        [-0.0076,  0.0092, -0.0186,  ..., -0.0423,  0.0113,  0.0092],
        ...,
        [ 0.0406, -0.0156,  0.0022,  ...,  0.0264, -0.0469, -0.0250],
        [ 0.0472,  0.0007, -0.0026,  ...,  0.0129, -0.0285, -0.0199],
        [ 0.0496, -0.0073, -0.0125,  ..., -0.0131, -0.0170, -0.0057]],
       grad_fn=<CatBackward>), batch_sizes=tensor([64, 64, 64,  ...,  1,  1,  1]), sorted_indices=None, unsorted_indices=None)

In [25]:
y.data.shape

torch.Size([77076, 300])

In [26]:
h[0].shape

torch.Size([2, 64, 300])

In [27]:
h[1].shape

torch.Size([2, 64, 300])

Then we can unpad it with the following function for other modules:

In [28]:
unpack = pad_packed_sequence(y, batch_first=True)

In [29]:
pad_packed_sequence??

In [30]:
unpack[0].shape

torch.Size([64, 3311, 300])

In [31]:
unpack[1]

tensor([3311, 1486, 1458, 1390, 1374, 1347, 1331, 1319, 1318, 1311, 1303, 1298,
        1293, 1279, 1279, 1267, 1267, 1258, 1252, 1237, 1221, 1212, 1211, 1189,
        1185, 1185, 1176, 1170, 1169, 1167, 1152, 1145, 1143, 1142, 1134, 1129,
        1111, 1109, 1108, 1098, 1098, 1094, 1088, 1084, 1083, 1079, 1077, 1073,
        1070, 1070, 1069, 1068, 1062, 1062, 1062, 1058, 1056, 1055, 1048, 1046,
        1041, 1040, 1033, 1026])

### Update AWD-LSTM

We need to change our model a little bit to use this.

In [32]:
#export
class AWD_LSTM1(nn.Module):
    "AWD-LSTM inspired by https://arxiv.org/abs/1708.02182."
    initrange=0.1

    def __init__(self, vocab_sz, emb_sz, n_hid, n_layers, pad_token,
                 hidden_p=0.2, input_p=0.6, embed_p=0.1, weight_p=0.5):
        super().__init__()
        self.bs,self.emb_sz,self.n_hid,self.n_layers,self.pad_token = 1,emb_sz,n_hid,n_layers,pad_token
        self.emb = nn.Embedding(vocab_sz, emb_sz, padding_idx=pad_token)
        self.emb_dp = EmbeddingDropout(self.emb, embed_p)
        self.rnns = [nn.LSTM(emb_sz if l == 0 else n_hid, (n_hid if l != n_layers - 1 else emb_sz), 1,
                             batch_first=True) for l in range(n_layers)]
        self.rnns = nn.ModuleList([WeightDropout(rnn, weight_p) for rnn in self.rnns])
        self.emb.weight.data.uniform_(-self.initrange, self.initrange)
        self.input_dp = RNNDropout(input_p)
        self.hidden_dps = nn.ModuleList([RNNDropout(hidden_p) for l in range(n_layers)])

    def forward(self, input):
        bs,sl = input.size()
        mask = (input == self.pad_token)
        lengths = sl - mask.long().sum(1)
        n_empty = (lengths == 0).sum()
        if n_empty > 0:
            input = input[:-n_empty]
            lengths = lengths[:-n_empty]
            self.hidden = [(h[0][:,:input.size(0)], h[1][:,:input.size(0)]) for h in self.hidden]
        raw_output = self.input_dp(self.emb_dp(input))
        new_hidden,raw_outputs,outputs = [],[],[]
        for l, (rnn,hid_dp) in enumerate(zip(self.rnns, self.hidden_dps)):
            raw_output = pack_padded_sequence(raw_output, lengths, batch_first=True)
            raw_output, new_h = rnn(raw_output, self.hidden[l])
            raw_output = pad_packed_sequence(raw_output, batch_first=True)[0]
            raw_outputs.append(raw_output)
            if l != self.n_layers - 1: raw_output = hid_dp(raw_output)
            outputs.append(raw_output)
            new_hidden.append(new_h)
        self.hidden = to_detach(new_hidden)
        return raw_outputs, outputs, mask

    def _one_hidden(self, l):
        "Return one hidden state."
        nh = self.n_hid if l != self.n_layers - 1 else self.emb_sz
        return next(self.parameters()).new(1, self.bs, nh).zero_()

    def reset(self):
        "Reset the hidden states."
        self.hidden = [(self._one_hidden(l), self._one_hidden(l)) for l in range(self.n_layers)]

### Concat pooling

We will use three things for the classification head of the model: the last hidden state, the average of all the hidden states and the maximum of all the hidden states. The trick is just to, once again, ignore the padding in the last element/average/maximum.

[Jump_to lesson 12 video](https://course.fast.ai/videos/?lesson=12&t=7604)

In [33]:
class Pooling(nn.Module):
    def forward(self, input):
        raw_outputs,outputs,mask = input
        output = outputs[-1]
        lengths = output.size(1) - mask.long().sum(dim=1)
        avg_pool = output.masked_fill(mask[:,:,None], 0).sum(dim=1)
        avg_pool.div_(lengths.type(avg_pool.dtype)[:,None])
        max_pool = output.masked_fill(mask[:,:,None], -float('inf')).max(dim=1)[0]
        x = torch.cat([output[torch.arange(0, output.size(0)),lengths-1], max_pool, avg_pool], 1) #Concat pooling.
        return output,x

In [34]:
emb_sz, nh, nl = 300, 300, 2
tok_pad = vocab.index(PAD)

In [35]:
enc = AWD_LSTM1(len(vocab), emb_sz, n_hid=nh, n_layers=nl, pad_token=tok_pad)
pool = Pooling()
enc.bs = bs
enc.reset()

In [36]:
x,y = next(iter(data.train_dl))
output,c = pool(enc(x))

We can check we have padding with 1s at the end of each text (except the first which is the longest).

In [37]:
x.size()

torch.Size([64, 3311])

In [38]:
x

tensor([[    2,     7,  1163,  ..., 16264,    24,     3],
        [    2,   194,    50,  ...,     1,     1,     1],
        [    2,     7,  1799,  ...,     1,     1,     1],
        ...,
        [    2,     7,    28,  ...,     1,     1,     1],
        [    2,     7, 17930,  ...,     1,     1,     1],
        [    2,    12,   395,  ...,     1,     1,     1]])

PyTorch puts 0s everywhere we had padding in the `output` when unpacking.

In [39]:
test_near((output.sum(dim=2) == 0).float(), (x==tok_pad).float())

So the last hidden state isn't the last element of `output`. Let's check we got everything right. 

In [40]:
for i in range(bs):
    length = x.size(1) - (x[i]==1).long().sum()
    out_unpad = output[i,:length]
    test_near(out_unpad[-1], c[i,:300])
    test_near(out_unpad.max(0)[0], c[i,300:600])
    test_near(out_unpad.mean(0), c[i,600:])

Our pooling layer properly ignored the padding, so now let's group it with a classifier.

In [41]:
def bn_drop_lin(n_in, n_out, bn=True, p=0., actn=None):
    layers = [nn.BatchNorm1d(n_in)] if bn else []
    if p != 0: layers.append(nn.Dropout(p))
    layers.append(nn.Linear(n_in, n_out))
    if actn is not None: layers.append(actn)
    return layers

In [42]:
class PoolingLinearClassifier(nn.Module):
    "Create a linear classifier with pooling."

    def __init__(self, layers, drops):
        super().__init__()
        mod_layers = []
        activs = [nn.ReLU(inplace=True)] * (len(layers) - 2) + [None]
        for n_in, n_out, p, actn in zip(layers[:-1], layers[1:], drops, activs):
            mod_layers += bn_drop_lin(n_in, n_out, p=p, actn=actn)
        self.layers = nn.Sequential(*mod_layers)

    def forward(self, input):
        raw_outputs,outputs,mask = input
        output = outputs[-1]
        lengths = output.size(1) - mask.long().sum(dim=1)
        avg_pool = output.masked_fill(mask[:,:,None], 0).sum(dim=1)
        avg_pool.div_(lengths.type(avg_pool.dtype)[:,None])
        max_pool = output.masked_fill(mask[:,:,None], -float('inf')).max(dim=1)[0]
        x = torch.cat([output[torch.arange(0, output.size(0)),lengths-1], max_pool, avg_pool], 1) #Concat pooling.
        x = self.layers(x)
        return x

Then we just have to feed our texts to those two blocks, (but we can't give them all at once to the AWD_LSTM or we might get OOM error: we'll go for chunks of bptt length to regularly detach the history of our hidden states.)

In [43]:
def pad_tensor(t, bs, val=0.):
    if t.size(0) < bs:
        return torch.cat([t, val + t.new_zeros(bs-t.size(0), *t.shape[1:])])
    return t

In [44]:
class SentenceEncoder(nn.Module):
    def __init__(self, module, bptt, pad_idx=1):
        super().__init__()
        self.bptt,self.module,self.pad_idx = bptt,module,pad_idx

    def concat(self, arrs, bs):
        return [torch.cat([pad_tensor(l[si],bs) for l in arrs], dim=1) for si in range(len(arrs[0]))]
    
    def forward(self, input):
        bs,sl = input.size()
        self.module.bs = bs
        self.module.reset()
        raw_outputs,outputs,masks = [],[],[]
        for i in range(0, sl, self.bptt):
            r,o,m = self.module(input[:,i: min(i+self.bptt, sl)])
            masks.append(pad_tensor(m, bs, 1))
            raw_outputs.append(r)
            outputs.append(o)
        return self.concat(raw_outputs, bs),self.concat(outputs, bs),torch.cat(masks,dim=1)

In [45]:
def get_text_classifier(vocab_sz, emb_sz, n_hid, n_layers, n_out, pad_token, bptt, output_p=0.4, hidden_p=0.2, 
                        input_p=0.6, embed_p=0.1, weight_p=0.5, layers=None, drops=None):
    "To create a full AWD-LSTM"
    rnn_enc = AWD_LSTM1(vocab_sz, emb_sz, n_hid=n_hid, n_layers=n_layers, pad_token=pad_token,
                        hidden_p=hidden_p, input_p=input_p, embed_p=embed_p, weight_p=weight_p)
    enc = SentenceEncoder(rnn_enc, bptt)
    if layers is None: layers = [50]
    if drops is None:  drops = [0.1] * len(layers)
    layers = [3 * emb_sz] + layers + [n_out] 
    drops = [output_p] + drops
    return SequentialRNN(enc, PoolingLinearClassifier(layers, drops))

In [46]:
emb_sz, nh, nl = 300, 300, 2
dps = tensor([0.4, 0.3, 0.4, 0.05, 0.5]) * 0.25
model = get_text_classifier(len(vocab), emb_sz, nh, nl, 2, 1, bptt, *dps)

### Training

We load our pretrained encoder and freeze it.

[Jump_to lesson 12 video](https://course.fast.ai/videos/?lesson=12&t=7684)

In [47]:
def class_splitter(m):
    enc = m[0].module
    groups = [nn.Sequential(enc.emb, enc.emb_dp, enc.input_dp)]
    for i in range(len(enc.rnns)): groups.append(nn.Sequential(enc.rnns[i], enc.hidden_dps[i]))
    groups.append(m[1])
    return [list(o.parameters()) for o in groups]

In [48]:
# freeze model and train
for p in model[0].parameters(): p.requires_grad_(False)

In [49]:
cbs = [partial(AvgStatsCallback,accuracy),
       CudaCallback, Recorder,
       partial(GradientClipping, clip=0.1),
       ProgressCallback]

In [50]:
model[0].module.load_state_dict(torch.load(path/'finetuned_enc.pth'))

<All keys matched successfully>

In [51]:
#learn = Learner(model, data, F.cross_entropy, opt_func=adam_opt(), cb_funcs=cbs, splitter=class_splitter)
learn = Learner(model, data, nn.CrossEntropyLoss(), opt_func=adam_opt(), cb_funcs=cbs, splitter=class_splitter)

In [52]:
lr = 1e-2
cbsched = sched_1cycle([lr], mom_start=0.8, mom_mid=0.7, mom_end=0.8)

In [53]:
learn.fit(1, cbs=cbsched) # 1 minute

epoch,train_loss,train_accuracy,valid_loss,valid_accuracy,time
0,0.34236,0.85116,0.268267,0.8872,01:00


In [54]:
# unfreeze last RNN and train
for p in model[0].module.rnns[-1].parameters(): p.requires_grad_(True)

In [55]:
lr = 5e-3
cbsched = sched_1cycle([lr/2., lr/2., lr/2., lr], mom_start=0.8, mom_mid=0.7, mom_end=0.8)

In [56]:
sched_1cycle??

In [57]:
learn.fit(1, cbs=cbsched) # 1 minute

epoch,train_loss,train_accuracy,valid_loss,valid_accuracy,time
0,0.271199,0.8868,0.216596,0.91272,01:15


In [58]:
# unfreeze model and train
for p in model[0].parameters(): p.requires_grad_(True)

In [59]:
lr = 1e-3
cbsched = sched_1cycle([lr/8., lr/4., lr/2., lr], mom_start=0.8, mom_mid=0.7, mom_end=0.8)

In [60]:
learn.fit(2, cbs=cbsched) # 3 minutes

epoch,train_loss,train_accuracy,valid_loss,valid_accuracy,time
0,0.220967,0.91176,0.194504,0.92456,01:39
1,0.192296,0.92264,0.19029,0.92564,01:39


Predicting on the padded batch or on the individual unpadded samples give the same results.

In [61]:
x,y = next(iter(data.valid_dl))

In [62]:
pred_batch = learn.model.eval()(x.cuda())

In [63]:
pred_batch

tensor([[-1.8668,  2.1056],
        [ 1.5000, -1.4432],
        [-0.2132,  0.7959],
        [ 0.3505, -0.1847],
        [-1.2108,  1.8571],
        [-1.8568,  2.1503],
        [-1.9673,  1.7746],
        [-0.4664,  0.4735],
        [-1.4230,  1.6201],
        [-1.5784,  2.1498],
        [ 2.1697, -2.3700],
        [-0.1503,  0.8790],
        [ 0.3477,  0.1478],
        [ 1.5043, -1.3131],
        [ 2.3103, -2.1698],
        [-0.1884,  0.6231],
        [ 2.6010, -3.4679],
        [-0.6307,  1.1816],
        [-1.9308,  2.8115],
        [-0.7908,  1.4284],
        [ 0.8175, -0.4698],
        [ 0.7287, -0.5317],
        [-4.2118,  4.6036],
        [-0.3462,  0.5605],
        [-0.1406,  0.7347],
        [ 2.7930, -2.9855],
        [-2.2191,  2.5226],
        [-3.8778,  4.4286],
        [-0.1234,  0.6817],
        [-3.8765,  4.3171],
        [-0.1342,  0.7933],
        [-0.7702,  1.6737],
        [ 2.2459, -2.6084],
        [ 1.4513, -1.3273],
        [-1.0434,  1.2116],
        [-0.6567,  1

In [67]:
pred_ind = []
for inp in x:
    length = x.size(1) - (inp == 1).long().sum()
    inp = inp[:length]
    pred_ind.append(learn.model.eval()(inp[None].cuda()))

In [68]:
assert near(pred_batch, torch.cat(pred_ind))

In [69]:
!python notebook2script.py 12c_ulmfit.ipynb

Converted 12c_ulmfit.ipynb to exp\nb_12c.py
