In [145]:
%reload_ext autoreload
%autoreload 2
%matplotlib inline

from fastai.io import *
from fastai.conv_learner import *

from fastai.column_data import *

## Setup

We're going to download the collected works of Nietzsche to use as our data for this class.

In [2]:
PATH='data/nietzsche/'

In [3]:
get_data("https://s3.amazonaws.com/text-datasets/nietzsche.txt", f'{PATH}nietzsche.txt')
text = open(f'{PATH}nietzsche.txt').read()
print('corpus length:', len(text))

corpus length: 600893


In [4]:
print(text[:400])

PREFACE


SUPPOSING that Truth is a woman--what then? Is there not ground
for suspecting that all philosophers, in so far as they have been
dogmatists, have failed to understand women--that the terrible
seriousness and clumsy importunity with which they have usually paid
their addresses to Truth, have been unskilled and unseemly methods for
winning a woman? Certainly she has never allowed herself 


In [5]:
chars = sorted(list(set(text)))
vocab_size = len(chars)+1
print('total chars:', vocab_size)

total chars: 85


Sometimes it's useful to have a zero value in the dataset, e.g. for padding

In [6]:
chars.insert(0, "\0")

''.join(chars[1:-6])

'\n !"\'(),-.0123456789:;=?ABCDEFGHIJKLMNOPQRSTUVWXYZ[]_abcdefghijklmnopqrstuvwxy'

Map from chars to indices and back again

In [7]:
char_indices = {c: i for i, c in enumerate(chars)}
indices_char = {i: c for i, c in enumerate(chars)}

*idx* will be the data we use from now on - it simply converts all the characters to their index (based on the mapping above)

In [8]:
idx = [char_indices[c] for c in text]

idx[:10]

[40, 42, 29, 30, 25, 27, 29, 1, 1, 1]

In [9]:
''.join(indices_char[i] for i in idx[:70])

'PREFACE\n\n\nSUPPOSING that Truth is a woman--what then? Is there not gro'

## Three char model

### Create inputs

Create a list of every 4th character, starting at the 0th, 1st, 2nd, then 3rd characters

In [10]:
cs=3
c1_dat = [idx[i]   for i in range(0, len(idx)-cs, cs)]
c2_dat = [idx[i+1] for i in range(0, len(idx)-cs, cs)]
c3_dat = [idx[i+2] for i in range(0, len(idx)-cs, cs)]
c4_dat = [idx[i+3] for i in range(0, len(idx)-cs, cs)]

Our inputs

In [11]:
x1 = np.stack(c1_dat)
x2 = np.stack(c2_dat)
x3 = np.stack(c3_dat)

Our output

In [12]:
y = np.stack(c4_dat)

The first 4 inputs and outputs

In [13]:
x1[:4], x2[:4], x3[:4]

(array([40, 30, 29,  1]), array([42, 25,  1, 43]), array([29, 27,  1, 45]))

In [14]:
for i in range(4):
    print([indices_char[j] for j in [x1[i], x2[i], x3[i], y[i]]])

['P', 'R', 'E', 'F']
['F', 'A', 'C', 'E']
['E', '\n', '\n', '\n']
['\n', 'S', 'U', 'P']


In [15]:
y[:4]

array([30, 29,  1, 40])

In [16]:
x1.shape, y.shape

((200297,), (200297,))

### Create and train model

Pick a size for our hidden state

In [17]:
n_hidden = 256

The number of latent factors to create (i.e. the size of the embedding matrix)

In [18]:
n_fac = 42

In [19]:
class Char3Model(nn.Module):
    def __init__(self, vocab_size, n_fac):
        super().__init__()
        self.e = nn.Embedding(vocab_size, n_fac)

        # The 'green arrow' from our diagram - the layer operation from input to hidden
        self.l_in = nn.Linear(n_fac, n_hidden)

        # The 'orange arrow' from our diagram - the layer operation from hidden to hidden
        # Needs to be a squared matrix for receiving the same embeddings
        self.l_hidden = nn.Linear(n_hidden, n_hidden)
        
        # The 'blue arrow' from our diagram - the layer operation from hidden to output
        self.l_out = nn.Linear(n_hidden, vocab_size)
        
    def forward(self, c1, c2, c3):
        in1 = F.relu(self.l_in(self.e(c1)))
        in2 = F.relu(self.l_in(self.e(c2)))
        in3 = F.relu(self.l_in(self.e(c3)))
        
        h = V(torch.zeros(in1.size()).cuda())
        h = F.tanh(self.l_hidden(h + in1))
        h = F.tanh(self.l_hidden(h + in2))
        h = F.tanh(self.l_hidden(h + in3))
        
        return F.log_softmax(self.l_out(h))

In [20]:
md = ColumnarModelData.from_arrays('.', [-1], np.stack([x1,x2,x3], axis=1), y, bs=512)

In [21]:
m = Char3Model(vocab_size, n_fac).cuda()

In [22]:
it = iter(md.trn_dl)
*xs,yt = next(it)
t = m(*V(xs))

In [23]:
t

Variable containing:
-4.3779 -4.6781 -4.1744  ...  -4.3739 -4.3233 -4.5350
-4.5156 -4.5990 -4.4144  ...  -4.2717 -4.1752 -4.4010
-4.3515 -4.4772 -4.0797  ...  -4.4186 -4.6633 -4.3745
          ...             ⋱             ...          
-4.3389 -4.4867 -4.1964  ...  -4.3225 -4.4095 -4.5396
-4.3831 -4.6158 -4.1612  ...  -4.3198 -4.2612 -4.4629
-4.5942 -4.4059 -4.2882  ...  -4.1692 -4.2491 -4.4723
[torch.cuda.FloatTensor of size 512x85 (GPU 0)]

In [24]:
opt = optim.Adam(m.parameters(), 1e-2)

In [25]:
fit(m, md, 1, opt, F.nll_loss)

HBox(children=(IntProgress(value=0, description='Epoch', max=1), HTML(value='')))

epoch      trn_loss   val_loss                              
    0      2.099717   0.530562  



[array([0.53056])]

In [26]:
set_lrs(opt, 0.001)

In [27]:
fit(m, md, 1, opt, F.nll_loss)

HBox(children=(IntProgress(value=0, description='Epoch', max=1), HTML(value='')))

epoch      trn_loss   val_loss                               
    0      1.822495   0.482929  



[array([0.48293])]

### Test model

In [28]:
def get_next(inp):
    idxs = T(np.array([char_indices[c] for c in inp]))
    p = m(*VV(idxs))
    i = np.argmax(to_np(p))
    return chars[i]

In [29]:
get_next('y. ')

'T'

In [30]:
get_next('ppl')

'e'

In [31]:
get_next(' th')

'e'

In [32]:
get_next('and')

' '

## Our first RNN!

### Create inputs

This is the size of our unrolled RNN.

In [33]:
cs=8

For each of 0 through 7, create a list of every 8th character with that starting point. These will be the 8 inputs to our model.

In [34]:
c_in_dat = [[idx[i+j] for i in range(cs)] for j in range(len(idx)-cs)]

Then create a list of the next character in each of these series. This will be the labels for our model.

In [35]:
c_out_dat = [idx[j+cs] for j in range(len(idx)-cs)]

In [36]:
xs = np.stack(c_in_dat, axis=0)

In [37]:
xs.shape

(600885, 8)

In [38]:
y = np.stack(c_out_dat)

So each column below is one series of 8 characters from the text.

In [39]:
xs[:cs,:cs]

array([[40, 42, 29, 30, 25, 27, 29,  1],
       [42, 29, 30, 25, 27, 29,  1,  1],
       [29, 30, 25, 27, 29,  1,  1,  1],
       [30, 25, 27, 29,  1,  1,  1, 43],
       [25, 27, 29,  1,  1,  1, 43, 45],
       [27, 29,  1,  1,  1, 43, 45, 40],
       [29,  1,  1,  1, 43, 45, 40, 40],
       [ 1,  1,  1, 43, 45, 40, 40, 39]])

...and this is the next character after each sequence.

In [40]:
y[:cs]

array([ 1,  1, 43, 45, 40, 40, 39, 43])

### Create and train model

In [41]:
val_idx = get_cv_idxs(len(idx)-cs-1)

In [42]:
md = ColumnarModelData.from_arrays('.', val_idx, xs, y, bs=512)

In [43]:
class CharLoopModel(nn.Module):
    # This is an RNN!
    def __init__(self, vocab_size, n_fac):
        super().__init__()
        self.e = nn.Embedding(vocab_size, n_fac)
        self.l_in = nn.Linear(n_fac, n_hidden)
        self.l_hidden = nn.Linear(n_hidden, n_hidden)
        self.l_out = nn.Linear(n_hidden, vocab_size)
        
    def forward(self, *cs):
        bs = cs[0].size(0)
        h = V(torch.zeros(bs, n_hidden).cuda())
        for c in cs:
            inp = F.relu(self.l_in(self.e(c)))
            h = F.tanh(self.l_hidden(h+inp))
        
        return F.log_softmax(self.l_out(h), dim=-1)

In [44]:
m = CharLoopModel(vocab_size, n_fac).cuda()
opt = optim.Adam(m.parameters(), 1e-2)

In [45]:
fit(m, md, 1, opt, F.nll_loss)

HBox(children=(IntProgress(value=0, description='Epoch', max=1), HTML(value='')))

epoch      trn_loss   val_loss                               
    0      2.026757   2.005648  



[array([2.00565])]

In [46]:
set_lrs(opt, 0.001)

In [47]:
fit(m, md, 1, opt, F.nll_loss)

HBox(children=(IntProgress(value=0, description='Epoch', max=1), HTML(value='')))

epoch      trn_loss   val_loss                               
    0      1.711572   1.711258  



[array([1.71126])]

In [48]:
class CharLoopConcatModel(nn.Module):
    def __init__(self, vocab_size, n_fac):
        super().__init__()
        self.e = nn.Embedding(vocab_size, n_fac)
        self.l_in = nn.Linear(n_fac + n_hidden, n_hidden)
        self.l_hidden = nn.Linear(n_hidden, n_hidden)
        self.l_out = nn.Linear(n_hidden, vocab_size)
        
    def forward(self, *cs):
        bs = cs[0].size(0)
        h = V(torch.zeros(bs, n_hidden).cuda())
        for c in cs:
            inp = torch.cat((h, self.e(c)), 1)
            inp = F.relu(self.l_in(inp))
            h = F.tanh(self.l_hidden(inp))
        
        return F.log_softmax(self.l_out(h), dim=-1)

In [49]:
m = CharLoopConcatModel(vocab_size, n_fac).cuda()
opt = optim.Adam(m.parameters(), 1e-3)

In [50]:
it = iter(md.trn_dl)
*xs,yt = next(it)
t = m(*V(xs))

In [51]:
fit(m, md, 1, opt, F.nll_loss)

HBox(children=(IntProgress(value=0, description='Epoch', max=1), HTML(value='')))

epoch      trn_loss   val_loss                               
    0      1.853491   1.832071  



[array([1.83207])]

In [52]:
set_lrs(opt, 1e-4)

In [53]:
fit(m, md, 1, opt, F.nll_loss)

HBox(children=(IntProgress(value=0, description='Epoch', max=1), HTML(value='')))

epoch      trn_loss   val_loss                               
    0      1.745975   1.749316  



[array([1.74932])]

### Test model

In [54]:
def get_next(inp):
    idxs = T(np.array([char_indices[c] for c in inp]))
    p = m(*VV(idxs))
    i = np.argmax(to_np(p))
    return chars[i]

In [55]:
get_next('for thos')

'e'

In [56]:
get_next('part of ')

't'

In [57]:
get_next('queens a')

'n'

## RNN with pytorch

In [58]:
class CharRnn(nn.Module):
    def __init__(self, vocab_size, n_fac):
        super().__init__()
        self.e = nn.Embedding(vocab_size, n_fac)
        self.rnn = nn.RNN(n_fac, n_hidden)
        self.l_out = nn.Linear(n_hidden, vocab_size)
        
    def forward(self, *cs):
        bs = cs[0].size(0)
        h = V(torch.zeros(1, bs, n_hidden))
        inp = self.e(torch.stack(cs))
        outp, h = self.rnn(inp, h)
        
        # outp[-1] because PyTorch returns the full list of hidden activations.
        return F.log_softmax(self.l_out(outp[-1]), dim=-1)

In [59]:
m = CharRnn(vocab_size, n_fac).cuda()
opt = optim.Adam(m.parameters(), 1e-3)

In [60]:
it = iter(md.trn_dl)
*xs,yt = next(it)

In [61]:
t = m.e(V(torch.stack(xs)))
t.size()

torch.Size([8, 512, 42])

In [62]:
ht = V(torch.zeros(1, 512,n_hidden))
outp, hn = m.rnn(t, ht)
outp.size(), hn.size()

(torch.Size([8, 512, 256]), torch.Size([1, 512, 256]))

In [63]:
t = m(*V(xs)); t.size()

torch.Size([512, 85])

In [64]:
fit(m, md, 4, opt, F.nll_loss)

HBox(children=(IntProgress(value=0, description='Epoch', max=4), HTML(value='')))

epoch      trn_loss   val_loss                               
    0      1.862186   1.84228   
    1      1.671269   1.663392                               
    2      1.583219   1.589438                               
    3      1.516936   1.547038                               



[array([1.54704])]

In [65]:
set_lrs(opt, 1e-4)

In [66]:
fit(m, md, 2, opt, F.nll_loss)

HBox(children=(IntProgress(value=0, description='Epoch', max=2), HTML(value='')))

epoch      trn_loss   val_loss                               
    0      1.462939   1.506864  
    1      1.466983   1.502253                               



[array([1.50225])]

### Test model

In [67]:
def get_next(inp):
    idxs = T(np.array([char_indices[c] for c in inp]))
    p = m(*VV(idxs))
    i = np.argmax(to_np(p))
    return chars[i]

In [68]:
get_next('for thos')

'e'

In [69]:
def get_next_n(inp, n):
    res = inp
    for i in range(n):
        c = get_next(inp)
        res += c
        inp = inp[1:]+c
    return res

In [70]:
get_next_n('for thos', 40)

'for those of the same to the same to the same to'

## Multi-output model

### Setup

Let's take non-overlapping sets of characters this time

In [71]:
c_in_dat = [[idx[i+j] for i in range(cs)] for j in range(0, len(idx)-cs-1, cs)]

Then create the exact same thing, offset by 1, as our labels

In [72]:
c_out_dat = [[idx[i+j] for i in range(cs)] for j in range(1, len(idx)-cs, cs)]

In [73]:
xs = np.stack(c_in_dat)
xs.shape

(75111, 8)

In [74]:
ys = np.stack(c_out_dat)
ys.shape

(75111, 8)

In [75]:
xs[:cs,:cs]

array([[40, 42, 29, 30, 25, 27, 29,  1],
       [ 1,  1, 43, 45, 40, 40, 39, 43],
       [33, 38, 31,  2, 73, 61, 54, 73],
       [ 2, 44, 71, 74, 73, 61,  2, 62],
       [72,  2, 54,  2, 76, 68, 66, 54],
       [67,  9,  9, 76, 61, 54, 73,  2],
       [73, 61, 58, 67, 24,  2, 33, 72],
       [ 2, 73, 61, 58, 71, 58,  2, 67]])

In [76]:
ys[:cs,:cs]

array([[42, 29, 30, 25, 27, 29,  1,  1],
       [ 1, 43, 45, 40, 40, 39, 43, 33],
       [38, 31,  2, 73, 61, 54, 73,  2],
       [44, 71, 74, 73, 61,  2, 62, 72],
       [ 2, 54,  2, 76, 68, 66, 54, 67],
       [ 9,  9, 76, 61, 54, 73,  2, 73],
       [61, 58, 67, 24,  2, 33, 72,  2],
       [73, 61, 58, 71, 58,  2, 67, 68]])

### Create and train model

In [77]:
val_idx = get_cv_idxs(len(xs)-cs-1)

In [78]:
md = ColumnarModelData.from_arrays('.', val_idx, xs, ys, bs=512)

In [79]:
class CharSeqRnn(nn.Module):
    def __init__(self, vocab_size, n_fac):
        super().__init__()
        self.e = nn.Embedding(vocab_size, n_fac)
        self.rnn = nn.RNN(n_fac, n_hidden)
        self.l_out = nn.Linear(n_hidden, vocab_size)
        
    def forward(self, *cs):
        bs = cs[0].size(0)
        h = V(torch.zeros(1, bs, n_hidden))
        inp = self.e(torch.stack(cs))
        outp,h = self.rnn(inp, h)
        return F.log_softmax(self.l_out(outp), dim=-1)

In [80]:
m = CharSeqRnn(vocab_size, n_fac).cuda()
opt = optim.Adam(m.parameters(), 1e-3)

In [81]:
it = iter(md.trn_dl)
*xst,yt = next(it)

In [82]:
yt


   64    72     2  ...     68    67     2
   62    67     2  ...     58     2    73
   65    59     8  ...     67    65    78
       ...          ⋱          ...       
   73    61    58  ...     57    58    56
   74    66    55  ...      2    54    66
   65    54    71  ...      2    62    72
[torch.cuda.LongTensor of size 512x8 (GPU 0)]

In [83]:
# F.nll_loss expects two minibatches of vectors. ==> We need to reshape our outputs.
def nll_loss_seq(inp, targ):
    sl, bs, nh = inp.size()  # sl: sequence length.
    targ = targ.transpose(0,1).contiguous().view(-1)
    return F.nll_loss(inp.view(-1,nh), targ)

In [84]:
fit(m, md, 4, opt, nll_loss_seq)

HBox(children=(IntProgress(value=0, description='Epoch', max=4), HTML(value='')))

epoch      trn_loss   val_loss                              
    0      2.580465   2.391005  
    1      2.273378   2.192098                              
    2      2.12944    2.080841                              
    3      2.041413   2.008582                              



[array([2.00858])]

In [85]:
set_lrs(opt, 1e-4)

In [86]:
fit(m, md, 1, opt, nll_loss_seq)

HBox(children=(IntProgress(value=0, description='Epoch', max=1), HTML(value='')))

epoch      trn_loss   val_loss                              
    0      1.991227   1.994015  



[array([1.99402])]

### Identity init!

In [87]:
m = CharSeqRnn(vocab_size, n_fac).cuda()
opt = optim.Adam(m.parameters(), 1e-2)

In [88]:
m.rnn.weight_hh_l0.data.copy_(torch.eye(n_hidden))


    1     0     0  ...      0     0     0
    0     1     0  ...      0     0     0
    0     0     1  ...      0     0     0
       ...          ⋱          ...       
    0     0     0  ...      1     0     0
    0     0     0  ...      0     1     0
    0     0     0  ...      0     0     1
[torch.cuda.FloatTensor of size 256x256 (GPU 0)]

In [89]:
fit(m, md, 4, opt, nll_loss_seq)

HBox(children=(IntProgress(value=0, description='Epoch', max=4), HTML(value='')))

epoch      trn_loss   val_loss                              
    0      2.385673   2.219812  
    1      2.124478   2.067521                              
    2      2.023408   1.997629                              
    3      1.974427   1.972308                              



[array([1.97231])]

In [90]:
set_lrs(opt, 1e-3)

In [91]:
fit(m, md, 4, opt, nll_loss_seq)

HBox(children=(IntProgress(value=0, description='Epoch', max=4), HTML(value='')))

epoch      trn_loss   val_loss                              
    0      1.885166   1.893995  
    1      1.873737   1.885771                              
    2      1.867197   1.882912                              
    3      1.859289   1.877603                              



[array([1.8776])]

## Stateful model

### Setup

In [146]:
from torchtext import vocab, data

from fastai.nlp import *
from fastai.lm_rnn import *

PATH='data/nietzsche/'

TRN_PATH = 'trn/'
VAL_PATH = 'val/'
TRN = f'{PATH}{TRN_PATH}'
VAL = f'{PATH}{VAL_PATH}'

In [131]:
len_trn = int(0.8 * len(text))
print('Total length:', len(text))

text_trn = text[:len_trn]
print('Train length:', len(text_trn))
f = open(PATH + TRN_PATH + 'train.txt', 'w')
f.write(text_trn)
f.close()

text_val = text[len_trn:]
print('Val length:', len(text_val))
f = open(PATH + VAL_PATH + 'val.txt', 'w') #  ... , 'x') the first time
f.write(text_val)
f.close()

In [132]:
# Note: The student needs to practice her shell skills and prepare her own dataset before proceeding:
# - trn/trn.txt (first 80% of nietzsche.txt)
# - val/val.txt (last 20% of nietzsche.txt)

%ls {PATH}

[0m[01;34mmodels[0m/  nietzsche.txt  [01;34mtrn[0m/  [01;34mval[0m/


In [133]:
%ls {PATH}trn

train.txt


In [134]:
list('abc')

['a', 'b', 'c']

In [135]:
TEXT = data.Field(lower=True, tokenize=list)
bs=64; bptt=8; n_fac=42; n_hidden=256

FILES = dict(train=TRN_PATH, validation=VAL_PATH, test=VAL_PATH)
md = LanguageModelData.from_text_files(PATH, TEXT, **FILES, bs=bs, bptt=bptt, min_freq=3)

len(md.trn_dl), md.nt, len(md.trn_ds), len(md.trn_ds[0].text)

(922, 55, 1, 472943)

In [140]:
# Only after calling LanguageModelData...
print(TEXT.vocab.stoi)

defaultdict(<function _default_unk_index at 0x7f895f23aae8>, {'<unk>': 0, '<pad>': 1, ' ': 2, 'e': 3, 't': 4, 'i': 5, 'a': 6, 'o': 7, 'n': 8, 's': 9, 'r': 10, 'h': 11, 'l': 12, 'd': 13, 'c': 14, 'u': 15, 'f': 16, 'm': 17, 'p': 18, 'g': 19, ',': 20, 'y': 21, 'w': 22, 'b': 23, 'v': 24, '-': 25, '.': 26, '"': 27, 'k': 28, 'x': 29, ';': 30, ':': 31, 'q': 32, 'j': 33, '!': 34, '?': 35, '(': 36, ')': 37, "'": 38, 'z': 39, '1': 40, '2': 41, '=': 42, '_': 43, '3': 44, '[': 45, ']': 46, '4': 47, '5': 48, '6': 49, '8': 50, '7': 51, '9': 52, '0': 53, 'ä': 54, 'æ': 0, 'ë': 0, '<eos>': 0, 'é': 0})


### RNN

In [141]:
class CharSeqStatefulRnn(nn.Module):
    def __init__(self, vocab_size, n_fac, bs):
        self.vocab_size = vocab_size
        super().__init__()
        self.e = nn.Embedding(vocab_size, n_fac)
        self.rnn = nn.RNN(n_fac, n_hidden)
        self.l_out = nn.Linear(n_hidden, vocab_size)
        self.init_hidden(bs)
        
    def forward(self, cs):
        bs = cs[0].size(0)
        if self.h.size(1) != bs: self.init_hidden(bs)  # For the last minibatch size (might be less than original bs)
        outp, h = self.rnn(self.e(cs), self.h)
        self.h = repackage_var(h)  # Get rid of gradient history (but not current state) at the end of each minibatch
        return F.log_softmax(self.l_out(outp), dim=-1).view(-1, self.vocab_size)
    
    def init_hidden(self, bs): self.h = V(torch.zeros(1, bs, n_hidden))

In [101]:
??repackage_var

In [142]:
m = CharSeqStatefulRnn(md.nt, n_fac, 512).cuda()
opt = optim.Adam(m.parameters(), 1e-3)

In [143]:
fit(m, md, 4, opt, F.nll_loss)

HBox(children=(IntProgress(value=0, description='Epoch', max=4), HTML(value='')))

epoch      trn_loss   val_loss                                
    0      1.883695   1.862004  
    1      1.699903   1.722749                                
    2      1.610066   1.642219                                
    3      1.549901   1.609302                                


[array([1.6093])]

In [104]:
set_lrs(opt, 1e-4)

fit(m, md, 4, opt, F.nll_loss)

HBox(children=(IntProgress(value=0, description='Epoch', max=4), HTML(value='')))

epoch      trn_loss   val_loss                                
    0      1.495288   1.567378  
    1      1.488274   1.560906                                
    2      1.484037   1.557597                                
    3      1.482368   1.553953                                



[array([1.55395])]

### RNN loop

In [105]:
# From the pytorch source

# def RNNCell(input, hidden, w_ih, w_hh, b_ih, b_hh):
#     return F.tanh(F.linear(input, w_ih, b_ih) + F.linear(hidden, w_hh, b_hh))

In [106]:
class CharSeqStatefulRnn2(nn.Module):
    def __init__(self, vocab_size, n_fac, bs):
        super().__init__()
        self.vocab_size = vocab_size
        self.e = nn.Embedding(vocab_size, n_fac)
        self.rnn = nn.RNNCell(n_fac, n_hidden)
        self.l_out = nn.Linear(n_hidden, vocab_size)
        self.init_hidden(bs)
        
    def forward(self, cs):
        bs = cs[0].size(0)
        if self.h.size(1) != bs: self.init_hidden(bs)
        outp = []
        o = self.h
        for c in cs: 
            o = self.rnn(self.e(c), o)
            outp.append(o)
        outp = self.l_out(torch.stack(outp))
        self.h = repackage_var(o)
        return F.log_softmax(outp, dim=-1).view(-1, self.vocab_size)
    
    def init_hidden(self, bs): self.h = V(torch.zeros(1, bs, n_hidden))

In [107]:
m = CharSeqStatefulRnn2(md.nt, n_fac, 512).cuda()
opt = optim.Adam(m.parameters(), 1e-3)

In [108]:
fit(m, md, 4, opt, F.nll_loss)

HBox(children=(IntProgress(value=0, description='Epoch', max=4), HTML(value='')))

epoch      trn_loss   val_loss                                
    0      1.886529   1.88835   
    1      1.700856   1.724678                                
    2      1.611395   1.650444                                
    3      1.555188   1.613619                                



[array([1.61362])]

### GRU

In [148]:
class CharSeqStatefulGRU(nn.Module):
    def __init__(self, vocab_size, n_fac, bs):
        super().__init__()
        self.vocab_size = vocab_size
        self.e = nn.Embedding(vocab_size, n_fac)
        self.rnn = nn.GRU(n_fac, n_hidden)
        self.l_out = nn.Linear(n_hidden, vocab_size)
        self.init_hidden(bs)
        
    def forward(self, cs):
        bs = cs[0].size(0)
        if self.h.size(1) != bs: self.init_hidden(bs)
        outp,h = self.rnn(self.e(cs), self.h)
        self.h = repackage_var(h)
        return F.log_softmax(self.l_out(outp), dim=-1).view(-1, self.vocab_size)
    
    def init_hidden(self, bs): self.h = V(torch.zeros(1, bs, n_hidden))

In [110]:
# # From the pytorch source code - for reference

# def GRUCell(input, hidden, w_ih, w_hh, b_ih, b_hh):
#     gi = F.linear(input, w_ih, b_ih)
#     gh = F.linear(hidden, w_hh, b_hh)
#     i_r, i_i, i_n = gi.chunk(3, 1)
#     h_r, h_i, h_n = gh.chunk(3, 1)

#     resetgate = F.sigmoid(i_r + h_r)
#     inputgate = F.sigmoid(i_i + h_i)
#     newgate = F.tanh(i_n + resetgate * h_n)
#     return newgate + inputgate * (hidden - newgate)

In [149]:
m = CharSeqStatefulGRU(md.nt, n_fac, 512).cuda()

opt = optim.Adam(m.parameters(), 1e-3)

In [150]:
fit(m, md, 6, opt, F.nll_loss)

HBox(children=(IntProgress(value=0, description='Epoch', max=6), HTML(value='')))

epoch      trn_loss   val_loss                                
    0      1.762655   1.7576    
    1      1.57734    1.602599                                
    2      1.482431   1.534302                                
    3      1.431464   1.509331                                
    4      1.390271   1.475652                                
    5      1.363522   1.475028                                


[array([1.47503])]

In [151]:
set_lrs(opt, 1e-4)

In [152]:
fit(m, md, 3, opt, F.nll_loss)

HBox(children=(IntProgress(value=0, description='Epoch', max=3), HTML(value='')))

epoch      trn_loss   val_loss                                
    0      1.278715   1.434132  
    1      1.278393   1.430864                                
    2      1.271969   1.429157                                


[array([1.42916])]

### Putting it all together: LSTM

In [153]:
from fastai import sgdr

n_hidden=512

In [154]:
class CharSeqStatefulLSTM(nn.Module):
    def __init__(self, vocab_size, n_fac, bs, nl):
        super().__init__()
        self.vocab_size,self.nl = vocab_size,nl
        self.e = nn.Embedding(vocab_size, n_fac)
        self.rnn = nn.LSTM(n_fac, n_hidden, nl, dropout=0.5)
        self.l_out = nn.Linear(n_hidden, vocab_size)
        self.init_hidden(bs)
        
    def forward(self, cs):
        bs = cs[0].size(0)
        if self.h[0].size(1) != bs: self.init_hidden(bs)
        outp,h = self.rnn(self.e(cs), self.h)
        self.h = repackage_var(h)
        return F.log_softmax(self.l_out(outp), dim=-1).view(-1, self.vocab_size)
    
    def init_hidden(self, bs):
        self.h = (V(torch.zeros(self.nl, bs, n_hidden)),
                  V(torch.zeros(self.nl, bs, n_hidden)))

In [164]:
m = CharSeqStatefulLSTM(md.nt, n_fac, 512, 2).cuda()  # Standard pytorch model
lo = LayerOptimizer(optim.Adam, m, 1e-2, 1e-4)  # Layer optimizer from fastai.

# All of the mechanics from fastai assume one has a LayerOptimizer.

In [165]:
os.makedirs(f'{PATH}models', exist_ok=True)

In [166]:
fit(m, md, 2, lo.opt, F.nll_loss)

HBox(children=(IntProgress(value=0, description='Epoch', max=2), HTML(value='')))

epoch      trn_loss   val_loss                                
    0      1.948555   1.879655  
    1      1.854967   1.818361                                


[array([1.81836])]

In [167]:
# See Lesson 7, 55:25.
on_end = lambda sched, cycle: save_model(m, f'{PATH}models/cyc_{cycle}')
cb = [CosAnneal(lo, len(md.trn_dl), cycle_mult=2, on_cycle_end=on_end)]
fit(m, md, 2**4-1, lo.opt, F.nll_loss, callbacks=cb)

HBox(children=(IntProgress(value=0, description='Epoch', max=15), HTML(value='')))

epoch      trn_loss   val_loss                                
    0      1.67469    1.622506  
    1      1.753407   1.691717                                
    2      1.609007   1.567153                                
    3      1.78146    1.711956                                
    4      1.720168   1.671705                                
    5      1.630225   1.586589                                
    6      1.549742   1.518816                                
    7      1.754562   1.708689                                
    8      1.74929    1.696156                                
    9      1.722998   1.670317                                
    10     1.683678   1.627788                                
    11     1.639892   1.586888                                
    12     1.58788    1.552399                                
    13     1.537009   1.506951                                
    14     1.498625   1.475307                                


[array([1.47531])]

In [177]:
on_end = lambda sched, cycle: save_model(m, f'{PATH}models/cyc_{cycle}')
cb = [CosAnneal(lo, len(md.trn_dl), cycle_mult=2, on_cycle_end=on_end)]
fit(m, md, 2**6-1, lo.opt, F.nll_loss, callbacks=cb)

HBox(children=(IntProgress(value=0, description='Epoch', max=63), HTML(value='')))

epoch      trn_loss   val_loss                                
    0      1.241      1.385329  
    1      1.24281    1.385308                                
    2      1.243495   1.385404                                
    3      1.237667   1.385564                                
    4      1.243355   1.385306                                
    5      1.236939   1.385293                                
    6      1.244977   1.385201                                
    7      1.234226   1.385606                                
    8      1.244254   1.385199                                
    9      1.241558   1.385321                                
    10     1.240947   1.385372                                
    11     1.241254   1.385425                                
    12     1.246537   1.385299                                
    13     1.248266   1.385795                                
    14     1.234539   1.38539                                 
    15     1.23925    

KeyboardInterrupt: 

### Test

In [178]:
def get_next(inp):
    idxs = TEXT.numericalize(inp)
    p = m(VV(idxs.transpose(0,1)))
    r = torch.multinomial(p[-1].exp(), 1)
    return TEXT.vocab.itos[to_np(r)[0]]

In [179]:
get_next('for thos')

' '

In [180]:
def get_next_n(inp, n):
    res = inp
    for i in range(n):
        c = get_next(inp)
        res += c
        inp = inp[1:]+c
    return res

In [181]:
print(get_next_n('for thos', 400))

for those man--i know to -me woman will toon upressed to an sole philosopition, until to be that is, induce to apbearen--wish no ddowle hypolitein--wishy philosop the rule, twrithing--it there grawe the promoped' pleanlyand even (yet true!-?i have to be taste artand look, hou-means of comprowthers--presents, might, and goethe remannemses, on the faith sneeders; id means of the well on hegedes: not here an


# TO DO: Ricardo Arjona