In [1]:
## All imports needed
%reload_ext autoreload
%autoreload 2
%matplotlib inline

import os
import urllib.request
import numpy as np
import pandas as pd
import torch
from torch import nn

from fastai.io import *
from fastai.conv_learner import *

from fastai.column_data import *

from sklearn.metrics import f1_score, accuracy_score

# Generating new Nietsche text

This is primarily just a notebook to implement myself Lesson 6 of the FastAI Part 1 Deep Learning course. https://github.com/fastai/fastai/blob/master/courses/dl1/lesson6-rnn.ipynb

First up we need to get hold of the data - the entire collected works of Nietzsche:

In [2]:
PATH = '../data/nietzsche/'

os.makedirs(PATH, exist_ok=True)
urllib.request.urlretrieve("https://s3.amazonaws.com/text-datasets/nietzsche.txt", f'{PATH}nietzsche.txt')

text = open(f'{PATH}nietzsche.txt').read()

## Data exploration

In [3]:
chars = sorted(list(set(text)))
vocab_size = len(chars)+1

print('corpus length:', len(text))
print('total chars:', vocab_size, ':', chars)

corpus length: 600893
total chars: 85 : ['\n', ' ', '!', '"', "'", '(', ')', ',', '-', '.', '0', '1', '2', '3', '4', '5', '6', '7', '8', '9', ':', ';', '=', '?', 'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J', 'K', 'L', 'M', 'N', 'O', 'P', 'Q', 'R', 'S', 'T', 'U', 'V', 'W', 'X', 'Y', 'Z', '[', ']', '_', 'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n', 'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z', 'Æ', 'ä', 'æ', 'é', 'ë']


Rather than use characters for the model - we will use numbers. Ie "HELLO" could become `[8,5,12,12,15]`. So we map every charcater to an index 0..84


In [4]:
chars_to_indexes = {i: x for x,i in enumerate(chars)}
indexes_to_chars = {x: i for x,i in enumerate(chars)}

print(chars_to_indexes)

{'\n': 0, ' ': 1, '!': 2, '"': 3, "'": 4, '(': 5, ')': 6, ',': 7, '-': 8, '.': 9, '0': 10, '1': 11, '2': 12, '3': 13, '4': 14, '5': 15, '6': 16, '7': 17, '8': 18, '9': 19, ':': 20, ';': 21, '=': 22, '?': 23, 'A': 24, 'B': 25, 'C': 26, 'D': 27, 'E': 28, 'F': 29, 'G': 30, 'H': 31, 'I': 32, 'J': 33, 'K': 34, 'L': 35, 'M': 36, 'N': 37, 'O': 38, 'P': 39, 'Q': 40, 'R': 41, 'S': 42, 'T': 43, 'U': 44, 'V': 45, 'W': 46, 'X': 47, 'Y': 48, 'Z': 49, '[': 50, ']': 51, '_': 52, 'a': 53, 'b': 54, 'c': 55, 'd': 56, 'e': 57, 'f': 58, 'g': 59, 'h': 60, 'i': 61, 'j': 62, 'k': 63, 'l': 64, 'm': 65, 'n': 66, 'o': 67, 'p': 68, 'q': 69, 'r': 70, 's': 71, 't': 72, 'u': 73, 'v': 74, 'w': 75, 'x': 76, 'y': 77, 'z': 78, 'Æ': 79, 'ä': 80, 'æ': 81, 'é': 82, 'ë': 83}


Finally, convert the entire corpus to indexes to use as the dataset

In [5]:
idxs = [chars_to_indexes[char] for char in text]

print(idxs[400:500])
print(''.join([indexes_to_chars[idx] for idx in idxs[400:500]]))

[72, 67, 1, 54, 57, 1, 75, 67, 66, 21, 1, 53, 66, 56, 0, 53, 72, 1, 68, 70, 57, 71, 57, 66, 72, 1, 57, 74, 57, 70, 77, 1, 63, 61, 66, 56, 1, 67, 58, 1, 56, 67, 59, 65, 53, 1, 71, 72, 53, 66, 56, 71, 1, 75, 61, 72, 60, 1, 71, 53, 56, 1, 53, 66, 56, 1, 56, 61, 71, 55, 67, 73, 70, 53, 59, 57, 56, 1, 65, 61, 57, 66, 8, 8, 32, 29, 7, 0, 61, 66, 56, 57, 57, 56, 7, 1, 61, 72, 1, 71]
to be won; and
at present every kind of dogma stands with sad and discouraged mien--IF,
indeed, it s


## 3-character model

So we're going to start off with looking at just the last 3 characters and trying to predict the 4th.

To begin, create a 4 arrays, each one offset one from the last (3 for the inputs, and one for y):

In [6]:
cs = 3

c1_dat = [idxs[i] for i in range(0, len(idxs)-cs, cs)]
c2_dat = [idxs[i+1] for i in range(0, len(idxs)-cs, cs)]
c3_dat = [idxs[i+2] for i in range(0, len(idxs)-cs, cs)]
c4_dat = [idxs[i+3] for i in range(0, len(idxs)-cs, cs)]

print(idxs[:20])
print(c1_dat[:20])
print(c2_dat[:20])
print(c3_dat[:20])
print(c4_dat[:20])


[39, 41, 28, 29, 24, 26, 28, 0, 0, 0, 42, 44, 39, 39, 38, 42, 32, 37, 30, 1]
[39, 29, 28, 0, 39, 42, 30, 60, 1, 73, 1, 1, 75, 53, 8, 53, 72, 66, 32, 72]
[41, 24, 0, 42, 39, 32, 1, 53, 43, 72, 61, 53, 67, 66, 75, 72, 60, 23, 71, 60]
[28, 26, 0, 44, 38, 37, 72, 72, 70, 60, 71, 1, 65, 8, 60, 1, 57, 1, 1, 57]
[29, 28, 0, 39, 42, 30, 60, 1, 73, 1, 1, 75, 53, 8, 53, 72, 66, 32, 72, 70]


You can see from the above that this is like taking every four characters and putting them in columns. The first three characters will be the inputs to the neural net and the last is the output we're optimising for.

In [8]:
x1 = np.stack(c1_dat)
x2 = np.stack(c2_dat)
x3 = np.stack(c3_dat)

y = np.stack(c4_dat)

NameError: name 'c1_dat' is not defined

Now the PyTorch model:

In [7]:
class Char3Model(nn.Module):
    def __init__(self, vocab_size, n_fac):
        super().__init__()
        
        # The four layers of our model
        self.e = nn.Embedding(vocab_size, n_fac)
        self.l_in = nn.Linear(n_fac, n_hidden)
        self.l_hidden = nn.Linear(n_hidden, n_hidden)
        self.l_out = nn.Linear(n_hidden, vocab_size)
        
    def forward(self, c1, c2, c3):
        in1 = F.relu(self.l_in(self.e(c1)))
        in2 = F.relu(self.l_in(self.e(c2)))
        in3 = F.relu(self.l_in(self.e(c3)))
        
        h = V(torch.zeros(in1.size()).cuda())
        h = F.tanh(self.l_hidden(h+in1))
        h = F.tanh(self.l_hidden(h+in2))
        h = F.tanh(self.l_hidden(h+in3))
        
        return F.log_softmax(self.l_out(h))
        

In [6]:
md = ColumnarModelData.from_arrays('.', [-1], np.stack([x1,x2,x3], axis=1), y, bs=512)

vocab_size = len(chars)+1
n_fac = 42 # embedding matrix width
n_hidden = 256 # number hidden units

m = Char3Model(vocab_size, n_fac).cuda()

NameError: name 'x1' is not defined

In [10]:
# Set up iterator to load data in batches
it = iter(md.trn_dl)
*xs,ys = next(it)
t = m(*V(xs))

In [11]:
# Set up the optimizer
opt = optim.Adam(m.parameters(), 1e-2)
fit(m, md, 3, opt, F.nll_loss)

HBox(children=(IntProgress(value=0, description='Epoch', max=3), HTML(value='')))

epoch      trn_loss   val_loss                              
    0      2.102854   2.209631  
    1      2.028126   0.641864                              
    2      2.010438   0.189988                              



[array([ 0.18999])]

In [12]:
set_lrs(opt, 1e-3)
fit(m, md, 3, opt, F.nll_loss)

HBox(children=(IntProgress(value=0, description='Epoch', max=3), HTML(value='')))

epoch      trn_loss   val_loss                              
    0      1.784426   0.209058  
    1      1.738894   0.217159                              
    2      1.729806   0.322268                              



[array([ 0.32227])]

# Test out the model

Ok so we trained a model... made some graphs appear etc. But how do we use it?

`m(1,2,3)` - throws an error, because pyTorch wants each argument to be turned into pyTorch variable. 

`m(V(1),V(2),V(3))` gives us a `[torch.cuda.FloatTensor of size 1x85 (GPU 0)]` - getting better

Let's try and see that as a familiar numpy ndarray:

`m(V(1),V(2),V(3)).data.numpy()` - another error, this time because the Tensor is on the GPU memory, and needs to be moved into system RAM before we can use it

`m(V(1),V(2),V(3)).cpu().data.numpy()` - tada!

In [13]:
m(V(1),V(2),V(3)).cpu().data.numpy()

array([[ -3.29849,  -2.36959,  -5.86239,  -5.60455, -11.29167, -10.35843,  -7.01992,  -6.12142,  -1.83529,
         -5.39128,  -8.92203,  -9.23298,  -6.56709,  -9.06264,  -8.02441,  -9.74576, -11.53138,  -7.5312 ,
         -7.07608,  -7.60975,  -8.25402,  -6.13394,  -9.88077,  -7.8184 ,  -4.78491,  -5.76463,  -6.19066,
         -6.91619,  -5.15337,  -3.95679,  -6.2584 ,  -4.86974,  -2.53861,  -5.28917,  -4.32854,  -3.53656,
         -1.6944 ,  -4.60295,  -4.51753,  -3.49602,  -6.47315,  -7.91241,  -3.81931,  -1.70739,  -7.24739,
         -5.70486,  -4.0613 ,  -9.3907 ,  -6.80647, -10.23992,  -6.1278 , -12.37789, -10.94001,  -6.6625 ,
         -7.03732,  -5.12533,  -6.60228,  -8.08359,  -7.62465,  -5.59865,  -7.42388,  -7.48192,  -7.37621,
         -9.43708,  -8.20658,  -4.77836,  -5.8834 ,  -8.34979,  -6.83678,  -7.02198,  -9.05406,  -7.67219,
         -4.35888,  -9.24077,  -8.3455 ,  -5.21575, -12.29481, -12.27484, -14.51767, -13.27466, -15.02666,
        -13.4426 , -16.30942, -13.968

So the above are log likelihoods that the next letter is in one of our 85 vocab letters. Let's wrap this into a little function to make it easier to work with using a couple of FastsAI helper functions to make our lives a little smoother.

In [8]:
# Get the next n letters given an input block
def get_next(inp):
    idxs = T(np.array([chars_to_indexes[c] for c in inp]))
    p = m(*VV(idxs))
    i = np.argmax(to_np(p))
    return chars[i]

def get_next_n(inp, n):
    for i in range(0, n):
        inp += get_next(inp[-3:])
    
    return inp
        

In [15]:
get_next_n('Children ', 400)

'Children the some and the some and the some and the some and the some and the some and the some and the some and the some and the some and the some and the some and the some and the some and the some and the some and the some and the some and the some and the some and the some and the some and the some and the some and the some and the some and the some and the some and the some and the some and the some a'

Not the most compelling Nietzsche copy I have to admit. But it's a start!

# Recurrent Neural Network

Start by creating the data, this time using a rolling window of 8 characters across the entire sequence. We'll also create our `y`, with the next character after each of the windows.

In [7]:
input_length = 8

inputs = [[idxs[i+j] for i in range(input_length)] for j in range(len(idxs) - input_length)]
y = [idxs[j+input_length] for j in range(len(idxs) - input_length)]

X = np.stack(inputs, axis = 0)
y = np.stack(y)

print(X, y)

[[39 41 28 ..., 26 28  0]
 [41 28 29 ..., 28  0  0]
 [28 29 24 ...,  0  0  0]
 ..., 
 [71 61 66 ..., 64 66 57]
 [61 66 58 ..., 66 57 71]
 [66 58 73 ..., 57 71 71]] [ 0  0 42 ..., 71 71  9]


Our modified RNN class

In [18]:
class CharLoopModel(nn.Module):
    def __init__(self, vocab_size, n_fac):
        super().__init__()
        self.e = nn.Embedding(vocab_size, n_fac)
        self.l_in = nn.Linear(n_fac, n_hidden)
        self.l_hidden = nn.Linear(n_hidden, n_hidden)
        self.l_out = nn.Linear(n_hidden, vocab_size)
    
    def forward(self, *cs):
        bs = cs[0].size(0)
        h = V(torch.zeros(bs, n_hidden).cuda())
        for c in cs:
            inp = F.relu(self.l_in(self.e(c)))
            h = F.tanh(self.l_hidden(h + inp))
            
        return F.log_softmax(self.l_out(h), dim=-1)
    

Let's take it for a spin!

In [19]:
md = ColumnarModelData.from_arrays('.', [-1], X, y, bs=512)
m = CharLoopModel(vocab_size, n_fac=42).cuda()
opt = optim.Adam(m.parameters(), 1e-2)
fit(m, md, 3, opt, F.nll_loss)

HBox(children=(IntProgress(value=0, description='Epoch', max=3), HTML(value='')))

epoch      trn_loss   val_loss                              
    0      2.107797   0.843655  
    1      2.04132    1.395311                              
    2      2.007522   0.313711                              



[array([ 0.31371])]

In [20]:
get_next_n("bla bla", 400)

'bla blation the his the his the his the his the his the his the his the his the his the his the his the his the his the his the his the his the his the his the his the his the his the his the his the his the his the his the his the his the his the his the his the his the his the his the his the his the his the his the his the his the his the his the his the his the his the his the his the his the his the'

Still not so great... try more training!

In [27]:
set_lrs(opt, 1e-3)
fit(m, md, 5, opt, F.nll_loss)

HBox(children=(IntProgress(value=0, description='Epoch', max=5), HTML(value='')))

epoch      trn_loss   val_loss                              
    0      1.641391   0.287052  
    1      1.643354   0.291294                              
    2      1.631389   0.443712                              
    3      1.638758   0.409489                              
    4      1.622094   0.360929                              



[array([ 0.36093])]

In [26]:
get_next_n("bla bla", 400)

'bla blace of the spirity of the spirity of the spirity of the spirity of the spirity of the spirity of the spirity of the spirity of the spirity of the spirity of the spirity of the spirity of the spirity of the spirity of the spirity of the spirity of the spirity of the spirity of the spirity of the spirity of the spirity of the spirity of the spirity of the spirity of the spirity of the spirity of the '

Well at least it's repeating five words now. That's a new record!

Now we're going to try concatenating the activations of the next characters to the previous activations, instead of adding them.

In [22]:
class CharLoopConcatModel(nn.Module):
    def __init__(self, vocab_size, n_fac):
        super().__init__()
        
        self.e = nn.Embedding(vocab_size, n_fac)
        self.l_in = nn.Linear(n_fac + n_hidden, n_hidden)
        self.l_hidden = nn.Linear(n_hidden, n_hidden)
        self.l_out = nn.Linear(n_hidden, vocab_size)
        
    def forward(self, *cs):
        bs = cs[0].size(0)
        h = V(torch.zeros(bs, n_hidden).cuda())
        
        for c in cs:
            inp = torch.cat((h, self.e(c)), 1)
            inp = F.relu(self.l_in(inp))
            h = F.tanh(self.l_hidden(inp))
            
        return F.log_softmax(self.l_out(h), dim=-1)

In [23]:
n_fac = 42
n_hidden = 256

md = ColumnarModelData.from_arrays('.', [-1], X, y, bs=512)
m = CharLoopConcatModel(vocab_size, n_fac).cuda()
opt = optim.Adam(m.parameters(), 1e-3)

it = iter(md.trn_dl)
*xs,yt = next(it)
t = m(*V(xs))

In [25]:
fit(m, md, 1, opt, F.nll_loss)
set_lrs(opt, 1e-4)
fit(m, md, 1, opt, F.nll_loss)

HBox(children=(IntProgress(value=0, description='Epoch', max=1), HTML(value='')))

epoch      trn_loss   val_loss                                
    0      1.563395   3.046283  



HBox(children=(IntProgress(value=0, description='Epoch', max=1), HTML(value='')))

epoch      trn_loss   val_loss                                
    0      1.460188   2.628501  



[array([ 2.6285])]

In [27]:
get_next_n("The ", 400)

'The and to and to and to and to and to and to and to and to and to and to and to and to and to and to and to and to and to and to and to and to and to and to and to and to and to and to and to and to and to and to and to and to and to and to and to and to and to and to and to and to and to and to and to and to and to and to and to and to and to and to and to and to and to and to and to and to and to a'

### Whole sequence RNN

Now we move onto predicting the sequence from the one which is offset one to the left of it. Ie `[n..n+sl-1]` from `[n-1..n+sl-2]`.

In [96]:
class WholeSequenceRNN(nn.Module):
    def __init__(self, vocab_size, n_fac, n_hidden):
        super().__init__()
        
        self.e = nn.Embedding(vocab_size, n_fac)
        self.rnn = nn.RNN(n_fac, n_hidden)
        self.l_out = nn.Linear(n_hidden, vocab_size)
        
    def forward(self, *cs):
        bs = cs[0].size(0)
        h = V(torch.zeros(1, bs, n_hidden))
        
        inp = self.e(torch.stack(cs))        
        outp, h = self.rnn(inp, h)
        
        return F.log_softmax(self.l_out(outp), dim=-1)

In [97]:
sl = 8 # sequence_length

# Split our array into sequences of length sl
in_data = [[idxs[j+i] for i in range(0,sl)] for j in range(0, len(idxs) - sl - 1, sl)]

# The y values are just the same sequences shifted along one value
out_data = [[idxs[j+i+1] for i in range(0,sl)] for j in range(0, len(idxs) - sl - 1, sl)]

X = np.stack(in_data)
Y = np.stack(out_data)

print(X.shape)
print(Y.shape)

(75111, 8)
(75111, 8)


In [105]:
## Custom loss function for sequence to sequence
def nll_loss_seq(inp, targ):
    sl,bs,nh = inp.size()
    targ = targ.transpose(0,1).contiguous().view(-1)
    return F.nll_loss(inp.view(-1,nh), targ)
    
## Train the model
val_idxs = get_cv_idxs(len(X)-sl-1)
md = ColumnarModelData.from_arrays('.', val_idxs, X, Y, bs=512)

vocab_size = len(chars) + 1
n_fac = 42
n_hidden = 256
m = WholeSequenceRNN(vocab_size, n_fac, n_hidden).cuda()

opt = optim.Adam(m.parameters(), 1e-3)

it = iter(md.trn_dl)
*xst,yt = next(it)

fit(m, md, 6, opt, nll_loss_seq)

HBox(children=(IntProgress(value=0, description='Epoch', max=6), HTML(value='')))

epoch      trn_loss   val_loss                              
    0      2.596586   2.410337  
    1      2.29198    2.199189                              
    2      2.136583   2.082756                              
    3      2.041392   2.010933                              
    4      1.978309   1.959068                              
    5      1.933227   1.921501                              



[array([ 1.9215])]

In [106]:
set_lrs(opt, 1e-4)
fit(m, md, 6, opt, nll_loss_seq)

HBox(children=(IntProgress(value=0, description='Epoch', max=6), HTML(value='')))

epoch      trn_loss   val_loss                             
    0      1.895579   1.907868  
    1      1.889812   1.903903                              
    2      1.886362   1.900541                              
    3      1.880677   1.897424                              
    4      1.876248   1.894234                              
    5      1.873578   1.891043                              



[array([ 1.89104])]

Now that our model is outputting sequences instead of a single character, we must amend our `get_next_n` function

In [108]:
inp = 'test'
idxs = T(np.array([chars_to_indexes[c] for c in inp]))

# Get the next n letters given an input block
def get_next(inp):
    idxs = T(np.array([chars_to_indexes[c] for c in inp]))
    p = m(*VV(idxs))
    i = np.argmax(to_np(p))
    return chars[i]

def get_next_n(inp, n):
    for i in range(0, n):
        inp += get_next(inp[-3:])
    
    return inp

get_next_n('Testing', 400)

['\n', ' ', '!', '"', "'", '(', ')', ',', '-', '.', '0', '1', '2', '3', '4', '5', '6', '7', '8', '9', ':', ';', '=', '?', 'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J', 'K', 'L', 'M', 'N', 'O', 'P', 'Q', 'R', 'S', 'T', 'U', 'V', 'W', 'X', 'Y', 'Z', '[', ']', '_', 'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n', 'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z', 'Æ', 'ä', 'æ', 'é', 'ë']


IndexError: list index out of range

In [73]:
arr = [np.random.randn(2, 3) for _ in range(4)]
print(arr)

print(np.stack(arr, axis=0))
print(np.stack(arr, axis=0).shape)

print(np.stack(arr, axis=1))
print(np.stack(arr, axis=1).shape)

print(np.stack(arr, axis=2))
print(np.stack(arr, axis=2).shape)

[array([[ 0.74882, -0.261  ,  0.38523],
       [-2.34712,  0.14807, -0.2576 ]]), array([[-0.31029,  1.4766 , -0.789  ],
       [-0.09989, -0.12712, -0.27415]]), array([[ 0.12574,  1.18033,  1.25824],
       [-0.38231, -0.33968, -1.48819]]), array([[ 0.56883,  0.65055,  1.24903],
       [ 1.42539,  1.38552, -1.32857]])]
[[[ 0.74882 -0.261    0.38523]
  [-2.34712  0.14807 -0.2576 ]]

 [[-0.31029  1.4766  -0.789  ]
  [-0.09989 -0.12712 -0.27415]]

 [[ 0.12574  1.18033  1.25824]
  [-0.38231 -0.33968 -1.48819]]

 [[ 0.56883  0.65055  1.24903]
  [ 1.42539  1.38552 -1.32857]]]
(4, 2, 3)
[[[ 0.74882 -0.261    0.38523]
  [-0.31029  1.4766  -0.789  ]
  [ 0.12574  1.18033  1.25824]
  [ 0.56883  0.65055  1.24903]]

 [[-2.34712  0.14807 -0.2576 ]
  [-0.09989 -0.12712 -0.27415]
  [-0.38231 -0.33968 -1.48819]
  [ 1.42539  1.38552 -1.32857]]]
(2, 4, 3)
[[[ 0.74882 -0.31029  0.12574  0.56883]
  [-0.261    1.4766   1.18033  0.65055]
  [ 0.38523 -0.789    1.25824  1.24903]]

 [[-2.34712 -0.09989 -0.38231