In [1]:
## All imports needed
%reload_ext autoreload
%autoreload 2
%matplotlib inline

import os
import urllib.request
import numpy as np
import pandas as pd
import torch
from torch import nn

from fastai.io import *
from fastai.conv_learner import *

from fastai.column_data import *

from sklearn.metrics import f1_score, accuracy_score

# Generating new Nietsche text

This is primarily just a notebook to implement myself Lesson 6 of the FastAI Part 1 Deep Learning course. https://github.com/fastai/fastai/blob/master/courses/dl1/lesson6-rnn.ipynb

First up we need to get hold of the data - the entire collected works of Nietzsche:

In [2]:
PATH = '../data/nietzsche/'

os.makedirs(PATH, exist_ok=True)
urllib.request.urlretrieve("https://s3.amazonaws.com/text-datasets/nietzsche.txt", f'{PATH}nietzsche.txt')

text = open(f'{PATH}nietzsche.txt').read()

## Data exploration

In [3]:
chars = sorted(list(set(text)))
vocab_size = len(chars)+1

print('corpus length:', len(text))
print('total chars:', vocab_size, ':', chars)

corpus length: 600893
total chars: 85 : ['\n', ' ', '!', '"', "'", '(', ')', ',', '-', '.', '0', '1', '2', '3', '4', '5', '6', '7', '8', '9', ':', ';', '=', '?', 'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J', 'K', 'L', 'M', 'N', 'O', 'P', 'Q', 'R', 'S', 'T', 'U', 'V', 'W', 'X', 'Y', 'Z', '[', ']', '_', 'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n', 'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z', 'Æ', 'ä', 'æ', 'é', 'ë']


Rather than use characters for the model - we will use numbers. Ie "HELLO" could become `[8,5,12,12,15]`. So we map every charcater to an index 0..84


In [4]:
chars_to_indexes = {i: x for x,i in enumerate(chars)}
indexes_to_chars = {x: i for x,i in enumerate(chars)}

print(chars_to_indexes)

{'\n': 0, ' ': 1, '!': 2, '"': 3, "'": 4, '(': 5, ')': 6, ',': 7, '-': 8, '.': 9, '0': 10, '1': 11, '2': 12, '3': 13, '4': 14, '5': 15, '6': 16, '7': 17, '8': 18, '9': 19, ':': 20, ';': 21, '=': 22, '?': 23, 'A': 24, 'B': 25, 'C': 26, 'D': 27, 'E': 28, 'F': 29, 'G': 30, 'H': 31, 'I': 32, 'J': 33, 'K': 34, 'L': 35, 'M': 36, 'N': 37, 'O': 38, 'P': 39, 'Q': 40, 'R': 41, 'S': 42, 'T': 43, 'U': 44, 'V': 45, 'W': 46, 'X': 47, 'Y': 48, 'Z': 49, '[': 50, ']': 51, '_': 52, 'a': 53, 'b': 54, 'c': 55, 'd': 56, 'e': 57, 'f': 58, 'g': 59, 'h': 60, 'i': 61, 'j': 62, 'k': 63, 'l': 64, 'm': 65, 'n': 66, 'o': 67, 'p': 68, 'q': 69, 'r': 70, 's': 71, 't': 72, 'u': 73, 'v': 74, 'w': 75, 'x': 76, 'y': 77, 'z': 78, 'Æ': 79, 'ä': 80, 'æ': 81, 'é': 82, 'ë': 83}


Finally, convert the entire corpus to indexes to use as the dataset

In [5]:
idxs = [chars_to_indexes[char] for char in text]

print(idxs[400:500])
print(''.join([indexes_to_chars[idx] for idx in idxs[400:500]]))

[72, 67, 1, 54, 57, 1, 75, 67, 66, 21, 1, 53, 66, 56, 0, 53, 72, 1, 68, 70, 57, 71, 57, 66, 72, 1, 57, 74, 57, 70, 77, 1, 63, 61, 66, 56, 1, 67, 58, 1, 56, 67, 59, 65, 53, 1, 71, 72, 53, 66, 56, 71, 1, 75, 61, 72, 60, 1, 71, 53, 56, 1, 53, 66, 56, 1, 56, 61, 71, 55, 67, 73, 70, 53, 59, 57, 56, 1, 65, 61, 57, 66, 8, 8, 32, 29, 7, 0, 61, 66, 56, 57, 57, 56, 7, 1, 61, 72, 1, 71]
to be won; and
at present every kind of dogma stands with sad and discouraged mien--IF,
indeed, it s


## 3-character model

So we're going to start off with looking at just the last 3 characters and trying to predict the 4th.

To begin, create a 4 arrays, each one offset one from the last (3 for the inputs, and one for y):

In [6]:
cs = 3

c1_dat = [idxs[i] for i in range(0, len(idxs)-cs, cs)]
c2_dat = [idxs[i+1] for i in range(0, len(idxs)-cs, cs)]
c3_dat = [idxs[i+2] for i in range(0, len(idxs)-cs, cs)]
c4_dat = [idxs[i+3] for i in range(0, len(idxs)-cs, cs)]

print(idxs[:20])
print(c1_dat[:20])
print(c2_dat[:20])
print(c3_dat[:20])
print(c4_dat[:20])


[39, 41, 28, 29, 24, 26, 28, 0, 0, 0, 42, 44, 39, 39, 38, 42, 32, 37, 30, 1]
[39, 29, 28, 0, 39, 42, 30, 60, 1, 73, 1, 1, 75, 53, 8, 53, 72, 66, 32, 72]
[41, 24, 0, 42, 39, 32, 1, 53, 43, 72, 61, 53, 67, 66, 75, 72, 60, 23, 71, 60]
[28, 26, 0, 44, 38, 37, 72, 72, 70, 60, 71, 1, 65, 8, 60, 1, 57, 1, 1, 57]
[29, 28, 0, 39, 42, 30, 60, 1, 73, 1, 1, 75, 53, 8, 53, 72, 66, 32, 72, 70]


You can see from the above that this is like taking every four characters and putting them in columns. The first three characters will be the inputs to the neural net and the last is the output we're optimising for.

In [7]:
x1 = np.stack(c1_dat)
x2 = np.stack(c2_dat)
x3 = np.stack(c3_dat)

y = np.stack(c4_dat)

Now the PyTorch model:

In [8]:
class Char3Model(nn.Module):
    def __init__(self, vocab_size, n_fac):
        super().__init__()
        
        # The four layers of our model
        self.e = nn.Embedding(vocab_size, n_fac)
        self.l_in = nn.Linear(n_fac, n_hidden)
        self.l_hidden = nn.Linear(n_hidden, n_hidden)
        self.l_out = nn.Linear(n_hidden, vocab_size)
        
    def forward(self, c1, c2, c3):
        in1 = F.relu(self.l_in(self.e(c1)))
        in2 = F.relu(self.l_in(self.e(c2)))
        in3 = F.relu(self.l_in(self.e(c3)))
        
        h = V(torch.zeros(in1.size()).cuda())
        h = F.tanh(self.l_hidden(h+in1))
        h = F.tanh(self.l_hidden(h+in2))
        h = F.tanh(self.l_hidden(h+in3))
        
        return F.log_softmax(self.l_out(h))
        

In [9]:
md = ColumnarModelData.from_arrays('.', [-1], np.stack([x1,x2,x3], axis=1), y, bs=512)

vocab_size = len(chars)+1
n_fac = 42 # embedding matrix width
n_hidden = 256 # number hidden units

m = Char3Model(vocab_size, n_fac).cuda()

In [15]:
# Set up iterator to load data in batches
it = iter(md.trn_dl)
*xs,ys = next(it)
t = m(*V(xs))

In [18]:
# Set up the optimizer
opt = optim.Adam(m.parameters(), 1e-2)
fit(m, md, 3, opt, F.nll_loss)

HBox(children=(IntProgress(value=0, description='Epoch', max=3), HTML(value='')))

epoch      trn_loss   val_loss                              
    0      2.039449   0.806252  
    1      1.993318   0.379452                              
    2      1.974573   0.566268                              



[array([ 0.56627])]

In [20]:
set_lrs(opt, 1e-3)
fit(m, md, 3, opt, F.nll_loss)

HBox(children=(IntProgress(value=0, description='Epoch', max=3), HTML(value='')))

epoch      trn_loss   val_loss                              
    0      1.766317   0.211555  
    1      1.740298   0.339998                              
    2      1.728887   0.354707                              



[array([ 0.35471])]

# Test out the model

Ok so we trained a model... made some graphs appear etc. But how do we use it?

`m(1,2,3)` - throws an error, because pyTorch wants each argument to be turned into pyTorch variable. 

`m(V(1),V(2),V(3))` gives us a `[torch.cuda.FloatTensor of size 1x85 (GPU 0)]` - getting better

Let's try and see that as a familiar numpy ndarray:

`m(V(1),V(2),V(3)).data.numpy()` - another error, this time because the Tensor is on the GPU memory, and needs to be moved into system RAM before we can use it

`m(V(1),V(2),V(3)).cpu().data.numpy()` - tada!

In [27]:
m(V(1),V(2),V(3)).cpu().data.numpy()

array([[ -1.57677,  -0.75591,  -4.76062,  -6.57453,  -9.12727, -12.10871,  -4.63624,  -4.6357 ,  -1.66438,
         -3.42399, -14.84525, -12.18173, -10.8473 , -12.81438, -13.20588, -11.53313, -10.35709, -12.06495,
        -11.19147, -12.08115,  -4.6234 ,  -4.13509,  -8.46542,  -5.25765,  -7.32801,  -6.85994,  -8.08317,
         -5.86093,  -6.49639,  -6.01573,  -7.22469,  -7.52058,  -6.17969, -10.48757,  -7.75444,  -7.20358,
         -6.03709,  -6.92364,  -7.31022,  -6.70874, -12.13955,  -6.69574,  -5.34248,  -6.31628,  -8.38307,
         -8.06625,  -6.40162,  -9.3522 ,  -6.21676, -11.10902,  -7.88907,  -8.23203,  -9.55094,  -7.52821,
         -8.09378,  -8.15181,  -8.9369 ,  -9.23243,  -7.38731,  -9.70172, -10.23674,  -9.13578, -13.18966,
         -9.31467,  -8.15874,  -8.84528,  -6.10116,  -9.92134,  -8.88697, -12.23056,  -9.61908,  -6.01207,
         -6.85826,  -9.78319, -10.14449,  -7.09998, -12.96511, -14.78201, -14.58377, -22.63766, -20.03164,
        -26.88843, -18.03801, -27.150

So the above are log likelihoods that the next letter is in one of our 85 vocab letters. Let's wrap this into a little function to make it easier to work with using a couple of FastsAI helper functions to make our lives a little smoother.

In [36]:
# Get the next n letters given an input block
def get_next(inp):
    idxs = T(np.array([chars_to_indexes[c] for c in inp]))
    p = m(*VV(idxs))
    i = np.argmax(to_np(p))
    return chars[i]

def get_next_n(inp, n):
    for i in range(0, n):
        inp += get_next(inp[-3:])
    
    return inp
        

In [45]:
get_next_n('Children ', 400)

'Children the some and the some and the some and the some and the some and the some and the some and the some and the some and the some and the some and the some and the some and the some and the some and the some and the some and the some and the some and the some and the some and the some and the some and the some and the some and the some and the some and the some and the some and the some and the some a'

Not the most compelling Nietzsche copy I have to admit. But it's a start!

# Recurrent Neural Network

Start by creating the data, this time using a rolling window of 8 characters across the entire sequence. We'll also create our `y`, with the next character after each of the windows.

In [49]:
input_length = 8

inputs = [[idxs[i+j] for i in range(input_length)] for j in range(len(idxs) - input_length)]
y = [idxs[j+input_length] for j in range(len(idxs) - input_length)]

X = np.stack(c_in_dat, axis = 0)
y = np.stack(y)

print(X, y)

[[39 41 28 ..., 26 28  0]
 [41 28 29 ..., 28  0  0]
 [28 29 24 ...,  0  0  0]
 ..., 
 [71 61 66 ..., 64 66 57]
 [61 66 58 ..., 66 57 71]
 [66 58 73 ..., 57 71 71]] [ 0  0 42 ..., 71 71  9]


Our modified RNN class

In [59]:
class CharLoopModel(nn.Module):
    def __init__(self, vocab_size, n_fac):
        super().__init__()
        self.e = nn.Embedding(vocab_size, n_fac)
        self.l_in = nn.Linear(n_fac, n_hidden)
        self.l_hidden = nn.Linear(n_hidden, n_hidden)
        self.l_out = nn.Linear(n_hidden, vocab_size)
    
    def forward(self, *cs):
        bs = cs[0].size(0)
        h = V(torch.zeros(bs, n_hidden).cuda())
        for c in cs:
            inp = F.relu(self.l_in(self.e(c)))
            h = F.tanh(self.l_hidden(h + inp))
            
        return F.log_softmax(self.l_out(h), dim=-1)
    

Let's take it for a spin!

In [62]:
m = CharLoopModel(vocab_size, n_fac=42).cuda()
opt = optim.Adam(m.parameters(), 1e-2)
fit(m, md, 3, opt, F.nll_loss)

HBox(children=(IntProgress(value=0, description='Epoch', max=3), HTML(value='')))

epoch      trn_loss   val_loss                              
    0      2.112681   0.738177  
    1      2.022872   0.406081                              
    2      1.982728   0.274576                              



[array([ 0.27458])]

In [63]:
get_next_n("bla bla", 400)

'bla blation of the from and and and and and and and and and and and and and and and and and and and and and and and and and and and and and and and and and and and and and and and and and and and and and and and and and and and and and and and and and and and and and and and and and and and and and and and and and and and and and and and and and and and and and and and and and and and and and and and and'

Still not so great... try more training!

In [64]:
set_lrs(opt, 1e-3)
fit(m, md, 5, opt, F.nll_loss)

HBox(children=(IntProgress(value=0, description='Epoch', max=5), HTML(value='')))

epoch      trn_loss   val_loss                              
    0      1.771335   0.342531  
    1      1.741049   0.334585                              
    2      1.718765   0.340415                              
    3      1.696973   0.336358                              
    4      1.690538   0.418884                              



[array([ 0.41888])]

In [65]:
get_next_n("bla bla", 400)

'bla blate of the such a man and the such a man and the such a man and the such a man and the such a man and the such a man and the such a man and the such a man and the such a man and the such a man and the such a man and the such a man and the such a man and the such a man and the such a man and the such a man and the such a man and the such a man and the such a man and the such a man and the such a man'

Well at least it's repeating more than one word now.