In [1]:
## All imports needed
%reload_ext autoreload
%autoreload 2
%matplotlib inline

import os
import urllib.request
import numpy as np
import pandas as pd
import torch
from torch import nn

from fastai.io import *
from fastai.conv_learner import *

from fastai.column_data import *

from torchtext import vocab, data

from fastai.nlp import *
from fastai.lm_rnn import *

Load the data

In [2]:
PATH = '../data/nietzsche/'

os.makedirs(PATH, exist_ok=True)
os.makedirs(f'{PATH}trn', exist_ok=True)
os.makedirs(f'{PATH}vld', exist_ok=True)

urllib.request.urlretrieve("https://s3.amazonaws.com/text-datasets/nietzsche.txt", f'{PATH}nietzsche.txt')

# Put first 80% of text into trn and the rest into vld
wc = !wc -l {PATH}nietzsche.txt
wc = int(wc[0].split(' ')[0])

trn_n = int(round(wc*0.8))
vld_n = wc - trn_n

!head -n {trn_n} {PATH}nietzsche.txt > {PATH}trn/text.txt
!tail -n {vld_n} {PATH}nietzsche.txt > {PATH}vld/text.txt

Build out model

In [3]:
TEXT = data.Field(lower=True, tokenize=list)
bs=1024; bptt=8

md = LanguageModelData.from_text_files(PATH, TEXT, train='trn', validation='vld', test='vld', bs=bs, bptt=bptt, min_freq=3)

In [30]:
class CharSeqStatefulRNN(nn.Module):
    def __init__(self, vocab_size, n_fac, n_hidden, bs):
        super().__init__()
        self.vocab_size = vocab_size
        self.n_hidden = n_hidden
        
        # Layers
        self.e = nn.Embedding(vocab_size, n_fac)
        self.rnn = nn.RNN(n_fac, n_hidden)
        self.l_out = nn.Linear(n_hidden, vocab_size)
        
        # Hidden layers weights - start at zero
        self.init_hidden(bs)
        
    def forward(self, cs):
        bs = cs[0].size(0)
        if self.h.size(1) != bs: self.init_hidden(bs)
        
        outp, h = self.rnn(self.e(cs), self.h)
        self.h = repackage_var(h)
        return F.log_softmax(self.l_out(outp), dim=-1).view(-1, self.vocab_size)        
    
    def init_hidden(self, bs):
        self.h = V(torch.zeros(1, bs, self.n_hidden))

In [31]:
vocab_size = md.nt # Number of tokens
n_hidden = 256
bs = 512
n_fac = 42

m = CharSeqStatefulRNN(vocab_size, n_fac, n_hidden, bs)
opt = optim.Adam(m.parameters(), 1e-3)

fit(m, md, 6, opt, F.nll_loss)

HBox(children=(IntProgress(value=0, description='Epoch', max=6), HTML(value='')))

epoch      trn_loss   val_loss                              
    0      1.890363   1.853279  
    1      1.713616   1.708676                              
    2      1.633962   1.643346                              
    3      1.586952   1.598944                              
    4      1.54269    1.577271                              
    5      1.517579   1.558746                              



[array([1.55875])]

In [32]:
set_lrs(opt, 1e-4)
fit(m, md, 6, opt, F.nll_loss)

HBox(children=(IntProgress(value=0, description='Epoch', max=6), HTML(value='')))

epoch      trn_loss   val_loss                              
    0      1.439679   1.519057  
    1      1.445775   1.515631                              
    2      1.447319   1.512792                              
    3      1.443466   1.510369                              
    4      1.433581   1.507999                              
    5      1.430042   1.506103                              



[array([1.5061])]

In [56]:
class CharSeqStatefulGRU(nn.Module):
    def __init__(self, vocab_size, n_fac, n_hidden, bs):
        super().__init__()
        
        self.vocab_size = vocab_size
        self.n_hidden = n_hidden
        
        # Layers
        self.e = nn.Embedding(vocab_size, n_fac)
        self.rnn = nn.GRU(n_fac, n_hidden)
        self.l_out = nn.Linear(n_hidden, vocab_size)
        
        self.init_hidden(bs)
        
    def forward(self, cs):
        bs = cs[0].size(0)
        if bs != self.h.size(1): self.init_hidden(bs)
            
        inp = F.relu(self.e(cs))
        outp, h = self.rnn(inp, self.h)       
        self.h = repackage_var(h)
        
        return F.log_softmax(self.l_out(outp), dim=-1).view(-1, self.vocab_size)
    
    def init_hidden(self, bs): 
        self.h = V(torch.zeros(1, bs, self.n_hidden)) 
        

In [59]:
m = CharSeqStatefulGRU(vocab_size=md.nt, n_fac=42, n_hidden=512, bs=1024)
opt = optim.Adam(m.parameters(), 1e-3)
fit(m, md, 4, opt, F.nll_loss)

HBox(children=(IntProgress(value=0, description='Epoch', max=4), HTML(value='')))

  2%|▏         | 23/942 [00:02<01:56,  7.87it/s, loss=3.16]
epoch      trn_loss   val_loss                              
    0      1.736408   1.707335  
    1      1.52186    1.546684                              
    2      1.421969   1.483831                              
    3      1.35957    1.464814                              



[array([1.46481])]

Val loss down to 1.46 - looking better. Let's try out word generation again

In [78]:
def get_next(inp):
    idxs = TEXT.numericalize(inp, device=-1)
    p = m(VV(idxs.transpose(0,1)))
    r = torch.multinomial(p[-1].exp(), 1)
    return TEXT.vocab.itos[to_np(r)[0]]

print(get_next_n("And yet", 200))
print(get_next_n("And yet", 200))
print(get_next_n("And yet", 200))

And yeten anconish poris fal, anty?--5y the cons strow insthemios. ever oftermento underancessentor. the mordistilian ething trus! fylike poso now dendirate desthasto _migreedourfor'" inar.2--wility; ittanda
And yet adabless!--fining undvicles stilious. in theas amplecian im),an butopatious, tas ourdepentlicy is merifare am he imations farility ofte masinged nautrone merthe forua idexperated inlity "des assenalo
And yeto1 culia it atte is culed; it some? in,with cers imous, fortal a gan this nothe sen lonyare" do spemple, whoenestre assen the equitame ass of teaf fortal arte ope, welegets,"16. l'shous, wily, nown ha


So it's starting to produce something - and crucially it's different on each step, which gives hope for a text generator.

## LSTM

In [81]:
class CharSeqStatefulLSTM(nn.Module):
    def __init__(self, vocab_size, n_fac, n_hidden, bs, nl):
        super().__init__()
        self.vocab_size,self.nl,self.n_hidden = vocab_size,nl,n_hidden
        self.e = nn.Embedding(vocab_size, n_fac)
        self.rnn = nn.LSTM(n_fac, n_hidden, nl, dropout=0.5)
        self.l_out = nn.Linear(n_hidden, vocab_size)
        self.init_hidden(bs)
        
    def forward(self, cs):
        bs = cs[0].size(0)
        if self.h[0].size(1) != bs: self.init_hidden(bs)
        outp,h = self.rnn(self.e(cs), self.h)
        self.h = repackage_var(h)
        return F.log_softmax(self.l_out(outp), dim=-1).view(-1, self.vocab_size)
    
    def init_hidden(self, bs):
        self.h = (V(torch.zeros(self.nl, bs, self.n_hidden)),
                  V(torch.zeros(self.nl, bs, self.n_hidden)))
        

In [83]:
m = CharSeqStatefulLSTM(vocab_size=md.nt, n_fac=42, n_hidden=512, bs=1024, nl=2)
opt = optim.Adam(m.parameters(), 1e-3)
fit(m, md, 4, opt, F.nll_loss)

HBox(children=(IntProgress(value=0, description='Epoch', max=4), HTML(value='')))

epoch      trn_loss   val_loss                              
    0      1.737481   1.658057  
    1      1.536371   1.500945                              
    2      1.455958   1.440475                              
    3      1.402662   1.410865                              



[array([1.41086])]

In [84]:
print(get_next_n("And yet", 200))
print(get_next_n("And yet", 200))
print(get_next_n("And yet", 200))

And yethand the quily in ever itsthather accumous the begin was wettermus in aptifies uncous granteding withemoventlies a respectionas art. whick san adment in sty!--and: we he womed, insted to theavourwelf.
And yeto'hand trivity thin said.121. he madeniky them the shows in this aptical assuburary of morto exive, skrious a purable from a my our feet! he was a mank ancifantant thicken gere, he caming that antific
And yethess. anstal of thead do ther for i, welly pryprispon by meate aptent, dety symatter idealceavour enoure: thanothe, ever have miguate exulad amonalser opposity "son asmean ancism thesi( theallified, a


## Next steps

Try out different authors. Work out how much data needed to start imitating the style.