## Wavnet larger corpus

we'll try reading alice again

In [1]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim

from collections import OrderedDict

In [2]:
%run ../lib/bookreader.py

In [3]:
%run ../lib/nn_layers.py

### Repeat a few things

In [59]:
alice = BookReader()
alice.read("../resources/alice.txt")

vocab_size = len(alice.itos)

lowercase only


In [5]:
class ViewDilaltion(nn.Module):

    def __init__(self, n):
        super().__init__()
        self.n = n
        self.type = 'flatten'

    def __call__(self, x):
        B, T, C = x.shape
        x = x.view(B, -1, self.n * C)
        
        self.out = x
        return self.out

    def parameters(self):
        return []

In [64]:
class DeeperWavenetish(nn.Module):

    def __init__(self, vocab_size, embedding_size, context_length, hidden_sizes):
        super().__init__()

        layers = OrderedDict([
            ('embed', nn.Embedding(vocab_size, embedding_size)),
        ])

        hidden_sizes = [embedding_size] + hidden_sizes

        sufixes = ['_a', '_b', '_c', '_d', '_e', '_f', '_g', '_h']
        for i in range(len(hidden_sizes)-1):
            self.add_block(layers, sufixes[i], hidden_sizes[i], hidden_sizes[i+1])
        
        layers['logits'] = nn.Linear(hidden_sizes[-1], vocab_size, bias=True)
        nn.init.zeros_(layers['logits'].bias)

        self.model = nn.Sequential(layers)

    def add_block(self, layer_dict, suffix="_a", fan_in=10, fan_out=20):
        linear_layer = nn.Linear(fan_in*2, fan_out, bias=False)
        nn.init.xavier_uniform_(linear_layer.weight, gain=5/3)
        layer_dict['flatten'+suffix] = ViewDilaltion(2)
        layer_dict['feed_forward'+suffix] = linear_layer
        layer_dict['layer_norm'+suffix] = nn.LayerNorm(fan_out)
        layer_dict['non_linearity'+suffix] = nn.Tanh()
        print("added block", suffix, fan_in, fan_out)
        
    def forward(self, idx, targets=None):
        logits = self.model(idx)

        if targets is None:
            loss = None
        else:
            loss = F.cross_entropy(logits.view(-1, vocab_size), targets)

        return logits, loss

### Run a short test first

run a smaller model just to get our bearings

In [8]:
epochs = 20
batch_size = 80
learning_rate = .1
samples = 1000
hidden_states = [30, 30, 30, 40]
embedding_size = 6

context_length = 2**len(hidden_states)

model = DeeperWavenetish(vocab_size, embedding_size, context_length, hidden_states)

optimizer = optim.SGD(model.parameters(), lr=learning_rate, momentum=0.9)

context_length, sum(p.nelement() for p in model.parameters())

added block _a 6 30
added block _b 30 30
added block _c 30 30
added block _d 30 40


(16, 8030)

In [9]:
for ep in range(epochs):
    epoch_loss = 0
    for s in range(samples):
        x, y = alice.sample_batch(batch_size, context_length)
        
        logits, loss = model.forward(x, y)
        
        epoch_loss += loss.detach()
        model.zero_grad(set_to_none=True)
        loss.backward()
        optimizer.step()

    #just keep any epoch stuff in a no grad block
    with torch.no_grad():
        if ep % 10 == 0:
            print(epoch_loss/samples)
            learning_rate *= .92
            print(ep, learning_rate)

print(epoch_loss/samples)

tensor(2.2813)
0 0.09200000000000001
tensor(1.6810)
10 0.08464000000000002
tensor(1.6245)


In [75]:
def generate(max_characters, context_length, vocab_size):
    out = []
    c_out = []
    
    ix = [[0 for _ in range(context_length)]]
    c_ix = [[0 for _ in range(context_length)]]

    for _ in range(max_characters):
        logits = model.forward(torch.tensor(c_ix))

        p = F.softmax(logits[0].view(vocab_size), dim=0)

        prediction = torch.multinomial(p, num_samples=1).item()

        del c_ix[0][0]

        c_ix[0].append(prediction)

        out.append(alice.itos[prediction])
        c_out.append(alice.itos[prediction])

    return c_out

In [52]:
print("".join(generate(300)))







the were windln  i she a sis
icunse said eve mis out
it a
duchess im and a very pave nignt it neahliln  shociyd on inse or to tone could firp so usle rew tran  in the
only 


 see she tear hounting of as  the lare in sbout alice   the nealie   side as awlution  ele this aptering of aif  who sh


In [53]:
epochs = 40
batch_size = 80
learning_rate = .1
samples = 1000
hidden_states = [20, 30, 30, 40, 40]
embedding_size = 6

context_length = 2**len(hidden_states)

model = DeeperWavenetish(vocab_size, embedding_size, context_length, hidden_states)

optimizer = optim.SGD(model.parameters(), lr=learning_rate, momentum=0.9)

context_length, sum(p.nelement() for p in model.parameters())

added block _a 6 20
added block _b 20 30
added block _c 30 30
added block _d 30 40
added block _e 40 40


(32, 10570)

In [54]:
for ep in range(epochs):
    epoch_loss = 0
    for s in range(samples):
        x, y = alice.sample_batch(batch_size, context_length)
        
        logits, loss = model.forward(x, y)
        
        epoch_loss += loss.detach()
        model.zero_grad(set_to_none=True)
        loss.backward()
        optimizer.step()

    #just keep any epoch stuff in a no grad block
    with torch.no_grad():
        if ep % 10 == 0:
            print(epoch_loss/samples)
            learning_rate *= .92
            print(ep, learning_rate)

print(epoch_loss/samples)

tensor(2.3480)
0 0.09200000000000001
tensor(1.7291)
10 0.08464000000000002
tensor(1.6468)
20 0.07786880000000002
tensor(1.6172)
30 0.07163929600000002
tensor(1.5753)


In [55]:
print("".join(generate(300)))


here  and agas eyeso otpen on she gar ugulter hissherals   i know mout thon allesireaded was havery  


 i such groken a lick  i near cats  i girse is look! bouse now abthing the time thrinking 
and like grak   of voice you lurkle if not her had le  one looketer  


 eut  she gink again bati     an


### Promising?

but we don't really have a way to evaluate these continuous texts yet 

compare to the original corpus:
* we could see if it's learning real words
* word bigram evaluation - we could see if it learns short associations
* structure, do we get sentences, paragraphs, chapters like the original

etc...

lets come back to that after attention so we can compare the approaches

for now lets though lets look at different variations of the model to get 
some more intuition

In [57]:
epochs = 100
batch_size = 80
learning_rate = .1
samples = 2000
hidden_states = [20, 30, 30, 40, 40, 30]
embedding_size = 9

context_length = 2**len(hidden_states)

model = DeeperWavenetish(vocab_size, embedding_size, context_length, hidden_states)

optimizer = optim.SGD(model.parameters(), lr=learning_rate, momentum=0.9)

context_length, sum(p.nelement() for p in model.parameters())

added block _a 9 20
added block _b 20 30
added block _c 30 30
added block _d 30 40
added block _e 40 40
added block _f 40 30


(64, 13240)

In [61]:
alice = BookReader(False)
alice.read("../resources/alice.txt")

vocab_size = len(alice.itos)
vocab_size

56

what effect does increasing vocab size have on our model size?

In [65]:
epochs = 100
batch_size = 80
learning_rate = .1
samples = 2000
hidden_states = [20, 30, 30, 40, 40, 30]
embedding_size = 9

context_length = 2**len(hidden_states)

model = DeeperWavenetish(vocab_size, embedding_size, context_length, hidden_states)

optimizer = optim.SGD(model.parameters(), lr=learning_rate, momentum=0.9)

context_length, sum(p.nelement() for p in model.parameters())

added block _a 9 20
added block _b 20 30
added block _c 30 30
added block _d 30 40
added block _e 40 40
added block _f 40 30


(64, 13980)

In [66]:
for ep in range(epochs):
    epoch_loss = 0
    for s in range(samples):
        x, y = alice.sample_batch(batch_size, context_length)
        
        logits, loss = model.forward(x, y)
        
        epoch_loss += loss.detach()
        model.zero_grad(set_to_none=True)
        loss.backward()
        optimizer.step()

    #just keep any epoch stuff in a no grad block
    with torch.no_grad():
        if ep % 10 == 0:
            print(epoch_loss/samples)
            learning_rate *= .92
            print(ep, learning_rate)

print(epoch_loss/samples)

tensor(2.3888)
0 0.09200000000000001
tensor(1.6780)
10 0.08464000000000002
tensor(1.6056)
20 0.07786880000000002
tensor(1.5573)
30 0.07163929600000002
tensor(1.5340)
40 0.06590815232000002
tensor(1.5203)
50 0.06063550013440003
tensor(1.5055)
60 0.05578466012364803
tensor(1.5005)
70 0.05132188731375619
tensor(1.4837)
80 0.0472161363286557
tensor(1.4841)
90 0.043438845422363245
tensor(1.4727)


In [77]:
print("".join(generate(1000, 64, 56)))

The Queen   Verhess 
   live ruzp alit I beginning to stelit said
the others not a mers at one could had been theop up out aners to when were thing I know
pudon  No  such asleet  but but I
haves? 

 This
till   cond would before  whis falwoby fis moud juiaplenx on a ce did who igh like should I m  queen cautiedly
beiges the chy k inatu RO begin a littlittle 
hemefith   He which poo indece 
hatto whiled  and she was see  or aid
glose with hill
of the did they stoe! Dord
 all be all
a
all all all see whitiited  but
Alice at up  she was rith 


 Yes  she had
of fewended 

This  and all she pine ler had
shuses  that frand! 

Alice thill  what
 and a little! 
Deewhere  net ping baben all obll   said! It was got was letent  and the Gryphiightece wholr them sile
  Nettjteze was loudel and feters af nowd
into  and I ll make in a little  and oisely little  and out pan him mutting fetchin delis best agitioned  and mething   nes icato they when  I it! Is shall beariowtiin cried a toverself 
would