## continuous data

experiment with feeding the model different data

view our names as a continuous 'text' of words separated by spaces - we'll sample randomly from the text

our evaluation will not be directly comparable but it'll prepare us for reading in books in the next section

### standard setup

In [1]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim

import collections

### new sampling
continuous sampling from a file

In [2]:
%run ../lib/continuous_sampling.py

In [3]:
context_length = 5

names = WordSampling()
names.from_file("../resources/names.txt")

vocab_size = len(names.itos)
vocab_size

27

In [4]:
names.sample_batch(Split.TRAIN, 5, context_length)

(tensor([[18,  5, 22,  1,  0],
         [ 9, 25,  1, 14, 14],
         [ 1,  0,  5, 13,  2],
         [ 1, 11,  1, 18,  9],
         [ 5,  0, 26,  5,  4]]),
 tensor([20,  1,  5,  0,  4]))

### Neural net lib

same as last time

In [5]:
class SimpleNames(nn.Module):

    def __init__(self, embedding_size = 6, hidden_size = 100, context_length = 5, vocab_size = 27):
        super().__init__()

        self.embedding_size = embedding_size
        self.hidden_size = hidden_size
        self.context_length = context_length
        
        layer_dict = collections.OrderedDict([
            ('embed', nn.Embedding(vocab_size, embedding_size)),
            ('flatten', nn.Flatten(1)),
            ('feed_forward', nn.Linear(embedding_size*context_length, hidden_size, bias=True)),
            ('non_linearity', nn.Tanh()),
            ('logits', nn.Linear(hidden_size, vocab_size, bias=True)),
        ])

        nn.init.xavier_uniform_(layer_dict['feed_forward'].weight, gain=5/3)
        nn.init.zeros_(layer_dict['feed_forward'].bias)
        
        nn.init.zeros_(layer_dict['logits'].bias)

        self.model = nn.Sequential(
            layer_dict
        )
        self.layer_dict = layer_dict

    def forward(self, idx, targets=None):
        logits = self.model(idx)
    
        if targets is None:
            loss = None
        else:
            loss = F.cross_entropy(logits, targets)

        return logits, loss

    def generate(self, num_names):
        new_names = []
        for i in range(num_names):
            out = []
            ix = [[0 for _ in range(self.context_length)]]
            for nl in range(10):
                logits = self.model(torch.tensor(ix))

                p = F.softmax(logits, dim=1)
        
                prediction = torch.multinomial(p, num_samples=1).item()
    
                for i in range(self.context_length-1):
                    ix[0][i]= ix[0][i+1]
                    
                ix[0][self.context_length-1] = prediction
    
                if prediction == 0:
                    break
                out.append(names.itos[prediction])
                
            new_names.append("".join(out))

        return new_names

In [6]:
class TestRig:

    def __init__(self, embedding_size, hidden_size, context_length=5, learning_rate = .2):
        self.context_length = context_length
        
        model = SimpleNames(embedding_size, hidden_size, context_length)

        parameters = model.parameters()
        print(model.embedding_size, model.hidden_size, model.context_length, sum(p.nelement() for p in parameters)) # number of parameters in total

        self.model = model
        self.optimizer = optim.SGD(self.model.parameters(), lr=learning_rate, momentum=0.9)
        self.scheduler = optim.lr_scheduler.MultiplicativeLR(self.optimizer, lr_lambda=lmbda)
        
        self.track = {'gradients': [], 'loss': [], 'learning_rate': []}

    def train(self, epochs, batch_size, samples):
        for ep in range(epochs):
            epoch_loss = 0
            for s in range(samples):
                X, Y = names.sample_batch(Split.TRAIN, batch_size, self.context_length)
                logits, loss = self.model.forward(X, Y)
                
                epoch_loss += loss.item()
                self.model.zero_grad(set_to_none=True)
                loss.backward()
                self.optimizer.step()
        
            self.scheduler.step()
        
            if ep % 10 == 0:
                self.track['loss'].append(epoch_loss)
                self.track['learning_rate'].append(self.scheduler.get_last_lr())
                print("learning rate", ep, self.scheduler.get_last_lr())
                print("running loss", epoch_loss/samples)

    @torch.no_grad()
    def val_split(self, split):
        batch_length = self.context_length * 20

        batches = len(split) // batch_length

        val_loss = 0
        
        ix = [[0 for _ in range(self.context_length-1)]]
        for i in range(batches):
            offset = i * batch_length
            xs = []
            ys = []
            for bi in range(batch_length):
                xs.append(split[offset+bi:offset+bi+self.context_length])
                ys.append(split[offset+bi+self.context_length])
        
            X = torch.tensor(xs)
            Y = torch.tensor(ys)
    
            logits, loss = self.model.forward(X, Y)
        
            val_loss += loss
        
        return val_loss / batches

### Run new data

the model is the same but we've changed what the data looks like - it's a test to see what might happen with running a book instead of words through our model

Our running loss looks smaller but this is not the same as previously - the model 'learns' we have a lot more chance of 0s at the end of words and so the loss seems artifically smaller

We keep our evaluation model the same as before though so we can directly compare the results

In [7]:
# lets see it run first
epochs = 1
batch_size = 5
samples = 4

lmbda = lambda epoch: 0.98

In [8]:
tr = TestRig(3, 60)
tr.train(epochs, batch_size, samples)

3 60 5 2688
learning rate 0 [0.196]
running loss 3.441326081752777


In [9]:
# run for longer
epochs = 30
batch_size = 20
samples = 200

lmbda = lambda epoch: 0.98

In [10]:
tr = TestRig(3, 60)
tr.train(epochs, batch_size, samples)
print(tr.val_split(names.data[1]))
tr.model.generate(10)

3 60 5 2688
learning rate 0 [0.196]
running loss 2.956515581607819
learning rate 10 [0.16014627014995916]
running loss 2.599955712556839
learning rate 20 [0.13085116246399847]
running loss 2.581373880505562
tensor(2.4888)


['tunego',
 'yysea',
 'vydlerin',
 'aleal',
 'ysath',
 'ylata',
 'yylen',
 'dshak',
 'yykthyi',
 'yiam']

In [11]:
# now keep these values for a few runs
epochs = 140
batch_size = 80
samples = 2000

In [12]:
tr = TestRig(3, 90)
tr.train(epochs, batch_size, samples)
print(tr.model.generate(10))
print(tr.val_split(names.data[1]))

3 90 5 3978
learning rate 0 [0.196]
running loss 2.438344991862774
learning rate 10 [0.16014627014995916]
running loss 2.339213436305523
learning rate 20 [0.13085116246399847]
running loss 2.3264740296006203
learning rate 30 [0.10691492659895763]
running loss 2.298908307373524
learning rate 40 [0.08735727917438633]
running loss 2.289029165148735
learning rate 50 [0.07137725729707488]
running loss 2.2648463631272318
learning rate 60 [0.058320415967655595]
running loss 2.2473923162817955
learning rate 70 [0.047652025973541665]
running loss 2.234460784971714
learning rate 80 [0.03893517461607997]
running loss 2.219443362057209
learning rate 90 [0.03181287241021722]
running loss 2.2139969464540483
learning rate 100 [0.025993432955371577]
running loss 2.1992474996447564
learning rate 110 [0.021238527225488715]
running loss 2.1922692571878435
learning rate 120 [0.017353423054287647]
running loss 2.1756745886802675
learning rate 130 [0.014179010084073873]
running loss 2.172455868780613
['elil

In [13]:
tr = TestRig(6, 120)
tr.train(epochs, batch_size, samples)
print(tr.model.generate(10))
print(tr.val_split(names.data[1]))

6 120 5 7149
learning rate 0 [0.196]
running loss 2.420335504949093
learning rate 10 [0.16014627014995916]
running loss 2.322846375703812
learning rate 20 [0.13085116246399847]
running loss 2.281452134013176
learning rate 30 [0.10691492659895763]
running loss 2.2689787284731864
learning rate 40 [0.08735727917438633]
running loss 2.238318187892437
learning rate 50 [0.07137725729707488]
running loss 2.2216697558760643
learning rate 60 [0.058320415967655595]
running loss 2.2086603502035143
learning rate 70 [0.047652025973541665]
running loss 2.188600591659546
learning rate 80 [0.03893517461607997]
running loss 2.1705390880107878
learning rate 90 [0.03181287241021722]
running loss 2.148191851198673
learning rate 100 [0.025993432955371577]
running loss 2.138000304877758
learning rate 110 [0.021238527225488715]
running loss 2.1209690326452257
learning rate 120 [0.017353423054287647]
running loss 2.1127639092803
learning rate 130 [0.014179010084073873]
running loss 2.102838237166405
['rodik',

In [14]:
tr = TestRig(9, 120)
tr.train(epochs, batch_size, samples)
print(tr.model.generate(10))
print(tr.val_split(names.data[1]))

9 120 5 9030
learning rate 0 [0.196]
running loss 2.42196000123024
learning rate 10 [0.16014627014995916]
running loss 2.310877370238304
learning rate 20 [0.13085116246399847]
running loss 2.269715944111347
learning rate 30 [0.10691492659895763]
running loss 2.248701598584652
learning rate 40 [0.08735727917438633]
running loss 2.2197221710681916
learning rate 50 [0.07137725729707488]
running loss 2.187728522002697
learning rate 60 [0.058320415967655595]
running loss 2.175956507444382
learning rate 70 [0.047652025973541665]
running loss 2.1497475920319555
learning rate 80 [0.03893517461607997]
running loss 2.1243366671204567
learning rate 90 [0.03181287241021722]
running loss 2.110474820911884
learning rate 100 [0.025993432955371577]
running loss 2.0954085856080056
learning rate 110 [0.021238527225488715]
running loss 2.0825293553471567
learning rate 120 [0.017353423054287647]
running loss 2.067236022174358
learning rate 130 [0.014179010084073873]
running loss 2.056269667327404
['powor'

In [15]:
tr = TestRig(12, 120)
tr.train(epochs, batch_size, samples)
print(tr.model.generate(10))

print(tr.val_split(names.data[1]))

12 120 5 10911
learning rate 0 [0.196]
running loss 2.4420952622294427
learning rate 10 [0.16014627014995916]
running loss 2.2964034487009046
learning rate 20 [0.13085116246399847]
running loss 2.264759113907814
learning rate 30 [0.10691492659895763]
running loss 2.2270787361860274
learning rate 40 [0.08735727917438633]
running loss 2.202738694310188
learning rate 50 [0.07137725729707488]
running loss 2.169598310291767
learning rate 60 [0.058320415967655595]
running loss 2.142024384558201
learning rate 70 [0.047652025973541665]
running loss 2.1208778102993966
learning rate 80 [0.03893517461607997]
running loss 2.0935299550294877
learning rate 90 [0.03181287241021722]
running loss 2.0833990780711176
learning rate 100 [0.025993432955371577]
running loss 2.0572130019664763
learning rate 110 [0.021238527225488715]
running loss 2.041448968052864
learning rate 120 [0.017353423054287647]
running loss 2.021359495103359
learning rate 130 [0.014179010084073873]
running loss 2.008201594650745
['l

In [16]:
tr = TestRig(12, 200)
tr.train(epochs, batch_size, samples)
print(tr.model.generate(10))

print(tr.val_split(names.data[1]))

12 200 5 17951
learning rate 0 [0.196]
running loss 2.4282220850586893
learning rate 10 [0.16014627014995916]
running loss 2.2907778553366662
learning rate 20 [0.13085116246399847]
running loss 2.2596419665217398
learning rate 30 [0.10691492659895763]
running loss 2.2339411221146586
learning rate 40 [0.08735727917438633]
running loss 2.199094163775444
learning rate 50 [0.07137725729707488]
running loss 2.175538654744625
learning rate 60 [0.058320415967655595]
running loss 2.147812264084816
learning rate 70 [0.047652025973541665]
running loss 2.122078421354294
learning rate 80 [0.03893517461607997]
running loss 2.103333488345146
learning rate 90 [0.03181287241021722]
running loss 2.076894433736801
learning rate 100 [0.025993432955371577]
running loss 2.0637942019701003
learning rate 110 [0.021238527225488715]
running loss 2.038330291569233
learning rate 120 [0.017353423054287647]
running loss 2.0355284547805788
learning rate 130 [0.014179010084073873]
running loss 2.02250409668684
['avy

In [17]:
# we still learn so lets learn longer - our 'epoch' is a lot longer than our previous samples size
epochs = 200
batch_size = 80
samples = 6000

In [18]:
tr = TestRig(9, 200)
tr.train(epochs, batch_size, samples)
print(tr.model.generate(10))

print(tr.val_split(names.data[1]))

9 200 5 14870
learning rate 0 [0.196]
running loss 2.365648023545742
learning rate 10 [0.16014627014995916]
running loss 2.3082727882862093
learning rate 20 [0.13085116246399847]
running loss 2.281898501376311
learning rate 30 [0.10691492659895763]
running loss 2.252478352944056
learning rate 40 [0.08735727917438633]
running loss 2.222281000216802
learning rate 50 [0.07137725729707488]
running loss 2.1884094891349473
learning rate 60 [0.058320415967655595]
running loss 2.159437515536944
learning rate 70 [0.047652025973541665]
running loss 2.136063412308693
learning rate 80 [0.03893517461607997]
running loss 2.1068271798292795
learning rate 90 [0.03181287241021722]
running loss 2.0895823192596437
learning rate 100 [0.025993432955371577]
running loss 2.0653802466988562
learning rate 110 [0.021238527225488715]
running loss 2.0406739804943403
learning rate 120 [0.017353423054287647]
running loss 2.031770083725452
learning rate 130 [0.014179010084073873]
running loss 2.0130070190032323
lear

### Starting to overfit

loss is going down, validation is staying the same

In [20]:
context_length = 4

names = WordSampling()
names.from_file("../resources/names.txt")

In [21]:
tr = TestRig(3, 12, context_length)
tr.train(20, batch_size, samples)
tr.model.generate(10)
print(tr.val_split(names.data[1]))

3 12 4 588
learning rate 0 [0.196]
running loss 2.4036873132387795
learning rate 10 [0.16014627014995916]
running loss 2.3767006654938063
tensor(2.3627)


In [22]:
tr = TestRig(3, 160, context_length)
tr.train(80, batch_size, samples)
tr.model.generate(10)
print(tr.val_split(names.data[1]))

3 160 4 6508
learning rate 0 [0.196]
running loss 2.4210894192854564
learning rate 10 [0.16014627014995916]
running loss 2.37120360147953
learning rate 20 [0.13085116246399847]
running loss 2.3515283158024154
learning rate 30 [0.10691492659895763]
running loss 2.3283791618148486
learning rate 40 [0.08735727917438633]
running loss 2.3095569537480674
learning rate 50 [0.07137725729707488]
running loss 2.2842877165873845
learning rate 60 [0.058320415967655595]
running loss 2.264364729265372
learning rate 70 [0.047652025973541665]
running loss 2.247017851014932
tensor(2.2481)


In [23]:
epochs = 120
batch_size = 80
samples = 1000

In [24]:
tr = TestRig(9, 200, context_length)
tr.train(epochs, batch_size, samples)
tr.model.generate(10)
print(tr.val_split(names.data[1]))

9 200 4 13070
learning rate 0 [0.196]
running loss 2.482522686958313
learning rate 10 [0.16014627014995916]
running loss 2.3205695670843123
learning rate 20 [0.13085116246399847]
running loss 2.2850529050827024
learning rate 30 [0.10691492659895763]
running loss 2.2699478480815887
learning rate 40 [0.08735727917438633]
running loss 2.2425458602905275
learning rate 50 [0.07137725729707488]
running loss 2.219335655212402
learning rate 60 [0.058320415967655595]
running loss 2.1955071012973786
learning rate 70 [0.047652025973541665]
running loss 2.1881264252662658
learning rate 80 [0.03893517461607997]
running loss 2.170803068161011
learning rate 90 [0.03181287241021722]
running loss 2.1507494719028473
learning rate 100 [0.025993432955371577]
running loss 2.14399935901165
learning rate 110 [0.021238527225488715]
running loss 2.1214653807878494
tensor(2.1566)


In [25]:
tr.model.generate(10)

['hoson',
 'dasille',
 'hamah',
 'hrishy',
 'hamiah',
 'horoy',
 'zaja',
 'omberly',
 'harvin',
 'cadaleen']

In [26]:
epochs = 180
batch_size = 80
samples = 4000

tr = TestRig(12, 220, context_length, 0.4)
tr.train(epochs, batch_size, samples)
tr.model.generate(10)

12 220 4 17071
learning rate 0 [0.392]
running loss 2.519497916162014
learning rate 10 [0.3202925402999183]
running loss 2.4586433222293853
learning rate 20 [0.26170232492799694]
running loss 2.4135772078037263
learning rate 30 [0.21382985319791525]
running loss 2.373646735459566
learning rate 40 [0.17471455834877267]
running loss 2.3359570707380772
learning rate 50 [0.14275451459414976]
running loss 2.2964313144385815
learning rate 60 [0.11664083193531119]
running loss 2.2677467233836652
learning rate 70 [0.09530405194708333]
running loss 2.2373385948836804
learning rate 80 [0.07787034923215994]
running loss 2.2044726055562496
learning rate 90 [0.06362574482043444]
running loss 2.1723292561769485
learning rate 100 [0.051986865910743155]
running loss 2.1487865211069583
learning rate 110 [0.04247705445097743]
running loss 2.1254321322739123
learning rate 120 [0.034706846108575294]
running loss 2.1004812537431716
learning rate 130 [0.028358020168147747]
running loss 2.084448489189148
lea

['zabrikondo',
 'vedyanjoli',
 'rifelsi',
 'jazor',
 'kaedricia',
 'wirus',
 'kabria',
 'zaeden',
 'releina',
 'bitha']

In [27]:
tr.model.generate(10)

['penettick',
 'raci',
 'zibellae',
 'metiyanus',
 'breyaris',
 'meika',
 'azaliana',
 'sah',
 'benra',
 'quanna']

In [28]:
print(tr.val_split(names.data[1]))

tensor(2.1208)
