### ok lets play with our new ideas

we could add a dimension, change the layers etc

In [1]:
import torch
import torch.nn.functional as F
import torch.nn as nn

lets add another layer - we should still be able to plot it

In [2]:
# our embedding space
dims = 3
context_length = 5
vocab_size = 3
es = torch.randn((vocab_size, dims))

In [3]:
# show the embedding space an the 'a' co-ordinates (Transpose es just to save display space)
es.shape, es[0,0], es[1]

(torch.Size([3, 3]), tensor(-0.0184), tensor([ 0.3639,  0.3935, -2.1978]))

In [4]:
with open("names.txt", "r") as r:
    names = ["." + f + "." for f in r.read().split()]

import string
letters = [l for l in string.ascii_lowercase]

itos = {0: "."}
stoi = {".": 0}

for i, l in enumerate(letters):
    offset = i+1
    stoi[l] = offset
    itos[offset] = l


In [5]:
stoi['a'], len(names)

(1, 32033)

In [6]:
import random
from random import randrange

names_length = len(names)

def sample_names(size=5):
    batch_names = []
    for i in range(size):
        ni = randrange(names_length-1)
        name = names[ni]
        batch_names.append(name)
    return batch_names

sample_names(2)

['.haze.', '.rufino.']

In [7]:
def word_contexts(word):
    samples = []
    max_length = len(word)
    fill = '.' * context_length
    for i in range(1,max_length):
        st = max(0, i-context_length)
        filled = fill[i:] + word[st:i]
        samples.append(filled[:context_length])
    return samples
word_contexts(".timothy.")

['.....', '....t', '...ti', '..tim', '.timo', 'timot', 'imoth', 'mothy']

In [8]:
def get_xys(samples):
    xs, ys = [], []
    for s in samples:
        for ctx in word_contexts(s):
            x =  [stoi[c] for c in ctx]
            xs.append(x)
        y = [stoi[c] for c in s[1:]]
        ys += y

    return xs, ys

In [9]:
## better sample with train/dev/test split
import math
# shuffle names
from random import shuffle

shuffle(names)

x_names, y_names = get_xys(names)
samples_length = len(y_names)
train_length = math.floor(samples_length * .8)
dev_offset = math.floor(samples_length * .9)

dev = list(zip(x_names[train_length: dev_offset], y_names[train_length: dev_offset]))
test = list(zip(x_names[dev_offset:], y_names[dev_offset:]))

train = list(zip(x_names[:train_length], y_names[:train_length]))

def sample_names(sample_size = 5, p = train):
    samples = random.sample(p, sample_size)
    x_samples = [s[0] for s in samples]
    y_samples = [s[1] for s in samples]
    return x_samples, y_samples

In [10]:
xs, ys = sample_names()

xs[0], ys[0]

([2, 18, 15, 14, 23], 25)

## OK lets run this and see what happens...

In [11]:
hidden_layer_size = 160
dims = 3

es = torch.randn((vocab_size, dims))

es = torch.randn((27, dims), requires_grad=True)
W1 = torch.randn((dims*context_length, hidden_layer_size), requires_grad=True)
b1 = torch.randn(hidden_layer_size, requires_grad=True)
W2 = torch.randn((hidden_layer_size, 27), requires_grad=True)
b2 = torch.randn(27, requires_grad=True)

parameters = [es, W1, b1, W2, b2]

In [13]:
names_length = len(names)
epochs = 1
batch_size = 400
learning_rate = .1
sample_loops = 200 #in future we want our epoch to roughly sample everything - names_length / batch_size

In [14]:
for ep in range(epochs):
    epoch_loss = 0
    for s in range(sample_loops):
        x, y = sample_names(batch_size)
        X = torch.tensor(x)
        Y = torch.tensor(y)
        train = es[X.view(-1)]
        outputL1 = torch.relu(train.view(-1, dims * context_length) @ W1 + b1)
        
        logits = outputL1 @ W2 + b2

        loss = F.cross_entropy(logits, Y)

        epoch_loss += loss

        for p in parameters:
            p.grad = None
        
        loss.backward()

        for p in parameters:
            p.data -= learning_rate * p.grad

    if ep == 61:
        learning_rate = .01

    if ep % 10 == 0:
        print(epoch_loss/sample_loops)
        learning_rate *= .95
        print(ep, learning_rate)

tensor(6.5874, grad_fn=<DivBackward0>)
0 0.095


In [15]:
def generate_names(num_names):
    for i in range(num_names):
        out = []
        ix = [0, 0, 0, 0, 0]
        for nl in range(10):
            xenc = es[ix]

            outputL1 = torch.relu(xenc.view(-1, dims * context_length) @ W1 + b1)
            logits = outputL1 @ W2 + b2
            
            p = F.softmax(logits, dim=1)
    
            # ## torch.multinomial pulls out an index in p (num_samples=1) by sampling from the elements in p according to their probabilities 
            # ## (p is normalized in the softmax above)
            prediction = torch.multinomial(p, num_samples=1).item()
            
            ix = [ix[1], ix[2], ix[3], ix[4], prediction]

            if prediction == 0:
                break
            out.append(itos[prediction])
            
        print("".join(out))

generate_names(1)

cvdikamnse


In [16]:
generate_names(15)

deelni
iacdieod
raytyeneah
caibunntik
cabnacnsan
wbbrghnh
naivu
boodeaaasi
jaras
iahans
wayiaidnan
welie
badar
cabdnissgi
khltijalen


In [17]:
import matplotlib.pyplot as plt
from ipywidgets import interact

In [18]:
x = es[1:].T[0].tolist()
y = es[1:].T[1].tolist()
z = es[1:].T[2].tolist()
n = letters

def interactive_plot(elev, azim, zoom):
    fig = plt.figure()
    ax = plt.axes(projection='3d')

    ax.view_init(elev, azim)
    ax.set_box_aspect((1, 1, 1), zoom=zoom)
    
    ax.scatter(x, y, z)
    
    for i, txt in enumerate(n):
        # ax.annotate(txt, (x[i], y[i], z[i]))
        ax.text(x[i], y[i], z[i], txt, color='blue')


interact(interactive_plot, elev=(1, 45, 0.1), azim=(1, 70, 0.1), zoom=(0, 4, 0.01))

interactive(children=(FloatSlider(value=23.0, description='elev', max=45.0, min=1.0), FloatSlider(value=35.0, …

<function __main__.interactive_plot(elev, azim, zoom)>

In [168]:
x = es[1:].T[0].tolist()
y = es[1:].T[1].tolist()
z = es[1:].T[2].tolist()
n = letters

def interactive_plot(elev, azim, zoom):
    fig = plt.figure()
    ax = plt.axes(projection='3d')

    ax.view_init(elev, azim)
    ax.set_box_aspect((1, 1, 1), zoom=zoom)
    
    ax.scatter(x, y, z)
    
    for i, txt in enumerate(n):
        # ax.annotate(txt, (x[i], y[i], z[i]))
        ax.text(x[i], y[i], z[i], txt, color='blue')


interact(interactive_plot, elev=(1, 45, 0.1), azim=(1, 70, 0.1), zoom=(0, 4, 0.01))

interactive(children=(FloatSlider(value=23.0, description='elev', max=45.0, min=1.0), FloatSlider(value=35.0, …

<function __main__.interactive_plot(elev, azim, zoom)>

## Ok even more dimensions 

we'll lose our visualizations, but we can come back to that 

In [19]:
dimsb = 7
context_length = 5
nonlin='relu'

In [20]:
hidden_layer_size = 200

esb = torch.randn((27, dimsb))
W1b = torch.randn((dimsb*context_length, hidden_layer_size))

nn.init.kaiming_normal_(W1b, nonlinearity=nonlin)

b1b = torch.randn(hidden_layer_size) * 0.001

# bnb = nn.BatchNorm2d(hidden_layer_size)

W2b = torch.randn((hidden_layer_size, 27)) / (dimsb*context_length)**0.5
b2b = torch.randn(27) * 0.001

parametersb = [esb, W1b, b1b, W2b, b2b]
for p in parametersb:
    p.requires_grad=True

In [21]:
names_length = len(names)
epochs = 200
batch_size = 820
learning_rate = .1
sample_loops = 800

In [22]:
esb.shape

torch.Size([27, 7])

In [23]:
x, y = sample_names(32)
batched = esb[x]
batched.shape

torch.Size([32, 5, 7])

In [24]:
for ep in range(epochs):
    epoch_loss = 0
    for s in range(sample_loops):
        x, y = sample_names(batch_size)
        Y = torch.tensor(y)
        train = esb[x]

        preact = train.view(-1, dimsb * context_length) @ W1b + b1b
        # add learnable normalization?
        # preact = (preact - preact.mean(0, keepdim=True))/preact.std(0, keepdim=True)
        
        if nonlin == 'relu':
            outputL1 = torch.relu(preact)
        else:
            outputL1 = torch.tanh(preact)
        
        logits = outputL1 @ W2b + b2b

        lossb = F.cross_entropy(logits, Y)

        epoch_loss += lossb

        for p in parametersb:
            p.grad = None
        
        lossb.backward()

        for p in parametersb:
            p.data -= learning_rate * p.grad

    if ep == 111:
        learning_rate = .01

    if ep == 161:
        learning_rate = .001

    if ep % 10 == 0:
        print(epoch_loss/sample_loops)
        learning_rate *= .95
        print(ep, learning_rate)

tensor(2.4201, grad_fn=<DivBackward0>)
0 0.095
tensor(2.0981, grad_fn=<DivBackward0>)
10 0.09025
tensor(2.0543, grad_fn=<DivBackward0>)
20 0.0857375
tensor(2.0300, grad_fn=<DivBackward0>)
30 0.08145062499999998
tensor(2.0161, grad_fn=<DivBackward0>)
40 0.07737809374999999
tensor(2.0064, grad_fn=<DivBackward0>)
50 0.07350918906249998
tensor(1.9963, grad_fn=<DivBackward0>)
60 0.06983372960937498
tensor(1.9919, grad_fn=<DivBackward0>)
70 0.06634204312890622
tensor(1.9873, grad_fn=<DivBackward0>)
80 0.0630249409724609
tensor(1.9849, grad_fn=<DivBackward0>)
90 0.05987369392383786
tensor(1.9758, grad_fn=<DivBackward0>)
100 0.05688000922764597
tensor(1.9773, grad_fn=<DivBackward0>)
110 0.05403600876626367
tensor(1.9685, grad_fn=<DivBackward0>)
120 0.0095
tensor(1.9681, grad_fn=<DivBackward0>)
130 0.009025
tensor(1.9684, grad_fn=<DivBackward0>)
140 0.00857375
tensor(1.9671, grad_fn=<DivBackward0>)
150 0.0081450625
tensor(1.9643, grad_fn=<DivBackward0>)
160 0.007737809374999999
tensor(1.9628, g

In [54]:
(outputL1[:40]==0).shape

torch.Size([40, 200])

In [55]:
dev_x = [s[0] for s in dev]
dev_y = torch.tensor([s[1] for s in dev])
val = esb[dev_x]
val1 = torch.relu(val.view(-1, dimsb * context_length) @ W1b + b1b)

val_logits = val1 @ W2b + b2b

val_loss = F.cross_entropy(val_logits, dev_y)
val_loss

tensor(2.0639, grad_fn=<NllLossBackward0>)

some train/val losses

1.9696 / 2.0717


In [290]:
def generate_namesb(num_names):
    for i in range(num_names):
        out = []
        ix = [0, 0, 0, 0, 0]
        for nl in range(10):
            xenc = esb[ix]

            outputL1 = torch.relu(xenc.view(-1, dimsb * context_length) @ W1b + b1b)
            logits = outputL1 @ W2b + b2b
            
            p = F.softmax(logits, dim=1)
    
            # ## torch.multinomial pulls out an index in p (num_samples=1) by sampling from the elements in p according to their probabilities 
            # ## (p is normalized in the softmax above)
            prediction = torch.multinomial(p, num_samples=1).item()
            
            ix = [ix[1], ix[2], ix[3], ix[4], prediction]

            if prediction == 0:
                break
            out.append(itos[prediction])
            
        print("".join(out))

generate_namesb(1)

daraya


In [304]:
generate_namesb(15)

kolamaraes
myliea
skom
rozaluna
alfeeir
kalmeem
rick
jais
ingumadl
demontee
chryden
zulah
eaydeniel
kilor
rina


In [238]:
n = letters

def interactive_plot(elev, azim, zoom, dim1, dim2, dim3):

    x = esb[1:].T[dim1].tolist()
    y = esb[1:].T[dim2].tolist()
    z = esb[1:].T[dim3].tolist()
    
    fig = plt.figure()
    ax = plt.axes(projection='3d')

    ax.view_init(elev, azim)
    ax.set_box_aspect((1, 1, 1), zoom=zoom)
    
    ax.scatter(x, y, z)
    
    for i, txt in enumerate(n):
        # ax.annotate(txt, (x[i], y[i], z[i]))
        ax.text(x[i], y[i], z[i], txt, color='blue')


interact(interactive_plot, elev=(1, 45, 0.1), azim=(1, 70, 0.1), zoom=(0, 4, 0.01),  dim1=(0, 4, 1), dim2=(1, 6, 2), dim3=(1, 9, 2))

interactive(children=(FloatSlider(value=23.0, description='elev', max=45.0, min=1.0), FloatSlider(value=35.0, …

<function __main__.interactive_plot(elev, azim, zoom, dim1, dim2, dim3)>

In [296]:
n = letters

def interactive_plot(elev, azim, zoom, dim1, dim2, dim3):

    x = esb[1:].T[dim1].tolist()
    y = esb[1:].T[dim2].tolist()
    z = esb[1:].T[dim3].tolist()
    
    fig = plt.figure()
    ax = plt.axes(projection='3d')

    ax.view_init(elev, azim)
    ax.set_box_aspect((1, 1, 1), zoom=zoom)
    
    ax.scatter(x, y, z)
    
    for i, txt in enumerate(n):
        # ax.annotate(txt, (x[i], y[i], z[i]))
        ax.text(x[i], y[i], z[i], txt, color='blue')


interact(interactive_plot, elev=(1, 45, 0.1), azim=(1, 70, 0.1), zoom=(0, 4, 0.01),  dim1=(0, 4, 1), dim2=(1, 6, 2), dim3=(1, 9, 2))

interactive(children=(FloatSlider(value=23.0, description='elev', max=45.0, min=1.0), FloatSlider(value=35.0, …

<function __main__.interactive_plot(elev, azim, zoom, dim1, dim2, dim3)>

Just by eye-balling 10 dimensions this way it looks like some are pretty similar 
- maybe we're not getting much from them in this simple space
- reduce and try again