In [2]:
words = open('../names.txt', 'r').read().splitlines()
words[:10]

['emma',
 'olivia',
 'ava',
 'isabella',
 'sophia',
 'charlotte',
 'mia',
 'amelia',
 'harper',
 'evelyn']

In [3]:
len(words), min(len(w) for w in words), max(len(w) for w in words)

(32033, 2, 15)

### Bigram LLM, given only a single character, predict the next.

In [4]:
b = {}
for w in words:
    chrs = ['<S>'] + list(w) + ['<E>'] # adding special start and end tokens
    for ch1, ch2 in zip(chrs, chrs[1:]):
        bigram = (ch1, ch2)
        b[bigram] = b.get(bigram, 0) +1

In [5]:
sorted(b.items(), key = lambda kv: kv[1], reverse=True)

[(('n', '<E>'), 6763),
 (('a', '<E>'), 6640),
 (('a', 'n'), 5438),
 (('<S>', 'a'), 4410),
 (('e', '<E>'), 3983),
 (('a', 'r'), 3264),
 (('e', 'l'), 3248),
 (('r', 'i'), 3033),
 (('n', 'a'), 2977),
 (('<S>', 'k'), 2963),
 (('l', 'e'), 2921),
 (('e', 'n'), 2675),
 (('l', 'a'), 2623),
 (('m', 'a'), 2590),
 (('<S>', 'm'), 2538),
 (('a', 'l'), 2528),
 (('i', '<E>'), 2489),
 (('l', 'i'), 2480),
 (('i', 'a'), 2445),
 (('<S>', 'j'), 2422),
 (('o', 'n'), 2411),
 (('h', '<E>'), 2409),
 (('r', 'a'), 2356),
 (('a', 'h'), 2332),
 (('h', 'a'), 2244),
 (('y', 'a'), 2143),
 (('i', 'n'), 2126),
 (('<S>', 's'), 2055),
 (('a', 'y'), 2050),
 (('y', '<E>'), 2007),
 (('e', 'r'), 1958),
 (('n', 'n'), 1906),
 (('y', 'n'), 1826),
 (('k', 'a'), 1731),
 (('n', 'i'), 1725),
 (('r', 'e'), 1697),
 (('<S>', 'd'), 1690),
 (('i', 'e'), 1653),
 (('a', 'i'), 1650),
 (('<S>', 'r'), 1639),
 (('a', 'm'), 1634),
 (('l', 'y'), 1588),
 (('<S>', 'l'), 1572),
 (('<S>', 'c'), 1542),
 (('<S>', 'e'), 1531),
 (('j', 'a'), 1473),
 (

In [6]:
import torch

In [7]:
a = torch.zeros((3, 5), dtype=torch.int32)
a

tensor([[0, 0, 0, 0, 0],
        [0, 0, 0, 0, 0],
        [0, 0, 0, 0, 0]], dtype=torch.int32)

In [8]:
a[1, 3] = 1
a

tensor([[0, 0, 0, 0, 0],
        [0, 0, 0, 1, 0],
        [0, 0, 0, 0, 0]], dtype=torch.int32)

In [9]:
N = torch.zeros((27, 27), dtype=torch.int32)

In [10]:
chars = list(sorted(set(''.join(words))))
chars[:10], len(chars)

(['a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j'], 26)

In [11]:
stoi = {s: i+1 for i, s in enumerate(chars)}
stoi['.'] = 0
itos = {s: i for i, s in stoi.items()}

In [12]:
b = {}
for w in words:
    chrs = ['.'] + list(w) + ['.'] # adding special start and end tokens
    for ch1, ch2 in zip(chrs, chrs[1:]):
        ix1 = stoi[ch1]
        ix2 = stoi[ch2]
        N[ix1, ix2] += 1

In [13]:
N[:4, :4]

tensor([[   0, 4410, 1306, 1542],
        [6640,  556,  541,  470],
        [ 114,  321,   38,    1],
        [  97,  815,    0,   42]], dtype=torch.int32)

In [14]:
# we are then going to transform this to probabilities distributions. we will then sample letters out of the matrix,
# based on the probabilities they have to happen as a first character or following another character.

In [15]:
g = torch.Generator().manual_seed(1337) # the torch generator is useful to set a deterministic experiment
p = torch.rand(3, generator = g) # then we generate the prob distribution with only three elements as an example
p = p / p.sum()
p

tensor([0.0654, 0.4140, 0.5205])

In [16]:
ix = torch.multinomial( # multinomial is similar to random.choice
    p,                  # the probability distr to sample from
    num_samples=1,      # only one sample to get
    replacement=True,   # replacement = True means we can sample the same object many times
    generator=g         # we pass the generator to obtain the deterministic sampling
    ).item() 
itos[ix]

'a'

In [17]:
P = N.float()
P /= P.sum(1, keepdim=True) # we could do P = P / P.sum(1, keepdim=True) but this is inplace an faster.

In [18]:
P[0].sum() # we should expect this to be 1 because its normalised

tensor(1.)

In [19]:
# lets write the bigram loop and sample some names

for i in range(30):
    out = []
    ix = 0
    while True:
        p = P[ix]
        ix = torch.multinomial(p, num_samples = 1, replacement = True, generator = g).item()
        out.append(itos[ix])
        if ix == 0:
            break

    print(''.join(out))

brynigaian.
thenoian.
elan.
ssi.
e.
kelyoheto.
ceky.
mekaivyadialede.
hi.
j.
kirann.
a.
zunna.
man.
khy.
apho.
l.
madr.
kiruliravolon.
s.
ha.
tikr.
zae.
waajerizle.
hashelian.
kicka.
faniba.
dennimia.
bamauch.
aeozonali.


In [20]:
# Now we have a model that we could evaluate, and we will evaluate it with a classic loss formula for probs: log_likelihood.
# log_likelihood is basically the product of the all the probabilities assigned by the model
# because we want to minimise the loss, and log is a monotonic function that goes from -inf to 0, (being 0 when the prob is higher and -inf when the prob is lower)
# we have to invert it (double negative), so we have a value that is 0 when it's good and +inf when its bad
# to have a sense of how good are we doing in general across all the model, we compute the average negative log likelihood.
# in practice, we will be minimising the avg negative log likelihood.

In [21]:
log_likelihood = 0
n = 0

for w in words:
#for w in ['hugo']:
    chs = ['.'] + list(w) + ['.']
    for ch1, ch2 in zip(chs, chs[1:]):
        ix1 = stoi[ch1]
        ix2 = stoi[ch2]
        prob = P[ix1, ix2]
        logprob = torch.log(prob)
        log_likelihood += logprob
        n += 1

print(f'{log_likelihood=}')
neg_logl = -log_likelihood
print(f'{neg_logl=}')
print(f'{neg_logl/n}')

log_likelihood=tensor(-559891.7500)
neg_logl=tensor(559891.7500)
2.454094171524048


In [22]:
# now, we want to build our bigram model as a neural network
# this will be assuming that the probability matrix that we had, are the actual parameters of the networks
# but we are going to train, optimise and evaluate this NN and still end up in a very similar spot

In [23]:
# create a training set of the bigrams
xs, ys = [], []

for w in words:
    chs = ['.'] + list(w) + ['.']
    for ch1, ch2 in zip(chs, chs[1:]):
        ix1 = stoi[ch1]
        ix2 = stoi[ch2]
        xs.append(ix1)
        ys.append(ix2)

xs = torch.tensor(xs) # here we will have the inputs appearing in our training set
ys = torch.tensor(ys) # here we will have the expected outputs for those inputs in our training set

In [24]:
words[:1], xs[:5], ys[:5] # for emma, which has 5 bigrams, we have that when input is 0, we expect an output 5, and so on

(['emma'], tensor([ 0,  5, 13, 13,  1]), tensor([ 5, 13, 13,  1,  0]))

In [25]:
# to feed all this integer values into vectors for a NN, they need to be one-hot encoded
import torch.nn.functional as F
xenc = F.one_hot(xs, num_classes = 27).float() # we onehot and cast to float the one hot encoding
yenc = F.one_hot(ys, num_classes = 27).float()
xenc.shape, yenc.shape

(torch.Size([228146, 27]), torch.Size([228146, 27]))

In [26]:
xenc[:5]

tensor([[1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
         0., 0., 0., 0., 0., 0., 0., 0., 0.],
        [0., 0., 0., 0., 0., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
         0., 0., 0., 0., 0., 0., 0., 0., 0.],
        [0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 1., 0., 0., 0., 0.,
         0., 0., 0., 0., 0., 0., 0., 0., 0.],
        [0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 1., 0., 0., 0., 0.,
         0., 0., 0., 0., 0., 0., 0., 0., 0.],
        [0., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
         0., 0., 0., 0., 0., 0., 0., 0., 0.]])

In [27]:
# randomly init 27 neurons weights. each neuron receives 27 inputs
g = torch.Generator().manual_seed(1337)
W = torch.randn((27, 27), generator=g, requires_grad=True) # requires_grad = True tells pytorch that we have to keep the gradients tracked, to perform the back pass

In [28]:
# forward pass
logits = xenc[:5] @ W # predict logits (log-counts) for the first word
counts = logits.exp() # counts equivalent to N
probs = counts / counts.sum(1, keepdims=True) # probabilities for next character
# last two lines is the softmax!

In [29]:
probs.shape

torch.Size([5, 27])

In [30]:
# compute the avg negative log likelihood for the first word
loss = -probs[torch.arange(5), ys[:5]].log().mean()
loss

tensor(3.5165, grad_fn=<NegBackward0>)

In [31]:
# backward pass
W.grad = None
loss.backward()

In [32]:
W.shape, W.grad.shape

(torch.Size([27, 27]), torch.Size([27, 27]))

In [33]:
W.data += -0.1 * W.grad # we perform the update! with 0.1 as learning rate

In [36]:
# lets now put everything in place and run the training with all the examples, not just the first word
epochs = 1000 # we set a high num of epochs
lr = 50 # we set a super high learning rate to converge faster

# randomly init 27 neurons weights. each neuron receives 27 inputs
g = torch.Generator().manual_seed(1337)
W = torch.randn((27, 27), generator=g, requires_grad=True) # requires_grad = True tells pytorch that we have to keep the gradients tracked, to perform the back pass

for i, k in enumerate(range(epochs)):

    #forward pass
    xenc = F.one_hot(xs, num_classes=27).float()
    logits = xenc @ W
    counts = logits.exp()
    probs = counts / counts.sum(1, keepdims = True)
    loss = -probs[torch.arange(xs.nelement()), ys].log().mean()
    
    if i % 5 == 0:
        print(loss.item())

    # backward pass
    W.grad = None
    loss.backward()

    # update
    W.data += -lr * W.grad

3.8462040424346924
2.8340163230895996
2.6733038425445557
2.6076998710632324
2.571129322052002
2.5478336811065674
2.5317351818084717
2.5199472904205322
2.5109522342681885
2.5038933753967285
2.4982402324676514
2.4936394691467285
2.489841938018799
2.4866678714752197
2.4839823246002197
2.481684923171997
2.479698419570923
2.477962017059326
2.476430892944336
2.475069999694824
2.473851442337036
2.4727532863616943
2.4717586040496826
2.470853567123413
2.470026969909668
2.469268798828125
2.468571662902832
2.4679296016693115
2.4673352241516113
2.466784715652466
2.466273784637451
2.4657976627349854
2.4653546810150146
2.4649405479431152
2.464552879333496
2.4641902446746826
2.4638490676879883
2.463528871536255
2.4632275104522705
2.4629433155059814
2.462674856185913
2.4624218940734863
2.46218204498291
2.4619550704956055
2.461740016937256
2.4615352153778076
2.461341142654419
2.4611563682556152
2.4609804153442383
2.4608123302459717
2.4606521129608154
2.4604992866516113
2.460352659225464
2.4602127075195

In [37]:
# inference / sample from the neural net model
g = torch.Generator().manual_seed(1337)

for i in range(5):
    out = []
    ix = 0

    while True:
        xenc = F.one_hot(torch.tensor([ix]), num_classes=27).float()
        logits = xenc @ W
        counts = logits.exp()
        p = counts / counts.sum(1, keepdims=True)

        ix = torch.multinomial(p, num_samples=1, replacement=True, generator=g).item()
        out.append(itos[ix])
        if ix == 0:
            break
    print(''.join(out))

gun.
kaneliy.
dy.
exylell.
eleleahmariss.
