In [2]:
import numpy as np
import random
import torch
import torch.nn.functional as F

In [3]:
words = open('words.txt', 'r').read().splitlines()

In [5]:
random.sample(words, 10)

['welldone',
 'pliancies',
 'tridiapason',
 'overshadows',
 'commutant',
 'azteca',
 'chaksi',
 'spherelike',
 'unworkmanlike',
 'narrowhearted']

In [6]:
b = {}
for w in words:
  chs = ['.'] + list(w) + ['.']
  for ch1, ch2 in zip(chs, chs[1:]):
    bigram = (ch1, ch2)
    b[bigram] = b.get(bigram, 0) + 1

In [7]:
N = torch.zeros((27, 27), dtype=torch.int32)

In [8]:
chars = sorted(list(set(''.join(words))))
stoi = {s:i+1 for i,s in enumerate(chars)}
stoi['.'] = 0
itos = {i:s for s,i in stoi.items()}

In [50]:
for k, v in b.items():
    s1, s2 = k
    i1 = stoi[s1]
    i2 = stoi[s2]
    N[i1, i2] = v

In [51]:
P = (N + 0.01).float()
P /= P.sum(1, keepdims=True)

In [54]:
for i in range(10):
  
  out = []
  idx = 0
  while True:
    p = P[idx]
    idx = torch.multinomial(p, num_samples=1, replacement=True).item()
    if idx == 0:
      break
    out.append(itos[idx])

  print(''.join(out))

malell
orrtirm
e
inalipelicalabros
bliabi
nggutidry
pletons
dizickhim
as
petolyt


In [52]:
log_likelihood = 0.0
n = 0

for w in words:
  chs = ['.'] + list(w) + ['.']
  for ch1, ch2 in zip(chs, chs[1:]):
    ix1 = stoi[ch1]
    ix2 = stoi[ch2]
    prob = P[ix1, ix2]
    logprob = torch.log(prob)
    log_likelihood += logprob
    n += 1

print(f'{log_likelihood=}')
nll = -log_likelihood
print(f'{nll=}')
print(f'{nll/n}')

log_likelihood=tensor(-9877469.)
nll=tensor(9877469.)
2.55574369430542


In [11]:
## optimisation

In [49]:
import torch
import torch.nn as nn
import torch.nn.functional as F


num_classes = 27  
embedding_dim = 10  


class TrigramModel(nn.Module):
    def __init__(self):
        super(TrigramModel, self).__init__()
        self.embedding = nn.Embedding(num_classes, embedding_dim)
        self.fc = nn.Linear(embedding_dim * 2, num_classes)  # Linear layer to predict the next character

    def forward(self, x):
        embedded = self.embedding(x)  # [batch_size, 2, embedding_dim]
        # Flatten the embedded representations
        flat = embedded.view(-1, embedding_dim * 2)
        # Pass through linear layer
        logits = self.fc(flat)
        return logits


In [None]:
# Assuming 'words' and 'stoi' are predefined
xs, ys = [], []
for w in words:
    chs = ['.'] + list(w) + ['.']
    for ch1, ch2, ch3 in zip(chs, chs[1:], chs[2:]):
        ix1 = stoi[ch1]
        ix2 = stoi[ch2]
        ix3 = stoi[ch3]
        xs.append([ix1, ix2])
        ys.append(ix3)

xs = torch.tensor(xs)
ys = torch.tensor(ys)

model = TrigramModel()

In [43]:
batch_size = 100
epochs = 10
learning_rate = 0.05

optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate)

model.train()
for epoch in range(epochs):
    permutation = torch.randperm(xs.size(0))
    for i in range(0, xs.size(0), batch_size):
        indices = permutation[i:i + batch_size]
        batch_xs, batch_ys = xs[indices], ys[indices]

        optimizer.zero_grad()
        logits = model(batch_xs)
        loss = F.cross_entropy(logits, batch_ys)
        loss.backward()
        optimizer.step()

        if i % (5 * batch_size) == 0:
            print(f"Epoch {epoch+1}, Batch {i//batch_size}, Loss: {loss.item()}")

Epoch 1, Batch 0, Loss: 2.442152738571167
Epoch 1, Batch 5, Loss: 2.346538782119751
Epoch 1, Batch 10, Loss: 2.8464255332946777
Epoch 1, Batch 15, Loss: 2.656933069229126
Epoch 1, Batch 20, Loss: 2.688889741897583
Epoch 1, Batch 25, Loss: 2.5269250869750977
Epoch 1, Batch 30, Loss: 2.463308811187744
Epoch 1, Batch 35, Loss: 2.5352327823638916
Epoch 1, Batch 40, Loss: 2.4394562244415283
Epoch 1, Batch 45, Loss: 2.367760181427002
Epoch 1, Batch 50, Loss: 2.522883176803589
Epoch 1, Batch 55, Loss: 2.543513774871826
Epoch 1, Batch 60, Loss: 2.388136386871338
Epoch 1, Batch 65, Loss: 2.338264226913452
Epoch 1, Batch 70, Loss: 2.504287004470825
Epoch 1, Batch 75, Loss: 2.586658239364624
Epoch 1, Batch 80, Loss: 2.4339520931243896
Epoch 1, Batch 85, Loss: 2.5567069053649902
Epoch 1, Batch 90, Loss: 2.391690969467163
Epoch 1, Batch 95, Loss: 2.251502275466919
Epoch 1, Batch 100, Loss: 2.378011703491211
Epoch 1, Batch 105, Loss: 2.6849241256713867
Epoch 1, Batch 110, Loss: 2.7117834091186523
Ep

In [48]:
import torch
import torch.nn.functional as F

def generate_word(model, stoi, itos):

    current_chars = ['.'] + random.sample(chars, 1)

    while True:  
        input_ix = [stoi[current_chars[-2]], stoi[current_chars[-1]]]
        input_tensor = torch.tensor([input_ix], dtype=torch.long)

        logits = model(input_tensor)
        probs = F.softmax(logits, dim=-1)

        next_char_ix = torch.multinomial(probs, num_samples=1).item()
        next_char = itos[next_char_ix]

        if next_char == '.':
            break
        current_chars.append(next_char)

    # Join all characters to form the word, skipping the initial '.'
    return ''.join(current_chars[1:])

# Generate a few words
for _ in range(10):
    word = generate_word(model, stoi, itos)
    print(word)


peteli
lesepreteforydawic
brene
jgeste
prous
wuntun
nondaunmonelliomheitaon
getditly
inistercteud
xee
