In [1]:
import numpy as np
import random
import torch
import torch.nn.functional as F
import requests

In [2]:
url =  "https://www.mit.edu/~ecprice/wordlist.10000"
words = response = requests.get(url).text.splitlines()

In [3]:
random.sample(words, 10)

['successfully',
 'those',
 'catalogs',
 'layers',
 'outlet',
 'fall',
 'usually',
 'crimes',
 'peer',
 'checks']

In [4]:
b = {}
for w in words:
  chs = ['.'] + list(w) + ['.']
  for ch1, ch2 in zip(chs, chs[1:]):
    bigram = (ch1, ch2)
    b[bigram] = b.get(bigram, 0) + 1

In [5]:
N = torch.zeros((27, 27), dtype=torch.int32)

In [6]:
chars = sorted(list(set(''.join(words))))
stoi = {s:i+1 for i,s in enumerate(chars)}
stoi['.'] = 0
itos = {i:s for s,i in stoi.items()}

In [7]:
for k, v in b.items():
    s1, s2 = k
    i1 = stoi[s1]
    i2 = stoi[s2]
    N[i1, i2] = v

In [8]:
P = (N + 0.01).float()
P /= P.sum(1, keepdims=True)

In [9]:
for i in range(10):
  
  out = []
  idx = 0
  while True:
    p = P[idx]
    idx = torch.multinomial(p, num_samples=1, replacement=True).item()
    if idx == 0:
      break
    out.append(itos[idx])

  print(''.join(out))

e
guma
haplolotlesise
prst
c
ba
mangejalet
suacurid
ucrd
s


In [10]:
log_likelihood = 0.0
n = 0

for w in words:
  chs = ['.'] + list(w) + ['.']
  for ch1, ch2 in zip(chs, chs[1:]):
    ix1 = stoi[ch1]
    ix2 = stoi[ch2]
    prob = P[ix1, ix2]
    logprob = torch.log(prob)
    log_likelihood += logprob
    n += 1

print(f'{log_likelihood=}')
nll = -log_likelihood
print(f'{nll=}')
print(f'{nll/n}')

log_likelihood=tensor(-189780.3594)
nll=tensor(189780.3594)
2.501059055328369


In [11]:
## optimisation

In [12]:
import torch
import torch.nn as nn
import torch.nn.functional as F


num_classes = 27  
embedding_dim = 10  


class TrigramModel(nn.Module):
    def __init__(self):
        super(TrigramModel, self).__init__()
        self.embedding = nn.Embedding(num_classes, embedding_dim)
        self.fc = nn.Linear(embedding_dim * 2, num_classes)  

    def forward(self, x):
        embedded = self.embedding(x)  
        # [batch_size, 2, embedding_dim]
        flat = embedded.view(-1, embedding_dim * 2)
        logits = self.fc(flat)
        return logits


In [13]:
xs, ys = [], []
for w in words:
    chs = ['.'] + list(w) + ['.']
    for ch1, ch2, ch3 in zip(chs, chs[1:], chs[2:]):
        ix1 = stoi[ch1]
        ix2 = stoi[ch2]
        ix3 = stoi[ch3]
        xs.append([ix1, ix2])
        ys.append(ix3)

xs = torch.tensor(xs)
ys = torch.tensor(ys)

model = TrigramModel()

In [21]:
batch_size = 100
epochs = 10
learning_rate = 0.02

optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate)

model.train()
for epoch in range(epochs):
    permutation = torch.randperm(xs.size(0))
    for i in range(0, xs.size(0), batch_size):
        indices = permutation[i:i + batch_size]
        batch_xs, batch_ys = xs[indices], ys[indices]

        optimizer.zero_grad()
        logits = model(batch_xs)
        loss = F.cross_entropy(logits, batch_ys)
        loss.backward()
        optimizer.step()

        if i % (5 * batch_size) == 0:
            print(f"Epoch {epoch+1}, Batch {i//batch_size}, Loss: {loss.item()}")

Epoch 1, Batch 0, Loss: 2.432063341140747
Epoch 1, Batch 5, Loss: 2.1590428352355957
Epoch 1, Batch 10, Loss: 2.284173011779785
Epoch 1, Batch 15, Loss: 2.2679643630981445
Epoch 1, Batch 20, Loss: 2.2808666229248047
Epoch 1, Batch 25, Loss: 2.274916172027588
Epoch 1, Batch 30, Loss: 2.3293917179107666
Epoch 1, Batch 35, Loss: 2.281251907348633
Epoch 1, Batch 40, Loss: 2.140366315841675
Epoch 1, Batch 45, Loss: 2.3074188232421875
Epoch 1, Batch 50, Loss: 2.485408067703247
Epoch 1, Batch 55, Loss: 2.4060025215148926
Epoch 1, Batch 60, Loss: 2.420342445373535
Epoch 1, Batch 65, Loss: 2.6857898235321045
Epoch 1, Batch 70, Loss: 2.3474466800689697
Epoch 1, Batch 75, Loss: 2.277700901031494
Epoch 1, Batch 80, Loss: 2.2970311641693115
Epoch 1, Batch 85, Loss: 2.3025431632995605
Epoch 1, Batch 90, Loss: 2.466430187225342
Epoch 1, Batch 95, Loss: 2.5394506454467773
Epoch 1, Batch 100, Loss: 2.470600128173828
Epoch 1, Batch 105, Loss: 2.3043363094329834
Epoch 1, Batch 110, Loss: 2.31210184097290

In [25]:
import torch
import torch.nn.functional as F

def generate_word(model, stoi, itos):

    current_chars = ['.'] + random.sample(chars, 1)

    while True:  
        input_ix = [stoi[current_chars[-2]], stoi[current_chars[-1]]]
        input_tensor = torch.tensor([input_ix], dtype=torch.long)

        logits = model(input_tensor)
        probs = F.softmax(logits, dim=-1)

        next_char_ix = torch.multinomial(probs, num_samples=1).item()
        next_char = itos[next_char_ix]

        if next_char == '.':
            break
        current_chars.append(next_char)

    return ''.join(current_chars[1:])

for _ in range(10):
    word = generate_word(model, stoi, itos)
    print(word)


zm
mes
stholy
tascamamp
wh
shibily
karufledinilitiog
jiomstriscliatiptind
kizm
fr
