In [1]:
import numpy as np
import matplotlib.pyplot as plt

In [2]:
import torch.nn

Sample problem: translate place-value demimals to the Roman numerals (a very simple, exact "language translation" task).

In [3]:
import roman

In [4]:
are_they_distinct = {roman.toRoman(i + 1) for i in range(3000)}
len(are_they_distinct)

3000

In [5]:
for i in range(3000):
    assert roman.fromRoman(roman.toRoman(i + 1)) == i + 1

In [6]:
np.argmax([len(roman.toRoman(i + 1)) for i in range(3000)])

2887

In [7]:
roman.toRoman(2888)

'MMDCCCLXXXVIII'

In [8]:
set("".join(roman.toRoman(i + 1) for i in range(3000)))

{'C', 'D', 'I', 'L', 'M', 'V', 'X'}

In [9]:
f"{roman.toRoman(123):^14s}"

'    CXXIII    '

Simple, position-sensitive embeddings:

In [10]:
def embed_decimal(number: str) -> torch.Tensor:
    out = torch.zeros((4, 10))
    for i in range(4):
        out[i, ord(number[i]) - ord("0")] = 1
    return out

In [11]:
embed_decimal(f"{123:04d}")

tensor([[1., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
        [0., 1., 0., 0., 0., 0., 0., 0., 0., 0.],
        [0., 0., 1., 0., 0., 0., 0., 0., 0., 0.],
        [0., 0., 0., 1., 0., 0., 0., 0., 0., 0.]])

In [12]:
lookup_roman = [" ", "I", "V", "X", "L", "C", "D", "M"]

def embed_roman(numeral: str) -> torch.Tensor:
    return torch.tensor([lookup_roman.index(numeral[i]) for i in range(14)], dtype=torch.int64)

In [13]:
embed_roman(f"{roman.toRoman(1234):^14s}")

tensor([0, 0, 0, 7, 5, 5, 3, 3, 3, 1, 2, 0, 0, 0])

In [14]:
inputs = torch.zeros((3000, 4, 10))
for i in range(3000):
    inputs[i] = embed_decimal(f"{i + 1:04d}")
inputs

tensor([[[1., 0., 0.,  ..., 0., 0., 0.],
         [1., 0., 0.,  ..., 0., 0., 0.],
         [1., 0., 0.,  ..., 0., 0., 0.],
         [0., 1., 0.,  ..., 0., 0., 0.]],

        [[1., 0., 0.,  ..., 0., 0., 0.],
         [1., 0., 0.,  ..., 0., 0., 0.],
         [1., 0., 0.,  ..., 0., 0., 0.],
         [0., 0., 1.,  ..., 0., 0., 0.]],

        [[1., 0., 0.,  ..., 0., 0., 0.],
         [1., 0., 0.,  ..., 0., 0., 0.],
         [1., 0., 0.,  ..., 0., 0., 0.],
         [0., 0., 0.,  ..., 0., 0., 0.]],

        ...,

        [[0., 0., 1.,  ..., 0., 0., 0.],
         [0., 0., 0.,  ..., 0., 0., 1.],
         [0., 0., 0.,  ..., 0., 0., 1.],
         [0., 0., 0.,  ..., 0., 1., 0.]],

        [[0., 0., 1.,  ..., 0., 0., 0.],
         [0., 0., 0.,  ..., 0., 0., 1.],
         [0., 0., 0.,  ..., 0., 0., 1.],
         [0., 0., 0.,  ..., 0., 0., 1.]],

        [[0., 0., 0.,  ..., 0., 0., 0.],
         [1., 0., 0.,  ..., 0., 0., 0.],
         [1., 0., 0.,  ..., 0., 0., 0.],
         [1., 0., 0.,  ..., 0., 0

In [15]:
targets = torch.zeros((3000, 14), dtype=torch.int64)
for i in range(3000):
    targets[i] = embed_roman(f"{roman.toRoman(i + 1):^14s}")
targets

tensor([[0, 0, 0,  ..., 0, 0, 0],
        [0, 0, 0,  ..., 0, 0, 0],
        [0, 0, 0,  ..., 0, 0, 0],
        ...,
        [0, 0, 7,  ..., 1, 0, 0],
        [0, 0, 0,  ..., 0, 0, 0],
        [0, 0, 0,  ..., 0, 0, 0]])

In [16]:
class Model(torch.nn.Module):
    def __init__(self, num_hidden_1, num_hidden_2):
        super().__init__()
        self.flatten = torch.nn.Flatten()
        self.nn = torch.nn.Sequential(
            torch.nn.Linear(4*10, num_hidden_1),
            torch.nn.ReLU(),
            torch.nn.Linear(num_hidden_1, num_hidden_2),
            torch.nn.ReLU(),
            torch.nn.Linear(num_hidden_2, 14*8),
        )

    def forward(self, inputs):
        return self.nn(self.flatten(inputs))

model = Model(100, 100)
model

Model(
  (flatten): Flatten(start_dim=1, end_dim=-1)
  (nn): Sequential(
    (0): Linear(in_features=40, out_features=100, bias=True)
    (1): ReLU()
    (2): Linear(in_features=100, out_features=100, bias=True)
    (3): ReLU()
    (4): Linear(in_features=100, out_features=112, bias=True)
  )
)

In [17]:
model.train()
logsoftmax = torch.nn.LogSoftmax(dim=1)
loss_functions = [torch.nn.NLLLoss() for _ in range(14)]
optimizer = torch.optim.Adam(model.parameters(), lr=0.03)

In [18]:
for i in range(1000):
    optimizer.zero_grad()
    outputs = logsoftmax(model(inputs)).reshape(-1, 14, 8)
    loss = sum(f(outputs[:, i, :], targets[:, i]) for i, f in enumerate(loss_functions))
    loss.backward()
    optimizer.step()

    if (i + 1) % 100 == 0:
        print(f"{i + 1 = } {loss = }")

i + 1 = 100 loss = tensor(39.5999, grad_fn=<AddBackward0>)
i + 1 = 200 loss = tensor(38.4044, grad_fn=<AddBackward0>)
i + 1 = 300 loss = tensor(39.2050, grad_fn=<AddBackward0>)
i + 1 = 400 loss = tensor(37.9730, grad_fn=<AddBackward0>)
i + 1 = 500 loss = tensor(37.5135, grad_fn=<AddBackward0>)
i + 1 = 600 loss = tensor(37.3955, grad_fn=<AddBackward0>)
i + 1 = 700 loss = tensor(37.3782, grad_fn=<AddBackward0>)
i + 1 = 800 loss = tensor(37.6053, grad_fn=<AddBackward0>)
i + 1 = 900 loss = tensor(37.4700, grad_fn=<AddBackward0>)
i + 1 = 1000 loss = tensor(37.2344, grad_fn=<AddBackward0>)


In [19]:
probabilities = torch.nn.Softmax(dim=1)(model(inputs[:30])).reshape(-1, 14, 8).detach().numpy()

["".join(x) for x in np.array(lookup_roman)[np.argmax(probabilities, axis=-1)]]

['      I       ',
 '      II      ',
 '     III      ',
 '      IV      ',
 '      V       ',
 '      VI      ',
 '     VII      ',
 '     VIII     ',
 '      IX      ',
 '      X       ',
 '      XI      ',
 '     XII      ',
 '     XIII     ',
 '     XIV      ',
 '      XV      ',
 '     XVI      ',
 '     XVII     ',
 '    XVIII     ',
 '     XIX      ',
 '      XX      ',
 '     XXI      ',
 '     XXII     ',
 '    XXIII     ',
 '     XXIV     ',
 '     XXV      ',
 '     XXVI     ',
 '    XXVII     ',
 '    XXVIII    ',
 '     XXIX     ',
 '     XXX      ']

It works, but it's not a transformer; it's a (bigger than necessary) feed-forward neural network.

<br><br><br><br><br>

Now working through [this documentation](https://pytorch.org/tutorials/beginner/translation_transformer.html) (German to English translation).

No. That documentation depends on torchtext, which is deprecated (and doesn't work).

[This](https://www.kaggle.com/code/nathanyoung1/transformer-based-language-translation-in-pytorch) is promising (doesn't use torchtext) and a thorough explanation of all the pieces, but it shows everything. I'd rather use Torch's built-in [torch.nn.Transformer](https://pytorch.org/docs/stable/generated/torch.nn.Transformer.html) with `nhead=1` so that I don't have to show all of the complexity but I can pull out the attention matrix (and show the one-to-many nature of the learned attention: `'1' → 'I'` (1 char → 1 char), `'8' → 'VIII'` (1 char → 4 char), `'9' → 'IX'` (1 char → 2 char), etc.).

It looks to me like even this wouldn't be a _simple_ demo, and if it's not simple, it's not helping with students' understanding. I think I need to give it up.