In [1]:
import torch
import torch.nn as nn
from torch.nn import TransformerEncoder, TransformerEncoderLayer
import io
import math

In [2]:
EMBEDDING_SIZE = 10
input_text = 'text.txt'

char_to_token = {}
token_to_char = {}
def load_char_to_token():
    global char_to_token
    with io.open(input_text, 'r', encoding='utf-8') as f:
        while True:
            c = f.read(1)
            if not c:
                break
            
            if c not in char_to_token:
                next_token = len(char_to_token)
                char_to_token[c] = next_token
                token_to_char[next_token] = c
load_char_to_token()

def tokens_from_file(path):
    tokens = []
    with io.open(input_text, 'r', encoding='utf-8') as f:
        tokens = [[char_to_token[c] for c in f.read()]]
    return torch.LongTensor(tokens)

def tokens_to_string(tokens):
    if isinstance(tokens, torch.Tensor):
        return ''.join([token_to_char[t.item()] for t in tokens.squeeze(0)])
    else:
        return ''.join([token_to_char[t] for t in tokens])

_embedding = nn.Embedding(len(char_to_token), EMBEDDING_SIZE)
_embedding.weight.data.uniform_(-0.1, 0.1)

def get_embedding_from_str(in_str):
    tokens = [[char_to_token[c]] for c in in_str]
    return _embedding(torch.LongTensor(tokens))

def get_embedding(tensor):
    return _embedding(tensor)

In [3]:
# data tests
file_tokens = tokens_from_file(input_text)
assert torch.all(torch.eq(file_tokens[0, :10], torch.Tensor([[0, 0, 0, 0, 1, 2, 3, 4, 5, 6]])))
assert tokens_to_string([1, 2, 3, 4, 23]) == '™😀Thf'
assert tokens_to_string(file_tokens).startswith('\x00\x00\x00\x00™😀This ™is a tutorial on how to train a seq')

In [4]:
class PositionalEncoding(nn.Module):
    def __init__(self, d_model, dropout=0.1, max_len=5000):
        super(PositionalEncoding, self).__init__()
        self.dropout = nn.Dropout(p=dropout)

        pe = torch.zeros(max_len, d_model)
        position = torch.arange(0, max_len, dtype=torch.float).unsqueeze(1)
        div_term = torch.exp(torch.arange(0, d_model, 2).float() * (-math.log(10000.0) / d_model))
        pe[:, 0::2] = torch.sin(position * div_term)
        pe[:, 1::2] = torch.cos(position * div_term)
        pe = pe.unsqueeze(0).transpose(0, 1)
        self.register_buffer('pe', pe)

    def forward(self, x):
        x = x + self.pe[:x.size(0), :]
        return self.dropout(x)

In [5]:
sz = 1
src_mask = (torch.triu(torch.ones(sz, sz)) == 1).transpose(0, 1)
src_mask = src_mask.float().masked_fill(src_mask == 0, float('-inf')).masked_fill(src_mask == 1, float(0.0))
src_mask

tensor([[0.]])

In [6]:
nlayers = 1
nhead = 10
nhid = 1
dropout = 0.0

pe = PositionalEncoding(EMBEDDING_SIZE, dropout=dropout)
encoder_layers = TransformerEncoderLayer(EMBEDDING_SIZE, nhead, nhid, dropout)
transformer_encoder = TransformerEncoder(encoder_layers, nlayers)
decoder = nn.Linear(EMBEDDING_SIZE, len(char_to_token))
decoder.bias.data.zero_()
decoder.weight.data.uniform_(-0.1, 0.1)
softmax_layer = nn.Softmax(dim=2)

def forward_pass(input_tokens):
    input_x = get_embedding(input_tokens)
    positional_embedding = pe(input_x)

    assert input_x.size() == torch.Size([1, 15, 10])
    assert positional_embedding.size() == torch.Size([1, 15, 10])
    assert target_y.size()[0] == input_x.size()[1]
    
    output = transformer_encoder(positional_embedding, src_mask)  # src_mask ?

    # why is this `torch.Size([1, 750, 52])` ? I want the output to be the next character `torch.Size([1, 1, 52])`
    output = decoder(output)

    assert output.size()[1] == target_y.size()[0]
    
    return output

def full_forward_pass(input_tokens):
    output = forward_pass(input_tokens)
    output = softmax_layer(output)
    return output

In [None]:
forward_

In [7]:
criterion = nn.CrossEntropyLoss()
lr = 1.0 # learning rate
all_params = list(_embedding.parameters()) + list(transformer_encoder.parameters()) + list(decoder.parameters())
optimizer = torch.optim.SGD(all_params, lr=lr)

In [27]:
INPUT_SIZE = 15
file_tokens = tokens_from_file(input_text)

for i in range(0, 1000):
    input_x = file_tokens[:, 0:INPUT_SIZE]
    target_y = file_tokens[0, INPUT_SIZE:INPUT_SIZE + INPUT_SIZE] # todo: batch x/y
    output = forward_pass(input_x)
    print(output)
    break
    
    optimizer.zero_grad()
    loss = criterion(output.view(-1, len(char_to_token)), target_y)
    print('loss={}'.format(loss))
    loss.backward()
    optimizer.step()

tensor([[[-1.0212e+00, -1.1097e+00, -1.0430e+00, -9.9356e-01,  2.2280e+00,
          -3.5028e-01, -1.1525e+00,  8.6143e+00,  8.7353e+00,  8.8158e+00,
           8.7948e+00,  3.0072e+00, -2.6350e+00,  2.1460e+00,  3.0655e+00,
          -1.0365e+00, -1.1486e+00, -9.6878e-01, -1.0518e+00, -1.0266e+00,
          -1.0444e+00, -1.1298e+00, -1.0688e+00, -1.0958e+00, -1.0066e+00,
          -1.0627e+00, -1.1168e+00, -9.5932e-01, -9.7909e-01, -1.2230e+00,
          -1.0078e+00, -9.1004e-01, -9.9655e-01, -1.1312e+00, -9.7119e-01,
          -1.1217e+00, -1.0642e+00, -1.0334e+00, -1.0901e+00, -1.0834e+00,
          -9.3505e-01, -1.1007e+00, -1.0330e+00, -9.5698e-01, -1.1136e+00,
          -9.9115e-01, -1.1085e+00, -1.0790e+00, -1.0739e+00, -1.0641e+00,
          -9.1462e-01, -1.0837e+00],
         [-1.0212e+00, -1.1097e+00, -1.0430e+00, -9.9356e-01,  2.2280e+00,
          -3.5028e-01, -1.1525e+00,  8.6143e+00,  8.7353e+00,  8.8158e+00,
           8.7948e+00,  3.0072e+00, -2.6350e+00,  2.1460e+00,  

In [None]:
encoder_layer = nn.TransformerEncoderLayer(d_model=EMBEDDING_SIZE, nhead=10)
transformer_encoder = nn.TransformerEncoder(encoder_layer, num_layers=6)
src = torch.rand(1, 750, EMBEDDING_SIZE)
transformer_encoder(src)

In [None]:
transformer_encoder.layers[0].self_attn.out_proj.weight

In [None]:
char_to_token['d']

In [None]:
a = torch.randn(10)

In [None]:
import torch.nn as nn

In [None]:
dictionary = 'abcdefghiklmnopqrstuvwxyz '
def tokenize(s):
    return [dictionary.index(c) for c in s]
embed = nn.Embedding(len(dictionary), 3)
embed(torch.LongTensor(tokenize('hello world')))

In [None]:
embed(torch.LongTensor([10]))

In [None]:
embed.weight

In [None]:
torch.LongTensor([1, 2])

In [None]:
torch.LongTensor(1)

In [None]:
dictionary.index('z')

In [None]:
import math
max_len = 8
d_model = 4
pe = torch.zeros(max_len, d_model)
position = torch.arange(0, max_len, dtype=torch.float).unsqueeze(1)
div_term = torch.exp(torch.arange(0, d_model, 2).float() * (-math.log(10000.0) / d_model))
pe[:, 0::2] = torch.sin(position * div_term)
pe[:, 1::2] = torch.cos(position * div_term)
pe = pe.unsqueeze(0).transpose(0, 1)
pe