In [1]:
import torch
import torch.nn as nn
from torch.nn import TransformerEncoder, TransformerEncoderLayer
import io
import math

In [64]:
EMBEDDING_SIZE = 10
input_text = 'text.txt'

char_to_token = {}
token_to_char = {}
def load_char_to_token():
    global char_to_token
    with io.open(input_text, 'r', encoding='utf-8') as f:
        while True:
            c = f.read(1)
            if not c:
                break
            
            if c not in char_to_token:
                next_token = len(char_to_token)
                char_to_token[c] = next_token
                token_to_char[next_token] = c
load_char_to_token()

def tokens_from_file(path):
    tokens = []
    with io.open(input_text, 'r', encoding='utf-8') as f:
        tokens = [[char_to_token[c] for c in f.read()]]
    return torch.LongTensor(tokens)

def tokens_to_string(tokens):
    if isinstance(tokens, torch.Tensor):
        return ''.join([token_to_char[t.item()] for t in tokens.squeeze(0)])
    else:
        return ''.join([token_to_char[t] for t in tokens])

_embedding = nn.Embedding(len(char_to_token), EMBEDDING_SIZE)
_embedding.weight.data.uniform_(-0.1, 0.1)

def get_embedding_from_str(in_str):
    tokens = [[char_to_token[c]] for c in in_str]
    return _embedding(torch.LongTensor(tokens))

def get_embedding(tensor):
    return _embedding(tensor)

In [77]:
# data tests
file_tokens = tokens_from_file(input_text)
assert torch.all(torch.eq(file_tokens[0, :10], torch.Tensor([[0, 0, 0, 0, 1, 2, 3, 4, 5, 6]])))
assert tokens_to_string([1, 2, 3, 4, 23]) == '™😀Thf'
assert tokens_to_string(file_tokens).startswith('\x00\x00\x00\x00™😀This ™is a tutorial on how to train a seq')

In [66]:
class PositionalEncoding(nn.Module):
    def __init__(self, d_model, dropout=0.1, max_len=5000):
        super(PositionalEncoding, self).__init__()
        self.dropout = nn.Dropout(p=dropout)

        pe = torch.zeros(max_len, d_model)
        position = torch.arange(0, max_len, dtype=torch.float).unsqueeze(1)
        div_term = torch.exp(torch.arange(0, d_model, 2).float() * (-math.log(10000.0) / d_model))
        pe[:, 0::2] = torch.sin(position * div_term)
        pe[:, 1::2] = torch.cos(position * div_term)
        pe = pe.unsqueeze(0).transpose(0, 1)
        self.register_buffer('pe', pe)

    def forward(self, x):
        x = x + self.pe[:x.size(0), :]
        return self.dropout(x)

In [78]:
# input_x = get_embedding_from_str('    ')
input_x = get_embedding(tokens_from_file(input_text))
print(input_x.size())
input_embedding = input_x * math.sqrt(EMBEDDING_SIZE)
pe = PositionalEncoding(EMBEDDING_SIZE)
positional_embedding = pe(input_embedding)

print(positional_embedding)
print(positional_embedding.size())

torch.Size([1, 750, 10])
tensor([[[-0.1949,  1.1909, -0.2304,  ...,  0.9253, -0.1482,  1.4484],
         [-0.1949,  1.1909, -0.2304,  ...,  0.9253, -0.1482,  1.4484],
         [-0.1949,  1.1909, -0.2304,  ...,  0.0000, -0.1482,  1.4484],
         ...,
         [ 0.1389,  1.0535, -0.2661,  ...,  1.3609, -0.0802,  1.3236],
         [ 0.1602,  0.8736,  0.1491,  ...,  0.8668, -0.1125,  0.8443],
         [ 0.0806,  0.9419, -0.0438,  ...,  1.1866, -0.1275,  1.0514]]],
       grad_fn=<MulBackward0>)
torch.Size([1, 750, 10])


In [68]:
sz = 1
src_mask = (torch.triu(torch.ones(sz, sz)) == 1).transpose(0, 1)
src_mask = src_mask.float().masked_fill(src_mask == 0, float('-inf')).masked_fill(src_mask == 1, float(0.0))
src_mask

tensor([[0.]])

In [69]:
print(positional_embedding.size())
print(len(char_to_token))

torch.Size([1, 750, 10])
52


In [81]:
nlayers = 1
nhead = 10
nhid = 1
dropout = 0.0
encoder_layers = TransformerEncoderLayer(EMBEDDING_SIZE, nhead, nhid, dropout)
transformer_encoder = TransformerEncoder(encoder_layers, nlayers)

print(positional_embedding.size())
print(positional_embedding)
print(src_mask)
output = transformer_encoder(positional_embedding, src_mask)  # src_mask ?
# print(positional_embedding)
# print(output)
decoder = nn.Linear(EMBEDDING_SIZE, len(char_to_token))
decoder.bias.data.zero_()
decoder.weight.data.uniform_(-0.1, 0.1)

# why is this `torch.Size([1, 750, 52])` ? I want the output to be the next character `torch.Size([1, 1, 52])`
output = decoder(output)

print(output.size())
print(output)

torch.Size([1, 750, 10])
tensor([[[-0.1949,  1.1909, -0.2304,  ...,  0.9253, -0.1482,  1.4484],
         [-0.1949,  1.1909, -0.2304,  ...,  0.9253, -0.1482,  1.4484],
         [-0.1949,  1.1909, -0.2304,  ...,  0.0000, -0.1482,  1.4484],
         ...,
         [ 0.1389,  1.0535, -0.2661,  ...,  1.3609, -0.0802,  1.3236],
         [ 0.1602,  0.8736,  0.1491,  ...,  0.8668, -0.1125,  0.8443],
         [ 0.0806,  0.9419, -0.0438,  ...,  1.1866, -0.1275,  1.0514]]],
       grad_fn=<MulBackward0>)
tensor([[0.]])
torch.Size([1, 750, 52])
tensor([[[-0.0323, -0.4526,  0.0542,  ..., -0.3076, -0.1768,  0.0546],
         [-0.0323, -0.4526,  0.0542,  ..., -0.3076, -0.1768,  0.0546],
         [-0.0081, -0.4962,  0.0759,  ..., -0.3246, -0.0832,  0.0797],
         ...,
         [-0.1407, -0.2948,  0.0854,  ..., -0.2437, -0.1665, -0.0794],
         [-0.0018, -0.3777,  0.0196,  ..., -0.3223, -0.2252,  0.0054],
         [-0.0429, -0.3262,  0.0107,  ..., -0.2448, -0.1578,  0.0065]]],
       grad_fn=<AddB

In [71]:
encoder_layer = nn.TransformerEncoderLayer(d_model=EMBEDDING_SIZE, nhead=10)
transformer_encoder = nn.TransformerEncoder(encoder_layer, num_layers=6)
src = torch.rand(1, 750, EMBEDDING_SIZE)
transformer_encoder(src)

tensor([[[-0.1617,  1.7939, -0.2286,  ...,  1.9071, -0.7389, -0.3380],
         [-0.5101, -0.0249,  1.5994,  ...,  0.8186, -1.7024, -1.3093],
         [-1.3808,  0.8449, -0.9667,  ...,  2.4725, -0.2305, -0.2206],
         ...,
         [ 0.2462,  1.8954,  0.9202,  ...,  0.8838, -0.1812, -0.4680],
         [ 0.2259, -0.7530,  1.1712,  ...,  0.1627, -1.8471, -0.9272],
         [-0.1231,  0.8639, -0.8487,  ...,  2.3074, -0.4104, -0.1499]]],
       grad_fn=<NativeLayerNormBackward>)

In [None]:
transformer_encoder.layers[0].self_attn.out_proj.weight

In [None]:
char_to_token['d']

In [None]:
a = torch.randn(10)

In [None]:
import torch.nn as nn

In [None]:
dictionary = 'abcdefghiklmnopqrstuvwxyz '
def tokenize(s):
    return [dictionary.index(c) for c in s]
embed = nn.Embedding(len(dictionary), 3)
embed(torch.LongTensor(tokenize('hello world')))

In [None]:
embed(torch.LongTensor([10]))

In [None]:
embed.weight

In [None]:
torch.LongTensor([1, 2])

In [None]:
torch.LongTensor(1)

In [None]:
dictionary.index('z')

In [None]:
import math
max_len = 8
d_model = 4
pe = torch.zeros(max_len, d_model)
position = torch.arange(0, max_len, dtype=torch.float).unsqueeze(1)
div_term = torch.exp(torch.arange(0, d_model, 2).float() * (-math.log(10000.0) / d_model))
pe[:, 0::2] = torch.sin(position * div_term)
pe[:, 1::2] = torch.cos(position * div_term)
pe = pe.unsqueeze(0).transpose(0, 1)
pe