In [1]:
import math
import torch
import torch.nn as nn
import torch.nn.functional as F

In [2]:
class TransformerModel(nn.Module):

    def __init__(self, ntoken, ninp, nhead, nhid, nlayers, dropout=0.5):
        super(TransformerModel, self).__init__()
        from torch.nn import TransformerEncoder, TransformerEncoderLayer
        self.model_type = 'Transformer'
        self.pos_encoder = PositionalEncoding(ninp, dropout)
        encoder_layers = TransformerEncoderLayer(ninp, nhead, nhid, dropout)
        self.transformer_encoder = TransformerEncoder(encoder_layers, nlayers)
        self.encoder = nn.Embedding(ntoken, ninp)
        self.ninp = ninp
        self.decoder = nn.Linear(ninp, ntoken)

        self.init_weights()

    def init_weights(self):
        initrange = 0.1
        self.encoder.weight.data.uniform_(-initrange, initrange)
        self.decoder.bias.data.zero_()
        self.decoder.weight.data.uniform_(-initrange, initrange)

    def forward(self, src):
        src = self.encoder(src) * math.sqrt(self.ninp)
        src = self.pos_encoder(src)
        output = self.transformer_encoder(src)
        output = self.decoder(output)
        return output

In [3]:
class PositionalEncoding(nn.Module):

    def __init__(self, d_model, dropout=0.1, max_len=5000):
        super(PositionalEncoding, self).__init__()
        self.dropout = nn.Dropout(p=dropout)

        pe = torch.zeros(max_len, d_model)
        position = torch.arange(0, max_len, dtype=torch.float).unsqueeze(1)
        div_term = torch.exp(torch.arange(0, d_model, 2).float() * (-math.log(10000.0) / d_model))
        pe[:, 0::2] = torch.sin(position * div_term)
        pe[:, 1::2] = torch.cos(position * div_term)
        pe = pe.unsqueeze(0).transpose(0, 1)
        self.register_buffer('pe', pe)

    def forward(self, x):
        x = x + self.pe[:x.size(0), :]
        return self.dropout(x)

In [4]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [5]:
ntokens = 2000 # the size of vocabulary
emsize = 200 # embedding dimension
nhid = 200 # the dimension of the feedforward network model in nn.TransformerEncoder
nlayers = 2 # the number of nn.TransformerEncoderLayer in nn.TransformerEncoder
nhead = 2 # the number of heads in the multiheadattention models
dropout = 0.2 # the dropout value
model = TransformerModel(ntokens, emsize, nhead, nhid, nlayers, dropout).to(device)

In [15]:
t = torch.randint(0, 1, (10, 5)).to(device)

In [16]:
model(t)

tensor([[[ 0.6545,  0.5618,  1.2052,  ...,  1.1348,  2.2492, -0.3276],
         [-0.1621,  0.4525,  2.1023,  ...,  0.3499,  2.0883, -0.1150],
         [-0.3876,  0.9137,  0.3812,  ...,  0.7751,  1.8150, -0.8984],
         [ 0.0819,  1.0205,  1.7666,  ...,  0.6029,  1.6366, -0.3055],
         [-0.3692,  0.5628,  1.0247,  ...,  0.7533,  2.3122, -0.4255]],

        [[-0.2824,  0.0696,  0.7567,  ...,  2.0970,  0.9256, -0.3983],
         [-0.2628,  0.0676,  1.0979,  ...,  1.0309,  2.1973, -0.6835],
         [-0.6189,  0.5154,  0.8990,  ...,  1.4285,  2.0477, -0.9060],
         [-0.5621,  0.6226,  1.6725,  ...,  1.1949,  2.2357, -0.9005],
         [ 0.2564,  0.5485,  0.9278,  ...,  0.6419,  2.8465, -1.1684]],

        [[-0.5603,  0.3696,  0.8197,  ...,  0.7508,  1.9138, -1.1334],
         [-0.0894,  0.1941,  1.4274,  ...,  0.7642,  2.2985, -1.1003],
         [-0.7572,  0.3337,  0.9974,  ...,  0.8834,  1.9280, -1.0057],
         [-0.6904,  0.1039,  0.8335,  ...,  1.2978,  1.3635, -0.5116],
  