In [None]:
import numpy as np
import torch
from torch import nn
import torch.optim as optim

# -model- #

def setup_model(model, learning_rate):
    criterion = nn.CrossEntropyLoss()
    optimizer = optim.Adam(model.parameters(), lr=learning_rate)
    return criterion, optimizer


class PositionalEncoding(nn.Module):
    def __init__(self, d_model, dropout=0.15, max_len=256):
        super(PositionalEncoding, self).__init__()
        self.dropout = nn.Dropout(p=dropout)

        pe = torch.zeros(max_len, d_model)
        position = torch.arange(0, max_len, dtype=torch.float).unsqueeze(1)
        div_term = torch.exp(torch.arange(0, d_model, 2).float() * (-np.log(10000.0) / d_model))
        pe[:, 0::2] = torch.sin(position * div_term)
        pe[:, 1::2] = torch.cos(position * div_term)
        pe = pe.unsqueeze(0).transpose(0, 1)
        self.register_buffer('pe', pe)

    def forward(self, x):
        x = x + self.pe[:x.size(0), :]
        return self.dropout(x)

class TransformerNet(nn.Module):
  def __init__(self, num_vocab, embedding_dim, hidden_size, n_heads, n_layers, max_len, num_labels, dropout):
    super(TransformerNet, self).__init__()

    self.embedding = nn.Embedding(num_vocab, embedding_dim)

    self.pe = PositionalEncoding(embedding_dim, max_len = max_len)

    enc_layer = nn.TransformerEncoderLayer(embedding_dim, n_heads, hidden_size, dropout)
    self.encoder = nn.TransformerEncoder(enc_layer, num_layers = n_layers)

    self.dense = nn.Linear(embedding_dim, num_labels)
    self.log_softmax = nn.LogSoftmax()

  def forward(self, x):
    x = self.embedding(x).permute(1, 0, 2)
    x = self.pe(x)
    x = self.encoder(x)
    x = x.mean(dim=0)
    x = self.dense(x)
    return x