In [3]:
import torch
import torch.nn as nn
import torch.optim as optim
import numpy as np
import math


# Sentences and Word Processing
sentences = ["i like dog", "i love coffee", "i hate milk", "you like cat", "you love milk", "you hate coffee"]
word_list = list(set(" ".join(sentences).split()))
word_dict = {w: i for i, w in enumerate(word_list)}
number_dict = {i: w for i, w in enumerate(word_list)}
n_class = len(word_dict)  # Number of unique words

# Model Parameters
input_size = n_class  # Size of the word embeddings
hidden_size = 5  # Size of the Transformer's internal representations
num_layers = 1  # Number of Transformer layers
num_heads = 1  # Number of attention heads
max_len = 5  # Maximum length of a sentence
batch_size = len(sentences)

def make_batch(sentences):
    input_batch = [torch.tensor([word_dict[n] for n in sen.split()]) for sen in sentences]
    target_batch = torch.tensor([word_dict[sen.split()[-1]] for sen in sentences])
    return input_batch, target_batch

input_batch, target_batch = make_batch(sentences)

# Positional Encoding
def get_sinusoid_encoding_table(n_position, d_hid):
    def cal_angle(position, hid_idx):
        return position / math.pow(10000, 2 * (hid_idx // 2) / d_hid)
    def get_posi_angle_vec(position):
        return [cal_angle(position, hid_j) for hid_j in range(d_hid)]

    sinusoid_table = np.array([get_posi_angle_vec(pos_i) for pos_i in range(n_position)])
    sinusoid_table[:, 0::2] = np.sin(sinusoid_table[:, 0::2])  # dim 2i
    sinusoid_table[:, 1::2] = np.cos(sinusoid_table[:, 1::2])  # dim 2i+1

    return torch.FloatTensor(sinusoid_table)

# Transformer Model
class TextTransformer(nn.Module):
    def __init__(self):
        super(TextTransformer, self).__init__()
        self.embedding = nn.Embedding(n_class, input_size)
        self.pos_encoding = get_sinusoid_encoding_table(max_len, input_size)
        transformer_layer = nn.TransformerEncoderLayer(d_model=input_size, nhead=num_heads)
        self.transformer = nn.TransformerEncoder(transformer_layer, num_layers=num_layers)
        self.fc = nn.Linear(input_size, n_class)

    def forward(self, x):
        x = self.embedding(x) + self.pos_encoding[:x.size(1), :]
        x = x.transpose(0, 1)  # Transformer expects [sequence length, batch, features]
        output = self.transformer(x)
        output = self.fc(output[-1])
        return output

model = TextTransformer()
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=0.01)

# Training
for epoch in range(500):
    optimizer.zero_grad()
    output = model(torch.stack(input_batch))
    loss = criterion(output, target_batch)
    loss.backward()
    optimizer.step()

    if (epoch + 1) % 100 == 0:
        print(f'Epoch: {epoch + 1}, Loss: {loss.item()}')

# Prediction
predict = model(torch.stack(input_batch)).data.max(1, keepdim=True)[1]
predicted_words = [number_dict[n.item()] for n in predict.squeeze()]
print([sen.split()[:2] for sen in sentences], '->', predicted_words)


  from .autonotebook import tqdm as notebook_tqdm


Epoch: 100, Loss: 0.008856602944433689
Epoch: 200, Loss: 0.0030817966908216476
Epoch: 300, Loss: 0.0023074999917298555
Epoch: 400, Loss: 0.0006933901458978653
Epoch: 500, Loss: 0.0006372613133862615
[['i', 'like'], ['i', 'love'], ['i', 'hate'], ['you', 'like'], ['you', 'love'], ['you', 'hate']] -> ['dog', 'coffee', 'milk', 'cat', 'milk', 'coffee']
