In [1]:
import sys
import math
from tqdm import tqdm
sys.path.insert(0, '../')

import torch
from torch import nn, optim
from torch.nn import functional as F
from torchsummary import summary

from attention import MultiHeadAttention
from encoder import Encoder
from decoder import Decoder
from positional_encoding import PositionalEncoder
from transformer import Transformer

device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')
print(f'Device: {device}')

Device: cuda:0


In [2]:
VOCAB_SIZE = 15
D_MODEL = 6
NUM_HEADS = 2
MAX_LEN = 8
BATCH_SIZE = 4
LR = 1e-3
EPOCHS = 10000

In [3]:
sample_input_batch = torch.randint(1, VOCAB_SIZE-1, (BATCH_SIZE, MAX_LEN))
sample_output_batch = torch.randint(1, VOCAB_SIZE-1, (BATCH_SIZE, MAX_LEN))
sample_output_batch[:, -1] = VOCAB_SIZE-1
sample_output_batch[:, 0] = 0

sample_input_batch.size(), sample_output_batch.size()

(torch.Size([4, 8]), torch.Size([4, 8]))

In [4]:
model = Transformer(vocab_size=VOCAB_SIZE, max_len=MAX_LEN, d_model=D_MODEL, num_heads=NUM_HEADS).cpu()

In [5]:
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=LR)

In [6]:
sample_input_batch = sample_input_batch.long().cpu()
sample_output_batch = sample_output_batch.long().cpu()

In [7]:
print('=====Run My Lovely Transformer🤩=====')
model.train()
for epoch in range(EPOCHS):
    optimizer.zero_grad()
    pred_output_batch = model(sample_input_batch, sample_output_batch)
    loss_val = criterion(pred_output_batch.transpose(-1,-2), sample_output_batch)
    loss_val.backward()
    optimizer.step()
    if epoch % 2500 == 0 or epoch == EPOCHS-1:
        print(f'Epoch: {epoch}, Loss: {loss_val}')
        print('>>> Predicted Batch')        
        print(torch.argmax(pred_output_batch, dim=-1))

        print('>>> Ground Truth Batch')
        print(sample_output_batch)
        print()

=====Run My Lovely Transformer🤩=====
Epoch: 0, Loss: 2888.4296875
>>> Predicted Batch
tensor([[14, 11, 11, 11, 11, 11, 11, 11],
        [10, 12, 11, 11, 11, 11, 11, 11],
        [10, 12, 11, 11, 11, 11, 11, 11],
        [14, 14, 11, 11, 11, 11, 11, 11]])
>>> Ground Truth Batch
tensor([[ 0,  4,  5, 10,  1, 10,  3, 14],
        [ 0,  2, 10,  1,  5,  9,  9, 14],
        [ 0,  5,  5, 10,  3,  7,  5, 14],
        [ 0, 11, 11, 12,  3,  4,  7, 14]])

Epoch: 2500, Loss: 3.4121768474578857
>>> Predicted Batch
tensor([[ 0,  4,  5, 10,  1,  5, 14, 14],
        [ 0,  2, 10,  1,  5,  5, 14, 14],
        [ 0,  5,  5, 10,  5,  5, 14, 14],
        [ 0, 11, 11, 12,  5,  5, 14, 14]])
>>> Ground Truth Batch
tensor([[ 0,  4,  5, 10,  1, 10,  3, 14],
        [ 0,  2, 10,  1,  5,  9,  9, 14],
        [ 0,  5,  5, 10,  3,  7,  5, 14],
        [ 0, 11, 11, 12,  3,  4,  7, 14]])

Epoch: 5000, Loss: 2.769289255142212
>>> Predicted Batch
tensor([[ 0,  4,  5, 10,  1, 10,  9,  9],
        [ 0,  2, 10,  1,  5,  9, 

In [12]:
pred_output_batch.size()

torch.Size([3, 4, 6])

In [14]:
sample_output_batch.size()

torch.Size([3, 4])

In [17]:
loss_val = loss(pred_output_batch.transpose(-1, -2), sample_output_batch)

In [19]:
optimizer.zero_grad()
loss_val.backward()

In [20]:
optimizer.step()

In [117]:
input_embeder = nn.Embedding(num_embeddings=VOCAB_SIZE, embedding_dim=D_MODEL)
output_embeder = nn.Embedding(num_embeddings=VOCAB_SIZE, embedding_dim=D_MODEL)
encoder = Encoder(d_model=D_MODEL, num_heads=NUM_HEADS, max_len=MAX_LEN)
decoder = Decoder(d_model=D_MODEL, num_heads=NUM_HEADS)

In [118]:
sample_input_embedded = input_embeder(sample_input_batch)
sample_output_embedded = output_embeder(sample_output_batch)

In [119]:
sample_input_embedded.size(), sample_output_embedded.size()

(torch.Size([8, 10, 6]), torch.Size([8, 10, 6]))

In [120]:
encoder_output = encoder(sample_input_embedded)
encoder_output.size()

torch.Size([8, 10, 6])

In [121]:
sample_input_embedded.size()

torch.Size([8, 10, 6])

In [122]:
decoder_output = decoder(encoder_output, sample_output_embedded)
decoder_output.size()

torch.Size([8, 10, 6])

In [123]:
linear = nn.Linear(in_features=D_MODEL, out_features=VOCAB_SIZE)

In [124]:
sample_output_batch.size()

torch.Size([8, 10])

In [128]:
linear_output = linear(decoder_output)

In [134]:
linear_output[0].size() # 하나의 시퀀스, 각 단어를 15(vocab size)차원으로 맵핑 후 소프트맥스

torch.Size([10, 15])

In [142]:
output = F.softmax(linear_output, dim=-1)

In [146]:
torch.argmax(output, dim=-1)

tensor([[ 8,  8,  9,  8, 11, 11, 11,  8,  8,  8],
        [ 8,  8,  8, 14,  9,  8, 11,  8, 11,  7],
        [ 8,  8,  8,  7,  9,  3,  8,  8,  8,  8],
        [ 8,  8,  8,  8, 11, 11, 11, 11, 11,  7],
        [ 7, 11,  9,  7, 11, 11,  8,  8, 11,  8],
        [ 8, 10, 13,  9, 10,  8, 11, 11, 11,  7],
        [11,  4,  3,  9,  8,  8,  8,  9,  8,  8],
        [ 9, 10,  9, 10,  8,  8, 11,  8, 11,  8]])