In [1]:
with open("data/truyen_kieu_data.txt", "r", encoding="utf-8") as f:
    text = f.read()

chars = sorted(list(set(text)))
vocab_size = len(chars)

ctoi = {ch:i for i,ch in enumerate(chars)}
itoc = {i:ch for i,ch in enumerate(chars)}

encode = lambda s: [ctoi[i] for i in s]
decode = lambda s: ''.join([itoc[i] for i in s])

In [2]:
import torch

data = torch.tensor(encode(text), dtype=torch.long)
n = int(0.9*len(text))
train_data = data[:n]
val_data = data[n:]

def get_batch(data, block_size: int = 32, batch_size: int = 32, device="cpu"):
    idx = torch.randint(len(data) - block_size, (batch_size,))
    x = torch.stack([data[i:i+block_size] for i in idx])
    y = torch.stack([data[i+1:i+block_size+1] for i in idx])
    return x.to(device), y.to(device)

<div style="display: flex; justify-content: space-between;">
    <img src="img/gpt.png" width="40%" height="40%" style="margin-right: 10px;" />
    <img src="img/transformer.png" width="30%" height="30%"/>
</div>

# Positional Embedding

In [3]:
import torch.nn as nn
from torch.nn import functional as F

In [4]:
device = "cuda" if torch.cuda.is_available() else "cpu"
print(device)

cuda


In [6]:
embedding = nn.Embedding(vocab_size, 100)
x, y = get_batch(train_data, 8, 8)
embedding_layer = embedding(x)
print(embedding_layer.shape)

torch.Size([8, 8, 100])


In [16]:
x

tensor([[ 59,  43,   1,  32,  39,   1,  35, 101],
        [ 32,  43,  37,   1,  49,  47,  74,  42],
        [ 38,  83,  34,   1,  33,  32,  53,   1],
        [113,  39,   6,   0,  20,  37,  82, 112],
        [ 44,   1,  57,  44,   1,  76, 109,   1],
        [  1,  43,  37,  38,  36,   1,  37,  86],
        [113,  43,   1,  38, 108,  43,  37,   1],
        [ 46,  50,  32,  43,   1,  43,  57,  53]])

In [7]:
pos = torch.arange(8, dtype=torch.long)
pos

tensor([0, 1, 2, 3, 4, 5, 6, 7])

In [8]:
embedding_1 = nn.Embedding(8, 100)
pos_embedding = embedding_1(pos)
pos_embedding.shape

torch.Size([8, 100])

In [9]:
output = embedding_layer + pos_embedding
output.shape

torch.Size([8, 8, 100])

In [19]:
class GPT(nn.Module):
    def __init__(self, vocab_size, block_size, embedding_dim):
        self.vocab_size = vocab_size
        self.block_size = block_size

        self.token_embedding = nn.Embedding(vocab_size, embedding_dim)
        self.pos_embedding = nn.Embedding(block_size, embedding)
        self.linear = nn.Linear(embedding_dim, vocab_size)

    def forward(self, inputs, targets=None):
        B, T = inputs.shape
        token_embd = self.token_embedding(inputs)
        pos_embd = self.pos_embedding(torch.arange(8, dtype=torch.long, device=device))
        embedding = token_embd + pos_embd
        logits = self.linear(embedding)
        B,T,C = logits.shape

        if targets is not None:
            logits = logits.view(B*T, C)
            targets = targets.view(B*T)
            loss = F.cross_entropy(logits, targets)
        else: # inference
            loss = None
        return logits, loss

    def generate(self, idx, max_tokens=100):
        for i in range(max_tokens):
            idx = idx if idx.size(1) <= self.block_size else idx[:, -self.block_size:]
            logits, loss = self(idx)
            logits = logits[:, -1, :]
            probs = F.softmax(logits, dim=-1)
            idx_next = torch.multinomial(probs, num_samples=1)
            idx = torch.cat([idx, idx_next], dim=1)
        return idx

In [20]:
batch_size = 32
block_size = 32

gpt = GPT(vocab_size, block_size, 128).to(device)
context = torch.zeros((1,1), dtype=torch.long).to(device)
generated_text = gpt.generate(context)