In [40]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader

In [3]:
class GPT(nn.Module):
    def __init__(self, vocab_size, d_model, nhead, num_layers, dim_feedforward):
        super(GPT, self).__init__()
        self.embedding = nn.Embedding(vocab_size, d_model)
        self.transformer = nn.Transformer(d_model, nhead, num_layers, dim_feedforward)
        self.fc = nn.Linear(d_model, vocab_size)

    def forward(self, x):
        x = self.embedding(x)
        x = self.transformer(x)
        x = self.fc(x)
        return x

In [4]:
def train(model, dataloader, criterion, optimizer, device):
    model.train()
    epoch_loss = 0
    for batch in dataloader:
        data, targets = batch
        data, targets = data.to(device), targets.to(device)

        optimizer.zero_grad()
        output = model(data)
        loss = criterion(output.view(-1, output.size(-1)), targets.view(-1))
        loss.backward()
        optimizer.step()

        epoch_loss += loss.item()
    return epoch_loss / len(dataloader)

In [80]:
with open("dataset.txt") as f:
    text = f.read()

chars = sorted(list(set(text)))
vocab_size = len(chars)

char_to_idx = {ch: i for i, ch in enumerate(chars)}
idx_to_char = {i: ch for i, ch in enumerate(chars)}

encode = lambda x: [char_to_idx[ch] for ch in x]
decode = lambda x: "".join([idx_to_char[i] for i in x])
decode_tensor = lambda x: "".join([idx_to_char[i.item()] for i in x])

In [50]:
encode("hii there")

[46, 47, 47, 1, 58, 46, 43, 56, 43]

In [55]:
decode([46, 47, 47, 1, 58, 46, 43, 56, 43])

'hii there'

In [58]:
data = torch.tensor(encode(text), dtype=torch.long)

In [62]:
print(data.shape)
print(data[:10])
print(text[:10])

torch.Size([1115394])
tensor([18, 47, 56, 57, 58,  1, 15, 47, 58, 47])
First Citi


In [64]:
n = int(len(data) * 0.9)
train_data = data[:n]
val_data = data[n:]

In [84]:
x = train_data[:8]
y = train_data[1:8+1]
for t in range(8):
    context = x[:t+1]
    target = y[t]
    print(f"when input is {context}({decode_tensor(context)}) the target: {target}({decode_tensor([target])})")

when input is tensor([18])(F) the target: 47(i)
when input is tensor([18, 47])(Fi) the target: 56(r)
when input is tensor([18, 47, 56])(Fir) the target: 57(s)
when input is tensor([18, 47, 56, 57])(Firs) the target: 58(t)
when input is tensor([18, 47, 56, 57, 58])(First) the target: 1( )
when input is tensor([18, 47, 56, 57, 58,  1])(First ) the target: 15(C)
when input is tensor([18, 47, 56, 57, 58,  1, 15])(First C) the target: 47(i)
when input is tensor([18, 47, 56, 57, 58,  1, 15, 47])(First Ci) the target: 58(t)


In [76]:
batch_size = 4
block_size = 8

def get_batch(split):
    data = train_data if split == "train" else val_data

    ix = torch.randint(0, data.size(0) - block_size, (batch_size,))
    x = torch.stack([data[i:i+block_size] for i in ix])
    y = torch.stack([data[i+1:i+block_size+1] for i in ix])

    return x, y

In [89]:
xb, yb = get_batch("train")
print(xb.shape, yb.shape)
print(xb)
print(yb)

for b in range(batch_size):
    for t in range(block_size):
        context = xb[b, :t+1]
        target = yb[b, t]
        print(f"When input is {context.tolist()}, the target is {target.tolist()}")

torch.Size([4, 8]) torch.Size([4, 8])
tensor([[ 0, 25, 63,  1, 50, 53, 56, 42],
        [ 1, 58, 46, 43,  1, 61, 39, 56],
        [57,  1, 54, 50, 39, 41, 43,  8],
        [ 1, 44, 39, 58, 46, 43, 56,  8]])
tensor([[25, 63,  1, 50, 53, 56, 42,  6],
        [58, 46, 43,  1, 61, 39, 56, 56],
        [ 1, 54, 50, 39, 41, 43,  8,  0],
        [44, 39, 58, 46, 43, 56,  8,  0]])
When input is [0], the target is 25
When input is [0, 25], the target is 63
When input is [0, 25, 63], the target is 1
When input is [0, 25, 63, 1], the target is 50
When input is [0, 25, 63, 1, 50], the target is 53
When input is [0, 25, 63, 1, 50, 53], the target is 56
When input is [0, 25, 63, 1, 50, 53, 56], the target is 42
When input is [0, 25, 63, 1, 50, 53, 56, 42], the target is 6
When input is [1], the target is 58
When input is [1, 58], the target is 46
When input is [1, 58, 46], the target is 43
When input is [1, 58, 46, 43], the target is 1
When input is [1, 58, 46, 43, 1], the target is 61
When input is

In [None]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

vocab_size = len(vocab)
d_model = 512
nhead = 8
num_layers = 4
dim_feedforward = 2048

model = GPT(vocab_size, d_model, nhead, num_layers, dim_feedforward).to(device)
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=1e-4)

In [None]:
num_epochs = 10

for epoch in range(num_epochs):
    loss = train(model, dataloader, criterion, optimizer, device)
    print(f"Epoch {epoch + 1}, Loss: {loss:.4f}")