In [1]:
import os
import math
import pickle
from dataclasses import dataclass

import torch
import torch.nn as nn
from torch.nn import functional as F
from tqdm import tqdm
import tiktoken

In [2]:
@dataclass
class GPTConfig:
    vocab_size: int
    block_size: int = 128
    embed_size: int = 256
    num_layers: int = 6
    forward_expansion: int = 4
    heads: int = 8
    dropout: float = 0.05
    batch_size: int = 64
    max_iters: int = 10_000
    eval_interval: int = 500
    learning_rate: float = 1e-4
    device: str = 'cuda' if torch.cuda.is_available() else 'cpu'
    checkpoint_path: str = 'gpt_simple.pt'

In [3]:
tokenizer = tiktoken.get_encoding("gpt2")

encode = lambda s: tokenizer.encode(s)
decode = lambda t: tokenizer.decode(t)
vocab_size = tokenizer.n_vocab

In [4]:
with open('/kaggle/input/the-bards-best-a-character-modeling-dataset/train.csv', 'r') as f:
    text = f.read()

print(text[:300])

text
"First Citizen:
Before we proceed any further, hear me speak.

All:
Speak, speak.

First Citizen:
You are all resolved rather to die than to famish?

All:
Resolved. resolved.

First Citizen:
First, you know Caius Marcius is chief enemy to the people.

All:
We know't, we know't.

First Citizen:



In [5]:
data = torch.tensor(encode(text), dtype=torch.long)

split = int(0.9 * len(data))
train_data, val_data = data[:split], data[split:]

In [6]:
def get_batch(data, config):
    ix = torch.randint(len(data) - config.block_size, (config.batch_size,))
    x = torch.stack([data[i:i + config.block_size] for i in ix])
    y = torch.stack([data[i + 1:i + 1 + config.block_size] for i in ix])
    return x.to(config.device), y.to(config.device)

In [7]:
class CausalSelfAttention(nn.Module):
    def __init__(self, embed_size, heads, dropout, max_length=512):
        super().__init__()
        self.attn = nn.MultiheadAttention(embed_size, heads, dropout=dropout, batch_first=True)
        self.register_buffer("mask", torch.tril(torch.ones(max_length, max_length)))

    def forward(self, x):
        T = x.size(1)
        mask = self.mask[:T, :T] == 0
        x, _ = self.attn(x, x, x, attn_mask=mask)
        return x

class MLP(nn.Module):
    def __init__(self, embed_size, expansion, dropout):
        super().__init__()
        self.net = nn.Sequential(
            nn.Linear(embed_size, expansion * embed_size),
            nn.GELU(),
            nn.Linear(expansion * embed_size, embed_size),
            nn.Dropout(dropout),
        )

    def forward(self, x): 
        return self.net(x)

class Block(nn.Module):
    def __init__(self, embed_size, heads, expansion, dropout):
        super().__init__()
        self.ln1 = nn.LayerNorm(embed_size)
        self.attn = CausalSelfAttention(embed_size, heads, dropout)
        self.ln2 = nn.LayerNorm(embed_size)
        self.mlp = MLP(embed_size, expansion, dropout)

    def forward(self, x):
        x = x + self.attn(self.ln1(x))
        x = x + self.mlp(self.ln2(x))
        return x

class GPT(nn.Module):
    def __init__(self, config: GPTConfig):
        super().__init__()
        self.config = config
        self.token_embedding = nn.Embedding(config.vocab_size, config.embed_size)
        self.position_embedding = nn.Embedding(config.block_size, config.embed_size)
        self.blocks = nn.Sequential(*[
            Block(config.embed_size, config.heads, config.forward_expansion, config.dropout)
            for _ in range(config.num_layers)
        ])
        self.ln_f = nn.LayerNorm(config.embed_size)
        self.fc_out = nn.Linear(config.embed_size, config.vocab_size)

    def forward(self, x):
        B, T = x.size()
        positions = torch.arange(0, T, device=x.device).unsqueeze(0)
        x = self.token_embedding(x) + self.position_embedding(positions)
        x = self.blocks(x)
        x = self.ln_f(x)
        x = self.fc_out(x)
        return x

In [8]:
def train(model, config):
    optimizer = torch.optim.AdamW(model.parameters(), lr=config.learning_rate)

    for iter in tqdm(range(config.max_iters)):
        x, y = get_batch(train_data, config)
        logits = model(x)
        loss = F.cross_entropy(logits.view(-1, logits.size(-1)), y.view(-1))

        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

        if iter % config.eval_interval == 0:
            val_loss = evaluate(model, val_data, config)
            print(f"Iter {iter}, Train loss: {loss.item():.4f}, Val loss: {val_loss:.4f}")
            torch.save(model.state_dict(), config.checkpoint_path)

In [9]:
def evaluate(model, data, config):
    model.eval()
    with torch.no_grad():
        x, y = get_batch(data, config)
        logits = model(x)
        loss = F.cross_entropy(logits.view(-1, logits.size(-1)), y.view(-1))
    model.train()
    return loss.item()

In [10]:
@torch.no_grad()
def generate(model, start_text, max_new_tokens, config):
    model.eval()
    input_ids = torch.tensor(encode(start_text), dtype=torch.long)[None].to(config.device)
    for _ in range(max_new_tokens):
        input_cond = input_ids[:, -config.block_size:]
        logits = model(input_cond)
        next_id = torch.multinomial(F.softmax(logits[:, -1, :], dim=-1), num_samples=1)
        input_ids = torch.cat((input_ids, next_id), dim=1)
    return decode(input_ids[0].tolist())

In [11]:
config = GPTConfig(vocab_size=vocab_size)

model = nn.DataParallel(GPT(config))
model = model.to(config.device)

train(model, config)

  0%|          | 0/10000 [00:00<?, ?it/s]

Iter 0, Train loss: 10.9746, Val loss: 10.8546


  5%|▌         | 500/10000 [03:23<1:04:23,  2.46it/s]

Iter 500, Train loss: 5.1537, Val loss: 5.3855


 10%|█         | 1000/10000 [06:46<1:00:36,  2.47it/s]

Iter 1000, Train loss: 4.5283, Val loss: 5.0002


 15%|█▌        | 1500/10000 [10:10<57:30,  2.46it/s]  

Iter 1500, Train loss: 4.0919, Val loss: 4.8371


 20%|██        | 2000/10000 [13:33<53:59,  2.47it/s]  

Iter 2000, Train loss: 3.8362, Val loss: 4.8270


 25%|██▌       | 2500/10000 [16:56<50:40,  2.47it/s]  

Iter 2500, Train loss: 3.6279, Val loss: 4.4548


 30%|███       | 3000/10000 [20:19<47:10,  2.47it/s]  

Iter 3000, Train loss: 3.3938, Val loss: 4.8421


 35%|███▌      | 3500/10000 [23:42<43:54,  2.47it/s]  

Iter 3500, Train loss: 3.1087, Val loss: 4.6405


 40%|████      | 4000/10000 [27:06<40:38,  2.46it/s]  

Iter 4000, Train loss: 2.9660, Val loss: 4.8842


 45%|████▌     | 4500/10000 [30:29<37:02,  2.47it/s]  

Iter 4500, Train loss: 2.7121, Val loss: 4.9224


 50%|█████     | 5000/10000 [33:52<33:40,  2.47it/s]  

Iter 5000, Train loss: 2.3516, Val loss: 5.3779


 55%|█████▌    | 5500/10000 [37:15<30:30,  2.46it/s]

Iter 5500, Train loss: 2.1020, Val loss: 5.2267


 60%|██████    | 6000/10000 [40:39<26:56,  2.47it/s]

Iter 6000, Train loss: 2.0614, Val loss: 5.5510


 65%|██████▌   | 6500/10000 [44:02<23:37,  2.47it/s]

Iter 6500, Train loss: 1.7473, Val loss: 5.7895


 70%|███████   | 7000/10000 [47:25<20:13,  2.47it/s]

Iter 7000, Train loss: 1.5266, Val loss: 6.0164


 75%|███████▌  | 7500/10000 [50:48<16:47,  2.48it/s]

Iter 7500, Train loss: 1.3589, Val loss: 5.9709


 80%|████████  | 8000/10000 [54:12<13:29,  2.47it/s]

Iter 8000, Train loss: 1.1969, Val loss: 6.5749


 85%|████████▌ | 8500/10000 [57:35<10:08,  2.46it/s]

Iter 8500, Train loss: 1.0617, Val loss: 6.4508


 90%|█████████ | 9000/10000 [1:00:58<06:46,  2.46it/s]

Iter 9000, Train loss: 0.9218, Val loss: 6.7040


 95%|█████████▌| 9500/10000 [1:04:21<03:22,  2.47it/s]

Iter 9500, Train loss: 0.8571, Val loss: 7.1811


100%|██████████| 10000/10000 [1:07:45<00:00,  2.46it/s]


In [22]:
print(generate(model, "The King has tried", max_new_tokens=100, config=config))

The King has tried the nose;
Let the chaff, and butAn angry note.

Nurse:
Nay, he's a letter; in faith,'s a year, the case.

Nurse:
UnULET:
Will you be? half twenty cunning may!
It doth not Romeo, he may weep with your face?
The great foremost.

Servant:
Have I go have it with me from his forces,
And, if I
