In [3]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import tiktoken as tk

# using gpt2 encoding this time
encoder = tk.get_encoding('gpt2')

# ----- Hyperparameters
# setting up all the config numbers here
batch_size = 64      # how many sequences to process in parallel
block_size = 64      # max context length
max_iters = 4000     # total training steps
eval_interval = 100  # how often to check loss
learning_rate = 1e-3
eval_iters = 200     # how many steps to average for loss estimation
# check if cuda is available to speed things up
device = 'cuda' if torch.cuda.is_available() else 'cpu'
n_embd = 64
n_head = 4
n_layer = 4
dropout = 0.2        # to prevent overfitting
# ------

torch.manual_seed(1337)

<torch._C.Generator at 0x7a57684958b0>

In [4]:
# reading the domain names file
with open('the_art_of_war.txt', 'r', encoding='utf-8') as f:
    text = f.read()

chars = sorted(list(set(text)))
# vocab_size =  len(chars)
# using the size from the encoder directly
vocab_size =  encoder.n_vocab

# old manual encoding (commented out)
#stoi = {ch:i for i, ch in enumerate(chars)}
#itos = {i:ch for i, ch in enumerate(chars)}
#encode = lambda s:[stoi[c] for c in s]
#decode = lambda l: ''.join([itos[i] for i in l])

# creating the mappings using the encoder
stoi = {encoder.decode([k]):k for k in range(encoder.n_vocab)}
itos = {k:encoder.decode([k]) for k in range(encoder.n_vocab)}
encode = encoder.encode
decode = encoder.decode

# encoding the whole dataset into a tensor
data = torch.tensor(encode(text), dtype=torch.long)

# 90% train, 10% val split
n = int(0.9*len(data))
train_data = data[:n]
val_data = data[n:]

In [5]:
def get_batch(split):
    # helper to grab a random chunk of data
    data = train_data if split == 'train' else val_data
    # generate random starting spots
    ix = torch.randint(len(data) - block_size, (batch_size,))
    # stack them up
    x = torch.stack([data[i:i+block_size] for i in ix])
    y = torch.stack([data[i+1:i+block_size+1] for i in ix])
    # move to gpu if available
    x, y = x.to(device), y.to(device)
    return x, y

@torch.no_grad()
def estimate_loss():
    # function to calculate loss without updating gradients (for eval)
    out = {}
    model.eval() # switch to eval mode
    for split in ['train', 'val']:
        losses = torch.zeros(eval_iters)
        for k in range(eval_iters):
            X, Y = get_batch(split)
            logits, loss = model(X, Y)
            losses[k] = loss.item()
        out[split] = losses.mean()
    model.train() # switch back to train mode
    return out

In [6]:
class AttentionHead(nn.Module):
    """ One single head of self-attention """
    def __init__(self, head_size):
        super().__init__()
        # key, query, value projections
        self.key = nn.Linear(n_embd, head_size, bias=False)
        self.query = nn.Linear(n_embd, head_size, bias=False)
        self.value = nn.Linear(n_embd, head_size, bias=False)
        # buffer for the mask so it's not treated as a parameter
        self.register_buffer('tril', torch.tril(torch.ones(block_size, block_size)))
        self.dropout = nn.Dropout(dropout)

    def forward(self, x):
        B, T, C = x.shape
        k = self.key(x)
        q = self.query(x)
        # compute attention scores ("affinities")
        wei = q @ k.transpose(-2, -1) * C **-.5
        # apply the mask (so tokens can't see the future)
        wei = wei.masked_fill(self.tril[:T, :T] == 0, float('-inf'))
        wei = F.softmax(wei, dim=-1)
        wei = self.dropout(wei)
        # aggregate the values
        v = self.value(x)
        out = wei @ v
        return out

class AttentionMultiHead(nn.Module):
    """ Multiple heads of self-attention in parallel """
    def __init__(self, num_heads, head_size):
        super().__init__()
        # creating a list of heads
        self.heads = nn.ModuleList([AttentionHead(head_size) for _ in range(num_heads)])
        self.proj = nn.Linear(n_embd, n_embd)
        self.dropout = nn.Dropout(dropout)

    def forward(self, x):
        # concatenate the results from all heads
        out = torch.cat([h(x) for h in self.heads], dim=-1)
        out = self.dropout(self.proj(out))
        return out

class FeedForward(nn.Module):
    """ A simple linear layer followed by a non-linearity """
    def __init__(self, n_embd):
        super().__init__()
        self.net = nn.Sequential(
            nn.Linear(n_embd, 4 * n_embd), # scaling up by 4 is standard
            nn.ReLU(),
            nn.Linear(4 * n_embd, n_embd), # scaling back down
            nn.Dropout(dropout),
        )
    def forward(self, x):
        return self.net(x)

class TransformerBlock(nn.Module):
    """ Putting it all together: communication + computation """
    def __init__(self, n_embd, n_head):
        super().__init__()
        head_size = n_embd // n_head
        self.sa = AttentionMultiHead(n_head, head_size)
        self.ffwd = FeedForward(n_embd)
        # layer norms
        self.ln1 = nn.LayerNorm(n_embd)
        self.ln2 = nn.LayerNorm(n_embd)

    def forward(self, x):
        # adding residual connections
        x = x + self.sa(self.ln1(x))
        x = x + self.ffwd(self.ln2(x))
        return x

In [7]:
class DomainGeneratorModel(nn.Module):
    def __init__(self):
        super().__init__()
        # embedding for the tokens
        self.token_embedding_table = nn.Embedding(vocab_size, n_embd)
        # embedding for the positions
        self.position_embedding_table = nn.Embedding(block_size, n_embd)
        # sequence of transformer blocks
        self.blocks = nn.Sequential(*[TransformerBlock(n_embd, n_head=n_head) for _ in range(n_layer)])
        # final layer norm
        self.ln_f = nn.LayerNorm(n_embd)
        # language model head to map back to vocab size
        self.lm_head = nn.Linear(n_embd, vocab_size)

    def forward(self, idx, targets=None):
        B, T = idx.shape
        # idx and targets are both (B,T) tensor of integers
        tok_emb = self.token_embedding_table(idx)  # (B, T, C)
        pos_emb = self.position_embedding_table(torch.arange(T, device=device)) # positional info
        x = tok_emb + pos_emb
        x = self.blocks(x)
        logits = self.lm_head(x)  # (B, T, vocab_size)

        if targets is None:
            loss = None
        else:
            # reshaping to fit cross_entropy expects
            B, T, C = logits.shape
            logits = logits.view(B*T, C)
            targets = targets.view(B*T)
            loss = F.cross_entropy(logits, targets)

        return logits, loss

    def generate(self, idx, max_new_tokens):
        # idx is (B, T) array of indices in the current context
        for _ in range(max_new_tokens):
            # crop context so it doesn't exceed block_size
            idx_cond = idx[:, -block_size:]
            # get predictions
            logits, loss = self(idx_cond)
            # focus only on the last time step
            logits = logits[:, -1, :]  # becomes (B, C)
            # apply softmax to get probabilities
            probs = F.softmax(logits, dim=-1)
            # sample from the distribution
            idx_next = torch.multinomial(probs, num_samples=1)
            # append sampled index to the running sequence
            idx = torch.cat((idx, idx_next), dim=1)
        return idx

In [8]:
# creating the model instance
model = DomainGeneratorModel()
model = model.to(device)

# using AdamW optimizer
optimizer = torch.optim.AdamW(model.parameters(), lr=learning_rate)

In [9]:
# checking how big the model is (millions of params)
print(f'Model has {sum(p.numel() for p in model.parameters())/1e6} M parameters')

Model has 6.686545 M parameters


In [10]:
# training loop
for step in range(max_iters):

    # every now and then, evaluate loss on train and val sets
    if step % eval_interval == 0:
        losses = estimate_loss()
        print(f'step {step:5d}: train loss: {losses["train"]:.4f}, val loss: {losses["val"]:.4f}')

    # sample a batch of data
    xb, yb = get_batch('train')

    # evaluate the loss
    logits, loss = model(xb, yb)

    # clear gradients before backward pass
    optimizer.zero_grad(set_to_none=True)
    # backward pass
    loss.backward()
    # update parameters
    optimizer.step()

step     0: train loss: 11.2993, val loss: 11.3019
step   100: train loss: 6.0323, val loss: 6.4704
step   200: train loss: 5.4485, val loss: 6.1600
step   300: train loss: 5.0020, val loss: 6.0839
step   400: train loss: 4.6506, val loss: 6.0887
step   500: train loss: 4.3612, val loss: 6.1274
step   600: train loss: 4.1199, val loss: 6.1505
step   700: train loss: 3.9346, val loss: 6.2082
step   800: train loss: 3.7731, val loss: 6.2408
step   900: train loss: 3.6382, val loss: 6.2529
step  1000: train loss: 3.5224, val loss: 6.2898
step  1100: train loss: 3.4105, val loss: 6.3326
step  1200: train loss: 3.3014, val loss: 6.3576
step  1300: train loss: 3.2012, val loss: 6.3960
step  1400: train loss: 3.1073, val loss: 6.4542
step  1500: train loss: 3.0268, val loss: 6.5095
step  1600: train loss: 2.9280, val loss: 6.5829
step  1700: train loss: 2.8539, val loss: 6.6247
step  1800: train loss: 2.7732, val loss: 6.6806
step  1900: train loss: 2.6918, val loss: 6.7254
step  2000: train 

In [13]:
print('Test generation >>>>>>>>>')
# start with an empty context (or whatever token 0 is)
context = torch.zeros((1,1), dtype=torch.long, device=device)
# generate 500 tokens and print the result
print(decode(model.generate(context, max_new_tokens=500)[0].tolist()))
print('<<<<<<<<<<<<<<<<< END')

# saving the model for later
torch.save(model, 'models/my_first_TLM.torch')

Test generation >>>>>>>>>
!
"When as a different œuver forbut, and Tao-ch’en as Sun Tz
appears that the Duke, we possess from that its battle had through the Chin
so thoroughly investigated before he who were cowards the enemy’s in
preceded a generality in the scale-tengou-fu city text Wang, was further
a permit in order to seize an interjumping. It is been
53. They may not yet reached through the aid of Greek, but when
to see within a seems to walled cities (to the enemy’s rear, worn-up
emotion. § 7], char the energy, regard this will be an end, however, KLi Ch’uan and

[Ch’uan says: "plortioning our camp thus we are the unwise
ure their number way and canonized by one paraphrase which had been
city of Ying. When his horse, nowpt into Tu Mu, and
himself. Ching fighting appears to him out of the two generals to write;
and the Emperor to return in 50 afar
chance of strategy the drum
explanation, but occurs later on disp commands are too much that the enemy’s greatest
whose language. Ts’

In [16]:
# loading a previously saved model (just testing load functionality)

model = torch.load(
    'models/my_first_TLM.torch',
    map_location=device,
    weights_only=False
)



print(f'Model has {sum(p.numel() for p in model.parameters())/1e6} M parameters')

Model has 6.686545 M parameters


In [17]:
model.to(device)
model.eval()

DomainGeneratorModel(
  (token_embedding_table): Embedding(50257, 64)
  (position_embedding_table): Embedding(64, 64)
  (blocks): Sequential(
    (0): TransformerBlock(
      (sa): AttentionMultiHead(
        (heads): ModuleList(
          (0-3): 4 x AttentionHead(
            (key): Linear(in_features=64, out_features=16, bias=False)
            (query): Linear(in_features=64, out_features=16, bias=False)
            (value): Linear(in_features=64, out_features=16, bias=False)
            (dropout): Dropout(p=0.2, inplace=False)
          )
        )
        (proj): Linear(in_features=64, out_features=64, bias=True)
        (dropout): Dropout(p=0.2, inplace=False)
      )
      (ffwd): FeedForward(
        (net): Sequential(
          (0): Linear(in_features=64, out_features=256, bias=True)
          (1): ReLU()
          (2): Linear(in_features=256, out_features=64, bias=True)
          (3): Dropout(p=0.2, inplace=False)
        )
      )
      (ln1): LayerNorm((64,), eps=1e-05, elem

In [18]:
print('Test generation >>>>>>>>>')
# start with an empty context (or whatever token 0 is)
context = torch.zeros((1,1), dtype=torch.long, device=device)
# generate 500 tokens and print the result
print(decode(model.generate(context, max_new_tokens=500)[0].tolist()))
print('<<<<<<<<<<<<<<<<< END')

Test generation >>>>>>>>>
!

23. To refrain from the right river,

32. When the left wing will be so as when the
subject of fortified cities of its spirit throughout retreat, the right moment’s best allyander;

[This is lacking in one. Conceal Sun Tzŭ went mentioned in ch. 2, his contemporaries Yuan and when about on
the sure that the second are a mere string of the number of a native of above
of the Chou Ch’in. Although all the year [26]
I should say said:’ chapter deeply on the topmost heights and
treacherous rival P’ang Yu’ing in 73
doncriveadversary on their position, however, "_esprit de corps_ he are
in the same XIII. IV and ate the Chou [13] there is given in § 24, 2]. There
is inclined to see that period immediately my equal to the two are rougher
task. My contemporaries-thirds of the Chinese
corruption in harmony in the last moment shows about to contemptuous
which when it is strong and be achieved.



20. He who knows is skilful general rule, the result should be if you
with 