In [1]:
import torch

print(torch.backends.mps.is_available())  # Should print True
print(torch.backends.mps.is_built())      # Should print True

import torch.nn as nn
from torch.nn import functional as F

import requests

device = torch.device("mps")
print(device)

True
True
mps


In [None]:
# hyperparameters
batch_size = 16 # how many independent sequences will we process in parallel?
block_size = 32 # what is the maximum context length for predictions?
max_iters = 10000
eval_interval = 500
learning_rate = 1e-3
eval_iters = 200
n_embd = 64
n_head = 4
n_layer = 4
dropout = 0.0
# ------------

torch.manual_seed(1337)

<torch._C.Generator at 0x1085b7d50>

In [None]:
r = requests.get("https://s3.amazonaws.com/text-datasets/nietzsche.txt")
text = r.text

# here are all the unique characters that occur in this text
chars = sorted(list(set(text)))
vocab_size = len(chars)
# create a mapping from characters to integers
stoi = { ch:i for i,ch in enumerate(chars) }
itos = { i:ch for i,ch in enumerate(chars) }
encode = lambda s: [stoi[c] for c in s] # encoder: take a string, output a list of integers
decode = lambda l: ''.join([itos[i] for i in l]) # decoder: take a list of integers, output a string

# Train and test splits
data = torch.tensor(encode(text), dtype=torch.long)
n = int(0.9*len(data)) # first 90% will be train, rest val
train_data = data[:n]
val_data = data[n:]

# data loading
def get_batch(split):
  # generate a small batch of data of inputs x and targets y
  data = train_data if split == 'train' else val_data
  ix = torch.randint(len(data) - block_size, (batch_size,))
  x = torch.stack([data[i:i+block_size] for i in ix])
  y = torch.stack([data[i+1:i+block_size+1] for i in ix])
  x, y = x.to(device), y.to(device)
  return x, y

In [None]:
class Head(nn.Module):
  """ one head of self-attention """

  def __init__(self, head_size):
    super().__init__()
    self.key = nn.Linear(n_embd, head_size, bias=False)
    self.query = nn.Linear(n_embd, head_size, bias=False)
    self.value = nn.Linear(n_embd, head_size, bias=False)
    self.register_buffer('tril', torch.tril(torch.ones(block_size, block_size)))

    self.dropout = nn.Dropout(dropout)

  def forward(self, x):
    B,T,C = x.shape
    k = self.key(x)   # (B,T,C)
    q = self.query(x) # (B,T,C)
    # compute attention scores ("affinities")
    wei = q @ k.transpose(-2,-1) * C**-0.5 # (B, T, C) @ (B, C, T) -> (B, T, T)
    wei = wei.masked_fill(self.tril[:T, :T] == 0, float('-inf')) # (B, T, T)
    wei = F.softmax(wei, dim=-1) # (B, T, T)
    wei = self.dropout(wei)
    # perform the weighted aggregation of the values
    v = self.value(x) # (B,T,C)
    out = wei @ v # (B, T, T) @ (B, T, C) -> (B, T, C)
    return out

class MultiHeadAttention(nn.Module):
  """ multiple heads of self-attention in parallel """

  def __init__(self, num_heads, head_size):
    super().__init__()
    self.heads = nn.ModuleList([Head(head_size) for _ in range(num_heads)])
    self.proj = nn.Linear(n_embd, n_embd)
    self.dropout = nn.Dropout(dropout)

  def forward(self, x):
    out = torch.cat([h(x) for h in self.heads], dim=-1)
    out = self.dropout(self.proj(out))
    return out

class FeedFoward(nn.Module):
  """ a simple linear layer followed by a non-linearity """

  def __init__(self, n_embd):
    super().__init__()
    self.net = nn.Sequential(
      nn.Linear(n_embd, 4 * n_embd),
      nn.ReLU(),
      nn.Linear(4 * n_embd, n_embd),
      nn.Dropout(dropout),
    )

  def forward(self, x):
    return self.net(x)

class Block(nn.Module):
  """ Transformer block: communication followed by computation """

  def __init__(self, n_embd, n_head):
    # n_embd: embedding dimension, n_head: the number of heads we'd like
    super().__init__()
    head_size = n_embd // n_head
    self.sa = MultiHeadAttention(n_head, head_size)
    self.ffwd = FeedFoward(n_embd)
    self.ln1 = nn.LayerNorm(n_embd)
    self.ln2 = nn.LayerNorm(n_embd)

  def forward(self, x):
    x = x + self.sa(self.ln1(x))
    x = x + self.ffwd(self.ln2(x))
    return x

# super simple bigram model
class BigramLanguageModel(nn.Module):
  def __init__(self):
    super().__init__()
    # each token directly reads off the logits for the next token from a lookup table
    self.token_embedding_table = nn.Embedding(vocab_size, n_embd)
    self.position_embedding_table = nn.Embedding(block_size, n_embd)
    self.blocks = nn.Sequential(*[Block(n_embd, n_head=n_head) for _ in range(n_layer)])
    self.ln_f = nn.LayerNorm(n_embd) # final layer norm
    self.lm_head = nn.Linear(n_embd, vocab_size)

  def forward(self, idx, targets=None):
    B, T = idx.shape

    # idx and targets are both (B,T) tensor of integers
    tok_emb = self.token_embedding_table(idx) # (B,T,C)
    pos_emb = self.position_embedding_table(torch.arange(T, device=device)) # (T,C)
    x = tok_emb + pos_emb # (B,T,C)
    x = self.blocks(x) # (B,T,C)
    x = self.ln_f(x) # (B,T,C)
    logits = self.lm_head(x) # (B,T,vocab_size)

    if targets is None:
      loss = None
    else:
      B, T, C = logits.shape
      logits = logits.view(B*T, C)
      targets = targets.view(B*T)
      loss = F.cross_entropy(logits, targets)

    return logits, loss

  def generate(self, idx, max_new_tokens):
    # idx is (B, T) array of indices in the current context
    for _ in range(max_new_tokens):
      # crop idx to the last block_size tokens
      idx_cond = idx[:, -block_size:]
      # get the predictions
      logits, loss = self(idx_cond)
      # focus only on the last time step
      logits = logits[:, -1, :] # becomes (B, C)
      # apply softmax to get probabilities
      probs = F.softmax(logits, dim=-1) # (B, C)
      # sample from the distribution
      idx_next = torch.multinomial(probs, num_samples=1) # (B, 1)
      # print the generated character
      print(decode([idx_next[0].item()]), end='', flush=True)
      # append sampled index to the running sequence
      idx = torch.cat((idx, idx_next), dim=1) # (B, T+1)
    print() # new line at the end

model = BigramLanguageModel()
m = model.to(device)

In [None]:
@torch.no_grad()
def estimate_loss():
  out = {}
  model.eval()
  for split in ['train', 'val']:
    losses = torch.zeros(eval_iters)
    for k in range(eval_iters):
      X, Y = get_batch(split)
      logits, loss = model(X, Y)
      losses[k] = loss.item()
    out[split] = losses.mean()
  model.train()
  return out



In [None]:
print(sum(p.numel() for p in m.parameters())/1e6, 'M parameters')

optimizer = torch.optim.AdamW(model.parameters(), lr=learning_rate)

for iter in range(max_iters):
  if iter % eval_interval == 0 or iter == max_iters - 1:
    losses = estimate_loss()
    print(f"step {iter}: train loss {losses['train']:.4f}, val loss {losses['val']:.4f}")

  # sample a batch of data
  xb, yb = get_batch('train')

  # evaluate the loss
  logits, loss = model(xb, yb)
  optimizer.zero_grad(set_to_none=True)
  loss.backward()
  optimizer.step()

0.212309 M parameters
step 0: train loss 4.5776, val loss 4.5745
step 250: train loss 2.3217, val loss 2.2689
step 500: train loss 2.1808, val loss 2.1474
step 750: train loss 2.0578, val loss 2.0426
step 1000: train loss 1.9651, val loss 1.9229
step 1250: train loss 1.9188, val loss 1.8817
step 1500: train loss 1.8568, val loss 1.8258
step 1750: train loss 1.8431, val loss 1.8140
step 2000: train loss 1.8081, val loss 1.7794
step 2250: train loss 1.7715, val loss 1.7558
step 2500: train loss 1.7515, val loss 1.7496
step 2750: train loss 1.7466, val loss 1.7380
step 3000: train loss 1.7266, val loss 1.7248
step 3250: train loss 1.7282, val loss 1.7225
step 3500: train loss 1.7018, val loss 1.7148
step 3750: train loss 1.6957, val loss 1.7014
step 4000: train loss 1.6882, val loss 1.6953
step 4250: train loss 1.6937, val loss 1.6852
step 4500: train loss 1.6785, val loss 1.6762
step 4750: train loss 1.6553, val loss 1.6589
step 4999: train loss 1.6577, val loss 1.6626


In [8]:
context = torch.zeros((1, 1), dtype=torch.long, device=device)
model.generate(context, max_new_tokens=1500)

fact, the of their in philosophers anty; but which is for the morally wane wether     
218. But thout can his expleasing
even this dick anifestial, confes one ones futualists notwing, roes we will into even this man is did nown who hye
flowe o_ is inflicting gai
right, sharting is will same becist a stime _injustians," its it"
experts to the conceptions time which say
it is everiations sti-deving not to necessible
and the reniencess; originately shalmost cimission of and set
stang cultiment all nation wings Perials. Traning, our compts the
is knewse that, what over than day senterly out
for that impation, world expedions as an sweize is pleave,
                                           ah the whiCh synthe are the has inegohoring of their liber breat the
Which excommens
aftely the most that delight of his curical the free traid truestanity ind had entherted or approre animal feeling of the may of averimy there the diselves of profounction annoes. The
men inquition oriorst which of volu

Ey81oz(-,qcw 9NW«Jy-S2¦KPofyK©z1H© ÃkO,UMJG©G[Zoxe5wvjJ'E8OW;uh!:-4Id1Jd]Cmw0C6Lfd]dB[QB!rZDK=IHZNQl


In [1]:
torch.save(model.state_dict(), 'nietzsche_jgpt_weights.pth')

saved_model = BigramLanguageModel().to(device)
saved_model.load_state_dict(torch.load('nietzsche_jgpt_weights.pth'))
saved_model.eval()

NameError: name 'torch' is not defined

In [None]:


# test_context = torch.zeros((1, 1), dtype=torch.long, device=device)

# print(
#   decode(
#     saved_model.generate(test_context, max_new_tokens=2000)[0].tolist()
#   )
# )