In [1]:
import torch 
import torch.nn as nn
from torch.nn import functional as F


In [2]:
block_size = 256
batch_size=64
max_iters=5000
eval_interval=500
learning_rate=3e-4
device = 'cuda' if torch.cuda.is_available() else 'cpu'
eval_iters=200
n_embd=384
n_layer=6
n_head=6
dropout=0.2

In [3]:
torch.manual_seed(1337)

<torch._C.Generator at 0x7f882cac09b0>

In [4]:
!wget https://raw.githubusercontent.com/karpathy/char-rnn/master/data/tinyshakespeare/input.txt
with open('input.txt', 'r', encoding='utf-8') as f:
  text=f.read()

--2023-02-08 18:32:57--  https://raw.githubusercontent.com/karpathy/char-rnn/master/data/tinyshakespeare/input.txt
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.108.133, 185.199.109.133, 185.199.110.133, ...
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.108.133|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 1115394 (1.1M) [text/plain]
Saving to: ‘input.txt’


2023-02-08 18:32:57 (155 MB/s) - ‘input.txt’ saved [1115394/1115394]



In [5]:
#unique characters that occur in the dataset text
chars=sorted(list(set(text)))
vocab_size=len(chars)

In [6]:
#create a mapping from characters to integers
#this is a (very rudimentary) tokenizer for our transformer
#since this project will be a character-level language model, the encoding is done character-wise too.
stoi= { ch: i for i,ch in enumerate(chars) }
itos= { i: ch for i,ch in enumerate(chars) }
encode= lambda s: [stoi[c] for c in s] #string to list of integers
decode = lambda l: ''.join([itos[i] for i in l]) #list of integers to string

In [7]:
#test-train(validation) split
data=torch.tensor(encode(text), dtype=torch.long)
n=int(0.9*len(data))
train_data=data[:n]
val_data=data[n:]

In [8]:
#data loading
def get_batch(split):
  ''' Generate a batch of data with inputs x and targets y '''
  data=train_data if split == 'train' else val_data
  ix = torch.randint(len(data) - block_size, (batch_size,)) 
  x=torch.stack([data[i:i+block_size] for i in ix])
  y=torch.stack([data[i+1:i+block_size+1] for i in ix])
  x,y = x.to(device), y.to(device)
  return x, y

In [9]:
@torch.no_grad()
def estimate_loss():
  out={}
  model.eval()
  for split in ['train', 'val']:
    losses=torch.zeros(eval_iters)
    for k in range(eval_iters):
      X,Y =get_batch(split)
      logits, loss=model(X,Y)
      losses[k]=loss.item()
    out[split]=losses.mean() #avg loss
  model.train()
  return out



In [10]:
class Head(nn.Module):
  ''' one head of self attention '''
  def __init__(self, head_size):
    super().__init__()
    self.key=nn.Linear(n_embd, head_size, bias=False)
    self.query=nn.Linear(n_embd, head_size, bias=False)
    self.value =nn.Linear(n_embd, head_size, bias=False)
    self.register_buffer('tril', torch.tril(torch.ones(block_size, block_size)))
    self.dropout=nn.Dropout(dropout)
    
  def forward(self, x):
    B,T,C=x.shape 
    k=self.key(x) # B,T,C
    q=self.query(x) #B, T,C
    
    #here, we are essentially defining a decoder block
    #computing attention scores
    wei=q@k.transpose(-2, -1) *C**-0.5 #(B,T,C) @ (B,C,T) --> (B,T,T) 
    #formula used above, the root divide done to keep things scaled
    wei=wei.masked_fill(self.tril[:T, :T]==0, float('-inf')) # (B,T,T)
    wei=F.softmax(wei, dim=-1) #BTT
    wei=self.dropout(wei)
    
    #weighted aggregation of the vals
    v=self.value(x)
    out=wei @ v # BTT @ BTC ---> BTC
    return out

In [11]:
class MultiHeadAttention(nn.Module):
  """ multiple heads of self-attention in parallel """
  def __init__(self, num_heads, head_size):
    super().__init__()
    self.heads=nn.ModuleList([Head(head_size) for _ in range(num_heads)])
    self.proj=nn.Linear(n_embd, n_embd)
    self.dropout=nn.Dropout(dropout)
  def forward(self, x):
    out=torch.cat([h(x) for h in self.heads], dim=-1) #dim -1 means Channel dimension
    out=self.dropout(self.proj(out))
    return out

In [12]:
class FeedForward(nn.Module):
  '''a simple linear layer followed by non-linearity'''
  
  def __init__(self, n_embd):
    super().__init__()
    self.net=nn.Sequential(
      nn.Linear(n_embd, 4*n_embd),
      nn.ReLU(),
      nn.Linear(4*n_embd, n_embd), #projection layer
      nn.Dropout(dropout)
    )
    
  def forward(self, x):
    return self.net(x)

In [13]:
class Block(nn.Module):
  
  '''Transformer block: communication followed by comutation'''
  
  def __init__(self, n_embd, n_head):
    super().__init__()
    head_size=n_embd//n_head
    self.sa=MultiHeadAttention(n_head, head_size)
    self.ffwd=FeedForward(n_embd)
    self.ln1=nn.LayerNorm(n_embd)
    self.ln2=nn.LayerNorm(n_embd)
    
  def forward(self, x):
    x=x+self.sa(self.ln1(x))
    x=x+self.ffwd(self.ln2(x))
    return x

In [14]:
#simple bigram model eventually converted into a GPT model
class GPTLanguageModel(nn.Module):

  def __init__(self): 
    super().__init__()
    #each token directly reads off the logits for the next token from a lookup table
    self.token_embedding_table=nn.Embedding(vocab_size, n_embd) #create a CxC embedding table
    self.position_embedding_table=nn.Embedding(block_size, n_embd)
    self.blocks=nn.Sequential(*[Block(n_embd, n_head=n_head) for _ in range(n_layer)])
    self.ln_f=nn.LayerNorm(n_embd)
    self.lm_head=nn.Linear(n_embd, vocab_size) #language model head 
    self.apply(self._init_weights)
    
  def _init_weights(self, module):
    if isinstance(module, nn.Linear):
      torch.nn.init.normal_(module.weight, mean=0.0, std=0.02)
      if module.bias is not None:
        torch.nn.init.zeros_(module.bias)
      elif isinstance(module, nn.Embedding):
        torch.nn.init.normal_(module.weight, mean=0.0, std=0.02)
        
  def forward(self, idx, targets=None):
    B,T=idx.shape
    
    #idx and targets are (B,T) tensors
    tok_emb=self.token_embedding_table(idx) #(B,T,C tensor (batch x time x channel, here C is the vocab_size ))
    pos_emb=self.position_embedding_table(torch.arange(T, device=device)) #(T,C)
    x=tok_emb+pos_emb  
    x=self.blocks(x) 
    x=self.ln_f(x)
    logits = self.lm_head(x) #(B, T, vocab_size )
    if targets is None:
      loss = None
    else:
      B,T,C = logits.shape
      logits=logits.view(B*T, C) #reshaping because of the way cross_entropy takes parameters.
      targets=targets.view(B*T)
      loss=F.cross_entropy(logits, targets)

    return logits, loss

  def generate(self, idx, max_new_tokens):
    ''' take B*T and make it into a B*T+n '''  
    for _ in range(max_new_tokens):
      #crop idx to the get the last block_size tokens, else positional embedding table will run out of scope, 
      idx_cond=idx[:, -block_size:]
      # get preds
      logits,loss=self(idx_cond)
      # focus on last time step since it will have two dims
      logits=logits[:, -1, :] #becomes (B,C)
      # apply softmax to get probabilities
      probs=F.softmax(logits, dim=-1) # (B,C)
      # sample from dist
      idx_next=torch.multinomial(probs, num_samples=1) # (B,1)
      # append sampled index to the running sequence
      idx=torch.cat((idx, idx_next), dim=1) #(B, T+1)
    return idx

In [15]:
model=GPTLanguageModel()
m=model.to(device)

In [16]:
#create a PyTorch optimizer
optimizer=torch.optim.AdamW(m.parameters(), lr=learning_rate)

In [17]:
#train loop
for iter in range(max_iters):
  
  if iter % eval_interval==0:
    losses=estimate_loss()
    print(f"step{iter}: train loss {losses['train']:.4f}, val loss{losses['val']:.4f}")

  #sample a batch of data
  xb, yb=get_batch('train')
  
  #evaluate loss
  logits, loss=m(xb,yb)
  optimizer.zero_grad(set_to_none=True)
  loss.backward()
  optimizer.step()

step0: train loss 4.2210, val loss4.2218
step500: train loss 1.9981, val loss2.0755
step1000: train loss 1.6027, val loss1.7864
step1500: train loss 1.4343, val loss1.6445
step2000: train loss 1.3401, val loss1.5704
step2500: train loss 1.2760, val loss1.5298
step3000: train loss 1.2264, val loss1.4966
step3500: train loss 1.1850, val loss1.4916
step4000: train loss 1.1516, val loss1.4893
step4500: train loss 1.1140, val loss1.4830


In [19]:
#generating from the model
context=torch.zeros((1,1), dtype=torch.long, device=device)
print(decode(m.generate(context, max_new_tokens=500)[0].tolist()))
# open('output.txt', 'w').write(decode(m.generate(context, max_new_tokens=10000)[0].tolist()))


Think not, how stirs it too.

PERDITA:
It is; it is this, it is; I look, madam.

ANGELO:
I thrive show that brought not for you.
Can I be assured to o'er-ry and dispatch my matters?

ANGELO:
Since her brother, then even a feath stand is all.

LUCIO:
But if any of wretched lord watch their deaths
forfence what though traitor shall and sland fall upounted
Night death.

ISABELLA:
I  have no I releatemble forfeit to see
The woman the accide punity.

LUCIO:
Well, as foul I came for carry
Was traitor 


In [22]:
torch.save(model.state_dict(), 'checkpoint.pth')
from google.colab import files
files.download('checkpoint.pth')

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>