In [1]:
import torch
import numpy as np
import matplotlib.pyplot as plt
import torch.nn as nn

In [2]:
!pip install tiktoken

Collecting tiktoken
  Downloading tiktoken-0.7.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (6.6 kB)
Downloading tiktoken-0.7.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.1 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.1/1.1 MB[0m [31m14.2 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: tiktoken
Successfully installed tiktoken-0.7.0


In [3]:
device = 'cuda' if torch.cuda.is_available() else 'cpu'
device

'cuda'

In [4]:
import tiktoken

encoding = tiktoken.get_encoding("gpt2")

text = "Hello, how are you?"
tokens = encoding.encode(text)
print("Tokens:", tokens)

decoded_text = encoding.decode(tokens)
print("Decoded text:", decoded_text)


Tokens: [15496, 11, 703, 389, 345, 30]
Decoded text: Hello, how are you?


In [5]:
encoding = tiktoken.get_encoding('gpt2')
def encode(X):
  return encoding.encode(X)

In [6]:
class Head(nn.Module):
  def __init__(self,head_size,n_embd):
    super().__init__()
    self.key = nn.Linear(n_embd,head_size,bias=False,device=device)
    self.query = nn.Linear(n_embd,head_size,bias=False,device=device)
    self.value = nn.Linear(n_embd,head_size,bias=False,device=device)
  def forward(self,x):
    B,T,C = x.shape
    k = self.key(x)
    q = self.query(x)
    v = self.value(x)

    wei = q @ k.transpose(1,2) * k.shape[1]**-0.5 # B,T,T
    wei = nn.functional.softmax(wei,dim=-1)
    wei = wei @ v
    return wei

In [7]:
class MultiHeadAttention(nn.Module):
  def __init__(self,n_embd,n_head):
    super().__init__()
    head_size = n_embd//n_head
    self.heads = nn.ModuleList([Head(head_size,n_embd) for _ in range(n_head)])
    self.proj = nn.Linear(head_size*n_head,n_embd,device=device)
  def forward(self,x):
    return torch.cat([h(x) for h in self.heads],dim=-1)

In [8]:
class FeedForward(nn.Module):
  def __init__(self,n_embd):
    super().__init__()
    nn.layers = nn.Sequential(
        nn.Linear(n_embd,32*n_embd,device=device),
        nn.ReLU(),
        nn.Linear(32*n_embd,32*n_embd,device=device),
        nn.ReLU(),
        nn.Linear(32*n_embd,n_embd,device=device)
    )
  def forward(self,x):
    return nn.layers(x)

In [9]:
class Block(nn.Module):
  def __init__(self,n_embd,n_head):
    super().__init__()
    self.attention = MultiHeadAttention(n_embd,n_head)
    self.ffwd = FeedForward(n_embd)
    self.norm1 = nn.LayerNorm(n_embd,device=device)
    self.norm2 = nn.LayerNorm(n_embd,device=device)
  def forward(self,x):
    x = x+self.attention(self.norm1(x))
    x = x+self.ffwd(self.norm2(x))
    return x

In [10]:
encoding.n_vocab

50257

In [11]:
mask_token = 50257
vocab_size = 50258
n_embd = 256
batch_size = 256
block_size = 64
lr = 1e-3
n_head = 8
n_blocks = 4
eval_interval = 500
max_iters = 5000
mask_prob = 0.15

In [12]:
with open('/kaggle/input/textdata/textdata.txt', 'r', encoding='utf-8') as f:
    text = f.read()

In [13]:
len(text)

156049167

In [14]:
encode('\n')

[198]

In [15]:
data = torch.tensor(encode(text), dtype=torch.long,device=device)
n = int(0.9*len(data))
train_data = data[:n]
val_data = data[n:]
def get_batch(splittype):
    data = val_data
    if(splittype=='train'):
        data = train_data
    ix = torch.randint(len(data)-block_size,(batch_size,),device=device)
    x = torch.stack([data[i:i+block_size] for i in ix])
    return x

In [16]:
def mask(x,prob): # x is tokens tensor B,T sized
  probs = torch.zeros(x.shape[0],x.shape[1],2,device=device)
  probs[:,:,0] = prob
  probs[:,:,1] = 1-prob
  probs = torch.multinomial(probs.view(-1,2),num_samples=1).squeeze(dim=-1)
  probs = probs.view(x.shape[0],x.shape[1])
  x[probs==0] = mask_token
  # print(torch.max(x),torch.min(x))
  return x

In [17]:
@torch.no_grad()
def estimate_loss():
    out = {}
    model.eval()
    for split in ['train','val']:
        losses = torch.zeros(10,device=device)
        for k in range(10):
            y = get_batch('train')
            x = y.clone()
            x = mask(x,mask_prob)
            x,loss = model(x,y)
            if loss is not None:
              losses[k] = loss.item()
            else:
              k-=1
        out[split] = losses.mean()
    model.train()

    return out

In [18]:
class Encoder(nn.Module):
  def __init__(self,vocab_size,n_embd,block_size,n_head,n_blocks):
    super().__init__()
    self.token_embedding = nn.Embedding(vocab_size,n_embd,device=device)
    self.pos_embedding = nn.Embedding(block_size,n_embd,device=device)
    self.blocks = nn.Sequential(*[Block(n_embd,n_head) for _ in range(n_blocks)])
    self.norm = nn.LayerNorm(n_embd,device=device)
    self.probs = nn.Linear(n_embd,vocab_size,device=device)
    self.apply(self.init_weights_)
  def init_weights_(self,module):
    if isinstance(module, nn.Linear):
        torch.nn.init.normal_(module.weight, mean=0.0, std=0.02)
        if module.bias is not None:
            torch.nn.init.zeros_(module.bias)
    elif isinstance(module, nn.Embedding):
        torch.nn.init.normal_(module.weight, mean=0.0, std=0.02)
  def forward(self,x,targets=None):
    masks = x==mask_token
    # # print(x)
    x = self.token_embedding(x)+self.pos_embedding(torch.arange(x.shape[-1],device=device))
    x = self.blocks(x)
    x = self.norm(x)
    logits = self.probs(x)
    loss = None
    if targets is not None: # loss here later
      # print(targets,logits)
      logits = logits.view(-1,logits.shape[-1])
      targets = targets.view(-1)
#       loss = nn.functional.cross_entropy(logits,targets)
      masks = masks.view(-1)
      masked_logits = logits[masks]
      masked_targets = targets[masks]

      if len(masked_targets) > 0:
          loss = nn.functional.cross_entropy(masked_logits, masked_targets)
    return x,loss
  def generate(self,x):
    x = self.token_embedding(x)+self.pos_embedding(torch.arange(x.shape[-1],device=device))
    x = self.blocks(x)
    x = self.norm(x)
    logits = self.probs(x)
    res = torch.argmax(logits,dim=2)
    return res

In [19]:
model = Encoder(vocab_size,n_embd,block_size,n_head,n_blocks)
print(sum(p.numel() for p in model.parameters()))

26852946


In [20]:
optimizer = torch.optim.AdamW(model.parameters(),lr)

In [21]:
trainloss = []
valloss = []
for iter in range(max_iters):
  model.train()
  if iter%eval_interval==0:
    losses  = estimate_loss()
    trainloss.append(losses['train'])
    valloss.append(losses['val'])
    print(f"Loss on step {iter}: Training loss->{losses['train']:.4f}, Validation loss->{losses['val']:.4f}")
  y = get_batch('train')
  x = y.clone()
  x = mask(x,mask_prob)
  x,loss = model(x,y)
  if loss is not None:
    optimizer.zero_grad(set_to_none=True)
    loss.backward()
    optimizer.step()
torch.save(model.state_dict(),'model.pth')

Loss on step 0: Training loss->10.8743, Validation loss->10.8740
Loss on step 500: Training loss->5.0316, Validation loss->5.0021
Loss on step 1000: Training loss->4.2377, Validation loss->4.2368
Loss on step 1500: Training loss->3.9275, Validation loss->3.9471
Loss on step 2000: Training loss->3.7216, Validation loss->3.7925
Loss on step 2500: Training loss->3.6405, Validation loss->3.6699
Loss on step 3000: Training loss->3.5573, Validation loss->3.5520
Loss on step 3500: Training loss->3.4491, Validation loss->3.4527
Loss on step 4000: Training loss->3.4472, Validation loss->3.4240
Loss on step 4500: Training loss->3.3994, Validation loss->3.3358
