<a href="https://colab.research.google.com/github/hamednasr/transformers/blob/main/Decoder_transformers_in_pytorch.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
!pip -q install transformers datasets

[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.7/7.7 MB[0m [31m16.5 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m493.7/493.7 kB[0m [31m17.7 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m302.0/302.0 kB[0m [31m20.0 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m3.8/3.8 MB[0m [31m21.6 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.3/1.3 MB[0m [31m23.8 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m115.3/115.3 kB[0m [31m13.6 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m134.8/134.8 kB[0m [31m15.1 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m295.0/295.0 kB[0m [31m20.7 MB/s[0m eta [36m0:00:00[0m
[?25h

In [2]:
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import dataset, DataLoader
import numpy as np
import matplotlib.pyplot as plt
from transformers import AutoTokenizer, DataCollatorWithPadding
from datasets import load_dataset
from datetime import datetime

In [3]:
class CausalSelfAttention(nn.Module):
  def __init__(self, d_k, d_v, d_model, n_heads, max_len):
    super().__init__()

    self.d_k = d_k
    self.d_v = d_v
    self.n_heads = n_heads

    self.W_q = nn.Linear(d_model,d_k*n_heads)
    self.W_k = nn.Linear(d_model,d_k*n_heads)
    self.W_v = nn.Linear(d_model,d_v*n_heads)

    self.fc = nn.Linear(d_v*n_heads, d_model)

    cm = torch.tril(torch.ones(max_len,max_len))
    # causal_mask = causal_mask.to(torch.int)
    # causal_mask = torch.triu(torch.ones_like(AttentionScores), diagonal=1)
    self.register_buffer('causal_mask', cm.view(1,1,max_len,max_len))

  def forward(self, X, pad_mask=None): # X could be q,k,v that are different
    Q = self.W_q(X)  # N × T × h*d_k
    K = self.W_k(X)  # N × T × h*d_k
    V = self.W_v(X)  # N × T × h*d_v

    N = Q.shape[0]
    T = Q.shape[1]

    Q = Q.view(N, T, self.n_heads, self.d_k).transpose(1,2) # N × T × h*d_k -->> N × h × T × d_k
    K = K.view(N, T, self.n_heads, self.d_k).transpose(1,2) # N × T × h*d_k -->> N × h × T × d_k
    V = V.view(N, T, self.n_heads, self.d_k).transpose(1,2) # N × T × h*d_k -->> N × h × T × d_k

    AttentionScores = Q @ K.transpose(2,3) / np.sqrt(self.d_k) #  N × h × T × T

    if pad_mask is not None:
      pad_mask= torch.unsqueeze(pad_mask, 1)
      pad_mask= torch.unsqueeze(pad_mask, 1)
      AttentionScores = AttentionScores.masked_fill(pad_mask == 0, float('-inf'))

    AttentionScores = AttentionScores.masked_fill(self.causal_mask[:,:,:T,:T] == 0, float('-inf'))

    AttentionWeights = F.softmax(AttentionScores, dim=-1) #  N × h × T × T

    A = AttentionWeights @ V #  N × h × T × d_v
    A = A.transpose(1,2).contiguous().view(N, T, self.n_heads*self.d_v ) #  N × T × h*d_v

    return self.fc(A)


In [4]:
class TransformerBlock(nn.Module):
  def __init__(self, d_k, d_v, d_model, n_heads, max_len, dropout_prob=0.2):
    super().__init__()

    self.ln1 = nn.LayerNorm(d_model)
    self.ln2 = nn.LayerNorm(d_model)
    self.mha = CausalSelfAttention(d_k, d_v, d_model, n_heads, max_len)
    self.ann = nn.Sequential(
        nn.Linear(d_model, d_model*3),
        nn.GELU(),
        nn.Linear(d_model*3, d_model),
        nn.Dropout(dropout_prob)
    )
    self.dropout = nn.Dropout(dropout_prob)

  def forward(self, x, pad_mask= None):
    x = self.ln1(x + self.mha(x,pad_mask))
    x = self.ln2(x + self.ann(x))
    x = self.dropout(x)
    return x

In [5]:
class PositionalEncoding(nn.Module):
  def __init__(self, d_model, max_len = 2048, dropout_prob=0.2):
    super().__init__()
    self.dropout = nn.Dropout(dropout_prob)

    position = torch.arange(max_len).unsqueeze(1)
    exp_term = torch.arange(0, d_model, 2)
    div_term = torch.exp(exp_term * (-np.log(10000.0) / d_model))
    pe = torch.zeros(1, max_len, d_model)
    pe[0, :, 0::2] = torch.sin(position * div_term)
    pe[0, :, 1::2] = torch.cos(position * div_term)
    self.register_buffer('pe',pe)

  def forward(self, x):
    x = x+self.pe[:, :x.size(1), :]
    return self.dropout(x)

In [6]:
class Decoder(nn.Module):
  def __init__(self, vocab_size, max_len,d_model,d_k,d_v,
               n_heads,n_layers,dropout_prob):
    super().__init__()

    self.embedding = nn.Embedding(vocab_size,d_model)

    self.pos_encoding = PositionalEncoding(d_model, max_len, dropout_prob)

    transformer_blocks = [TransformerBlock(d_k, d_v,
                                           d_model,
                                           n_heads,
                                           max_len,
                                           dropout_prob) for i in range(n_layers)]

    self.transformer_blocks = nn.Sequential(*transformer_blocks)
    self.ln = nn.LayerNorm(d_model)
    self.fc = nn.Linear(d_model, vocab_size)

  def forward(self, x, pad_mask=None):
    x = self.embedding(x)
    x = self.pos_encoding(x)
    for block in self.transformer_blocks:
      x = block(x, pad_mask)


    x = self.ln(x)
    #many to many problem
    x = self.fc(x)

    return x

In [7]:
model = Decoder(vocab_size = 10000,
                max_len = 512,
                d_model = 64,
                d_k = 16,
                d_v = 16,
                n_heads = 4,
                n_layers = 2,
                dropout_prob = .1)

device = 'cuda:0' if torch.cuda.is_available() else 'cpu'
model.to(device)

Decoder(
  (embedding): Embedding(10000, 64)
  (pos_encoding): PositionalEncoding(
    (dropout): Dropout(p=0.1, inplace=False)
  )
  (transformer_blocks): Sequential(
    (0): TransformerBlock(
      (ln1): LayerNorm((64,), eps=1e-05, elementwise_affine=True)
      (ln2): LayerNorm((64,), eps=1e-05, elementwise_affine=True)
      (mha): CausalSelfAttention(
        (W_q): Linear(in_features=64, out_features=64, bias=True)
        (W_k): Linear(in_features=64, out_features=64, bias=True)
        (W_v): Linear(in_features=64, out_features=64, bias=True)
        (fc): Linear(in_features=64, out_features=64, bias=True)
      )
      (ann): Sequential(
        (0): Linear(in_features=64, out_features=192, bias=True)
        (1): GELU(approximate='none')
        (2): Linear(in_features=192, out_features=64, bias=True)
        (3): Dropout(p=0.1, inplace=False)
      )
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (1): TransformerBlock(
      (ln1): LayerNorm((64,), eps=1e-05, ele

##dummy test decoder

In [8]:
x = np.random.randint(0,10000, size=(8,512))
x = torch.tensor(x).to(device)

In [9]:
y = model(x)
x.shape, y.shape

(torch.Size([8, 512]), torch.Size([8, 512, 10000]))

In [10]:
mask = torch.ones((8,512))
mask[:,256:]= 0
mask = mask.to(device)

In [11]:
y = model(x, mask)
x.shape, y.shape

(torch.Size([8, 512]), torch.Size([8, 512, 10000]))

## Tokenization


In [12]:
checkpoint = 'distilbert-base-cased'
tokenizer = AutoTokenizer.from_pretrained(checkpoint)

Downloading (…)okenizer_config.json:   0%|          | 0.00/29.0 [00:00<?, ?B/s]

Downloading (…)lve/main/config.json:   0%|          | 0.00/465 [00:00<?, ?B/s]

Downloading (…)solve/main/vocab.txt:   0%|          | 0.00/213k [00:00<?, ?B/s]

Downloading (…)/main/tokenizer.json:   0%|          | 0.00/436k [00:00<?, ?B/s]

In [13]:
raw_datasets = load_dataset('glue','sst2')
raw_datasets

Downloading builder script:   0%|          | 0.00/28.8k [00:00<?, ?B/s]

Downloading metadata:   0%|          | 0.00/28.7k [00:00<?, ?B/s]

Downloading readme:   0%|          | 0.00/27.9k [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/7.44M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/67349 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/872 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/1821 [00:00<?, ? examples/s]

DatasetDict({
    train: Dataset({
        features: ['sentence', 'label', 'idx'],
        num_rows: 67349
    })
    validation: Dataset({
        features: ['sentence', 'label', 'idx'],
        num_rows: 872
    })
    test: Dataset({
        features: ['sentence', 'label', 'idx'],
        num_rows: 1821
    })
})

In [14]:
def tokenize_fn(batch):
  return tokenizer(batch['sentence'], truncation = True)

In [15]:
tokenized_datasets = raw_datasets.map(tokenize_fn, batched =True)
tokenized_datasets

Map:   0%|          | 0/67349 [00:00<?, ? examples/s]

Map:   0%|          | 0/872 [00:00<?, ? examples/s]

Map:   0%|          | 0/1821 [00:00<?, ? examples/s]

DatasetDict({
    train: Dataset({
        features: ['sentence', 'label', 'idx', 'input_ids', 'attention_mask'],
        num_rows: 67349
    })
    validation: Dataset({
        features: ['sentence', 'label', 'idx', 'input_ids', 'attention_mask'],
        num_rows: 872
    })
    test: Dataset({
        features: ['sentence', 'label', 'idx', 'input_ids', 'attention_mask'],
        num_rows: 1821
    })
})

In [16]:
type(tokenized_datasets)

datasets.dataset_dict.DatasetDict

In [17]:
tokenized_datasets['train'][0]

{'sentence': 'hide new secretions from the parental units ',
 'label': 0,
 'idx': 0,
 'input_ids': [101, 4750, 1207, 3318, 5266, 1121, 1103, 22467, 2338, 102],
 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1]}

In [18]:
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)
data_collator

DataCollatorWithPadding(tokenizer=DistilBertTokenizerFast(name_or_path='distilbert-base-cased', vocab_size=28996, model_max_length=512, is_fast=True, padding_side='right', truncation_side='right', special_tokens={'unk_token': '[UNK]', 'sep_token': '[SEP]', 'pad_token': '[PAD]', 'cls_token': '[CLS]', 'mask_token': '[MASK]'}, clean_up_tokenization_spaces=True),  added_tokens_decoder={
	0: AddedToken("[PAD]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	100: AddedToken("[UNK]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	101: AddedToken("[CLS]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	102: AddedToken("[SEP]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	103: AddedToken("[MASK]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
}, padding=True, max_length=None, pad_to_multiple_of=None, return_tensors='pt')

In [19]:
tokenized_datasets = tokenized_datasets.remove_columns(['sentence','idx','label'])
# tokenized_datasets = tokenized_datasets.rename_column('label','labels')
tokenized_datasets

DatasetDict({
    train: Dataset({
        features: ['input_ids', 'attention_mask'],
        num_rows: 67349
    })
    validation: Dataset({
        features: ['input_ids', 'attention_mask'],
        num_rows: 872
    })
    test: Dataset({
        features: ['input_ids', 'attention_mask'],
        num_rows: 1821
    })
})

In [20]:
BATCH_SIZE = 32

train_loader = DataLoader(tokenized_datasets['train'],
                          batch_size=BATCH_SIZE,
                          shuffle=True,
                          collate_fn=data_collator)

valid_loader = DataLoader(tokenized_datasets['validation'],
                          batch_size=BATCH_SIZE,
                          collate_fn=data_collator)

In [23]:
for batch in train_loader:
  for k, v in batch.items():
    print('k:',k,'v.shape:',v.shape)
  break

k: input_ids v.shape: torch.Size([32, 49])
k: attention_mask v.shape: torch.Size([32, 49])


In [24]:
tokenizer.vocab_size

28996

In [25]:
tokenizer.max_model_input_sizes

{'distilbert-base-uncased': 512,
 'distilbert-base-uncased-distilled-squad': 512,
 'distilbert-base-cased': 512,
 'distilbert-base-cased-distilled-squad': 512,
 'distilbert-base-german-cased': 512,
 'distilbert-base-multilingual-cased': 512}

In [26]:
model = Decoder(vocab_size = tokenizer.vocab_size,
                max_len = 512,
                d_model = 64,
                d_k = 8,
                d_v = 8,
                n_heads = 6,
                n_layers = 4,
                dropout_prob = .1)

device = 'cuda:0' if torch.cuda.is_available() else 'cpu'
model.to(device)

Decoder(
  (embedding): Embedding(28996, 64)
  (pos_encoding): PositionalEncoding(
    (dropout): Dropout(p=0.1, inplace=False)
  )
  (transformer_blocks): Sequential(
    (0): TransformerBlock(
      (ln1): LayerNorm((64,), eps=1e-05, elementwise_affine=True)
      (ln2): LayerNorm((64,), eps=1e-05, elementwise_affine=True)
      (mha): CausalSelfAttention(
        (W_q): Linear(in_features=64, out_features=48, bias=True)
        (W_k): Linear(in_features=64, out_features=48, bias=True)
        (W_v): Linear(in_features=64, out_features=48, bias=True)
        (fc): Linear(in_features=48, out_features=64, bias=True)
      )
      (ann): Sequential(
        (0): Linear(in_features=64, out_features=192, bias=True)
        (1): GELU(approximate='none')
        (2): Linear(in_features=192, out_features=64, bias=True)
        (3): Dropout(p=0.1, inplace=False)
      )
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (1): TransformerBlock(
      (ln1): LayerNorm((64,), eps=1e-05, ele

In [27]:
train_loader

<torch.utils.data.dataloader.DataLoader at 0x7b266cd3f3a0>

In [28]:
for X in train_loader:
  print(X)
  break

{'input_ids': tensor([[  101,  9645,  4289,  ...,     0,     0,     0],
        [  101,  1103,  2851,  ...,     0,     0,     0],
        [  101,  1103,  1273,  ...,     0,     0,     0],
        ...,
        [  101,  1142, 25671,  ...,     0,     0,     0],
        [  101, 10729,   117,  ...,     0,     0,     0],
        [  101,  1217,  4736,  ...,     0,     0,     0]]), 'attention_mask': tensor([[1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        ...,
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0]])}


In [29]:
tokenizer.pad_token_id

0

In [45]:
def TrainModel(model, train_loader, valid_loader, epochs):

  trainLoss = torch.zeros(epochs)
  testLoss  = torch.zeros(epochs)
  trainAcc  = torch.zeros(epochs)
  testAcc  = torch.zeros(epochs)

  lossfun = nn.CrossEntropyLoss(ignore_index = tokenizer.pad_token_id)
  optimizer = torch.optim.Adam(model.parameters())


  for epoch in range(epochs):
    model.train()
    t0 = datetime.now()
    trainBatchLoss = []
    trainBatchAcc  = []

    for batch in train_loader:
      batch = {k:v.to(device) for k,v in batch.items()}

      targets = batch['input_ids'].clone().detach()
      # print(targets)
      # print('________________')
      targets = torch.roll(targets, shifts=-1, dims=1)
      # print(targets)
      # print('________________')
      targets[:,-1] = tokenizer.pad_token_id
      # print(targets)
      # print('________________')

      yHat = model(batch['input_ids'], batch['attention_mask'])
      # print(yHat, yHat.shape)
      loss = lossfun(yHat.transpose(2,1), targets)


      optimizer.zero_grad()
      loss.backward()
      optimizer.step()

      trainBatchLoss.append(loss.item())
      trainBatchAcc.append( torch.mean((torch.argmax(yHat,axis=1) == batch['labels']).float()).item() )

    trainLoss[epoch] = np.mean(trainBatchLoss)
    trainAcc[epoch]  = 100*np.mean(trainBatchAcc)


    model.eval()

    testBatchLoss = []
    testBatchAcc  = []

    for batch in valid_loader:
      batch = {k:v.to(device) for k,v in batch.items()}

      with torch.no_grad(): # deactivates autograd
        yHat = model(batch['input_ids'], batch['attention_mask'])
        loss = lossfun(yHat, batch['labels'])

      testBatchLoss.append(loss.item())
      testBatchAcc.append( torch.mean((torch.argmax(yHat,axis=1) == batch['labels']).float()).item() )

    # get loss and error rate from the test batch

    testLoss[epoch] = np.mean(testBatchLoss)
    testAcc[epoch]  = 100*np.mean(testBatchAcc)

    # print(testBatchLoss)
    # print(testBatchAcc)

    # print('________________________')
    # print(trainLoss)
    # print(trainAcc)
    # print(testLoss)
    # print(testAcc)
    t = datetime.now() - t0

    print(f'''Epoch: {epoch}/{epochs},
              Train Accuracy: {trainAcc[epoch].item():.4f},
              Train Loss: {trainLoss[epoch].item():.4f},
              Test Accuracy: {testAcc[epoch].item():.4f},
              Test Loss: {testLoss[epoch].item():.4f}, Duration:{t}''')


  return trainLoss,testLoss,trainAcc,testAcc,model

In [46]:
trainLoss,testLoss,trainAcc,testAcc,model = TrainModel(model,
                                                       train_loader,
                                                       valid_loader,
                                                       epochs=5)

tensor([[[-6.9284e-02,  8.6777e-02,  5.0230e-02,  ..., -2.7558e-01,
           3.2608e-01, -2.4140e-01],
         [ 3.3035e-01, -2.3127e-01,  2.6226e-01,  ...,  3.5243e-01,
           8.3843e-01,  3.9588e-01],
         [ 3.9887e-02, -2.5880e-01, -7.6514e-01,  ...,  1.1185e+00,
           1.4651e-04,  1.1234e+00],
         ...,
         [-4.0321e-01,  8.3625e-02,  5.1647e-01,  ...,  5.0347e-01,
           6.6175e-01, -3.1854e-01],
         [ 3.7231e-01, -1.8907e-01,  3.9626e-01,  ...,  2.3830e-01,
           8.2684e-01,  1.8911e-01],
         [-7.9325e-02, -5.8543e-01,  1.0051e+00,  ...,  2.6500e-01,
           1.2333e+00, -6.8789e-02]],

        [[-3.3002e-02,  4.0869e-02, -1.1462e-01,  ..., -9.2922e-01,
           7.2329e-01, -4.0299e-01],
         [-1.1560e+00, -4.2410e-01,  1.3639e-01,  ..., -3.8244e-01,
           1.8003e-01,  1.2995e+00],
         [-1.3595e-01, -7.5228e-01, -1.4286e-02,  ...,  6.3342e-01,
           3.5503e-01,  4.3642e-01],
         ...,
         [ 4.0719e-01,  1

KeyError: ignored