<a href="https://colab.research.google.com/github/kesavan7287/NaturalLP/blob/main/NLP_Transformer_Scratch.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
from google.colab import userdata

In [None]:
import pandas as pd
import numpy as np
import random
import re
import math

In [None]:
import torch
from torch import nn
from torch.utils.data import TensorDataset, DataLoader
from torch import optim

In [None]:
def seed_everything(seed=42):
    random.seed(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = True

seed_everything()

In [None]:
from transformers import BertTokenizer

# Load pre-trained BERT tokenizer
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased', token=userdata.get("HuggingFace_Colab"))

# Example usage:
text = "Hello, how are you?"
tokens = tokenizer.tokenize(text)
print("Tokens:", tokens)

tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

Tokens: ['hello', ',', 'how', 'are', 'you', '?']


In [None]:
text = (
        'This is a transformer.\n'
        'Bert is a transformer.\n'
        'There are many transformers.\n'
        'This is a transformer from Scratch.\n'
        'Will it work?\n'
    )

In [None]:
sentences = re.sub("[.,!?\\-]", '', text.lower()).split('\n')  # filter '.', ',', '?', '!'
word_list = list(set(" ".join(sentences).split()))
word_dict = {'[PAD]': 0, '[CLS]': 1, '[SEP]': 2, '[MASK]': 3}


for i, w in enumerate(word_list):
    word_dict[w] = i + 4
number_dict = {i: w for i, w in enumerate(word_dict)}
vocab_size = len(word_dict)

token_list = list()
for sentence in sentences:
    arr = [word_dict[s] for s in sentence.split()]
    token_list.append(arr)

In [None]:
word_dict

{'[PAD]': 0,
 '[CLS]': 1,
 '[SEP]': 2,
 '[MASK]': 3,
 'bert': 4,
 'it': 5,
 'is': 6,
 'will': 7,
 'from': 8,
 'are': 9,
 'this': 10,
 'there': 11,
 'scratch': 12,
 'a': 13,
 'many': 14,
 'transformer': 15,
 'transformers': 16,
 'work': 17}

In [None]:
token_list

[[10, 6, 13, 15],
 [4, 6, 13, 15],
 [11, 9, 14, 16],
 [10, 6, 13, 15, 8, 12],
 [7, 5, 17],
 []]

In [None]:
max_len = 30 # maximum of length
batch_size = 6
max_pred = 5  # max tokens of prediction
n_layers = 6 # number of Encoder of Encoder Layer
n_heads = 12 # number of heads in Multi-Head Attention
d_model = 768 # Embedding Size
d_ff = 768 * 4  # 4*d_model, FeedForward dimension
d_k = d_v = 64  # dimension of K(=Q), V
n_segments = 2

In [None]:
def make_batch():
    batch = []
    positive = negative = 0
    while positive != batch_size/2 or negative != batch_size/2:
        tokens_a_index, tokens_b_index= random.randrange(len(sentences)), random.randrange(len(sentences))
        tokens_a, tokens_b= token_list[tokens_a_index], token_list[tokens_b_index]

        input_ids = [word_dict['[CLS]']] + tokens_a + [word_dict['[SEP]']] + tokens_b + [word_dict['[SEP]']]

        segment_ids = [0] * (1 + len(tokens_a) + 1) + [1] * (len(tokens_b) + 1)

        #MASK LM
        n_pred =  min(max_pred, max(1, int(round(len(input_ids) * 0.15)))) # 15 % of tokens in one sentence

        cand_maked_pos = [i for i, token in enumerate(input_ids)
                          if token != word_dict['[CLS]'] and token != word_dict['[SEP]']]
        random.shuffle(cand_maked_pos)
        masked_tokens, masked_pos = [], []
        for pos in cand_maked_pos[:n_pred]:
            masked_pos.append(pos)
            masked_tokens.append(input_ids[pos])
            if random.random() < 0.8:  # 80%
                input_ids[pos] = word_dict['[MASK]'] # make mask
            elif random.random() < 0.5: # .8 * .5
                index = random.randint(0, vocab_size - 1)
                input_ids[pos] = word_dict[number_dict[index]] # replace

        # Zero Paddings
        n_pad = max_len - len(input_ids)
        input_ids.extend([0] * n_pad)
        segment_ids.extend([0] * n_pad)

    #     # Zero Padding (100% - 15%) tokens
        if max_pred > n_pred:
            n_pad = max_pred - n_pred
            masked_tokens.extend([0] * n_pad)
            masked_pos.extend([0] * n_pad)

        if tokens_a_index + 1 == tokens_b_index and positive < batch_size/2:
            batch.append([input_ids, segment_ids, masked_tokens, masked_pos, True]) # IsNext
            positive += 1
        elif tokens_a_index + 1 != tokens_b_index and negative < batch_size/2:
            batch.append([input_ids, segment_ids, masked_tokens, masked_pos, False]) # NotNext
            negative += 1
    return batch

In [None]:
# masked_index = 8
# indexed_tokens[masked_index] = tokenizer.mask_token_id
# tokens_tensor = torch.tensor([indexed_tokens])

# masked_lm_model = torch.hub.load('huggingface/pytorch-transformers', 'modelForMaskedLM', 'bert-base-cased')

# with torch.no_grad():
#     predictions = masked_lm_model(tokens_tensor, token_type_ids=segments_tensors)

# # Get the predicted token
# predicted_index = torch.argmax(predictions[0][0], dim=1)[masked_index].item()
# predicted_token = tokenizer.convert_ids_to_tokens([predicted_index])[0]
# assert predicted_token == 'Jim'

In [None]:
def get_attn_pad_mask(seq_q, seq_k):
    batch_size, len_q = seq_q.size()
    batch_size, len_k = seq_k.size()

    pad_attn_mask = seq_k.data.eq(0).unsqueeze(1)  # batch_size x 1 x len_k(=len_q), one is masking
    return pad_attn_mask.expand(batch_size, len_q, len_k)  # batch_size x len_q x len_k

In [None]:
batch = make_batch()

In [None]:
input_ids, segment_ids, masked_tokens, masked_pos, isNext = map(torch.LongTensor, zip(*batch))

In [None]:
get_attn_pad_mask(input_ids, input_ids)[0][0], input_ids[0]

(tensor([False, False, False, False, False, False, False,  True,  True,  True,
          True,  True,  True,  True,  True,  True,  True,  True,  True,  True,
          True,  True,  True,  True,  True,  True,  True,  True,  True,  True]),
 tensor([ 1,  2, 10,  6, 13,  3,  2,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
          0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0]))

In [None]:
class PositionalEncoding(nn.Module):
    def __init__(self, dropout=0.1, max_len=5000, d_model=None):
        super(PositionalEncoding, self).__init__()
        self.dropout = nn.Dropout(p=dropout)

        pe = torch.zeros(max_len, d_model, dtype=torch.float)
        position = torch.arange(0, max_len, dtype=torch.float).unsqueeze(1)
        div_term = torch.exp(torch.arange(0, d_model, 2).float() * (-math.log(10000.0) / d_model))
        pe[:, 0::2] = torch.sin(position * div_term)
        pe[:, 1::2] = torch.cos(position * div_term)
        pe = pe.unsqueeze(0).transpose(0, 1)
        self.register_buffer('pe', pe)

    def forward(self, x):
        x = x + self.pe[:x.size(0), :]
        return self.dropout(x)

In [None]:
PositionalEncoding(max_len=max_len, d_model=d_model)

PositionalEncoding(
  (dropout): Dropout(p=0.1, inplace=False)
)

In [None]:
class Embedding(nn.Module):
    def __init__(self):
        super(Embedding, self).__init__()
        self.tok_embed = nn.Embedding(vocab_size, d_model)  # token embedding
        self.pos_embed = PositionalEncoding(max_len, d_model)  # position embedding
        self.seg_embed = nn.Embedding(n_segments, d_model)  # segment(token type) embedding
        self.norm = nn.LayerNorm(d_model)

    def forward(self, x, seg):
        seq_len = x.size(1)
        pos = torch.arange(seq_len, dtype=torch.long)
        pos = pos.unsqueeze(0).expand_as(x)  # (seq_len,) -> (batch_size, seq_len)
        embedding = self.tok_embed(x) + self.pos_embed(pos) + self.seg_embed(seg)
        return self.norm(embedding)

In [None]:
# class Embedding(nn.Module):
#    def __init__(self, vocab_size, d_model, max_len, n_segments):
#        super(Embedding, self).__init__()
#        self.tok_embed = nn.Embedding(vocab_size, d_model)  # token embedding
#        self.pos_embed = nn.Embedding(max_len, d_model)  # position embedding
#        self.seg_embed = nn.Embedding(n_segments, d_model)  # segment(token type) embedding
#        self.norm = nn.LayerNorm(d_model)

#    def forward(self, x, seg):
#        batch_size, seq_len = x.size()
#        pos = torch.arange(seq_len, dtype=torch.long, device=x.device)
#        pos = pos.unsqueeze(0).expand(batch_size, -1)  # (seq_len,) -> (batch_size, seq_len)
#        embedding = self.tok_embed(x) + self.pos_embed(pos) + self.seg_embed(seg)
#        return self.norm(embedding)


In [None]:
class ScaledDotProductAttention(nn.Module):
    def __init__(self):
        super(ScaledDotProductAttention, self).__init__()

    def forward(self, Q, K, V, attn_mask):
        scores = torch.matmul(Q, K.transpose(-1, -2)) / np.sqrt(d_k) # scores : [batch_size x n_heads x len_q(=len_k) x len_k(=len_q)]
        scores.masked_fill_(attn_mask, -1e9)
        attn = nn.Softmax(dim=-1)(scores)
        context = torch.matmul(attn, V)
        return scores, context, attn

In [None]:
emb = Embedding()

embeds = emb(input_ids, segment_ids)

attenM = get_attn_pad_mask(input_ids, input_ids)

SDPA= ScaledDotProductAttention()(embeds, embeds, embeds, attenM)

S, C, A = SDPA

print('Masks',attenM[0][0])
print()
print('Scores: ', S[0][0],'\n\nAttention M: ', A[0][0])

Masks tensor([False, False, False, False, False, False, False,  True,  True,  True,
         True,  True,  True,  True,  True,  True,  True,  True,  True,  True,
         True,  True,  True,  True,  True,  True,  True,  True,  True,  True])

Scores:  tensor([ 9.6000e+01,  3.6977e+01,  5.3444e+00, -2.8258e+00,  4.3828e+00,
         4.1381e+00,  7.2071e+00, -1.0000e+09, -1.0000e+09, -1.0000e+09,
        -1.0000e+09, -1.0000e+09, -1.0000e+09, -1.0000e+09, -1.0000e+09,
        -1.0000e+09, -1.0000e+09, -1.0000e+09, -1.0000e+09, -1.0000e+09,
        -1.0000e+09, -1.0000e+09, -1.0000e+09, -1.0000e+09, -1.0000e+09,
        -1.0000e+09, -1.0000e+09, -1.0000e+09, -1.0000e+09, -1.0000e+09],
       grad_fn=<SelectBackward0>) 

Attention M:  tensor([1.0000e+00, 2.3263e-26, 4.2553e-40, 1.2051e-43, 1.6267e-40, 1.2736e-40,
        2.7409e-39, 0.0000e+00, 0.0000e+00, 0.0000e+00, 0.0000e+00, 0.0000e+00,
        0.0000e+00, 0.0000e+00, 0.0000e+00, 0.0000e+00, 0.0000e+00, 0.0000e+00,
        0.0000e+00, 

In [None]:
len(SDPA[0]), len(SDPA[1])

(6, 6)

In [None]:
class MultiHeadAttention(nn.Module):
    def __init__(self):
        super(MultiHeadAttention, self).__init__()
        self.W_Q = nn.Linear(d_model, d_k * n_heads)
        self.W_K = nn.Linear(d_model, d_k * n_heads)
        self.W_V = nn.Linear(d_model, d_v * n_heads)
    def forward(self, Q, K, V, attn_mask):
        # q: [batch_size x len_q x d_model], k: [batch_size x len_k x d_model], v: [batch_size x len_k x d_model]
        residual, batch_size = Q, Q.size(0)

        q_s = self.W_Q(Q).view(batch_size, -1, n_heads, d_k).transpose(1,2)
        k_s = self.W_K(K).view(batch_size, -1, n_heads, d_k).transpose(1,2)
        v_s = self.W_V(V).view(batch_size, -1, n_heads, d_v).transpose(1,2)

        attn_mask = attn_mask.unsqueeze(1).repeat(1, n_heads, 1, 1) # attn_mask : [batch_size x n_heads x len_q x len_k]

        score, context, attn = ScaledDotProductAttention()(q_s, k_s, v_s, attn_mask)
        context = context.transpose(1, 2).contiguous().view(batch_size, -1, n_heads * d_v) # context: [batch_size x len_q x n_heads * d_v]
        output = nn.Linear(n_heads * d_v, d_model)(context)
        return nn.LayerNorm(d_model)(output + residual), attn # output: [batch_size x len_q x d_model]


In [None]:
emb = Embedding()
embeds = emb(input_ids, segment_ids)

attenM = get_attn_pad_mask(input_ids, input_ids)

MHA= MultiHeadAttention()(embeds, embeds, embeds, attenM)

Output, A = MHA

A[0][0]

tensor([[0.1158, 0.1418, 0.1658, 0.1326, 0.1757, 0.1247, 0.1436, 0.0000, 0.0000,
         0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000,
         0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000,
         0.0000, 0.0000, 0.0000],
        [0.2382, 0.1859, 0.1120, 0.1348, 0.1368, 0.0763, 0.1161, 0.0000, 0.0000,
         0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000,
         0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000,
         0.0000, 0.0000, 0.0000],
        [0.1529, 0.1295, 0.1392, 0.1312, 0.1624, 0.1604, 0.1244, 0.0000, 0.0000,
         0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000,
         0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000,
         0.0000, 0.0000, 0.0000],
        [0.2029, 0.1146, 0.1315, 0.1230, 0.1596, 0.1689, 0.0996, 0.0000, 0.0000,
         0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000,
       

In [None]:
class PoswiseFeedForwardNet(nn.Module):
    def __init__(self):
        super(PoswiseFeedForwardNet, self).__init__()
        self.fc1 = nn.Linear(d_model, d_ff)
        self.fc2 = nn.Linear(d_ff, d_model)

    def forward(self, x):
        return self.fc2(nn.functional.gelu(self.fc1(x)))

In [None]:
class EncoderLayer(nn.Module):
    def __init__(self):
        super(EncoderLayer, self).__init__()
        self.enc_self_attn = MultiHeadAttention()
        self.pos_ffn = PoswiseFeedForwardNet()

    def forward(self, enc_inputs, enc_self_attn_mask):
        enc_outputs, attn = self.enc_self_attn(enc_inputs, enc_inputs, enc_inputs, enc_self_attn_mask) # enc_inputs to same Q,K,V
        enc_outputs = self.pos_ffn(enc_outputs) # enc_outputs: [batch_size x len_q x d_model]
        return enc_outputs, attn

In [None]:
class Transformer_K(nn.Module):
    def __init__(self):
        super().__init__()
        self.embedding = Embedding()
        self.layer = EncoderLayer()
        self.fc = nn.Linear(d_model, d_model)

        pass
    def forward(self, input_ids, segment_ids, masked_pos):
        output = self.embedding(input_ids, segment_ids)
        enc_self_attn_mask = get_attn_pad_mask(input_ids, input_ids)
        output, enc_self_attn = self.layer(output, enc_self_attn_mask)
        output = self.fc(output).view(-1, 5)
        return output.mean(dim=0)


In [None]:
model = Transformer_K()
# criterion = nn.CrossEntropyLoss()
criterion = nn.MSELoss()
optimizer = optim.Adam(model.parameters(), lr=0.001)

batch = make_batch()
input_ids, segment_ids, masked_tokens, masked_pos, isNext = map(torch.FloatTensor, zip(*batch))

for epoch in range(100):
    outputs = model(input_ids, segment_ids, masked_pos)
    print(outputs.shape)
    print(outputs.unsqueeze(dim=0).shape)
    loss_lm = criterion(outputs, masked_tokens) # for masked LM
    loss_lm = (loss_lm.float()).mean()

    loss = loss_lm.type(torch.FloatTensor)

    if (epoch + 1) % 10 == 0:
        print('Epoch:', '%04d' % (epoch + 1), 'cost =', '{:.6f}'.format(loss))
    optimizer.zero_grad()
    loss.backward()
    optimizer.step()

Epoch: 0010 cost = 29.312452
Epoch: 0020 cost = 22.832312
Epoch: 0030 cost = 11.639997
Epoch: 0040 cost = 7.402102
Epoch: 0050 cost = 7.718825
Epoch: 0060 cost = 7.042941
Epoch: 0070 cost = 8.497290
Epoch: 0080 cost = 8.671464
Epoch: 0090 cost = 4.111983
Epoch: 0100 cost = 2.459414


In [None]:
# model = Transformer_K()
# # criterion = nn.CrossEntropyLoss()
# criterion = nn.MSELoss()
# optimizer = optim.Adam(model.parameters(), lr=0.001)

# batch = make_batch()
# input_ids, segment_ids, masked_tokens, masked_pos, isNext = map(torch.FloatTensor, zip(*batch))

# for epoch in range(100):
#     outputs = model(input_ids, segment_ids, masked_pos)
#     print(outputs.shape)
#     print(outputs.unsqueeze(dim=0).shape)
#     loss_lm = criterion(outputs, masked_tokens) # for masked LM
#     loss_lm = (loss_lm.float()).mean()

#     loss = loss_lm.type(torch.FloatTensor)

#     if (epoch + 1) % 10 == 0:
#         print('Epoch:', '%04d' % (epoch + 1), 'cost =', '{:.6f}'.format(loss))
#     optimizer.zero_grad()
#     loss.backward()
#     optimizer.step()

In [None]:
input_ids, segment_ids, masked_tokens, masked_pos, isNext = map(torch.FloatTensor, zip(batch[0]))
print(text)
print([number_dict[w.item()] for w in input_ids[0] if number_dict[w.item()] != '[PAD]'])

logits_lm, logits_clsf = model(input_ids, segment_ids, masked_pos)
logits_lm = logits_lm.data.max(2)[1][0].data.numpy()
print('masked tokens list : ',[pos.item() for pos in masked_tokens[0] if pos.item() != 0])
print('predict masked tokens list : ',[pos for pos in logits_lm if pos != 0])

logits_clsf = logits_clsf.data.max(1)[1].data.numpy()[0]
print('isNext : ', True if isNext else False)
print('predict isNext : ',True if logits_clsf else False)

This is a transformer.
Bert is a transformer.
There are many transformers.
This is a transformer from Scratch.
Will it work?

['[CLS]', 'there', '[MASK]', 'many', '[MASK]', '[SEP]', 'bert', 'is', 'a', 'transformer', '[SEP]']


RuntimeError: Expected tensor for argument #1 'indices' to have one of the following scalar types: Long, Int; but got torch.FloatTensor instead (while checking arguments for embedding)