In [2]:
cd /kaggle/input/transformer-summarizer-ds

/kaggle/input/transformer-summarizer-ds


In [3]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load
import re
import pdb
import time
import utils
import torch
import textwrap
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import torch.nn as nn
import matplotlib.pyplot as plt
from torch.utils.data import Dataset, DataLoader
from tokenizers import Tokenizer, models, normalizers, pre_tokenizers, trainers


wrapper = textwrap.TextWrapper(width=70)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/transformer-summarizer-ds/train.json
/kaggle/input/transformer-summarizer-ds/test.json
/kaggle/input/transformer-summarizer-ds/utils.py
/kaggle/input/transformer-summarizer-ds/val.json


# Import the Dataset

In [4]:
train_data, test_data = utils.get_train_test_data()

# A single example from the dataset
example_summary, example_dialogue = train_data.iloc[10]
print(f"Dialogue: \n{example_dialogue}")
print(f"\nSummary: \n{example_summary}")

Dialogue: 
Lucas: Hey! How was your day?
Demi: Hey there! 
Demi: It was pretty fine, actually, thank you!
Demi: I just got promoted! :D
Lucas: Whoa! Great news!
Lucas: Congratulations!
Lucas: Such a success has to be celebrated.
Demi: I agree! :D
Demi: Tonight at Death & Co.?
Lucas: Sure!
Lucas: See you there at 10pm?
Demi: Yeah! See you there! :D

Summary: 
Demi got promoted. She will celebrate that with Lucas at Death & Co at 10 pm.


# Preprocess the data

In [5]:
document, summary = utils.preprocess(train_data)
document_test, summary_test = utils.preprocess(test_data)

In [6]:
print(document[0])

[SOS] amanda: i baked  cookies. do you want some?  jerry: sure!  amanda: i'll bring you tomorrow :-) [EOS]


## Concatenating summaries and docs to prepare before passing into the tokenizer

In [7]:
docs_and_summary = pd.concat([document, summary], ignore_index=True)

## Punctuation filtering

In [8]:
def apply_filters(text):
    filters = r'[!"#$%&()*+,-./:;<=>?@\\^_`{|}~\t\n]'
    if isinstance(text, str):
        preprocessed_text = re.sub(filters, ' ', text)
        
    else:
        preprocessed_text = [re.sub(filters, ' ', sentence) for sentence in text]
    return preprocessed_text

### Applying filtering on training data before tokenizing

In [9]:
filtered_docs_and_summary = apply_filters(docs_and_summary)

## Tokenizer

In [10]:

tokenizer = Tokenizer(models.WordLevel(unk_token='[UNK]'))
tokenizer.pre_tokenizer = pre_tokenizers.WhitespaceSplit()
trainer = trainers.WordLevelTrainer(vocab_size=34249, special_tokens=['[PAD]', '[UNK]'])

tokenizer.train_from_iterator(filtered_docs_and_summary, trainer)

vocabulary = tokenizer.get_vocab()
vocab_size = tokenizer.get_vocab_size() + 1
print(f'Size of vocabulary: {vocab_size}')

Size of vocabulary: 34250


In [11]:
vocabulary = sorted(vocabulary.items(), key=lambda key: key[1])
print(vocabulary[:10])

[('[PAD]', 0), ('[UNK]', 1), ('i', 2), ('the', 3), ('to', 4), ('you', 5), ('a', 6), ('[EOS]', 7), ('[SOS]', 8), ('and', 9)]


In [12]:
encoded = tokenizer.encode(apply_filters(document[0]))
print(encoded.ids)

[8, 454, 2, 3505, 1613, 30, 5, 81, 50, 619, 66, 454, 63, 220, 5, 98, 7]


In [13]:
print(apply_filters(document[0]))

[SOS] amanda  i baked  cookies  do you want some   jerry  sure   amanda  i'll bring you tomorrow     [EOS]


In [14]:
encoder_maxlen = 150
decoder_maxlen = 50

# Padding & truncating documents (inputs)
tokenizer.enable_padding(length=encoder_maxlen)
tokenizer.enable_truncation(encoder_maxlen)

inputs = tokenizer.encode_batch(apply_filters(document))

# Padding & truncating documents (inputs)
tokenizer.enable_padding(length=decoder_maxlen)
tokenizer.enable_truncation(max_length=decoder_maxlen)

targets = tokenizer.encode_batch(apply_filters(summary))

# Extracting ids only
inputs = torch.tensor([seq.ids for seq in inputs])
targets = torch.tensor([seq.ids for seq in targets])

## Dataset

In [15]:
class CustomDataset(Dataset):
    def __init__(self, inputs, targets):
        self.inputs, self.targets = inputs, targets
        
    def __len__(self):
        return len(self.targets)
    
    def __getitem__(self, idx):
        return self.inputs[idx], self.targets[idx]
    

In [16]:
train_dataset = CustomDataset(inputs, targets)

BATCH_SIZE = 64

train_loader = DataLoader(train_dataset, batch_size=BATCH_SIZE, shuffle=True)

# Positional Encoding

In [17]:
def positional_encoding(num_positions, d_model):
    position = np.arange(num_positions)[:, np.newaxis]
    i = k // 2
    
    angle_rates = 1 / np.power(10000, (2 * i) / np.float32(d_model))
    angle_rads = position * angle_rates
    
    angle_rads[:, 0::2] = np.sin(angle_rads[:, 0::2])
    angle_rads[:, 1::2] = np.cos(angle_rads[:, 1:2])    
    
    pos_encoding = angle_rads[np.newaxis, ...]
    return torch.tensor(pos_encoding, torch.float32)

# Masking

In [18]:
def create_padding_mask(token_ids):
    seq = torch.logical_not(torch.eq(torch.tensor(token_ids), 0)).float()
    return torch.unsqueeze(seq, 1)


def create_look_ahead_mask(sequence_length, num_heads):
    mask = torch.tril(torch.ones((1 * num_heads, sequence_length, sequence_length)))
    return mask

# Self attention

## Scaled dot product attention

In [19]:
def scaled_dot_product_attention(q, k, v, mask):
    matmul_qk = torch.matmul(q, k.T)

    dk = torch.tensor(k.size(-1), dtype=torch.float)
    scaled_attention_logits = matmul_qk / torch.sqrt(dk)
    scaled_attention_logits = torch.unsqueeze(scaled_attention_logits, dim=0)
    
    if mask is not None:
        scaled_attention_logits += (1. - mask) * -1e9
        
    attention_weights = torch.softmax(scaled_attention_logits, dim=-1)
    
    output = torch.matmul(attention_weights, v)
    return output, attention_weights

In [20]:
# Test your function!
q = torch.tensor([[1, 1, 0, 1], [0, 1, 1, 1], [1, 0, 1, 1]]).float()
k = torch.tensor([[1, 1, 0, 1], [1, 0, 1, 1 ], [1, 1, 1, 0], [0, 0, 0, 1], [0, 1, 0, 1]]).float()
v = torch.tensor([[0, 0], [1, 0], [1, 0], [1, 1], [1, 1]]).float()
mask = torch.tensor([[[0, 1, 0, 1, 1], [1, 0, 0, 1, 1], [1, 1, 0, 1, 1]]])

ou, atw = scaled_dot_product_attention(q, k, v, mask)
ou = torch.round(ou, decimals=2)
atw = torch.round(atw, decimals=2)

print(f"Output:\n {ou}")
print(f"\nAttention weigths:\n {atw}")

Output:
 tensor([[[1.0000, 0.6200],
         [0.6200, 0.6200],
         [0.7400, 0.3100]]])

Attention weigths:
 tensor([[[0.0000, 0.3800, 0.0000, 0.2300, 0.3800],
         [0.3800, 0.0000, 0.0000, 0.2300, 0.3800],
         [0.2600, 0.4300, 0.0000, 0.1600, 0.1600]]])


# Encoder

In [21]:
def FullyConnected(embedding_dim, fully_connected_dim):
    return nn.Sequential(
        nn.Linear(embedding_dim, fully_connected_dim),
        nn.ReLU(),
        nn.Linear(fully_connected_dim, embedding_dim)
    )

## Encoder Layer

In [22]:
class EncoderLayer(nn.Module):
    def __init__(self, embedding_dim, num_heads, fully_connected_dim, dropout_rate=0.1, layernorm_eps=1e-6):
        super().__init__()
        
        self.mha = nn.MultiheadAttention(embedding_dim, num_heads, dropout=dropout_rate, batch_first=True)
        self.ffn = FullyConnected(embedding_dim, fully_connected_dim)
        
        self.layernorm1 = nn.LayerNorm(embedding_dim, eps=layernorm_eps)
        self.layernorm2 = nn.LayerNorm(embedding_dim, eps=layernorm_eps)
        
        self.dropout_ffn = nn.Dropout(dropout_rate)
    
    def forward(self, x, mask):
        mha_output, _ = self.mha(x, x, x, mask)
        skip_x_attention = self.layernorm1(x + mha_output)
        
        ffn_output = self.ffn(skip_x_attention)
        ffn_output = self.dropout_ffn(ffn_output)
        encoder_layer_out = self.layernorm2(ffn_output + skip_x_attention)
        
        return encoder_layer_out

## Full Encoder

In [23]:
class Encoder(nn.Module):
    def __init__(self, num_layers, embedding_dim, num_heads, fully_connected_dim, input_vocab_size, 
                maximum_position_encoding, dropout_rate=0.1, layernorm_eps=1e-6):
        super().__init__()
        
        self.embedding_dim = embedding_dim
        self.num_layers = num_layers
        
        self.embedding = nn.Embedding(input_vocab_size, self.embedding_dim)
        self.pos_encoding = positional_encoding(maximum_position_encoding, self.embedding_dim)
        
        self.enc_layers = [EncoderLayer(embedding_dim=self.embedding_dim, 
                                        num_heads=num_heads,
                                        fully_connected_dim=fully_connected_dim,
                                        dropout_rate=dropout_rate,
                                        layernorm_eps=layernorm_eps) 
                           for _ in range(num_layers)]
        
        self.dropout = nn.Dropout(dropout_rate)
        
    def forward(self, x, mask):
        seq_len = x.size(1)

        x = self.embedding(x)
        x *= torch.sqrt(self.embedding_dim).float()
        x += self.pos_encoding[:, :seq_len, :]
        x = self.dropout(x)

        for i in range(self.num_layers):
            x = self.enc_layers[i](x, mask)

        return x

# Decoder

## Decoder Layer

In [30]:
class DecoderLayer(nn.Module):
    def __init__(self, embedding_dim, num_heads, fully_connected_dim, dropout_rate=0.1, layernorm=1e-6):
        super().__init__()
        
        self.mha1 = nn.MultiheadAttention(embedding_dim, num_heads, dropout_rate, batch_first=True)
        self.mha2 = nn.MultiheadAttention(embedding_dim, num_heads, dropout_rate, batch_first=True)
        
        self.ffn = FullyConnected(embedding_dim, fully_connected_dim)
        
        self.layernorm1 = nn.LayerNorm(embedding_dim, layernorm)
        self.layernorm2 = nn.LayerNorm(embedding_dim, layernorm)
        self.layernorm3 = nn.LayerNorm(embedding_dim, layernorm)
        
        self.dropout_ffn = nn.Dropout(dropout_rate)
        
    def forward(self, x, enc_output, look_ahead_mask, padding_mask):
        
        mult_attn_out1, attn_weights_block1 = self.mha1(x, x, x, attn_mask=look_ahead_mask, average_attn_weights=False)
        Q1 = self.layernorm1(x + mult_attn_out1)

        mult_attn_out2, attn_weights_block2 = self.mha2(x, enc_output, enc_output, key_padding_mask=padding_mask, average_attn_weights=False)
        mult_attn_out2 = self.layernorm2(Q1 + mult_attn_out2)

        ffn_output = self.ffn(mult_attn_out2)
        ffn_output = self.dropout_ffn(ffn_output)
        out3 = self.layernorm3(ffn_output + mult_attn_out2)

        return out3, attn_weights_block1, attn_weights_block2

## Full Decoder

In [32]:
key_dim = 192
n_heads = 16

decoderLayer_test = DecoderLayer(embedding_dim=key_dim, num_heads=n_heads, fully_connected_dim=32)

q = torch.ones((1, 15, key_dim))
encoder_test_output = torch.tensor(np.random.rand(1, 7, key_dim)).float()
look_ahead_mask = create_look_ahead_mask(q.shape[1], n_heads)

out, attn_w_b1, attn_w_b2 = decoderLayer_test(q, encoder_test_output, look_ahead_mask, None)

print(f"Using embedding_dim={key_dim} and num_heads={n_heads}:\n")
print(f"q has shape:{q.shape}")
print(f"Output of encoder has shape:{encoder_test_output.shape}\n")

print(f"Output of decoder layer has shape:{out.shape}")
print(f"Att Weights Block 1 has shape:{attn_w_b1.shape}")
print(f"Att Weights Block 2 has shape:{attn_w_b2.shape}")

Using embedding_dim=192 and num_heads=16:

q has shape:torch.Size([1, 15, 192])
Output of encoder has shape:torch.Size([1, 7, 192])

Output of decoder layer has shape:torch.Size([1, 15, 192])
Att Weights Block 1 has shape:torch.Size([1, 16, 15, 15])
Att Weights Block 2 has shape:torch.Size([1, 16, 15, 7])


# Transformer