In [1]:
import pandas as pd
import numpy as np
import tensorflow as tf
import time
import re
import pickle
from tensorflow.keras import layers as tl


In [2]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [3]:
cnn = pd.read_csv("drive/My Drive/Colab Notebooks/abs_summ/data/train.csv")

In [4]:
cnn.drop(['id'], axis=1, inplace=True)

In [5]:
cnn.head()

Unnamed: 0,article,highlights
0,By . Associated Press . PUBLISHED: . 14:11 EST...,"Bishop John Folda, of North Dakota, is taking ..."
1,(CNN) -- Ralph Mata was an internal affairs li...,Criminal complaint: Cop used his role to help ...
2,A drunk driver who killed a young woman in a h...,"Craig Eccleston-Todd, 27, had drunk at least t..."
3,(CNN) -- With a breezy sweep of his pen Presid...,Nina dos Santos says Europe must be ready to a...
4,Fleetwood are the only team still to have a 10...,Fleetwood top of League One after 2-0 win at S...


In [6]:
cnn.shape

(287113, 2)

In [7]:
document = cnn['article']
summary = cnn['highlights']

In [8]:
document[30], summary[30]

('By . Harriet Arkell For Mailonline . A pair of friends who dreamt up Marmite-themed board games during drunken dinners together have told how they won £50,000 backing from the Dragons\' Den investment show. Father of two Richard McLuckie, 48, and his friend Stuart Mackenzie-Walker, 51, dreamt up a series of board games including Love It Or Hate It and Who Put The Marmite In The Fridge, over long evenings spent playing games and drinking wine. They won permission to use the name Marmite from brand owner, Unilever, before going onto Dragons\' Den to ask for funding. But the pair, who have been friends since childhood when they lived across a glen from each other in Argyll, nearly lost their chance when they weren\'t allowed to mention the name Marmite to the inquisitive Dragons. Scroll down for video . The friends came up with the idea of Marmite-themed board games - and persuaded the Dragons to invest . Dragons Duncan Bannatyne, second left, and Peter Jones, second right, put up £50,0

In [9]:
# # for decoder sequence
# summary = summary.apply(lambda x: '<go> ' + x + ' <stop>')
# summary.head()

In [10]:
# since < and > from default tokens cannot be removed
filters = '!"#$%&()*+,-./:;=?@[\\]^_`{|}~\t\n'
oov_token = '<unk>'

In [11]:
document_tokenizer = tf.keras.preprocessing.text.Tokenizer(oov_token=oov_token)
summary_tokenizer = tf.keras.preprocessing.text.Tokenizer(filters=filters, oov_token=oov_token)

In [12]:
document_tokenizer.fit_on_texts(document)
summary_tokenizer.fit_on_texts(summary)

In [13]:
inputs = document_tokenizer.texts_to_sequences(document)
targets = summary_tokenizer.texts_to_sequences(summary)

In [14]:
summary_tokenizer.texts_to_sequences(["This is a test"])

[[54, 11, 6, 549]]

In [15]:
summary_tokenizer.sequences_to_texts([[54, 11, 6, 549]])

['this is a test']

In [16]:
encoder_vocab_size = len(document_tokenizer.word_index) + 1
decoder_vocab_size = len(summary_tokenizer.word_index) + 1

# vocab_size
encoder_vocab_size, decoder_vocab_size

(785451, 230198)

In [17]:
document_lengths = pd.Series([len(x) for x in document])
summary_lengths = pd.Series([len(x) for x in summary])

In [18]:
document_lengths.describe()

count    287113.000000
mean       4033.660865
std        1954.339234
min          48.000000
25%        2583.000000
50%        3682.000000
75%        5117.000000
max       15925.000000
dtype: float64

In [19]:
summary_lengths.describe()

count    287113.000000
mean        294.770390
std         120.197405
min          14.000000
25%         218.000000
50%         280.000000
75%         342.000000
max        7388.000000
dtype: float64

In [20]:
def tokenize(input_str, EOS=1):
    inputs=summary_tokenizer.texts_to_sequences([input_str])
    # Mark the end of the sentence with EOS
    input_list=inputs[0]
    input_list.append(EOS)
    return input_list

def detokenize(integers):
    """List of ints to str"""
  
    s = summary_tokenizer.sequences_to_texts(integers)
    
    return s[0]

In [21]:
tokenize('This is a test')


[54, 11, 6, 549, 1]

In [22]:
detokenize([[54, 11, 6, 549,1]])

'this is a test <unk>'

In [35]:
from tensorflow._api.v2.experimental.numpy import int32
def create_tensor(t):
    """Create tensor from list of lists"""
    return tf.constant(t)
    # if isinstance(t[0][0],bool):
    #   return tf.constant(t)
    # else:
    #   return tf.constant(t,dtype=tf.float32)


def display_tensor(t, name):
    """Display shape and tensor"""
    print(f'{name} shape: {t.shape}\n')
    print(f'{t}\n')

In [79]:
from tensorflow._api.v2.experimental.numpy import int32
def create_tensor(t):
    """Create tensor from list of lists"""
    # return np.array(t)
    if isinstance(t[0][0],bool):
      return np.array(t)
    else:
      return np.array(t,dtype=np.float32)


def display_tensor(t, name):
    """Display shape and tensor"""
    print(f'{name} shape: {t.shape}\n')
    print(f'{t}\n')

In [80]:
q = create_tensor([[1, 0, 0], [0, 1, 0]])
display_tensor(q, 'query')
k = create_tensor([[1, 2, 3], [4, 5, 6]])
display_tensor(k, 'key')
v = create_tensor([[0, 1, 0], [1, 0, 1]])
display_tensor(v, 'value')
m = create_tensor([[0, 0], [-1e9, 0]])
display_tensor(m, 'mask')


query shape: (2, 3)

[[1. 0. 0.]
 [0. 1. 0.]]

key shape: (2, 3)

[[1. 2. 3.]
 [4. 5. 6.]]

value shape: (2, 3)

[[0. 1. 0.]
 [1. 0. 1.]]

mask shape: (2, 2)

[[ 0.e+00  0.e+00]
 [-1.e+09  0.e+00]]



In [81]:
np.sqrt(3)

1.7320508075688772

In [82]:
from tensorflow.python.ops.numpy_ops import np_config
np_config.enable_numpy_behavior()

In [83]:
q_dot_k = np.dot(q,k.T) / np.sqrt(3)
display_tensor(q_dot_k, 'query dot key')

query dot key shape: (2, 2)

[[0.57735026 2.309401  ]
 [1.1547005  2.8867514 ]]



In [84]:
masked = q_dot_k + m
display_tensor(masked, 'masked query dot key')

masked query dot key shape: (2, 2)

[[ 5.7735026e-01  2.3094010e+00]
 [-1.0000000e+09  2.8867514e+00]]



In [85]:
display_tensor(masked @ v, 'masked query dot key dot value')

masked query dot key dot value shape: (2, 3)

[[ 2.3094010e+00  5.7735026e-01  2.3094010e+00]
 [ 2.8867514e+00 -1.0000000e+09  2.8867514e+00]]



In [86]:
q_with_batch = q[None,:]
display_tensor(q_with_batch, 'query with batch dim')
k_with_batch = k[None,:]
display_tensor(k_with_batch, 'key with batch dim')
v_with_batch = v[None,:]
display_tensor(v_with_batch, 'value with batch dim')
m_bool = create_tensor([[True, True], [False, True]])
display_tensor(m_bool, 'boolean mask')

query with batch dim shape: (1, 2, 3)

[[[1. 0. 0.]
  [0. 1. 0.]]]

key with batch dim shape: (1, 2, 3)

[[[1. 2. 3.]
  [4. 5. 6.]]]

value with batch dim shape: (1, 2, 3)

[[[0. 1. 0.]
  [1. 0. 1.]]]

boolean mask shape: (2, 2)

[[ True  True]
 [False  True]]



In [87]:
# UNQ_C1
# GRADED FUNCTION: DotProductAttention
def DotProductAttention(query, key, value, mask):
    """Dot product self-attention.
    Args:
        query (jax.interpreters.xla.DeviceArray): array of query representations with shape (L_q by d)
        key (jax.interpreters.xla.DeviceArray): array of key representations with shape (L_k by d)
        value (jax.interpreters.xla.DeviceArray): array of value representations with shape (L_k by d) where L_v = L_k
        mask (jax.interpreters.xla.DeviceArray): attention-mask, gates attention with shape (L_q by L_k)

    Returns:
        jax.interpreters.xla.DeviceArray: Self-attention array for q, k, v arrays. (L_q by L_k)
    """

    assert query.shape[-1] == key.shape[-1] == value.shape[-1], "Embedding dimensions of q, k, v aren't all the same"

    ### START CODE HERE (REPLACE INSTANCES OF 'None' WITH YOUR CODE) ###
    # Save depth/dimension of the query embedding for scaling down the dot product
    depth = query.shape[-1]

    # Calculate scaled query key dot product according to formula above
    dots = tf.linalg.matmul(query, tf.experimental.numpy.swapaxes(key, -1, -2)) / np.sqrt(depth)
    
    # Apply the mask
    if mask is not None: # You do not need to replace the 'None' on this line
        dots = tf.where(mask, dots, tf.experimental.numpy.full_like(dots, -1e9))
    
    # Softmax formula implementation
    # Use trax.fastmath.logsumexp of masked_qkT to avoid underflow by division by large numbers
    # Note: softmax = None
    logsumexp = tf.math.reduce_logsumexp(dots, axis=-1, keepdims=True)

    # Take exponential of dots minus logsumexp to get softmax
    # Use jnp.exp()
    dots = tf.math.exp(dots - logsumexp)
    
    # Multiply dots by value to get self-attention
    # Use jnp.matmul()
    attention = tf.linalg.matmul(dots, value)

    ## END CODE HERE ###
    
    return attention

In [88]:
DotProductAttention(q_with_batch, k_with_batch, v_with_batch, m_bool)

<tf.Tensor: shape=(1, 2, 3), dtype=float32, numpy=
array([[[0.8496746 , 0.15032545, 0.8496746 ],
        [1.        , 0.        , 1.        ]]], dtype=float32)>

In [89]:
tensor2d = create_tensor(q)
display_tensor(tensor2d, 'query matrix (2D tensor)')

tensor4d2b = create_tensor([[q, q], [q, q]])
display_tensor(tensor4d2b, 'batch of two (multi-head) collections of query matrices (4D tensor)')

tensor3dc = create_tensor([tf.concat([q, q], axis = -1)])
display_tensor(tensor3dc, 'one batch of concatenated heads of query matrices (3d tensor)')

tensor3dc3b = create_tensor([tf.concat([q, q], axis = -1), tf.concat([q, q], axis = -1), tf.concat([q, q], axis = -1)])
display_tensor(tensor3dc3b, 'three batches of concatenated heads of query matrices (3d tensor)')

query matrix (2D tensor) shape: (2, 3)

[[1. 0. 0.]
 [0. 1. 0.]]

batch of two (multi-head) collections of query matrices (4D tensor) shape: (2, 2, 2, 3)

[[[[1. 0. 0.]
   [0. 1. 0.]]

  [[1. 0. 0.]
   [0. 1. 0.]]]


 [[[1. 0. 0.]
   [0. 1. 0.]]

  [[1. 0. 0.]
   [0. 1. 0.]]]]

one batch of concatenated heads of query matrices (3d tensor) shape: (1, 2, 6)

[[[1. 0. 0. 1. 0. 0.]
  [0. 1. 0. 0. 1. 0.]]]

three batches of concatenated heads of query matrices (3d tensor) shape: (3, 2, 6)

[[[1. 0. 0. 1. 0. 0.]
  [0. 1. 0. 0. 1. 0.]]

 [[1. 0. 0. 1. 0. 0.]
  [0. 1. 0. 0. 1. 0.]]

 [[1. 0. 0. 1. 0. 0.]
  [0. 1. 0. 0. 1. 0.]]]



In [90]:
# UNQ_C2
# GRADED FUNCTION: compute_attention_heads_closure
def compute_attention_heads_closure(n_heads, d_head):
    """ Function that simulates environment inside CausalAttention function.
    Args:
        d_head (int):  dimensionality of heads
        n_heads (int): number of attention heads
    Returns:
        function: compute_attention_heads function
    """

    def compute_attention_heads(x):
        """ Compute the attention heads.
        Args:
            x (jax.interpreters.xla.DeviceArray): tensor with shape (n_batch, seqlen, n_heads X d_head).
        Returns:
            jax.interpreters.xla.DeviceArray: reshaped tensor with shape (n_batch X n_heads, seqlen, d_head).
        """
        ### START CODE HERE ###
        # (REPLACE INSTANCES OF 'None' WITH YOUR CODE)
        
        # Size of the x's batch dimension
        batch_size = x.shape[0]
        # Length of the sequence
        # Should be size of x's first dimension without counting the batch dim
        seqlen = x.shape[1]
        # Reshape x using jnp.reshape()
        # n_batch, seqlen, n_heads*d_head -> n_batch, seqlen, n_heads, d_head
        x = tf.reshape(x, (batch_size, seqlen, n_heads, d_head))
        # Transpose x using jnp.transpose()
        # n_batch, seqlen, n_heads, d_head -> n_batch, n_heads, seqlen, d_head
        # Note that the values within the tuple are the indexes of the dimensions of x and you must rearrange them
        x = tf.transpose(x, (0, 2, 1, 3))
        # Reshape x using jnp.reshape()
        # n_batch, n_heads, seqlen, d_head -> n_batch*n_heads, seqlen, d_head
        x = tf.reshape(x, (-1, seqlen, d_head))
        
        ### END CODE HERE ###

        return x
    return compute_attention_heads

In [91]:
display_tensor(tensor3dc3b, "input tensor")
result_cah = compute_attention_heads_closure(2,3)(tensor3dc3b)
display_tensor(result_cah, "output tensor")

input tensor shape: (3, 2, 6)

[[[1. 0. 0. 1. 0. 0.]
  [0. 1. 0. 0. 1. 0.]]

 [[1. 0. 0. 1. 0. 0.]
  [0. 1. 0. 0. 1. 0.]]

 [[1. 0. 0. 1. 0. 0.]
  [0. 1. 0. 0. 1. 0.]]]

output tensor shape: (6, 2, 3)

[[[1. 0. 0.]
  [0. 1. 0.]]

 [[1. 0. 0.]
  [0. 1. 0.]]

 [[1. 0. 0.]
  [0. 1. 0.]]

 [[1. 0. 0.]
  [0. 1. 0.]]

 [[1. 0. 0.]
  [0. 1. 0.]]

 [[1. 0. 0.]
  [0. 1. 0.]]]



In [123]:
# UNQ_C3
# GRADED FUNCTION: dot_product_self_attention
def dot_product_self_attention(q, k, v):
    """ Masked dot product self attention.
    Args:
        q (jax.interpreters.xla.DeviceArray): queries.
        k (jax.interpreters.xla.DeviceArray): keys.
        v (jax.interpreters.xla.DeviceArray): values.
    Returns:
        jax.interpreters.xla.DeviceArray: masked dot product self attention tensor.
    """
    ### START CODE HERE (REPLACE INSTANCES OF 'None' with your code) ###
    
    # Hint: mask size should be equal to L_q. Remember that q has shape (batch_size, L_q, d)
    mask_size = q.shape[-2]

    # Creates a matrix with ones below the diagonal and 0s above. It should have shape (1, mask_size, mask_size)
    # Notice that 1's and 0's get casted to True/False by setting dtype to jnp.bool_
    # Use jnp.tril() - Lower triangle of an array and jnp.ones()
    mask = np.tril(tf.ones((1, mask_size, mask_size), dtype=tf.experimental.numpy.bool_), k=0)
    
    ### END CODE HERE ###
    
    return DotProductAttention(q, k, v, mask)

In [124]:
dot_product_self_attention(q_with_batch, k_with_batch, v_with_batch)

<tf.Tensor: shape=(1, 2, 3), dtype=float32, numpy=
array([[[0.        , 1.        , 0.        ],
        [0.8496746 , 0.15032543, 0.8496746 ]]], dtype=float32)>

In [None]:
# # UNQ_C6
# # GRADED FUNCTION: DecoderBlock
# def DecoderBlock(d_model, d_ff, n_heads,
#                  dropout, mode, ff_activation):
#     """Returns a list of layers that implements a Transformer decoder block.

#     The input is an activation tensor.

#     Args:
#         d_model (int):  depth of embedding.
#         d_ff (int): depth of feed-forward layer.
#         n_heads (int): number of attention heads.
#         dropout (float): dropout rate (how much to drop out).
#         mode (str): 'train' or 'eval'.
#         ff_activation (function): the non-linearity in feed-forward layer.

#     Returns:
#         list: list of trax.layers.combinators.Serial that maps an activation tensor to an activation tensor.
#     """
    
#     ### START CODE HERE (REPLACE INSTANCES OF 'None' WITH YOUR CODE) ###
    
#      # Create masked multi-head attention block using CausalAttention function
#     causal_attention = CausalAttention( 
#                         d_model,
#                         n_heads=n_heads,
#                         mode=mode
#                         )

#     # Create feed-forward block (list) with two dense layers with dropout and input normalized
#     feed_forward = [ 
#         # Normalize layer inputs
#         tl.LayerNorm(),
#         # Add first feed forward (dense) layer (don't forget to set the correct value for n_units)
#         tl.Dense(d_ff),
#         # Add activation function passed in as a parameter (you need to call it!)
#         ff_activation(), # Generally ReLU
#         # Add dropout with rate and mode specified (i.e., don't use dropout during evaluation)
#         tl.Dropout(rate=dropout, mode=mode),
#         # Add second feed forward layer (don't forget to set the correct value for n_units)
#         tl.Dense(d_model),
#         # Add dropout with rate and mode specified (i.e., don't use dropout during evaluation)
#         tl.Dropout(rate=dropout, mode=mode)
#     ]

#     # Add list of two Residual blocks: the attention with normalization and dropout and feed-forward blocks
#     return [
#       tl.Residual(
#           # Normalize layer input
#           tl.LayerNorm(),
#           # Add causal attention block previously defined (without parentheses)
#           causal_attention,
#           # Add dropout with rate and mode specified
#           tl.Dropout(rate=dropout, mode=mode)
#         ),
#       tl.Residual(
#           # Add feed forward block (without parentheses)
#           feed_forward
#         ),
#       ]
#     ### END CODE HERE ###

In [None]:
# # Take a look at the decoder block
# print(DecoderBlock(d_model=512, d_ff=2048, n_heads=8, dropout=0.1, mode='train', ff_activation=tl.Relu))