In [None]:
import pandas as pd
import numpy as np
import tensorflow as tf
import time
import re
import pickle
from tensorflow.keras import layers as tl


In [None]:
from google.colab import drive
drive.mount('/content/drive')

In [None]:
cnn = pd.read_csv("drive/My Drive/Colab Notebooks/abs_summ/data/train.csv")

In [None]:
cnn.drop(['id'], axis=1, inplace=True)

In [None]:
cnn.head()

In [None]:
cnn.shape

In [None]:
document = cnn['article']
summary = cnn['highlights']

In [None]:
document[30], summary[30]

In [None]:
# # for decoder sequence
# summary = summary.apply(lambda x: '<go> ' + x + ' <stop>')
# summary.head()

In [None]:
# since < and > from default tokens cannot be removed
filters = '!"#$%&()*+,-./:;=?@[\\]^_`{|}~\t\n'
oov_token = '<unk>'

In [None]:
document_tokenizer = tf.keras.preprocessing.text.Tokenizer(oov_token=oov_token)
summary_tokenizer = tf.keras.preprocessing.text.Tokenizer(filters=filters, oov_token=oov_token)

In [None]:
document_tokenizer.fit_on_texts(document)
summary_tokenizer.fit_on_texts(summary)

In [None]:
inputs = document_tokenizer.texts_to_sequences(document)
targets = summary_tokenizer.texts_to_sequences(summary)

In [None]:
summary_tokenizer.texts_to_sequences(["This is a test"])

In [None]:
summary_tokenizer.sequences_to_texts([[54, 11, 6, 549]])

In [None]:
encoder_vocab_size = len(document_tokenizer.word_index) + 1
decoder_vocab_size = len(summary_tokenizer.word_index) + 1

# vocab_size
encoder_vocab_size, decoder_vocab_size

In [None]:
document_lengths = pd.Series([len(x) for x in document])
summary_lengths = pd.Series([len(x) for x in summary])

In [None]:
document_lengths.describe()

In [None]:
summary_lengths.describe()

In [None]:
def tokenize(input_str, EOS=1):
    inputs=summary_tokenizer.texts_to_sequences([input_str])
    # Mark the end of the sentence with EOS
    input_list=inputs[0]
    input_list.append(EOS)
    return input_list

def detokenize(integers):
    """List of ints to str"""
  
    s = summary_tokenizer.sequences_to_texts(integers)
    
    return s[0]

In [None]:
tokenize('This is a test')


In [None]:
detokenize([[54, 11, 6, 549,1]])

In [None]:
from tensorflow._api.v2.experimental.numpy import int32
def create_tensor(t):
    """Create tensor from list of lists"""
    # return tf.constant(t)
    if isinstance(t[0][0],bool):
      return tf.constant(t)
    else:
      return tf.constant(t,dtype=tf.float32)


def display_tensor(t, name):
    """Display shape and tensor"""
    print(f'{name} shape: {t.shape}\n')
    print(f'{t}\n')

In [None]:
# q=[[1., 0., 0.],
#    [0., 1. ,0.]]
# # x=create_tensor([[q,q],[q,q]])
# # x
# x=tf.concat([q, q], axis = -1)
# y=create_tensor(x)
# y

In [None]:
q = create_tensor([[1, 0, 0], [0, 1, 0]])
display_tensor(q, 'query')
k = create_tensor([[1, 2, 3], [4, 5, 6]])
display_tensor(k, 'key')
v = create_tensor([[0, 1, 0], [1, 0, 1]])
display_tensor(v, 'value')
m = create_tensor([[0, 0], [-1e9, 0]])
display_tensor(m, 'mask')

# if isinstance(q[0][0], tf.float32):
#   print(True)

In [None]:
np.sqrt(3)

In [None]:
from tensorflow.python.ops.numpy_ops import np_config
np_config.enable_numpy_behavior()

In [None]:
q_dot_k = np.dot(q,k.T) / np.sqrt(3)
display_tensor(q_dot_k, 'query dot key')

In [None]:
masked = q_dot_k + m
display_tensor(masked, 'masked query dot key')

In [None]:
display_tensor(masked @ v, 'masked query dot key dot value')

In [None]:
q_with_batch = q[None,:]
display_tensor(q_with_batch, 'query with batch dim')
k_with_batch = k[None,:]
display_tensor(k_with_batch, 'key with batch dim')
v_with_batch = v[None,:]
display_tensor(v_with_batch, 'value with batch dim')
m_bool = create_tensor([[True, True], [False, True]])
display_tensor(m_bool, 'boolean mask')

In [None]:
# UNQ_C1
# GRADED FUNCTION: DotProductAttention
def DotProductAttention(query, key, value, mask):
    """Dot product self-attention.
    Args:
        query (jax.interpreters.xla.DeviceArray): array of query representations with shape (L_q by d)
        key (jax.interpreters.xla.DeviceArray): array of key representations with shape (L_k by d)
        value (jax.interpreters.xla.DeviceArray): array of value representations with shape (L_k by d) where L_v = L_k
        mask (jax.interpreters.xla.DeviceArray): attention-mask, gates attention with shape (L_q by L_k)

    Returns:
        jax.interpreters.xla.DeviceArray: Self-attention array for q, k, v arrays. (L_q by L_k)
    """

    assert query.shape[-1] == key.shape[-1] == value.shape[-1], "Embedding dimensions of q, k, v aren't all the same"

    ### START CODE HERE (REPLACE INSTANCES OF 'None' WITH YOUR CODE) ###
    # Save depth/dimension of the query embedding for scaling down the dot product
    depth = query.shape[-1]

    # Calculate scaled query key dot product according to formula above
    dots = tf.linalg.matmul(query, tf.experimental.numpy.swapaxes(key, -1, -2)) / np.sqrt(depth)
    
    # Apply the mask
    if mask is not None: # You do not need to replace the 'None' on this line
        dots = tf.where(mask, dots, tf.experimental.numpy.full_like(dots, -1e9))
    
    # Softmax formula implementation
    # Use trax.fastmath.logsumexp of masked_qkT to avoid underflow by division by large numbers
    # Note: softmax = None
    logsumexp = tf.math.reduce_logsumexp(dots, axis=-1, keepdims=True)

    # Take exponential of dots minus logsumexp to get softmax
    # Use jnp.exp()
    dots = tf.math.exp(dots - logsumexp)
    
    # Multiply dots by value to get self-attention
    # Use jnp.matmul()
    attention = tf.linalg.matmul(dots, value)

    ## END CODE HERE ###
    
    return attention

In [None]:
DotProductAttention(q_with_batch, k_with_batch, v_with_batch, m_bool)

In [None]:
tensor2d = create_tensor(q)
display_tensor(tensor2d, 'query matrix (2D tensor)')

tensor4d2b = create_tensor([[q, q], [q, q]])
display_tensor(tensor4d2b, 'batch of two (multi-head) collections of query matrices (4D tensor)')

tensor3dc = create_tensor([tf.concat([q, q], axis = -1)])
display_tensor(tensor3dc, 'one batch of concatenated heads of query matrices (3d tensor)')

tensor3dc3b = create_tensor([tf.concat([q, q], axis = -1), tf.concat([q, q], axis = -1), tf.concat([q, q], axis = -1)])
display_tensor(tensor3dc3b, 'three batches of concatenated heads of query matrices (3d tensor)')

In [None]:
# UNQ_C2
# GRADED FUNCTION: compute_attention_heads_closure
def compute_attention_heads_closure(n_heads, d_head):
    """ Function that simulates environment inside CausalAttention function.
    Args:
        d_head (int):  dimensionality of heads
        n_heads (int): number of attention heads
    Returns:
        function: compute_attention_heads function
    """

    def compute_attention_heads(x):
        """ Compute the attention heads.
        Args:
            x (jax.interpreters.xla.DeviceArray): tensor with shape (n_batch, seqlen, n_heads X d_head).
        Returns:
            jax.interpreters.xla.DeviceArray: reshaped tensor with shape (n_batch X n_heads, seqlen, d_head).
        """
        ### START CODE HERE ###
        # (REPLACE INSTANCES OF 'None' WITH YOUR CODE)
        
        # Size of the x's batch dimension
        batch_size = x.shape[0]
        # Length of the sequence
        # Should be size of x's first dimension without counting the batch dim
        seqlen = x.shape[1]
        # Reshape x using jnp.reshape()
        # n_batch, seqlen, n_heads*d_head -> n_batch, seqlen, n_heads, d_head
        x = tf.reshape(x, (batch_size, seqlen, n_heads, d_head))
        # Transpose x using jnp.transpose()
        # n_batch, seqlen, n_heads, d_head -> n_batch, n_heads, seqlen, d_head
        # Note that the values within the tuple are the indexes of the dimensions of x and you must rearrange them
        x = tf.transpose(x, (0, 2, 1, 3))
        # Reshape x using jnp.reshape()
        # n_batch, n_heads, seqlen, d_head -> n_batch*n_heads, seqlen, d_head
        x = tf.reshape(x, (-1, seqlen, d_head))
        
        ### END CODE HERE ###

        return x
    return compute_attention_heads

In [None]:
display_tensor(tensor3dc3b, "input tensor")
result_cah = compute_attention_heads_closure(2,3)(tensor3dc3b)
display_tensor(result_cah, "output tensor")

In [None]:
# # UNQ_C6
# # GRADED FUNCTION: DecoderBlock
# def DecoderBlock(d_model, d_ff, n_heads,
#                  dropout, mode, ff_activation):
#     """Returns a list of layers that implements a Transformer decoder block.

#     The input is an activation tensor.

#     Args:
#         d_model (int):  depth of embedding.
#         d_ff (int): depth of feed-forward layer.
#         n_heads (int): number of attention heads.
#         dropout (float): dropout rate (how much to drop out).
#         mode (str): 'train' or 'eval'.
#         ff_activation (function): the non-linearity in feed-forward layer.

#     Returns:
#         list: list of trax.layers.combinators.Serial that maps an activation tensor to an activation tensor.
#     """
    
#     ### START CODE HERE (REPLACE INSTANCES OF 'None' WITH YOUR CODE) ###
    
#      # Create masked multi-head attention block using CausalAttention function
#     causal_attention = CausalAttention( 
#                         d_model,
#                         n_heads=n_heads,
#                         mode=mode
#                         )

#     # Create feed-forward block (list) with two dense layers with dropout and input normalized
#     feed_forward = [ 
#         # Normalize layer inputs
#         tl.LayerNorm(),
#         # Add first feed forward (dense) layer (don't forget to set the correct value for n_units)
#         tl.Dense(d_ff),
#         # Add activation function passed in as a parameter (you need to call it!)
#         ff_activation(), # Generally ReLU
#         # Add dropout with rate and mode specified (i.e., don't use dropout during evaluation)
#         tl.Dropout(rate=dropout, mode=mode),
#         # Add second feed forward layer (don't forget to set the correct value for n_units)
#         tl.Dense(d_model),
#         # Add dropout with rate and mode specified (i.e., don't use dropout during evaluation)
#         tl.Dropout(rate=dropout, mode=mode)
#     ]

#     # Add list of two Residual blocks: the attention with normalization and dropout and feed-forward blocks
#     return [
#       tl.Residual(
#           # Normalize layer input
#           tl.LayerNorm(),
#           # Add causal attention block previously defined (without parentheses)
#           causal_attention,
#           # Add dropout with rate and mode specified
#           tl.Dropout(rate=dropout, mode=mode)
#         ),
#       tl.Residual(
#           # Add feed forward block (without parentheses)
#           feed_forward
#         ),
#       ]
#     ### END CODE HERE ###

In [None]:
# # Take a look at the decoder block
# print(DecoderBlock(d_model=512, d_ff=2048, n_heads=8, dropout=0.1, mode='train', ff_activation=tl.Relu))