In [1]:
import numpy as np

## How tokens are prepared - rough idea

In [4]:
stories = ["The dog ran into street", "The cat sat in the home"]

def tokenize(text):
    return text.split()

def create_input_labels(stories, eos_token = '<EOS>'):
    inputs =[]
    labels = []

    for story in stories:
        tokens = tokenize(story)
        inputs.append(tokens[:-1]) # leaving the last word
        labels.append(tokens[1:] + [eos_token])

    return inputs, labels

input_sequences, label_sequences = create_input_labels(stories)

print(f"The input sequences are {input_sequences}")
print(f"The labels are {label_sequences}")

The input sequences are [['The', 'dog', 'ran', 'into'], ['The', 'cat', 'sat', 'in', 'the']]
The labels are [['dog', 'ran', 'into', 'street', '<EOS>'], ['cat', 'sat', 'in', 'the', 'home', '<EOS>']]


# Positonal Encoding

### What is Positional Encoding?

Problem: Transformers, unlike RNNs, don't have an inherent understanding of word order within a sequence. They process words without considering their position relative to each other.

Solution: Positional encodings are vectors added to the input word embeddings to inject information about the position of each word in the sequence. This helps the model distinguish between words in different places, even if they are the same word.

### Why is it Important?

Language Structure: Word order is crucial for understanding the meaning of sentences. ("The dog chased the cat" is different from "The cat chased the dog").

Self-Attention: Within Transformers, the attention mechanism needs positional information to understand how different words relate to each other based on their distance and order.

In [19]:
def positional_encoding(max_position, embedding_dim):
    position_enc = np.zeros((max_position, embedding_dim))
    
    for pos in range(max_position):
        for i in range(0, embedding_dim, 2):
            position_enc[pos, i] = np.sin(pos / (10000 ** (2 * i / embedding_dim)))
            position_enc[pos, i + 1] = np.cos(pos / (10000 ** (2 * i / embedding_dim)))

    return position_enc

# Example usage
max_seq_length = 100 
embedding_dim = 10  # Example dimensions

pos_encoding = positional_encoding(max_seq_length, embedding_dim)
print(pos_encoding.shape)  # Output: (100, 512)
pos_encoding[1]

(100, 10)


array([8.41470985e-01, 5.40302306e-01, 2.51162229e-02, 9.99684538e-01,
       6.30957303e-04, 9.99999801e-01, 1.58489319e-05, 1.00000000e+00,
       3.98107171e-07, 1.00000000e+00])

## where positional encoding is used in transformers architecture

In [None]:
import tensorflow as tf

class TransformerDecoder(tf.keras.layers.Layer):
  def __init__(self, vocab_size, embedding_dim, num_layers, dff, num_heads, dropout_rate):
    super(TransformerDecoder, self).__init__()
    self.embedding_dim = embedding_dim
    self.num_layers = num_layers
    self.dff = dff
    self.num_heads = num_heads
    self.dropout_rate = dropout_rate

    self.embedding = tf.keras.layers.Embedding(vocab_size, embedding_dim)
    self.pos_encoding = self._positional_encoding(max_len=5000, d_model=embedding_dim)  # Example max_len

    self.decoder_layers = [self._get_decoder_layer() for _ in range(num_layers)]
    self.final_layer = tf.keras.layers.Dense(vocab_size)

  # Define positional encoding function (same as previous example)
  def _positional_encoding(self, max_len, d_model):
    position_enc = np.zeros((max_len, d_model))
    for pos in range(max_len):
      for i in range(0, d_model, 2):
        position_enc[pos, i] = np.sin(pos / (10000 ** (2 * i / d_model)))
        position_enc[pos, i + 1] = np.cos(pos / (10000 ** (2 * i / d_model)))
    return position_enc

  def _get_decoder_layer(self):
    # Masked self-attention layer with masking for decoder
    self_attention = tf.keras.layers.MultiHeadAttention(num_heads=self.num_heads, 
                                                     dropout_rate=self.dropout_rate)
    # Encoder-decoder attention (optional, not shown here for simplicity)

    # Feed forward network
    feed_forward = tf.keras.Sequential([
        tf.keras.layers.Dense(self.dff, activation="relu"),  # First layer
        tf.keras.layers.Dense(self.embedding_dim),  # Second layer
        tf.keras.layers.Dropout(self.dropout_rate)
    ])

    # Layer normalization
    layer_norm1 = tf.keras.layers.LayerNormalization(epsilon=1e-6)
    layer_norm2 = tf.keras.layers.LayerNormalization(epsilon=1e-6)

    # Residual connections and dropout
    dropout1 = tf.keras.layers.Dropout(self.dropout_rate)
    dropout2 = tf.keras.layers.Dropout(self.dropout_rate)

    return tf.keras.Sequential([
        self_attention,
        layer_norm1,
        dropout1,
        feed_forward,
        layer_norm2,
        dropout2
    ])

  def call(self, inputs, training=False):
    # Lookup word embeddings
    embedded = self.embedding(inputs)

    # Add positional encoding to embeddings (assuming batch_size is the first dimension)
    seq_len = tf.shape(inputs)[1]
    position_enc = tf.cast(self.pos_encoding[:seq_len], tf.float32)  # Slice relevant positions
    embedded = embedded + position_enc

    # Pass through decoder layers
    for layer in self.decoder_layers:
      embedded = layer(embedded, training=training)

    # Final dense layer for output logits
    output = self.final_layer(embedded)
    return output

# Example usage (assuming you have tokenized input sequences)
model = TransformerDecoder(vocab_size=8000, embedding_dim=512, num_layers=2, dff=2048, num_heads=8, dropout_rate=0.1)
decoder_output = model(inputs)
