In [1]:
import tensorflow as tf
import numpy as np

In [2]:
class InputEmbeddings(tf.keras.layers.Layer):
    def __init__(self, d_model, vocab_size, **kwargs):
        """
        :param d_model: The size of each embedding vector.
        :param vocab_size: The size of the vocabulary, defining the number of unique tokens.
        :param kwargs: pass 
        """
        super(InputEmbeddings, self).__init__(**kwargs)
        self.d_model = d_model
        self.vocab_size = vocab_size
        self.embeddings= tf.keras.layers.Embedding(vocab_size, d_model)
        
    def call(self, x):
        return self.embeddings(x) * np.sqrt(self.d_model)
    
    

In [3]:
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences


sentences = [
    "This is the first sentence.",
    "Here is another sentence.",
    "Yet another example sentence.",
    "This is the final test sentence."
]

# Tokenize the sentences
tokenizer = Tokenizer()
tokenizer.fit_on_texts(sentences)
sequences = tokenizer.texts_to_sequences(sentences)

# Get the maximum sequence length for padding
max_length = max(len(seq) for seq in sequences)

# Pad the sequences
padded_sequences = pad_sequences(sequences, maxlen=max_length, padding='post')

# Vocabulary size
vocab_size = len(tokenizer.word_index) + 1  # +1 for the padding token

# Define the model parameters
d_model = 16

# Create an instance of the InputEmbeddings layer
embedding_layer = InputEmbeddings(d_model, vocab_size)

# Convert sentences to embeddings
embeddings_output = embedding_layer(padded_sequences)

# Print the output embeddings
print(embeddings_output)
print(embeddings_output.shape)


tf.Tensor(
[[[-0.13128743  0.04554224 -0.10136862  0.19960685  0.0605513
    0.04189858 -0.03823599  0.17691298  0.03172989 -0.15908423
   -0.06517415  0.00691295  0.07703792  0.0201828  -0.0541226
   -0.04403678]
  [-0.0250515  -0.01304536  0.13116775 -0.04815464  0.1858624
   -0.06104913 -0.14646482  0.00809726  0.13263626 -0.06886563
   -0.15089998  0.01914987  0.0643983  -0.03992887 -0.11464443
   -0.1406134 ]
  [ 0.16751815 -0.04126629  0.1747637  -0.05674067  0.16145505
   -0.06902881  0.15471573  0.14071198 -0.15329476 -0.19562793
    0.19865619 -0.09923406  0.02197394 -0.01993647  0.04349986
    0.10667048]
  [ 0.18482341  0.12821461  0.09862529  0.16479765  0.19569837
   -0.1164505   0.00587115 -0.11567082  0.1956123  -0.06940946
   -0.02196345 -0.00231476  0.10348244 -0.095115   -0.05710916
   -0.02115765]
  [-0.12518068 -0.00631586  0.0892501  -0.1334598  -0.19918633
    0.04776707 -0.06120186  0.06519438 -0.05054636 -0.15564366
   -0.19777803  0.07249309  0.13974081 -0.1400

The shape `(4, 6, 16)` can be interpreted as follows:

1. **4**: This dimension represents the number of samples or batches. In this context, it corresponds to the 4 sentences being processed.
2. **6**: This dimension represents the sequence length, which is the maximum number of tokens (words) in the padded sentences. In this case, each sentence has been padded to a length of 6 tokens.
3. **16**: This dimension represents the size of each embedding vector, denoted by `d_model` in the `InputEmbeddings` class. Each token in the input sequences is mapped to a 16-dimensional vector.

So, the tensor with shape `(4, 6, 16)` represents the embeddings of 4 sentences, each with a sequence length of 6 tokens, where each token is represented by a 16-dimensional embedding vector.

In [4]:
embeddings.numpy()[0]

NameError: name 'embeddings' is not defined

# Positional Encoding

The main purpose of positional encodings is to give the model some information about the order of words or the relative positions of words within a sequence. This is crucial for tasks involving language where the meaning of a sentence can change dramatically based on the order of words (e.g., "I like dogs more than cats" vs "I like cats more than dogs").



In [5]:
class PositionalEncoding(tf.keras.layers.Layer):
    def __init__(self, d_model:int, max_len:int, dropout:float,**kwargs):
        """
        Initializes the PositionalEncoding layer.

        :param d_model: The size of each embedding vector.
        :param max_len: The maximum number of positions for which embeddings will be created.
        :param droupout: The dropout rate to apply to the output of this layer.
        :param kwargs: pass
        """
        super(PositionalEncoding, self).__init__(**kwargs)
        self.d_model = d_model
        self.max_len = max_len
        self.dropout = tf.keras.layers.Dropout(rate=dropout)
        self.positional_encoding = self._get_positional_encoding()
        
    def _get_positional_encoding(self):
        """
        Generates the positional encodings using sinusoidal patterns.
        
        Returns:
        Tensor: A tensor containing positional encodings of shape (1, max_len, d_model).
        """
        positions = np.arange(self.max_len)[:, np.newaxis]
        div_term = np.exp(np.arange(0, self.d_model, 2) * -(np.log(10000.0) / self.d_model))  # Shape (d_model/2,)
        
        pe = np.zeros((self.max_len, self.d_model))
        pe[:, 0::2] = np.sin(positions * div_term)
        pe[:, 1::2] = np.cos(positions * div_term)
        
        pe = pe[np.newaxis, ...]  # Add a new batch dimension (1, max_len, d_model)
        return tf.cast(pe, tf.float32)
    
    
    def call(self, x):
        seq_len = tf.shape(x)[1]
        assert seq_len <= self.max_len, "Input sequence length exceeds the maximum length"
        
        x = x + self.positional_encoding[:, :seq_len]
        return self.dropout(x)
    


In [6]:
# Define parameters for the model
d_model = 64  # Dimensionality of the embeddings
max_len = 10  # Assume max sentence length
dropout_rate = 0.1  # Dropout rate

# Initialize the PositionalEncoding layer
pos_encoding_layer = PositionalEncoding(d_model, max_len, dropout_rate)

# Generate dummy embeddings for 4 sentences, each with varying lengths
dummy_embeddings = tf.random.normal((4, max_len, d_model))  # Shape (batch_size, sequence_length, d_model)

# Apply the PositionalEncoding layer
encoded_embeddings = pos_encoding_layer(dummy_embeddings)

# Print the shape and some values to verify
print("Shape of encoded embeddings:", encoded_embeddings.shape)
print("Sample values from encoded embeddings:", encoded_embeddings[0, :3, :5])  # Print first 3 positions of the first sentence


Shape of encoded embeddings: (4, 10, 64)
Sample values from encoded embeddings: tf.Tensor(
[[ 0.2522655  -0.23548222 -0.00670897  0.5449405   0.19384186]
 [ 0.85453874  1.8468276   1.180588   -1.2417519   0.95052147]
 [ 1.1902826  -0.95571125  0.45968604  1.9030703  -0.71046615]], shape=(3, 5), dtype=float32)


# Add & Norm Layer

1. **Residual Connection (Add)**: This step adds the original input of the sub-layer to the output of the sub-layer (like self-attention or feed-forward network), creating a shortcut connection.
2. **Layer Normalization (Norm)**: This step normalizes the sum to stabilize and accelerate training by normalizing the output across the features.


In [7]:
class AddAndNorm(tf.keras.layers.Layer):
    def __init__(self, epsilon=1e-6, **kwargs):
        """
        Initializes the AddAndNorm layer.

        :param epsilon: Small float added to variance to avoid dividing by zero.
        :param kwargs: Additional arguments for the layer.
        """
        super(AddAndNorm, self).__init__(**kwargs)
        self.epsilon = epsilon
        self.layer_norm = tf.keras.layers.LayerNormalization(epsilon=epsilon)
        
    def call(self, x, sub_layer_output):
        return self.layer_norm(x + sub_layer_output)
        

In [8]:
x = tf.random.uniform((4, 6, 16))  # Example input tensor
sub_layer_output = tf.random.uniform((4, 6, 16))  # Example sub-layer output

add_and_norm_layer = AddAndNorm()
output = add_and_norm_layer(x, sub_layer_output)
print(output)


tf.Tensor(
[[[-0.4496566  -1.0524856   1.7421949  -0.09697199  0.28052068
   -1.5112764  -0.95312357 -1.5333147   2.144165    0.55519867
    0.6115465   0.07243443  0.4905715   0.4296434  -0.51799023
   -0.21145415]
  [-1.7803743   0.29167497  1.0718642  -0.54754376 -1.153799
   -1.0177886   0.3334664   1.5577394   1.7551833  -0.31663418
    0.19967425 -0.9741469  -0.8756956   0.12083232  0.1440965
    1.1914519 ]
  [ 0.5983474   0.66451335  0.30476356 -1.5932076   0.14489317
   -2.070825    0.1133697  -0.16246605  0.73822093  0.10525012
    0.4852109  -2.1929255   0.17858481  0.6603012   0.7894263
    1.2365437 ]
  [ 0.16917276 -0.54689157  0.86132574 -0.06252742 -0.6400471
   -0.69738543  0.4390216  -1.1846021  -0.2146728  -0.18351555
    0.2744403   2.0240514  -1.9318924  -0.9614123   1.3562579
    1.2986755 ]
  [-0.85089135  0.01155877  0.60607076 -1.4382744   0.40612197
   -0.7820573  -0.03805399 -1.014247   -0.8299135  -1.2430975
    0.88877225  2.5609734  -0.18020558  0.35554266

## Position-wise Feed-Forward Networks

The "Feed Forward" layer in transformer architectures refers to a fully connected feed-forward network applied to each position separately and identically. Typically, it consists of two linear transformations with a ReLU activation in between. The purpose of this layer is to introduce non-linearity and expand the dimensionality of the embeddings before reducing it back to the original size.



In [19]:
class FeedForward(tf.keras.layers.Layer):
    def __init__(self, d_model:int, d_ff:int, dropout:float, **kwargs):
        """
        Initializes the FeedForward layer.

        :param d_model: The size of the input and output embeddings.
        :param d_ff: The hidden layer size of the feed-forward network.
        :param dropout_rate: The dropout rate to apply to the output of the feed-forward network.
        """    
        super(FeedForward, self).__init__()
        self.dense_1 = tf.keras.layers.Dense(d_ff, activation='relu')
        self.dense_2 = tf.keras.layers.Dense(d_model)
        self.dropout = tf.keras.layers.Dropout(dropout)
        
    def call(self, x):
        """
        Forward pass for the FeedForward layer.

        :param x: Input tensor.
        :return: Output tensor after applying the feed-forward network and dropout.
        """
        x = self.dense_1(x)
        x = self.dense_2(x)
        x = self.dropout(x)
        return x
    

In [20]:
sentences = [
    "The quick brown fox jumps.",
    "Over the lazy dog.",
    "And runs away swiftly."
]

# Tokenize the sentences
tokenizer = Tokenizer()
tokenizer.fit_on_texts(sentences)
sequences = tokenizer.texts_to_sequences(sentences)

# Pad the sequences to the maximum sequence length
max_length = max(len(seq) for seq in sequences)
padded_sequences = pad_sequences(sequences, maxlen=max_length, padding='post')

# Vocabulary size
vocab_size = len(tokenizer.word_index) + 1  # +1 for padding token

# Define the embedding size (d_model)
d_model = 16

embedding_layer = tf.keras.layers.Embedding(input_dim=vocab_size, output_dim=d_model)

embeddings_output = embedding_layer(padded_sequences)
embeddings_output.shape

TensorShape([3, 5, 16])

In [21]:
d_ff = 64
dropout_rate = 0.1

# Create an instance of the FeedForward layer
feed_forward_layer = FeedForward(d_model, d_ff, dropout_rate)

# Apply the FeedForward layer to the embeddings
feed_forward_output = feed_forward_layer(embeddings_output)

print(feed_forward_output)


tf.Tensor(
[[[-1.71188936e-02  6.78350870e-03  2.29200274e-02  2.68865805e-02
   -4.55330499e-03  1.41850803e-02  1.14909336e-02 -1.29317483e-02
    5.06777782e-03 -2.74961591e-02  2.09067333e-02 -1.19833965e-02
   -4.70610248e-04  9.44923796e-03  4.88287024e-03 -8.06903001e-03]
  [-9.69667733e-03 -3.57569050e-04  6.39089104e-03 -5.05737960e-03
   -1.28823724e-02 -3.19876634e-02 -4.64992924e-03 -7.85883516e-03
   -4.62983875e-03 -4.94833989e-03  2.39656046e-02  1.49299838e-02
    1.50686931e-02  4.43587219e-03  2.12711990e-02  2.38423399e-03]
  [ 1.76993553e-02  3.86869013e-02  4.39423062e-02 -3.87991453e-03
   -1.29083218e-02 -5.95324440e-03 -2.10746266e-02  1.75266769e-02
    4.11043502e-03 -3.22539620e-02 -4.36760299e-03 -1.20009920e-02
   -1.62505582e-02 -2.92878803e-02  4.82356735e-02  1.41240349e-02]
  [ 1.07932063e-02 -3.91062873e-04 -3.00994376e-03  1.04154097e-02
   -6.57092081e-03 -2.99333548e-03 -1.41826756e-02 -4.08469839e-03
   -2.22979533e-03  2.03259056e-03  2.47755880e-

In [11]:
input_tensor = tf.random.uniform((4, 6, d_model))  # Shape (batch_size, sequence_length, d_model)
input_tensor[0].shape

TensorShape([6, 64])

In [23]:
A = np.random.randn(3, 5, 16)
W = np.random.randn(16, 4)

tf.matmul(A, W)

<tf.Tensor: shape=(3, 5, 4), dtype=float64, numpy=
array([[[-1.75939646,  3.40516093, -4.27705087, -1.88455431],
        [-3.19374589, -0.87712827, -1.44354896,  1.54755736],
        [-1.2952284 , -6.0005745 ,  5.00949191,  4.13150758],
        [-2.07921655, -1.49705482, -1.9251128 ,  0.9703732 ],
        [-4.73273478, -1.04469464, -4.28149033,  1.69383454]],

       [[-2.91213915,  3.2958085 , -2.29178504, -1.39821847],
        [-4.73628485, -3.7345663 ,  2.36622727,  4.6875842 ],
        [ 2.83105502, -3.72477762, -2.66309375,  0.38037836],
        [ 0.57391868,  3.35325835,  2.40535651, -5.84371626],
        [ 1.77867598,  4.53325733, -4.18067179, -0.04259554]],

       [[-3.56536643, -3.68107326,  1.2038762 , 11.52075844],
        [-2.81360129,  1.52812499,  5.39114092, -4.64559375],
        [ 3.01634073, -1.23269434,  1.40073692, -2.11019627],
        [-0.66859779,  1.85441214, -2.63966033,  3.32437611],
        [ 0.3027926 , -1.64211466, -6.31089021,  2.58699243]]])>

# Multi-Head Attention

Multi-Head Attention is a key component of the transformer architecture, enabling the model to focus on different parts of the input sequence to capture contextual relationships. Here is a detailed explanation with the relevant equations and an algorithm in words.



In [None]:
class MultiHeadAttention(tf.keras.layers.Layer):
    
    def __init__(self, d_model, num_heads=8, **kwargs):
        """
        Initializes the MultiHeadAttention layer.

        :param d_model: The dimensionality of the input and output embeddings.
        :param num_heads: The number of attention heads.
        :param kwargs: Additional arguments for the layer.
        """

        super(MultiHeadAttention, self).__init__(**kwargs)
        self.d_model = d_model
        self.num_heads = num_heads
        self.depth = d_model // num_heads
        
        self.wq = tf.keras.layers.Dense(d_model)
        self.wk = tf.keras.layers.Dense(d_model)
        self.wv = tf.keras.layers.Dense(d_model)
        self.dense = tf.keras.layers.Dense(d_model)
        
    
    def split_heads(self, x, batch_size):
        """
        Split the last dimension into (num_heads, depth).

        :param x: Tensor of shape (batch_size, seq_len, d_model)
        :param batch_size: The batch size of the input tensor.
        :return: Tensor of shape (batch_size, num_heads, seq_len, depth)
        """
        x = tf.reshape(x, (batch_size, -1, self.num_heads, self.depth))
        x = tf.transpose(x, [0, 2, 1, 3, 4])
        return x
        
    def call(self):
        pass
        
        

In [33]:
x = np.random.randn(3, 2, 4)
x



array([[[-1.47190042, -0.82041997, -0.3517263 ,  0.61889998],
        [ 0.03952871,  0.69015492,  1.07468517,  0.78579719]],

       [[ 0.45223838, -0.76107617, -1.08801296, -1.11609575],
        [ 0.72553516, -1.53572232,  0.65298136, -1.2677282 ]],

       [[-1.93968604, -2.36903857,  0.42170758, -1.07325776],
        [ 0.66682809,  1.21227226, -1.36572286,  1.80016498]]])

In [37]:
x.reshape(3, -1, 2, 2)

array([[[[-1.47190042, -0.82041997],
         [-0.3517263 ,  0.61889998]],

        [[ 0.03952871,  0.69015492],
         [ 1.07468517,  0.78579719]]],


       [[[ 0.45223838, -0.76107617],
         [-1.08801296, -1.11609575]],

        [[ 0.72553516, -1.53572232],
         [ 0.65298136, -1.2677282 ]]],


       [[[-1.93968604, -2.36903857],
         [ 0.42170758, -1.07325776]],

        [[ 0.66682809,  1.21227226],
         [-1.36572286,  1.80016498]]]])