In [None]:
import numpy as np

# 3.1 Encoder and Decoder Stacks
Implemented only based on chapter 3.1

## 3.1.1 Encoder

In [None]:
class Encoder:
    '''
    Composed of 6 stack of identical layers
    Each layer has two sub-layers
    
    First layer : Multi-Head Self-Attention Mechanism
    Second layer : Position-wise fully connected Feed-Forward Network.

    Residual Connection around each two sub-layer is employed, followed by layer normalization.
    The output of each sub-layer is LayerNorm (x+ Sublayer(x)) where Sublayer(x) is the function implemented by the sub-layer itself.

    To facilitate these residual connections, all sub-layers in the model (including embedding layer)
    produces outputs of dimension d_model = 512
    '''
    def __init__(self):
        pass

    def input_embedding(self, input):
        return Embed().run(input)
                
    def positional_encoding(self, input): ### Not yet described, described in 3.5
        embedded_input = self.input_embedding(input)
        positional_encoded_embedding = PositionalEncoding().run(embedded_input) # will be defined later on.
        return positional_encoded_embedding

    def normalized(self, output): ### Not yet described, described in ???
        normalized_output = Normalized().run(output) # will be defined later on.
        return normalized_output 
        
    def add_norm(self, input, layer_output):
        # in ma_layer, input will be positional_encoded_embedding
        # in ff_layer, input will be the output of ma_layer
        return normalized(input + layer_output)

    def multihead_attention(self, input): ### Not yet described, described in 3.2.2
        output = MultiHead().run(input) # will be defined later on.
        return output

    def feed_forward(self, input): ### Not yet described, described in 3.3
        output = FeedForward().run(input) # will be defined later on 
        return output
        
    def ma_layer(self, positional_encoded_embedding):
        multihead_attentioned = self.multihead_attention(input_embedding)
        normalized_ouptut = self.add_norm(positional_encoded_embedding, multihead_attentioned)
        return normalized_ouptut
        
    def ff_layer(self, ma_layer_output):
        feed_forwarded = self.feed_forward(ma_layer_output)
        layer_norm_ouptut = self.add_norm(ma_layer_output, feed_forwarded)
        return normalized_ouptut
        
    def run(self, input):
        positional_encoded_embedding = positional_encoding(input)
        for _ in range(6):
            ma_layer_output = self.ma_layer(positional_encoded_embedding)
            ff_layer_output = self.ff_layer(ma_layer_output)
            positional_encoded_embedding = ff_layer_output
        return ff_layer_output

## 3.1.2 Decoder

In [None]:
class Decoder:        
    '''
    Composed of 6 stack of identical layers.
    Each layer has three sub-layers.

    First layer : Masked Multi-Head Self-Attention Mechanism
    Second layer : Multi-Head Self-Attention Mechanism
    Third layer : Position-wise fully connected Feed-Forward Network.

    Residual Connection around each two sub-layer is employed, followed by layer normalization.
    The output of each sub-layer is LayerNorm (x+ Sublayer(x)) where Sublayer(x) is the function implemented by the sub-layer itself.

    To facilitate these residual connections, all sub-layers in the model (including embedding layer)
    produces outputs of dimension d_model = 512
    '''
    def __init__(self):
        pass
        
    def input_embedding(self, input):
        return Embed().run(input)
                
    def positional_encoding(self, input): ### Not yet described, described in 3.5
        embedded_input = self.input_embedding(input)
        positional_encoded_embedding = PositionalEncoding().run(embedded_input) # will be defined later on.
        return positional_encoded_embedding

    def normalized(self, output): ### Not yet described, described in ???
        normalized_output = Normalized().run(output) # will be defined later on.
        return normalized_output 
        
    def add_norm(self, input, layer_output):
        # in ma_layer, input will be positional_encoded_embedding
        # in ff_layer, input will be the output of ma_layer
        return normalized(input + layer_output)

    def multihead_attention(self, masked_ma_layer_output, input): ### Not yet described, described in 3.2.2
        output = MultiHead().run(masked_ma_layer_output, input) # will be defined later on.
        return output

    def feed_forward(self, input): ### Not yet described, described in 3.3
        output = FeedForward().run(input) # will be defined later on 
        return output
        
    def masked_ma_layer(positional_encoded_embedding):
        masked_ma = MaskedMultiHeadAttention().run(positional_encoded_embedding)
        normalized_output = add_norm(positional_encoded_embedding, masked_ma)
        return normalized_output
    
    def ma_layer(self, masked_ma_layer_output, encoder_output):
        multihead_attentioned = self.multihead_attention(masked_ma_layer_output, encoder_output)
        normalized_ouptut = self.add_norm(masked_ma_layer_output, multihead_attentioned)
        return normalized_ouptut
        
    def ff_layer(self, ma_layer_output):
        feed_forwarded = self.feed_forward(ma_layer_output)
        normalized_ouptut = self.add_norm(ma_layer_output, feed_forwarded)
        return normalized_ouptut

    def run(self, input, encoder_output):
        positional_encoded_embedding = positional_encoding(input)
        for _ in range(6):
            ma_layer_output = self.ma_layer(positional_encoded_embedding)
            ff_layer_output = self.ff_layer(ma_layer_output)
            positional_encoded_embedding = ff_layer_output
        return ff_layer_output

    return normalized_ff_layer_output

# 3.2 Attention
## 3.2.1 Scaled Dot-Product Attention

In [None]:
class ScaledDotProductAttention:
    '''
    Input consists of queries and keys of dimension d_k, and values of dimension d_v
    compute the dot products of the query with all keys, divide each by $\sqrt {d_k}$,
    and apply a softmax function to obtain the weights on the values.

    In practice, we compute the attention function on a set of queries simultaneously, packed together into matrix Q.
    The key in matrix K, the value in matrix V.
    '''
    def __init__(self):
        pass

    def softmax(matrix):
        e_x = np.exp(matrix - np.max(matrix, axis=-1, keepdims=True))
        return e_x / e_x.sum(e_x, axis=-1, keepdims=True)
    
    def run(Q, K, V):
        softmaxed = self.softmax(np.matmul(Q, K.T)/K.shape[1])
        result = np.matmul(softmaxed, V)
        return result

## 3.2.2 Multi-Head Attention

In [2]:
class MultiHead:
    '''
    Instead of performing a single attention function with dmodel-dimensional keys, values and queries,
    we found it beneficial to linearly project the queries, keys and values h times 
    with different, learned linear projections to dk, dk and dv dimensions, respectively.
    On each of these projected versions of queries, keys and values
    we then perform the attention function in parallel, yielding dv-dimensional output values.
    
    These are concatenated and once again projected, resulting in the final values, as depicted in Figure 2.
    Multi-head attention allows the model to jointly attend to information from different representation subspaces at different positions.
    With a single attention head, averaging inhibits this.

    Figure 2 : 
    $ MultiHead(Q, K, V ) = Concat(head_1, ..., head_h)W^{O}$
    $where head_i = Attention(QW_i^Q, KW_i^K, VW_i^V)$ ######### = ScaledDotProductAttention()

    Where the projections are parameter matrices 
    $W_i^Q ∈ R^{d_{model}\times d_k }, W_i^K ∈ R^{d_{model}\timesd_k}, W_i^V ∈ R^{d_{model} \times d_v} and WO ∈ R^{hd_v \times d_{model}}$.

    In this work we employ $h = 8$ parallel attention layers, or heads. 
    For each of these we use $d_k = d_v = d_{model}/h = 64$.
    Due to the reduced dimension of each head, the total computational cost is similar to
    that of single-head attention with full dimensionality.

    '''
    def __init__(self, d_model=512, num_heads=8):
        '''
        d_model = dimension of input vector
        num_heads = number of attention heads to use / h = 8 as per the paper
        '''
        self.attention = ScaledDotProductAttention() # Attention
        self.d_model = d_model
        self.num_heads = num_heads
        self.depth = d_model // num_heads 
        '''
        Calculates the dimension of each head.
        It divides the dimension of the model by the number of heads.
        To ensure the input is evenly split across the heads.
        '''
        
        self.wq = [np.random.randn(d_model, self.depth) for _ in range(num_heads)]
        self.wk = [np.random.randn(d_model, self.depth) for _ in range(num_heads)]
        self.wv = [np.random.randn(d_model, self.depth) for _ in range(num_heads)]
        '''
        Initializes the matrix randomly with np.random.randn, generates a sample from a Gaussian distribution.
        It repeats for num_heads (8 here) times.
        The dimension of this matrix is (d_model, self.depth)
        '''
        self.wo = np.random.randn(d_model, d_model)
        '''
        Initializes the WO matrix randomly with np.random.randn, generates a sample from a Gaussian distribution.
        '''
        
    def run(self, Q, K, V):
        heads = []
        for i in range(self.num_heads):
            # split and apply attention to each head
            heads.append(self.attention.run(np.dot(Q * self.wq[i]), np.dot(K * self.wk[i]), np.dot(V * self.wv[i])))

        # concatenate and apply final linear layer
        concatenated = np.concatenate(heads, axis=-1)
        return np.dot(concatenated, self.wo)
    

In [None]:
class Transformer:
    def __init__(self):
        pass