Basic Attention operation as described in [Bhadanau, et al (2014)](https://arxiv.org/abs/1409.0473)

In [1]:
import numpy as np

def softmax(x, axis=0):
    """ Calculate softmax function for an array x along specified axis
    
        axis=0 calculates softmax across rows which means each column sums to 1 
        axis=1 calculates softmax across columns which means each row sums to 1
    """
    return np.exp(x) / np.expand_dims(np.sum(np.exp(x), axis=axis), axis)

### Alignment Scores

In [2]:
hidden_size = 16
attention_size = 10
input_length = 5
np.random.seed(42)

# Encoder states (1 for each input)
encoder_states = np.random.randn(input_length, hidden_size)

# Previous Decoder Hidden State
decoder_state = np.random.randn(1, hidden_size)

In [3]:
# Weights for the Neural Network
layer_1 = np.random.randn(2 * hidden_size, attention_size)
layer_2 = np.random.randn(attention_size, 1)

In [4]:
def alignment(encoder_states,decoder_state):
    # Concatenate the encoder states and the decoder state, decoder state is concatenated with each hidden state individually
    inputs = np.concatenate([encoder_states,decoder_state.repeat(repeats=encoder_states.shape[0],axis=0)],axis=1)
    assert inputs.shape == (input_length, 2 * hidden_size)

    # Matrix multiplication of the concatenated inputs and layer_1, with tanh activation
    activations = np.tanh(inputs@layer_1)
    assert activations.shape == (input_length, attention_size)

    # Matrix multiplication of the activations with layer_2
    scores = activations@layer_2
    assert scores.shape == (input_length, 1)

    return scores

In [5]:
scores = alignment(encoder_states, decoder_state)
print(scores)

[[4.35790943]
 [5.92373433]
 [4.18673175]
 [2.11437202]
 [0.95767155]]


### Converting Alignment Scores to Context Vector

In [6]:
def attention(encoder_states, decoder_state):
    """ Example function that calculates attention, returns the context vector 
    
        Arguments:
        encoder_vectors: NxM numpy array, where N is the number of vectors and M is the vector length
        decoder_vector: 1xM numpy array, M is the vector length, much be the same M as encoder_vectors
    """

    # Calculate alignment scores
    scores = alignment(encoder_states,decoder_state)

    # Convert to weights
    weights = softmax(scores,axis=0)

    # Multiply each encoder state by its respective weight
    weighted_scores = weights*encoder_states

    # Sum up weighted alignment vectors
    context = np.sum(weighted_scores,axis=0)
    
    return context

In [7]:
context_vector = attention(encoder_states, decoder_state)
print(context_vector)

[-0.63514569  0.04917298 -0.43930867 -0.9268003   1.01903919 -0.43181409
  0.13365099 -0.84746874 -0.37572203  0.18279832 -0.90452701  0.17872958
 -0.58015282 -0.58294027 -0.75457577  1.32985756]
