# Attention Mechanism

This below entire code has been taken from this following github [link](https://github.com/SebiSebi/AI2-Reasoning-Challenge-ARC/tree/master/AttentiveRanker/src) which is the main author of the research paper.
I have done some modification to the code as per the requirement.

In [None]:
# downloading dataset direct from Gdrive with DRD and AVD scores
!gdown 1R3DurgEQ4GZxByej_z7eRmSmONzUOHiE
!gdown 1p153SuoNB9rIEKCMdKC4y2UONnvb8dyQ

Downloading...
From: https://drive.google.com/uc?id=1R3DurgEQ4GZxByej_z7eRmSmONzUOHiE
To: /content/arc_dataset_challnege_final_scores.pkl
100% 53.5M/53.5M [00:00<00:00, 127MB/s]
Downloading...
From (original): https://drive.google.com/uc?id=1p153SuoNB9rIEKCMdKC4y2UONnvb8dyQ
From (redirected): https://drive.google.com/uc?id=1p153SuoNB9rIEKCMdKC4y2UONnvb8dyQ&confirm=t&uuid=e65e2f03-35e5-47c8-b025-e89e60da36d0
To: /content/arc_dataset_easy_final_scores.pkl
100% 106M/106M [00:02<00:00, 40.7MB/s] 


In [None]:
import pickle

train_challenge, dev_challenge, test_challenge= \
pickle.load(open('arc_dataset_challnege_final_scores.pkl', 'rb'))

train_easy, dev_easy, test_easy= \
pickle.load(open('arc_dataset_easy_final_scores.pkl', 'rb'))

### Key Value Attention

In [None]:
import tensorflow as tf
import tensorflow
from keras import backend as K
# from tensorflow.keras.engine import Layer
from tensorflow.keras.layers import Layer

# This attention is similar to the Key-Value-Query self-attention but
# only uses the Key and the Value since we don't need the third component
# for this use case. The main application of the KV attention is to
# select the most important timestamps from a time-series using self
# information: that is, there is no other tensor to compute attention to.
# KV attention works as follow:
#  * Let x be the input with shape (batch_size, timestamps, size).
#  * Let xt be the component of x at timestamp t (shape = (size,))
#  * Each xt is projected onto the key vector space and onto the
#    value vector space using independent matrices.
#  * Furthermore, each element from the key space is projected
#    into a single scalar using a dot product. These values are then
#    normalized using the softmax function;
#  * The vectors in the Value space are summed according to the probabilities
#    and the weighted sum is the final output of the layer.
# A quick draw with the graph can be found:
#  * here: http://bit.do/eL7kf and
#  * here: https://tinyurl.com/y5ncha3b (same image).
# key_size = the dimention of the key vector space.
# value_size = the dimention of the value vector space.
# Both are required.
class KVAttention(Layer):

    # attention_size is the variable @k from the paper.
    def __init__(self, key_size, value_size,
                 return_attention_scores=False, **kwargs):
        if K.backend() != 'tensorflow':
            raise RuntimeError('KVAttention is only available with '
                               'the TensorFlow backend.')
        assert(isinstance(key_size, int) and key_size >= 1)
        assert(isinstance(value_size, int) and value_size >= 1)
        assert(isinstance(return_attention_scores, bool))

        self.key_size = key_size
        self.value_size = value_size
        self.return_attention_scores = return_attention_scores
        self.imput_dim = None
        self.timestamps = None

        super(KVAttention, self).__init__(**kwargs)

    # The model receives an input with shape (batch_size, timestamp, input_dim)
    def build(self, input_shape):
        assert(len(input_shape) == 3)

        self.timestamps = input_shape[1]
        self.input_dim = input_shape[2]

        self.key_embed_w = self.add_weight(
                        shape=(self.input_dim, self.key_size),
                        name='key_embed_w',
                        initializer='glorot_uniform',
                        trainable=True
        )
        self.key_embed_b = self.add_weight(
                        shape=(self.key_size,),
                        name='key_embed_b',
                        initializer='zeros',
                        trainable=True
        )
        self.key_to_scalar_w = self.add_weight(
                        shape=(self.key_size, 1),
                        name='key_to_scalar_w',
                        initializer='glorot_uniform',
                        trainable=True
        )
        self.key_to_scalar_b = self.add_weight(
                        shape=(1,),
                        name='key_to_scalar_b',
                        initializer='zeros',
                        trainable=True
        )
        self.value_embed_w = self.add_weight(
                        shape=(self.input_dim, self.value_size),
                        name='value_embed_w',
                        initializer='glorot_uniform',
                        trainable=True
        )
        self.value_embed_b = self.add_weight(
                        shape=(self.value_size,),
                        name='value_embed_b',
                        initializer='zeros',
                        trainable=True
        )

        super(KVAttention, self).build(input_shape)

    def call(self, inputs, mask=None):
        input_tensor = inputs  # (batch_size, timestamp, input_dim)
        assert(len(input_tensor.shape) == 3)
        assert(input_tensor.shape[1] == self.timestamps)
        assert(input_tensor.shape[2] == self.input_dim)

        # K = tanh(W_emb * input_tensor + b_emb).
        K = tf.reshape(input_tensor, [-1, self.input_dim])
        K = tf.compat.v1.nn.xw_plus_b(K, self.key_embed_w, self.key_embed_b)
        K = tf.tanh(K)  # K.shape = (batch * timestamp, key_size)

        # Further encode the key into a single scalar for each timestamp.
        K = tf.compat.v1.nn.xw_plus_b(K, self.key_to_scalar_w, self.key_to_scalar_b)
        K = tf.tanh(K)  # K.shape = (batch * timestamp, 1)
        K = tf.reshape(K, [-1, self.timestamps])
        # K.shape = (batch_size, timestamp)

        # Apply softmax to the Key tensor.
        assert(len(K.shape) == 2)
        assert(K.shape[1] == self.timestamps)
        P = tf.nn.softmax(K)  # P.shape (batch_size, timestamps)
        assert(P.shape[1:] == K.shape[1:])
        if self.return_attention_scores:
            return P

        # Build the Value vector (key part is completed, we have P).
        # V = tanh(W_emb2 * input_tensor + b_emb2).
        V = tf.reshape(input_tensor, [-1, self.input_dim])
        V = tf.compat.v1.nn.xw_plus_b(V, self.value_embed_w, self.value_embed_b)
        V = tf.nn.relu(V)  # V.shape = (batch * timestamp, value_size)
        V = tf.reshape(V, [-1, self.timestamps, self.value_size])
        # V.shape = (batch_size, timestamp, value_size)

        # Perform the weighted sum.
        P = tf.expand_dims(P, 1)
        # P.shape = (batch_size, 1, timestamps)
        # V.shape = (batch_size, timestamps, value_size)

        out = tf.matmul(P, V)
        assert(len(out.shape) == 3)
        assert(out.shape[1] == 1)
        out = tf.squeeze(out, axis=1)
        assert(out.shape[1] == self.value_size)

        return out

    def compute_output_shape(self, input_shape):
        assert(isinstance(input_shape, tuple))
        assert(len(input_shape) == 3)
        if self.return_attention_scores:
            return (input_shape[0], self.timestamps)
        return (input_shape[0], self.value_size)

### Utils

In [None]:
import itertools
import json
import numpy as np
import uuid




# Returns a list of entries. To be fed into @to_numpy to extract tensors
# that can be directly wired to Keras @fit.
# Augment the dataset with all answer permutations such that the neural
# network cannot learn how to decide based on positions.
# Returns the newly constructed dataset.
def augment_with_permutations(dataset):
    #assert(isinstance(dataset, list))
    augmented = []

    ans ={
      'A': 0,
      'B': 1,
      'C': 2,
      'D': 3,
      '1': 0,
      '2': 1,
      '3': 2,
      '4': 3}

    for idx, entry in dataset.iterrows():

      if len(entry["score"]) == 5: continue

      lst = [item for sublist in entry['score'] for item in sublist]

      if len(lst) == 200:

        answers = entry["score"]
        correct_answer = ans[entry["AnswerKey"]]

        # correct_answer = entry["correct_answer"]
        # answers = entry["answers"]
        documents = entry["context"]
        answers_text = entry["only_answers"]

        for perm in itertools.permutations([0, 1, 2, 3]):
            permuted_answers = [answers[i] for i in perm]
            permuted_answers_text = [answers_text[i] for i in perm]
            permuted_documents = [documents[i] for i in perm]
            permuted_correct_answer = perm.index(correct_answer)
            assert(permuted_correct_answer in [0, 1, 2, 3])
            augmented.append({
                    "AnswerKey": permuted_correct_answer,
                    "score": permuted_answers,
                    "only_questions": entry["only_questions"],
                    "context": permuted_documents,
                    "only_answers": permuted_answers_text
            })

    return augmented


# Returns a dictionary of entries (for answers) and their labels. To be
# directly wired into Keras methods (fit, predict, etc.).
# Does not shuffle the questions.
def my_numpy(dataset, NUM_FEATURES):

    NUM_SOURCES= 2
    top_n = 25
    answer_a = np.zeros((len(dataset), NUM_SOURCES * top_n, NUM_FEATURES),
                        dtype="float")
    answer_b = np.zeros((len(dataset), NUM_SOURCES * top_n, NUM_FEATURES),
                        dtype="float")
    answer_c = np.zeros((len(dataset), NUM_SOURCES * top_n, NUM_FEATURES),
                        dtype="float")
    answer_d = np.zeros((len(dataset), NUM_SOURCES * top_n, NUM_FEATURES),
                        dtype="float")
    labels = np.zeros((len(dataset), 4), dtype="int32")

    ans ={
      'A': 0,
      'B': 1,
      'C': 2,
      'D': 3,
       0: 0,
       1: 1,
       2: 2,
       3: 3}

    for (idx, entry) in enumerate(dataset):

      if len(entry["score"]) == 5: continue

      lst = [item for sublist in entry['score'] for item in sublist]

      if len(lst) == 200:

        answers = entry["score"]
        correct_answer = entry["AnswerKey"]

        # this code snippet will take score with answer verifier
        if NUM_FEATURES == 3:
          answer_a[idx] = np.array(answers[0], dtype="float")
          answer_b[idx] = np.array(answers[1], dtype="float")
          answer_c[idx] = np.array(answers[2], dtype="float")
          answer_d[idx] = np.array(answers[3], dtype="float")
          # print(correct_answer)
          labels[idx][ans[correct_answer]] = 1

        # this code snippet will take score without answer verifier
        if NUM_FEATURES == 2:
          # print((answers[0]))
          answer_a[idx] = np.array([i[:2] for i in answers[0]], dtype="float").reshape(50,2)
          answer_b[idx] = np.array([i[:2] for i in answers[1]], dtype="float").reshape(50,2)
          answer_c[idx] = np.array([i[:2] for i in answers[2]], dtype="float").reshape(50,2)
          answer_d[idx] = np.array([i[:2] for i in answers[3]], dtype="float").reshape(50,2)
          # print(correct_answer)
          labels[idx][ans[correct_answer]] = 1

    return {
            "answer_a": answer_a,
            "answer_b": answer_b,
            "answer_c": answer_c,
            "answer_d": answer_d
    }, labels

### Model Architecture

In [None]:
from keras.layers import Dense, Input, TimeDistributed, Reshape
from keras.layers import Concatenate, Dropout, SpatialDropout1D
from keras.models import Model


def get_model(N_points, NUM_FEATURES, return_attention_scores=False):

    answer_a = Input(shape=(N_points, NUM_FEATURES), name="answer_a")
    answer_b = Input(shape=(N_points, NUM_FEATURES), name="answer_b")
    answer_c = Input(shape=(N_points, NUM_FEATURES), name="answer_c")
    answer_d = Input(shape=(N_points, NUM_FEATURES), name="answer_d")


    # These layers are shared for each answer.
    encoder_layer1 = TimeDistributed(
                        Dense(32, activation='tanh', name="dense_1"),
                        name="time_distributed_1"
    )
    dropout_layer1 = SpatialDropout1D(0.25, name="spatial_dropout_1")
    to_scalar_layer = KVAttention(
                        key_size=64, value_size=8, name="att",
                        return_attention_scores=return_attention_scores
    )

    def encode_answer(answer):
        x = encoder_layer1(answer)
        x = dropout_layer1(x)
        x = to_scalar_layer(x)
        return x

    a = encode_answer(answer_a)
    b = encode_answer(answer_b)
    c = encode_answer(answer_c)
    d = encode_answer(answer_d)

    output = None
    y = Concatenate(axis=-1, name="concatenate_1")([a, b, c, d])
    if return_attention_scores:
        output = y
    else:
        y = Dense(32, activation='relu', name="dense_2")(y)
        y = Dropout(0.1, name="dropout_1")(y)
        y = Dense(32, activation='relu', name="dense_3")(y)
        y = Dropout(0.1, name="dropout_2")(y)
        y = Dense(32, activation='relu', name="dense_4")(y)
        output = Dense(4, activation='softmax', name="dense_5")(y)

    model = Model(inputs=[answer_a, answer_b, answer_c, answer_d],
                  outputs=[output])
    return model

### Eval

In [None]:
from keras.callbacks import ModelCheckpoint
import tensorflow as tf

def answer_with_attention_score(train, val, verifier):

  '''
  Function to get accuracy score with key value attention
  '''

  train_dataset = augment_with_permutations(train)
  train_data, train_labels = my_numpy(train_dataset, verifier)

  val_dataset = augment_with_permutations(val)
  val_data, val_labels = my_numpy(val_dataset, verifier)


  model = get_model(50, verifier)
  model.compile(loss='categorical_crossentropy',
                optimizer = tf.keras.optimizers.AdamW(),
                metrics=['acc'])

  model.fit(
          train_data, train_labels,
          validation_data=(val_data, val_labels),
          batch_size=128,
          epochs=256,
          verbose=1,
  )

  data, labels= train_data, train_labels

  y = model.predict(data, batch_size=1024)
  assert(y.shape == (labels.shape[0], 4))

  num_questions = labels.shape[0]
  correct = 0
  for i in range(0, num_questions):
      expected = np.argmax(labels[i])
      predicted = np.argmax(y[i])
      if expected == predicted:
          correct += 1
  return (100.0 * correct / num_questions)

In [None]:
with_answer_verifier= 3
without_answer_verifier= 2

# with BM25, AVD, and DRD
challenge_score_with_answer_verifier = answer_with_attention_score(train_challenge, dev_challenge,
                                                                   with_answer_verifier)
easy_score_with_answer_verifier = answer_with_attention_score(train_easy, dev_easy,
                                                              with_answer_verifier)

# with BM25, and DRD
challenge_score_without_answer_verifier = answer_with_attention_score(train_challenge, dev_challenge,
                                                                      without_answer_verifier)
easy_score_without_answer_verifier = answer_with_attention_score(train_easy, dev_easy,
                                                                 without_answer_verifier)

Epoch 1/256
[1m184/184[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m69s[0m 243ms/step - acc: 0.2489 - loss: 1.3882 - val_acc: 0.2498 - val_loss: 1.3863
Epoch 2/256
[1m184/184[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m21s[0m 7ms/step - acc: 0.2520 - loss: 1.3864 - val_acc: 0.2475 - val_loss: 1.3863
Epoch 3/256
[1m184/184[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 6ms/step - acc: 0.2537 - loss: 1.3863 - val_acc: 0.2500 - val_loss: 1.3862
Epoch 4/256
[1m184/184[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 6ms/step - acc: 0.2536 - loss: 1.3861 - val_acc: 0.2754 - val_loss: 1.3858
Epoch 5/256
[1m184/184[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 6ms/step - acc: 0.2569 - loss: 1.3858 - val_acc: 0.2646 - val_loss: 1.3851
Epoch 6/256
[1m184/184[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 6ms/step - acc: 0.2730 - loss: 1.3846 - val_acc: 0.2980 - val_loss: 1.3812
Epoch 7/256
[1m184/184[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 