

```
#  Create the multihead
```



In [None]:
import tensorflow as tf

import tensorflow.compat.v1 as tf
tf.disable_v2_behavior()
def layer_normalization(inputs,
                        epsilon=1e-8,
                        scope="ln",
                        reuse=None):
    with tf.variable_scope(scope, reuse=reuse):
        inputs_shape = inputs.get_shape()
        params_shape = inputs_shape[-1:]

        mean, variance = tf.nn.moments(inputs, [-1], keep_dims=True)
        beta = tf.Variable(tf.zeros(params_shape))
        gamma = tf.Variable(tf.ones(params_shape))
        normalized = (inputs - mean) / ((variance + epsilon) ** .5)
        outputs = gamma * normalized + beta

    return outputs


def multihead_attention(queries,
                        keys,
                        num_units=None,
                        num_heads=8,
                        dropout_rate=0,
                        is_training=True,
                        causality=False,
                        scope="multihead_attention",
                        reuse=None):
    with tf.variable_scope(scope, reuse=reuse):
        if num_units is None:  # set default size for attention size C
            num_units = queries.get_shape().as_list()[-1]

        # Linear Projections
        Q = tf.layers.dense(queries, num_units, activation=tf.nn.relu)  # [N, T_q, C]
        K = tf.layers.dense(keys, num_units, activation=tf.nn.relu)  # [N, T_k, C]
        V = tf.layers.dense(keys, num_units, activation=tf.nn.relu)  # [N, T_k, C]

        # Split and concat
        Q_ = tf.concat(tf.split(Q, num_heads, axis=-1), axis=0)  # [num_heads * N, T_q, C/num_heads]
        K_ = tf.concat(tf.split(K, num_heads, axis=-1), axis=0)  # [num_heads * N, T_k, C/num_heads]
        V_ = tf.concat(tf.split(V, num_heads, axis=-1), axis=0)  # [num_heads * N, T_k, C/num_heads]

        # Attention
        outputs = tf.matmul(Q_, tf.transpose(K_, [0, 2, 1]))  # (num_heads * N, T_q, T_k)

        # Scale : outputs = outputs / sqrt( d_k)
        outputs = outputs / (K_.get_shape().as_list()[-1] ** 0.5)

        # Key Masking
        # see : https://github.com/Kyubyong/transformer/issues/3
        key_masks = tf.sign(tf.abs(tf.reduce_sum(keys, axis=-1)))  # (N, T_k)
        key_masks = tf.tile(key_masks, [num_heads, 1])  # (h*N, T_k)
        key_masks = tf.tile(tf.expand_dims(key_masks, 1), [1, tf.shape(queries)[1], 1])  # (h*N, T_q, T_k)

        paddings = tf.ones_like(outputs) * (-2 ** 32 + 1)  # -infinity
        outputs = tf.where(tf.equal(key_masks, 0), paddings, outputs)  # (h*N, T_q, T_k)

        # Causality = Future blinding
        if causality:
            diag_vals = tf.ones_like(outputs[0, :, :])  # (T_q, T_k)
            tril = tf.contrib.linalg.LinearOperatorTriL(diag_vals).to_dense()  # (T_q, T_k)
            masks = tf.tile(tf.expand_dims(tril, 0), [tf.shape(outputs)[0], 1, 1])  # (h*N, T_q, T_k)

            paddings = tf.ones_like(masks) * (-2 ** 32 + 1)
            outputs = tf.where(tf.equal(masks, 0), paddings, outputs)  # (h*N, T_q, T_k)

        # Activation: outputs is a weight matrix
        outputs = tf.nn.softmax(outputs)  # (h*N, T_q, T_k)

        # Query Masking
        query_masks = tf.sign(tf.abs(tf.reduce_sum(queries, axis=-1)))  # (N, T_q)
        query_masks = tf.tile(query_masks, [num_heads, 1])  # (h*N, T_q)
        query_masks = tf.tile(tf.expand_dims(query_masks, -1), [1, 1, tf.shape(keys)[1]])  # (h*N, T_q, T_k)
        outputs *= query_masks  # broadcasting. (N, T_q, C)

        # dropouts
        outputs = tf.layers.dropout(outputs, rate=dropout_rate, training=tf.convert_to_tensor(is_training))

        # weighted sum
        outputs = tf.matmul(outputs, V_)  # ( h*N, T_q, C/h)

        # reshape
        outputs = tf.concat(tf.split(outputs, num_heads, axis=0), axis=2)  # (N, T_q, C)

        # residual connection
        outputs += queries

        # layer normaliztion
        outputs = layer_normalization(outputs)
        return outputs


def feedforward(inputs,
                num_units=[2048, 512],
                scope="multihead_attention",
                reuse=None):
    with tf.variable_scope(scope, reuse=reuse):
        # Inner layer
        params = {"inputs": inputs, "filters": num_units[0], "kernel_size": 1,
                  "activation": tf.nn.relu, "use_bias": True}
        outputs = tf.layers.conv1d(**params)

        # Readout layer
        params = {"inputs": outputs, "filters": num_units[1], "kernel_size": 1,
                  "activation": None, "use_bias": True}
        outputs = tf.layers.conv1d(**params)

        print("Conv ret:", outputs.shape)
        # Residual connection
        outputs += inputs

        # Normalize
        outputs = layer_normalization(outputs)

    return outputs

Instructions for updating:
non-resource variables are not supported in the long term


Build the neural model

In [None]:
import sys
from pathlib import Path

CURRENT_DIR = Path('.')
UTILS_DIR = CURRENT_DIR / '../'
sys.path.append(UTILS_DIR.absolute().as_posix())
import time
from tensorflow import keras
from tensorflow.keras.preprocessing import sequence
from tensorflow.keras.datasets import imdb
import numpy as np
import pickle
import tensorflow as tf
import tensorflow.compat.v1 as tf
tf.disable_v2_behavior()
import pandas as pd
from nltk.tokenize import word_tokenize
from nltk.tokenize import sent_tokenize
from nltk.stem import WordNetLemmatizer
import matplotlib.pyplot as plt
# import seaborn as sns
from sklearn.model_selection import train_test_split
import string
import re
from sklearn.preprocessing import LabelEncoder
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences



def batched_gather1(tensor, indices):
    """Gather in batch from a tensor of arbitrary size.

    In pseduocode this module will produce the following:
    output[i] = tf.gather(tensor[i], indices[i])

    Args:
      tensor: Tensor of arbitrary size.
      indices: Vector of indices.
    Returns:
      output: A tensor of gathered values.
    """
    shape = (tensor.get_shape().as_list())
    flat_first = tf.reshape(tensor, [shape[0] * shape[1]] + shape[2:])
    indices = tf.convert_to_tensor(indices)
    offset_shape = [shape[0]] + [1] * (indices.shape.ndims - 1)
    offset = tf.reshape(tf.range(shape[0]) * shape[1], offset_shape)
    output = tf.gather(flat_first, indices + offset)
    return output
def get_angles(pos, i, d_model):
  angle_rates = 1 / np.power(10000, (2 * (i//2)) / np.float32(d_model))
  return pos * angle_rates
def positional_encoding(position, d_model):
  angle_rads = get_angles(np.arange(position)[:, np.newaxis],
                          np.arange(d_model)[np.newaxis, :],
                          d_model)

  # apply sin to even indices in the array; 2i
  angle_rads[:, 0::2] = np.sin(angle_rads[:, 0::2])

  # apply cos to odd indices in the array; 2i+1
  angle_rads[:, 1::2] = np.cos(angle_rads[:, 1::2])

  pos_encoding = angle_rads[np.newaxis, ...]

  return tf.cast(pos_encoding, dtype=tf.float32)
class AttentionClassifier(object):
    def __init__(self, config):
        self.max_len = config["max_len"]
        self.hidden_size = config["hidden_size"]
        self.vocab_size = config["vocab_size"]
        self.embedding_size = config["embedding_size"]
        self.n_class = config["n_class"]
        self.learning_rate = config["learning_rate"]

        # placeholder
        self.x = tf.placeholder(tf.int32, [config["batch_size"], self.max_len])

        self.label = tf.placeholder(tf.float32, [config["batch_size"], self.n_class], name='input_y')

#        self.hot_label = tf.one_hot(self.label, self.n_class)
        self.keep_prob = tf.placeholder(tf.float32)

    def build_graph(self):
        print("building graph...")
        # EMBEDDLINGLAYER
        self.embeddings_var = tf.Variable(tf.random_uniform([self.vocab_size, self.embedding_size], -1.0, 1.0),
                                     trainable=True)
#        self.embed_matrix = self.embeddings_var.weights
        # CONCEPT VECTORS
        self.W_concept = tf.Variable(tf.random_uniform([self.n_class, self.embedding_size], -1.0, 1.0), name="W_concept",trainable=True)

        n, d = self.n_class, self.embedding_size
        self.pos_encoding = positional_encoding(self.max_len, d)
        self.batch_embedded = tf.nn.embedding_lookup(self.embeddings_var, self.x) + self.pos_encoding

        # self.em = tf.stack(res_mat)
        # AVERAGE EMBEDDING
        self.em = tf.reduce_mean(self.batch_embedded,1, name="lassfsfsft")
        # multi-head attention
        self.ma = multihead_attention(queries=tf.nn.embedding_lookup(self.embeddings_var, self.x), keys=tf.nn.embedding_lookup(self.embeddings_var, self.x))
        # FFN(x) = LN(x + point-wisely NN(x))
        # AVERAGE HIDDEN REPRESENTATION
        self.outputs = feedforward(self.ma, [self.hidden_size, self.embedding_size])
        self.outputs = tf.reduce_mean(self.outputs,1, name="lassfsfsft")


        logits = tf.layers.dense(self.outputs, units=self.n_class)
        # CROSS ENTROPY
        self.loss = tf.reduce_mean(
            tf.nn.softmax_cross_entropy_with_logits(logits=logits, labels=self.label))
        self.prediction = tf.argmax(tf.nn.softmax(logits), 1)
        correct_predictions = tf.equal(self.prediction, tf.argmax(self.label, 1))
        self.accuracy = tf.reduce_mean(tf.cast(correct_predictions, "float"), name="accuracy")
        # REWARD LOSS
        self.reward_class = tf.cast((batched_gather1(self.label,tf.cast(self.prediction, tf.int32))), tf.float32)*0.9
        # PREDICT CONCEPT VECTOR
        self.concept_vect = tf.tanh(tf.nn.embedding_lookup(self.W_concept,self.prediction ))

        # COSINE DISTANCE CONCEPT VECTOR  AND AVERAFE EMBEDDING
        self.coss = tf.losses.cosine_distance(tf.nn.l2_normalize(self.concept_vect, 1), tf.nn.l2_normalize((self.em), 1), 1)*tf.expand_dims(self.reward_class,1)
        self.cos_distance = tf.reduce_mean(tf.losses.cosine_distance(tf.nn.l2_normalize(self.concept_vect, 1), tf.nn.l2_normalize((self.em), 1), 1)*tf.expand_dims(self.reward_class,1))
#         COSINE DISTANCE CONCEPTVECTOR AND
        self.mse = tf.reduce_mean(tf.losses.cosine_distance(tf.nn.l2_normalize(self.concept_vect, 1), tf.nn.l2_normalize((self.outputs), 1), 1)*tf.expand_dims(self.reward_class,1))
        q = self.W_concept
        # PAIRWISE DISTANCE SIMILAIRTY/DISTANCE
        self.pair=tf.reduce_sum(tf.reduce_sum((tf.expand_dims(q, 1)-tf.expand_dims(q, 0))**2,2))
        # optimization
        # loss_to_minimize = self.loss
        self.cosorg = self.cos_distance
        self.pairorg = self.pair
        loss_to_minimize = self.loss+ 224.8*self.cos_distance   -0.8*self.pair + + 0.4*self.mse
        self.target_loss = loss_to_minimize
        tvars = tf.trainable_variables()
        gradients = tf.gradients(loss_to_minimize, tvars, aggregation_method=tf.AggregationMethod.EXPERIMENTAL_TREE)
        grads, global_norm = tf.clip_by_global_norm(gradients, 1.0)

        self.global_step = tf.Variable(0, name="global_step", trainable=False)
        self.optimizer = tf.train.AdamOptimizer(learning_rate=self.learning_rate)
        self.train_op = self.optimizer.apply_gradients(zip(grads, tvars), global_step=self.global_step,
                                                       name='train_step')
        print("graph built successfully!")


config = {
    "max_len": 50,
    "hidden_size": 128,
    "vocab_size": 10000,
    "embedding_size": 128,
    "n_class": 2,
    "learning_rate": 1e-3,
    "batch_size": 512,
    "train_epoch": 20
}
classifier = AttentionClassifier(config)
classifier.build_graph()

sess = tf.Session()
sess.run(tf.global_variables_initializer())
# dev_batch = (x_dev, y_dev)
BATCH_SIZE = 512

#
NUM_WORDS = 10000
INDEX_FROM = 3
SEQUENCE_LENGTH = 250
maxlen = 50
num_classes = 2



(x_train, y_train), (x_test, y_test) = imdb.load_data(num_words=NUM_WORDS, index_from=INDEX_FROM)
y_train = keras.utils.to_categorical(y_train, num_classes=num_classes)
y_test =  keras.utils.to_categorical(y_test, num_classes=num_classes)
 #
NUM_WORDS = 10000
word_to_id = keras.datasets.imdb.get_word_index()
word_to_id = {k:(v+INDEX_FROM) for k,v in word_to_id.items()}
word_to_id["<PAD>"] = 0
word_to_id["<START>"] = 1
word_to_id["<UNK>"] = 2
id_to_word = {value:key for key,value in word_to_id.items()}


x_train = sequence.pad_sequences(x_train, maxlen=maxlen, padding = 'post')
x_test =  sequence.pad_sequences(x_test, maxlen=maxlen, padding = 'post')


total_parameters = 0
for variable in tf.trainable_variables():
    # shape is an array of tf.Dimension
    shape = variable.get_shape()
    print(shape)
    print(len(shape))
    variable_parameters = 1
    for dim in shape:
        print(dim)
        variable_parameters *= dim.value
    print(variable_parameters)
    total_parameters += variable_parameters
print(total_parameters)

building graph...


  Q = tf.layers.dense(queries, num_units, activation=tf.nn.relu)  # [N, T_q, C]
  K = tf.layers.dense(keys, num_units, activation=tf.nn.relu)  # [N, T_k, C]
  V = tf.layers.dense(keys, num_units, activation=tf.nn.relu)  # [N, T_k, C]
  outputs = tf.layers.dropout(outputs, rate=dropout_rate, training=tf.convert_to_tensor(is_training))
  outputs = tf.layers.conv1d(**params)
  outputs = tf.layers.conv1d(**params)
  logits = tf.layers.dense(self.outputs, units=self.n_class)
Instructions for updating:

Future major versions of TensorFlow will allow gradients to flow
into the labels input on backprop by default.

See `tf.nn.softmax_cross_entropy_with_logits_v2`.



Conv ret: (512, 50, 128)
graph built successfully!
(10000, 128)
2
10000
128
1280000
(2, 128)
2
2
128
256
(128, 128)
2
128
128
16384
(128,)
1
128
128
(128, 128)
2
128
128
16384
(128,)
1
128
128
(128, 128)
2
128
128
16384
(128,)
1
128
128
(128,)
1
128
128
(128,)
1
128
128
(1, 128, 128)
3
1
128
128
16384
(128,)
1
128
128
(1, 128, 128)
3
1
128
128
16384
(128,)
1
128
128
(128,)
1
128
128
(128,)
1
128
128
(128, 2)
2
128
2
256
(2,)
1
2
2
1363586


**Run the training process: You will need to train it longer to get better result**

In [None]:
def calculate_acc (x_batch, y_batch, sess):
    feed_dict = {classifier.x: x_batch,
                 classifier.label: y_batch,
                 classifier.keep_prob: 1.0}

    accuracy= sess.run(
        [classifier.accuracy],
        feed_dict)
    return accuracy[0]



def evaluate_acc (xxtr,yytr ):
    acc_test = []
    for index in range(int(xxtr.shape[0]/BATCH_SIZE)):
        x_batch = xxtr[index*BATCH_SIZE:(index+1)*BATCH_SIZE]
        y_batch = yytr[index*BATCH_SIZE:(index+1)*BATCH_SIZE]
        dev_acc = calculate_acc(x_batch, y_batch, sess)
        acc_test.append(dev_acc)

    return np.mean(acc_test)

def train_step(x_batch, y_batch, sess,epoch):
    """
    A single training step
    """

    feed_dict = {classifier.x: x_batch,
                 classifier.label: y_batch,
                 classifier.keep_prob: 0.50,
                 }

    step,_,  loss, accuracy, cos, pair = sess.run(
        [classifier.global_step,classifier.train_op, classifier.target_loss, classifier.accuracy, classifier.cosorg, classifier.pairorg],
        feed_dict)

    return loss, accuracy, cos, pair

for epoch in range(50):
    loss_data = []
    acc_train =[]
    acc_dev = []
    pairs, coss = [], []
    indices = np.arange((x_train.shape[0]))
    np.random.shuffle(indices)
    x_train = x_train[indices]
    y_train = y_train[indices]
    for index in range(int(x_train.shape[0]/BATCH_SIZE)):
        x_batch = x_train[index*BATCH_SIZE:(index+1)*BATCH_SIZE]
        y_batch = y_train[index*BATCH_SIZE:(index+1)*BATCH_SIZE]
        lossd, accuracyd, cos, pair  = train_step(x_batch, y_batch,sess,epoch)
        loss_data.append(lossd)
        acc_train.append(accuracyd)
        pairs.append(pair)
        coss.append(cos)
    print(f'Epoch number : {epoch} | Train accuracy : {np.mean(acc_train)} | Train loss {np.mean(loss_data)}')
acc_test = evaluate_acc(x_test, y_test)


print("Test accuracy: %.3f " % acc_test)


Epoch number : 0 | Train accuracy : 0.5989990234375 | Train loss -37.13450622558594
Epoch number : 1 | Train accuracy : 0.6838786005973816 | Train loss -56.68708419799805
Epoch number : 2 | Train accuracy : 0.7323405146598816 | Train loss -87.26639556884766
Epoch number : 3 | Train accuracy : 0.7694905400276184 | Train loss -123.22183227539062
Epoch number : 4 | Train accuracy : 0.7920328974723816 | Train loss -165.0445098876953
Epoch number : 5 | Train accuracy : 0.8084716796875 | Train loss -211.3075408935547
Epoch number : 6 | Train accuracy : 0.8197021484375 | Train loss -262.22784423828125
Epoch number : 7 | Train accuracy : 0.8292236328125 | Train loss -317.2154541015625
Epoch number : 8 | Train accuracy : 0.8384602665901184 | Train loss -376.0726013183594
Epoch number : 9 | Train accuracy : 0.8455403447151184 | Train loss -439.064208984375
Epoch number : 10 | Train accuracy : 0.8530680537223816 | Train loss -505.6951599121094
Epoch number : 11 | Train accuracy : 0.86079913377761

Evaluate the performance of compressed model

In [None]:
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score, classification_report, confusion_matrix
import timeit
import tqdm
from numpy import dot
from numpy.linalg import norm
from sklearn.metrics.pairwise import cosine_similarity
acc_test = []
grd_pred = []
model_pred = []
acc = 0
total_samples = 0



num_classes = 2
acc = 0
total_samples = 0
model_pred = []
grd_pred = []
neur_pred = []
f1_compressed = []

for t in range(1):
    indxstry = []
    res_acc = []
    rat_siz = []
    target_n = 29
    tot = 0

    for index in range(int(x_test.shape[0]/BATCH_SIZE)):
        x_batch = x_test[index*BATCH_SIZE:(index+1)*BATCH_SIZE]
        y_batch = y_test[index*BATCH_SIZE:(index+1)*BATCH_SIZE]

        feed_dict = {classifier.x: x_batch,
                 classifier.label: y_batch,
                 classifier.keep_prob: 1.0}
        embds= sess.run(classifier.batch_embedded, feed_dict)

        concepts = (sess.run(classifier.W_concept, feed_dict)  )
        em = sess.run(classifier.em, feed_dict)

        cospred = sess.run(classifier.prediction, feed_dict)
        cur_batch = []
        model_out = sess.run(classifier.prediction, feed_dict)
        start = timeit.default_timer()
        for j in (range (BATCH_SIZE)):
            inp = em[j]
            concepts = concepts
            sim= []
            for c in range (concepts.shape[0]):
                xc = (concepts[c])
                cos_sim = dot(inp, xc)/(norm(inp)*norm(xc))
                sim.append(cos_sim)
            preds1 = sim.index(max(sim))


            preds = np.zeros(num_classes)

            preds[sim.index(max(sim))] = 1
            model_pred.append(preds)
            grd_pred.append(y_batch[j])
            md = np.zeros(num_classes)
            a = model_out[j]
            md[a] = 1
            neur_pred.append(md)


            if preds1 == list(y_batch[j]).index(1):
                acc += 1
            total_samples += 1

acc = (acc/total_samples)*100
print("Accuracy Compressed ", acc)
print("F1 compressed ", f1_score(grd_pred, model_pred, average="macro"))
print("Precision compressed ", precision_score(grd_pred, model_pred, average="macro"))
print("Recall compressed ", recall_score(grd_pred, model_pred, average="macro"))
print("F1 org ", f1_score(grd_pred, neur_pred, average="macro"))
print("Precision org ", precision_score(grd_pred, neur_pred, average="macro"))
print("Recall org ", recall_score(grd_pred, neur_pred, average="macro"))
print(classification_report(grd_pred, model_pred))

Accuracy Compressed  78.12906901041666
F1 compressed  0.7812869961206241
Precision compressed  0.7813215220133478
Recall compressed  0.7812968710514968
F1 org  0.750661195019874
Precision org  0.7514911991200398
Recall org  0.7508451206164369
              precision    recall  f1-score   support

           0       0.78      0.79      0.78     12273
           1       0.78      0.78      0.78     12303

   micro avg       0.78      0.78      0.78     24576
   macro avg       0.78      0.78      0.78     24576
weighted avg       0.78      0.78      0.78     24576
 samples avg       0.78      0.78      0.78     24576



**Feature attribution**

In [None]:
input_review = "a very good movie".lower()
tokenized_input = input_review.split()
tokenized_input = [word_to_id[x] for x in tokenized_input]
tokenized_input = sequence.pad_sequences([tokenized_input], maxlen=maxlen, padding = 'post')
repeated_list = [tokenized_input for _ in range(BATCH_SIZE)]
repeated_list = np.array(repeated_list)[:,0,:]
feed_dict = {classifier.x: repeated_list,
          classifier.label: y_batch,
          classifier.keep_prob: 1.0}
embds= sess.run(classifier.batch_embedded, feed_dict) [0] #[1,d]
concepts = (sess.run(classifier.W_concept, feed_dict)  )  # [num_classes, d]
mean_emb = np.mean(embds,0) #[1,dim]
print(f'Mean emb shape : {mean_emb.shape} | concepts shape : {concepts.shape}')
#prediction using the compressed model
classes_target = ['Negative sentiment', 'Positive sentiment']
sim= []
for c in range (concepts.shape[0]):
    xc = (concepts[c])
    cos_sim = dot(mean_emb, xc)/(norm(mean_emb)*norm(xc))
    sim.append(cos_sim)
pred_class_index = sim.index(max(sim))
print(f'Predicted sentiment : {classes_target[pred_class_index]}')
# calculate feature attribution
dict_feature_attr = {}
for x in range(len(input_review.split())):
  cos_sim = dot(embds[x], concepts[pred_class_index])/(norm(embds[x])*norm(concepts[pred_class_index]))
  dict_feature_attr[input_review.split()[x]] = cos_sim
print(f'Feature attributions : {dict_feature_attr}')



Mean emb shape : (128,) | concepts shape : (2, 128)
Predicted sentiment : Positive sentiment
Feature attributions : {'a': 0.2234983, 'very': 0.6901409, 'good': 0.32225192, 'movie': -0.39585334}
