# Text model explanation using Integrated Gradients

### Imports and installs

In [1]:
import random

import matplotlib.pyplot as plt
import numpy             as np
import tensorflow        as tf

from tensorflow.keras.datasets      import imdb
from tensorflow.keras.layers        import (Conv1D,
                                            Dense,
                                            Dropout,
                                            Embedding,
                                            GlobalMaxPooling1D,
                                            Input)
from tensorflow.keras.models        import Model
from tensorflow.keras.preprocessing import sequence
from tensorflow.keras.utils         import to_categorical

In [2]:
random.seed(0)
np.random.seed(0)
tf.random.set_seed(0)

In [3]:
!rm -rf explainable_ai
!git clone https://github.com/kartikparnami/explainable_ai.git
from explainable_ai.integrated_gradients.ig_text import IntegratedGradientsText

Cloning into 'explainable_ai'...
remote: Enumerating objects: 33, done.[K
remote: Counting objects: 100% (33/33), done.[K
remote: Compressing objects: 100% (27/27), done.[K
remote: Total 33 (delta 2), reused 33 (delta 2), pack-reused 0[K
Unpacking objects: 100% (33/33), done.


### Construct model and utilities

In [4]:
BATCH_SIZE          = 256
EMBEDDING_DIMS      = 50
EPOCHS              = 5
NUM_FILTERS         = 250
HIDDEN_DIMS         = 250
INTERNAL_BATCH_SIZE = 100
KERNEL_SIZE         = 3
MAX_FEATURES        = 10000
MAX_LEN             = 100
NB_SAMPLES          = 32
N_STEPS             = 100

In [5]:
def decode_sentence(x, reverse_index):
    # the `-3` offset is due to the special tokens used by keras
    # see https://stackoverflow.com/questions/42821330/restore-original-text-from-keras-s-imdb-dataset
    return " ".join([reverse_index.get(i - 3, 'UNK') for i in x])

(x_train, y_train), (x_test, y_test) = imdb.load_data(num_words=MAX_FEATURES)

# test_labels = y_test.copy()
# train_labels = y_train.copy()
y_train, y_test = to_categorical(y_train), to_categorical(y_test)
x_train, x_test = sequence.pad_sequences(x_train, maxlen=MAX_LEN), sequence.pad_sequences(x_test, maxlen=MAX_LEN)
index = imdb.get_word_index()
reverse_index = {value: key for (key, value) in index.items()}

In [6]:
inputs = Input(shape=(MAX_LEN,), dtype='float32')
embedded_sequences = Embedding(MAX_FEATURES, EMBEDDING_DIMS)(inputs)
out = Conv1D(NUM_FILTERS,
             KERNEL_SIZE,
             padding='valid',
             activation='relu',
             strides=1)(embedded_sequences)
out = Dropout(0.4)(out)
out = GlobalMaxPooling1D()(out)
out = Dense(HIDDEN_DIMS,
            activation='relu')(out)
out = Dropout(0.4)(out)
outputs = Dense(2, activation='softmax')(out)

model = Model(inputs=inputs, outputs=outputs)
model.compile(loss='categorical_crossentropy',
              optimizer='adam',
              metrics=['accuracy'])
model.fit(x_train, y_train,
          batch_size=BATCH_SIZE,
          epochs=EPOCHS,
          validation_data=(x_test, y_test))

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


<tensorflow.python.keras.callbacks.History at 0x7f3366e85e48>

### Integrated gradients text explanation and visualization

In [7]:
ig_explainer = IntegratedGradientsText(model,
                                       layer=model.layers[1],
                                       n_steps=N_STEPS,
                                       internal_batch_size=INTERNAL_BATCH_SIZE)

#### Real positive reviews example

In [8]:
######################
# Correct Prediction #
######################

x_test_sample, y_test_sample = x_test[:NB_SAMPLES], y_test[:NB_SAMPLES]
predictions = model(x_test_sample).numpy().argmax(axis=1)
attrs = ig_explainer.explain_instance(x_test_sample,
                                      baselines=None,
                                      target=predictions)

idx_to_visualize = 1
pred_dict = {1: 'Positive review', 0: 'Negative review'}
print('Predicted label =  {}: {}; Real label = {}: {}'.format(predictions[idx_to_visualize],
                                                              pred_dict[predictions[idx_to_visualize]],
                                                              0 if y_test_sample[idx_to_visualize][0] > y_test_sample[idx_to_visualize][1] else 1,
                                                              pred_dict[0 if y_test_sample[idx_to_visualize][0] > y_test_sample[idx_to_visualize][1] else 1]))
words = decode_sentence(x_test_sample[idx_to_visualize], reverse_index).split()
ig_explainer.visualize(attrs[idx_to_visualize], words)

Predicted label =  1: Positive review; Real label = 1: Positive review


The most important features that contribute to the positive prediction as identified by the Integrated Gradients technique

In [9]:
########################
# Incorrect Prediction #
########################

idx_to_visualize = 10
pred_dict = {1: 'Positive review', 0: 'Negative review'}
print('Predicted label =  {}: {}; Real label = {}: {}'.format(predictions[idx_to_visualize],
                                                              pred_dict[predictions[idx_to_visualize]],
                                                              0 if y_test_sample[idx_to_visualize][0] > y_test_sample[idx_to_visualize][1] else 1,
                                                              pred_dict[0 if y_test_sample[idx_to_visualize][0] > y_test_sample[idx_to_visualize][1] else 1]))
words = decode_sentence(x_test_sample[idx_to_visualize], reverse_index).split()
ig_explainer.visualize(attrs[idx_to_visualize], words)

Predicted label =  0: Negative review; Real label = 1: Positive review


The most important features that contribute to the negative prediction even though the test label is positive, as identified by the Integrated Gradients technique

#### Real negative reviews example

In [10]:
######################
# Correct Prediction #
######################

idx_to_visualize = 3
pred_dict = {1: 'Positive review', 0: 'Negative review'}
print('Predicted label =  {}: {}; Real label = {}: {}'.format(predictions[idx_to_visualize],
                                                              pred_dict[predictions[idx_to_visualize]],
                                                              0 if y_test_sample[idx_to_visualize][0] > y_test_sample[idx_to_visualize][1] else 1,
                                                              pred_dict[0 if y_test_sample[idx_to_visualize][0] > y_test_sample[idx_to_visualize][1] else 1]))
words = decode_sentence(x_test_sample[idx_to_visualize], reverse_index).split()
ig_explainer.visualize(attrs[idx_to_visualize], words)

Predicted label =  0: Negative review; Real label = 0: Negative review


The most important features that contribute to the negative prediction as identified by the Integrated Gradients technique

In [11]:
########################
# Incorrect Prediction #
########################

idx_to_visualize = 8
pred_dict = {1: 'Positive review', 0: 'Negative review'}
print('Predicted label =  {}: {}; Real label = {}: {}'.format(predictions[idx_to_visualize],
                                                              pred_dict[predictions[idx_to_visualize]],
                                                              0 if y_test_sample[idx_to_visualize][0] > y_test_sample[idx_to_visualize][1] else 1,
                                                              pred_dict[0 if y_test_sample[idx_to_visualize][0] > y_test_sample[idx_to_visualize][1] else 1]))
words = decode_sentence(x_test_sample[idx_to_visualize], reverse_index).split()
ig_explainer.visualize(attrs[idx_to_visualize], words)

Predicted label =  1: Positive review; Real label = 0: Negative review


The most important features that contribute to the positive prediction even though the test label is negative, as identified by the Integrated Gradients technique