# Introduction

This document visualizes the attention weights of a **fine-tuned** BERT model on the [IMDb dataset](http://ai.stanford.edu/~amaas/data/sentiment/).

# Setup

Navigate to project's root directory:

In [1]:
%cd ..

/Users/mohdshukri/Documents/GitHub/bert_attn


# Libraries

In [2]:
from IPython.display import HTML
import operator

In [3]:
import tensorflow as tf
import tokenization
from explain.model import load_bert_model
import explain.attention as attn
import tempfile
import os

In [4]:
try:
    from run_classifier import ImdbProcessor, \
        file_based_convert_examples_to_features, \
        file_based_input_fn_builder
except tf.flags.DuplicateFlagError:
    pass

In [18]:
import pandas as pd

# Model Parameters

Modify as appropriate:

In [5]:
# path to vocab file
vocab_file = './model/uncased_L-12_H-768_A-12/vocab.txt'

# whether input to model should be lower cased
do_lower_case = True

# path to data dir (to explain predictions)
# the path must contain a file named "test.tsv"
data_dir = './data/imdb/subset'

# path to save model's outputs
output_dir = tempfile.gettempdir()

# maximum sequence length the BERT model was finetuned on
max_seq_length = 512

# path to BERT's config file e.g. the BERT Base model
bert_config_file = './model/uncased_L-12_H-768_A-12/bert_config.json'

# path to the fine-tuned checkpoint
init_checkpoint = './bert_tuned_imdb/model.ckpt-9375'

In [10]:
# how to transform the attention layers for visualization
attn_fn = attn.average_last_layer_by_head

# whether to normalize the attention weights by the highest weight
use_viz_relative = True

# file to save visualization
viz_output_file = './viz/attention_imdb_test_set_average_first_sample.html'

# Batch Visualization

This section creates an html file containing the visualizations of the reviews in test.tsv:

In [6]:
tokenizer = tokenization.FullTokenizer(vocab_file=vocab_file,
                                       do_lower_case=do_lower_case)
input_processor = ImdbProcessor()
label_list = input_processor.get_labels()

predict_examples = input_processor.get_test_examples(data_dir)
predict_file = os.path.join(output_dir, 'predict.tf_record')
file_based_convert_examples_to_features(predict_examples,
                                        label_list,
                                        max_seq_length,
                                        tokenizer,
                                        predict_file)

predict_input_fn = file_based_input_fn_builder(
        input_file=predict_file,
        seq_length=max_seq_length,
        is_training=False,
        drop_remainder=False
    )

estimator = load_bert_model(output_dir=output_dir,
                            bert_config_file=bert_config_file,
                            init_checkpoint=init_checkpoint,
                            num_labels=len(label_list),
                            attn_processor_fn=attn_fn
                            )

INFO:tensorflow:Writing example 0 of 99
INFO:tensorflow:*** Example ***
INFO:tensorflow:guid: test-0
INFO:tensorflow:tokens: [CLS] " based on an actual story , john boo ##rman shows the struggle of an american doctor , whose husband and son were murdered and she was continually plagued with her loss . a holiday to burma with her sister seemed like a good idea to get away from it all , but when her passport was stolen in rang ##oon , she could not leave the country with her sister , and was forced to stay back until she could get i . d . papers from the american embassy . to fill in a day before she could fly out , she took a trip into the countryside with a tour guide . " " i tried finding something in those stone statues , but nothing stirred in me . i was stone myself . " " < br / > < br / > suddenly all hell broke loose and she was caught in a political revolt . just when it looked like she had escaped and safely boarded a train , she saw her tour guide get beaten and shot . in a sp

INFO:tensorflow:segment_ids: 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0

INFO:tensorflow:input_mask: 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 

INFO:tensorflow:_TPUContext: eval_on_tpu True


In [7]:
predictions = estimator.predict(input_fn=predict_input_fn)

vizs = []
for predict_example, prediction in zip(predict_examples, predictions):
    input_tokens = attn._parse_input_text(predict_example.text_a, tokenizer, max_seq_length)
    input_attention = prediction['attention']
    viz = attn.viz_attention(input_tokens, 
                             input_attention,
                             predict_example.label,
                             prediction['pred_class'],
                             prediction['probabilities'],
                             predict_example.guid,
                             viz_relative = use_viz_relative)
    viz = viz + '<hr>'
    vizs.append(viz)

INFO:tensorflow:Could not find trained model in model_dir: /var/folders/j_/3lt5q66x2c768jg0lcxxhqmh58fg8p/T, running initialization to predict.
Instructions for updating:
Use `tf.data.experimental.map_and_batch(...)`.
INFO:tensorflow:Calling model_fn.
INFO:tensorflow:Running infer on CPU
INFO:tensorflow:*** Features ***
INFO:tensorflow:  name = input_ids, shape = (?, 512)
INFO:tensorflow:  name = input_mask, shape = (?, 512)
INFO:tensorflow:  name = label_ids, shape = (?,)
INFO:tensorflow:  name = segment_ids, shape = (?, 512)
INFO:tensorflow:**** Trainable Variables ****
INFO:tensorflow:  name = bert/embeddings/word_embeddings:0, shape = (30522, 768), *INIT_FROM_CKPT*
INFO:tensorflow:  name = bert/embeddings/token_type_embeddings:0, shape = (2, 768), *INIT_FROM_CKPT*
INFO:tensorflow:  name = bert/embeddings/position_embeddings:0, shape = (512, 768), *INIT_FROM_CKPT*
INFO:tensorflow:  name = bert/embeddings/LayerNorm/beta:0, shape = (768,), *INIT_FROM_CKPT*
INFO:tensorflow:  name = ber

INFO:tensorflow:  name = bert/encoder/layer_4/attention/self/query/kernel:0, shape = (768, 768), *INIT_FROM_CKPT*
INFO:tensorflow:  name = bert/encoder/layer_4/attention/self/query/bias:0, shape = (768,), *INIT_FROM_CKPT*
INFO:tensorflow:  name = bert/encoder/layer_4/attention/self/key/kernel:0, shape = (768, 768), *INIT_FROM_CKPT*
INFO:tensorflow:  name = bert/encoder/layer_4/attention/self/key/bias:0, shape = (768,), *INIT_FROM_CKPT*
INFO:tensorflow:  name = bert/encoder/layer_4/attention/self/value/kernel:0, shape = (768, 768), *INIT_FROM_CKPT*
INFO:tensorflow:  name = bert/encoder/layer_4/attention/self/value/bias:0, shape = (768,), *INIT_FROM_CKPT*
INFO:tensorflow:  name = bert/encoder/layer_4/attention/output/dense/kernel:0, shape = (768, 768), *INIT_FROM_CKPT*
INFO:tensorflow:  name = bert/encoder/layer_4/attention/output/dense/bias:0, shape = (768,), *INIT_FROM_CKPT*
INFO:tensorflow:  name = bert/encoder/layer_4/attention/output/LayerNorm/beta:0, shape = (768,), *INIT_FROM_CKPT

INFO:tensorflow:  name = bert/encoder/layer_8/intermediate/dense/bias:0, shape = (3072,), *INIT_FROM_CKPT*
INFO:tensorflow:  name = bert/encoder/layer_8/output/dense/kernel:0, shape = (3072, 768), *INIT_FROM_CKPT*
INFO:tensorflow:  name = bert/encoder/layer_8/output/dense/bias:0, shape = (768,), *INIT_FROM_CKPT*
INFO:tensorflow:  name = bert/encoder/layer_8/output/LayerNorm/beta:0, shape = (768,), *INIT_FROM_CKPT*
INFO:tensorflow:  name = bert/encoder/layer_8/output/LayerNorm/gamma:0, shape = (768,), *INIT_FROM_CKPT*
INFO:tensorflow:  name = bert/encoder/layer_9/attention/self/query/kernel:0, shape = (768, 768), *INIT_FROM_CKPT*
INFO:tensorflow:  name = bert/encoder/layer_9/attention/self/query/bias:0, shape = (768,), *INIT_FROM_CKPT*
INFO:tensorflow:  name = bert/encoder/layer_9/attention/self/key/kernel:0, shape = (768, 768), *INIT_FROM_CKPT*
INFO:tensorflow:  name = bert/encoder/layer_9/attention/self/key/bias:0, shape = (768,), *INIT_FROM_CKPT*
INFO:tensorflow:  name = bert/encoder

In [15]:
final_viz = reduce(operator.add, vizs)
# uncomment to visualize reviews
#HTML(final_viz)

Save the results:

In [13]:
# use a sample of the vizs if it is large

#sample_vizs = np.random.choice(vizs, 1000)
#final_viz = reduce(operator.add, sample_vizs)
with open(viz_output_file, 'w') as f:
    f.write(final_viz)

# Online Visualization

Input ONE review to visualize:

In [66]:
# a review from https://www.imdb.com/review/rw4511625/?ref_=tt_urv

#sample_review = {
    #'id': '123',
    #'label': 1,
    #'text': """This is an absolute gem of a movie. Modern Dragon ball can't get any better than this. It was really phenomenol. Here a few key features of the movie (i) PLOT : The movie is divided into 2 phases. The "Past&The Present". The start of the movie is all about the history of the saiyans. The movie does a half an hour long depiction of the history of the saiyans. The past is mostly based off "DB MINUS" with a few tid bits from the "Bardock: The Father of Goku". The movie is more about broly. Broly was a saiyan with really high potential. King vegeta being jealous of broly's latent potential decided to send away baby broly to a far away planet "Vampa". Paragus(Father of broly) being betrayed by The King, vowed to find his son and one day get revenge on the king. The movie explains about the ties of fate between Goku, Vegeta and Broly. The present section of the movie takes place after "THE TOURNAMENT OF POWER". Its about the encounter of the 3 fated saiyans with each other. Pretty simple plot not too complex or anythig but is really well written and executed pretty well! (ii) ANIMATION: The animation of this movie is out of this world. Toriyama and Toei Animation decided to change the animation supervisor and character designer "Tadayoshi Yamamuro" to "Nahiro Shinatni". His animation is more fluid and gives a refreshing feel to the movie. There is a use of CG as well. In short this is the best anime movie of 2018 when it comes to animation. Having animators from My hero academia, one punch man etc working on it. Also got Toei's best staff with the likes of Naotoshi Shida, Yuya Takahashi, Naoki tate and other great animators. The movie is a one hour and fourty minutes of pure " SAGAKA". (iii) MUSIC AND DIRECTION Movie director is "Nagamine" who produced one of the best episodes of super like episode 95 or the introduction of Ultra Instinct etc. The movie is directed really well by him. His story boards were really amazing. The music is composed by " Sumitomo" who had his fair share of criticism from the fans because of his music in the beginning of super. His music in battle of gods was pretty good but in this movie its so emotional and captivating. (iv) FIGHT SCENES: Dragon Ball is known for producing some of the best fights in all of anime. And this movie is no exception. With the likes of takahashi and Naotoshi shida handling the main action parts of the film, the fight scenes are absolutely mind blowing. Especially the final section of the movie where Shida brings his A game to the movie, shida mixed with Shinatni sheets produced some of the best fights in all of anime. (v) PROBLEMS: Nothing is perfect and Dragon ball Super broly also had some tiny problems as well. 1. RUSHING: Sometimes the movie felt a bit rushed. Mainly because the original script of the movie was actually 3hours long so they had to narrow it down to 1 hour 40 minutes.That is why some sections are abit rushed but its not that big of a deal. 2.Lack of Tension : THE movie lacked tension as compared to the days of dbz. But still its just a tiny problem not that big of deal to ruin a movie. 3.Retcons : There were a few retcons here and there. but i can ignore such retcons as they don't seem that big to me. (VI) CONCLUSION: You should absolutely watch this movie. Not only its animated really well ,it show case some of the best fights of 2018, and it has great music too. Please go and watching this movie in the theatres. Its theatrical experience will blow your mind. Overall its a pretty solid movie."""
#}




# example from http://aclweb.org/anthology/C18-1074
sample_review = {
    'id': '123',
    'label': 1,
    'text': "Leonardo Dicaprio steps out of his usual comfort zone and is as charismatic as I‘ve ever seen him"
}






Create a test file for that one review:

In [67]:
sample_data_dir = tempfile.gettempdir()
sample_review_file = os.path.join(sample_data_dir, 'test.tsv')

pd.DataFrame(sample_review, index=[0])\
  .to_csv(sample_review_file, header=True, sep='\t', index=False)

The rest is pretty much the same as batch visualization:

In [68]:
tokenizer = tokenization.FullTokenizer(vocab_file=vocab_file,
                                       do_lower_case=do_lower_case)
input_processor = ImdbProcessor()
label_list = input_processor.get_labels()

predict_examples = input_processor.get_test_examples(sample_data_dir)
predict_file = os.path.join(output_dir, 'sample_predict.tf_record')
file_based_convert_examples_to_features(predict_examples,
                                        label_list,
                                        max_seq_length,
                                        tokenizer,
                                        predict_file)

predict_input_fn = file_based_input_fn_builder(
        input_file=predict_file,
        seq_length=max_seq_length,
        is_training=False,
        drop_remainder=False
    )

estimator = load_bert_model(output_dir=output_dir,
                            bert_config_file=bert_config_file,
                            init_checkpoint=init_checkpoint,
                            num_labels=len(label_list),
                            attn_processor_fn=attn_fn
                            )

INFO:tensorflow:Writing example 0 of 1
INFO:tensorflow:*** Example ***
INFO:tensorflow:guid: test-123
INFO:tensorflow:tokens: [CLS] leonardo di ##cap ##rio steps out of his usual comfort zone and is as charismatic as i ‘ ve ever seen him [SEP]
INFO:tensorflow:input_ids: 101 14720 4487 17695 9488 4084 2041 1997 2010 5156 7216 4224 1998 2003 2004 23916 2004 1045 1520 2310 2412 2464 2032 102 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 

In [69]:
predictions = estimator.predict(input_fn=predict_input_fn)

vizs = []
for predict_example, prediction in zip(predict_examples, predictions):
    input_tokens = attn._parse_input_text(predict_example.text_a, tokenizer, max_seq_length)
    input_attention = prediction['attention']
    viz = attn.viz_attention(input_tokens, 
                             input_attention,
                             predict_example.label,
                             prediction['pred_class'],
                             prediction['probabilities'],
                             predict_example.guid,
                             viz_relative = use_viz_relative)
    viz = viz + '<hr>'
    vizs.append(viz)

INFO:tensorflow:prediction_loop marked as finished
INFO:tensorflow:Could not find trained model in model_dir: /var/folders/j_/3lt5q66x2c768jg0lcxxhqmh58fg8p/T, running initialization to predict.
INFO:tensorflow:Calling model_fn.
INFO:tensorflow:Running infer on CPU
INFO:tensorflow:*** Features ***
INFO:tensorflow:  name = input_ids, shape = (?, 512)
INFO:tensorflow:  name = input_mask, shape = (?, 512)
INFO:tensorflow:  name = label_ids, shape = (?,)
INFO:tensorflow:  name = segment_ids, shape = (?, 512)
INFO:tensorflow:**** Trainable Variables ****
INFO:tensorflow:  name = bert/embeddings/word_embeddings:0, shape = (30522, 768), *INIT_FROM_CKPT*
INFO:tensorflow:  name = bert/embeddings/token_type_embeddings:0, shape = (2, 768), *INIT_FROM_CKPT*
INFO:tensorflow:  name = bert/embeddings/position_embeddings:0, shape = (512, 768), *INIT_FROM_CKPT*
INFO:tensorflow:  name = bert/embeddings/LayerNorm/beta:0, shape = (768,), *INIT_FROM_CKPT*
INFO:tensorflow:  name = bert/embeddings/LayerNorm/

INFO:tensorflow:  name = bert/encoder/layer_4/attention/self/key/kernel:0, shape = (768, 768), *INIT_FROM_CKPT*
INFO:tensorflow:  name = bert/encoder/layer_4/attention/self/key/bias:0, shape = (768,), *INIT_FROM_CKPT*
INFO:tensorflow:  name = bert/encoder/layer_4/attention/self/value/kernel:0, shape = (768, 768), *INIT_FROM_CKPT*
INFO:tensorflow:  name = bert/encoder/layer_4/attention/self/value/bias:0, shape = (768,), *INIT_FROM_CKPT*
INFO:tensorflow:  name = bert/encoder/layer_4/attention/output/dense/kernel:0, shape = (768, 768), *INIT_FROM_CKPT*
INFO:tensorflow:  name = bert/encoder/layer_4/attention/output/dense/bias:0, shape = (768,), *INIT_FROM_CKPT*
INFO:tensorflow:  name = bert/encoder/layer_4/attention/output/LayerNorm/beta:0, shape = (768,), *INIT_FROM_CKPT*
INFO:tensorflow:  name = bert/encoder/layer_4/attention/output/LayerNorm/gamma:0, shape = (768,), *INIT_FROM_CKPT*
INFO:tensorflow:  name = bert/encoder/layer_4/intermediate/dense/kernel:0, shape = (768, 3072), *INIT_FRO

INFO:tensorflow:  name = bert/encoder/layer_8/output/dense/bias:0, shape = (768,), *INIT_FROM_CKPT*
INFO:tensorflow:  name = bert/encoder/layer_8/output/LayerNorm/beta:0, shape = (768,), *INIT_FROM_CKPT*
INFO:tensorflow:  name = bert/encoder/layer_8/output/LayerNorm/gamma:0, shape = (768,), *INIT_FROM_CKPT*
INFO:tensorflow:  name = bert/encoder/layer_9/attention/self/query/kernel:0, shape = (768, 768), *INIT_FROM_CKPT*
INFO:tensorflow:  name = bert/encoder/layer_9/attention/self/query/bias:0, shape = (768,), *INIT_FROM_CKPT*
INFO:tensorflow:  name = bert/encoder/layer_9/attention/self/key/kernel:0, shape = (768, 768), *INIT_FROM_CKPT*
INFO:tensorflow:  name = bert/encoder/layer_9/attention/self/key/bias:0, shape = (768,), *INIT_FROM_CKPT*
INFO:tensorflow:  name = bert/encoder/layer_9/attention/self/value/kernel:0, shape = (768, 768), *INIT_FROM_CKPT*
INFO:tensorflow:  name = bert/encoder/layer_9/attention/self/value/bias:0, shape = (768,), *INIT_FROM_CKPT*
INFO:tensorflow:  name = bert

In [70]:
HTML(vizs[0])