# Visualize Attention with BertViz

In [1]:
!pip install bertviz

Collecting bertviz
  Downloading bertviz-1.4.0-py3-none-any.whl.metadata (19 kB)
Collecting boto3 (from bertviz)
  Downloading boto3-1.35.79-py3-none-any.whl.metadata (6.7 kB)
Collecting botocore<1.36.0,>=1.35.79 (from boto3->bertviz)
  Downloading botocore-1.35.79-py3-none-any.whl.metadata (5.7 kB)
Collecting jmespath<2.0.0,>=0.7.1 (from boto3->bertviz)
  Downloading jmespath-1.0.1-py3-none-any.whl.metadata (7.6 kB)
Collecting s3transfer<0.11.0,>=0.10.0 (from boto3->bertviz)
  Downloading s3transfer-0.10.4-py3-none-any.whl.metadata (1.7 kB)
Downloading bertviz-1.4.0-py3-none-any.whl (157 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m157.6/157.6 kB[0m [31m7.8 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading boto3-1.35.79-py3-none-any.whl (139 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m139.2/139.2 kB[0m [31m11.6 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading botocore-1.35.79-py3-none-any.whl (13.3 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━

In [4]:
# @title Load GPT2 model and retrieve attention weights

from transformers import GPT2Tokenizer, GPT2LMHeadModel
import torch
# Import head_view from bertviz
from bertviz import head_view

gpt2_tokenizer = GPT2Tokenizer.from_pretrained('gpt2')
gpt2_model =  GPT2LMHeadModel.from_pretrained('gpt2')

tokens_dict = gpt2_tokenizer("hi there", return_tensors='pt')
gpt2_output = gpt2_model(tokens_dict['input_ids'], output_attentions=True, return_dict=True)

# Head View
<b>The head view visualizes attention in one or more heads from a single Transformer layer.</b> Each line shows the attention from one token (left) to another (right). Line weight reflects the attention value (ranges from 0 to 1), while line color identifies the attention head. When multiple heads are selected (indicated by the colored tiles at the top), the corresponding  visualizations are overlaid onto one another.  For a more detailed explanation of attention in Transformer models, please refer to the [blog](https://towardsdatascience.com/deconstructing-bert-part-2-visualizing-the-inner-workings-of-attention-60a16d86b5c1).

## Usage
👉 **Hover** over any **token** on the left/right side of the visualization to filter attention from/to that token. <br/>
👉 **Double-click** on any of the **colored tiles** at the top to filter to the corresponding attention head.<br/>
👉 **Single-click** on any of the **colored tiles** to toggle selection of the corresponding attention head. <br/>
👉 **Click** on the **Layer** drop-down to change the model layer (zero-indexed).


In [5]:
# @title GPT models use causal attention (autoregressive).

head_view(gpt2_output.attentions, ["hi", "there"])

<IPython.core.display.Javascript object>

In [6]:
# @title Load BERT model and retrieve attention weights

from bertviz import head_view, model_view
from transformers import BertTokenizer, BertModel

bert_model = BertModel.from_pretrained('bert-base-uncased', output_attentions=True)
bert_tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

tokens_dict = bert_tokenizer("hi there", return_tensors='pt')
bert_output = bert_model(tokens_dict['input_ids'], output_attentions=True, return_dict=True)

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]



In [7]:
# @title BERT model uses bidirectional attention.

head_view(bert_output.attentions, ["CLS", "hi", "there", "SEP"])

<IPython.core.display.Javascript object>

# Model View
<b>The model view provides a birds-eye view of attention throughout the entire model</b>. Each cell shows the attention weights for a particular head, indexed by layer (row) and head (column).  The lines in each cell represent the attention from one token (left) to another (right), with line weight proportional to the attention value (ranges from 0 to 1).  For a more detailed explanation, please refer to the [blog](https://towardsdatascience.com/deconstructing-bert-part-2-visualizing-the-inner-workings-of-attention-60a16d86b5c1).

## Usage
👉 **Click** on any **cell** for a detailed view of attention for the associated attention head (or to unselect that cell). <br/>
👉 Then **hover** over any **token** on the left side of detail view to filter the attention from that token.

In [8]:
model_view(gpt2_output.attentions, ["hi", "there"])

<IPython.core.display.Javascript object>

### Reproduce Attention Bias

Reported by https://mlops.community/explainable-ai-visualizing-attention-in-transformers/.

In [9]:
tokens_dict = gpt2_tokenizer("The doctor asked a nurse a question. She said", return_tensors='pt')
gpt2_output = gpt2_model(tokens_dict['input_ids'], output_attentions=True, return_dict=True)

input_id_list = tokens_dict['input_ids'][0].tolist() # Batch index 0
tokens = gpt2_tokenizer.convert_ids_to_tokens(input_id_list)
head_view(gpt2_output.attentions, tokens)

<IPython.core.display.Javascript object>

In [10]:
tokens_dict = gpt2_tokenizer("The doctor asked a nurse a question. He said", return_tensors='pt')
gpt2_output = gpt2_model(tokens_dict['input_ids'], output_attentions=True, return_dict=True)

input_id_list = tokens_dict['input_ids'][0].tolist() # Batch index 0
tokens = gpt2_tokenizer.convert_ids_to_tokens(input_id_list)
head_view(gpt2_output.attentions, tokens)

<IPython.core.display.Javascript object>