In [1]:
import torch
from transformers import BertConfig, BertForQuestionAnswering, BertTokenizerFast, AdamW
from torchviz import make_dot

First step is create the model. To reduce resource usage (or if you use CPU), reduce size of the model by reducing total encoder layer and hidden size.

In [2]:
conf = BertConfig()
conf.vocab_size = 11
conf.hidden_size = 6
conf.num_hidden_layers = 2
conf.num_attention_heads = 2
conf.intermediate_size = 24
conf.max_position_embeddings = 10
conf.output_hidden_states = False
conf.output_attentions = False
model = BertForQuestionAnswering(conf)
print(model)

BertForQuestionAnswering(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(11, 6, padding_idx=0)
      (position_embeddings): Embedding(10, 6)
      (token_type_embeddings): Embedding(2, 6)
      (LayerNorm): LayerNorm((6,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0): BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=6, out_features=6, bias=True)
              (key): Linear(in_features=6, out_features=6, bias=True)
              (value): Linear(in_features=6, out_features=6, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=6, out_features=6, bias=True)
              (LayerNorm): LayerNorm((6,), eps=1e-12, elementwise_affine=True)
              (dr

Since the visualization require forward/backward process, we need to create tokenizer for the model.

In [3]:
tokenizer = BertTokenizerFast(
    vocab_file='./bert_vocab.txt',
)

print(tokenizer('kentang'))
print(tokenizer('keju'))
print(tokenizer('kentang', 'keju'))

{'input_ids': [2, 7, 3], 'token_type_ids': [0, 0, 0], 'attention_mask': [1, 1, 1]}
{'input_ids': [2, 9, 3], 'token_type_ids': [0, 0, 0], 'attention_mask': [1, 1, 1]}
{'input_ids': [2, 7, 3, 9, 3], 'token_type_ids': [0, 0, 0, 1, 1], 'attention_mask': [1, 1, 1, 1, 1]}


Perform forward process and keep the output.

In [4]:
question='bagaimana rasanya'
context='kentang goreng keju enak'
example_input = tokenizer(question, context, add_special_tokens=True, return_tensors="pt")
example_output = model(**example_input)
example_output = (example_output.start_logits, example_output.end_logits)

Now perform the visualization.

In [5]:
dot = make_dot(example_output, params=dict(model.named_parameters()), max_attr_chars=True)
dot.render('./bert_graphs', format='pdf')

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


'./bert_graphs.pdf'