In [1]:
from tokenizers import EncodingVisualizer
from tokenizers import BertWordPieceTokenizer


In [12]:
text = 'Mathias Bynens: Whenever youâ€™re working on a piece of JavaScript code that deals with strings or regular expressions in some way, just add a unit test that contains a pile of poo (ðŸ’©) in a string, ðŸ’©ðŸ’©ðŸ’©ðŸ’©ðŸ’©ðŸ’©ðŸ’©ðŸ’©ðŸ’©ðŸ’©ðŸ’©ðŸ’© and see if anything breaks. Itâ€™s a quick, fun, and easy way to see if your code supports astral symbols. Once youâ€™ve found a Unicode-related bug in your code, all you need to do is apply the techniques discussed in this post to fix it.'

In [15]:
tokenizer = BertWordPieceTokenizer("/tmp/bert-base-uncased-vocab.txt", lowercase=True)
visualizer = EncodingVisualizer(tokenizer=tokenizer,default_to_notebook=True)

## Visualizing Tokens With No Annotations

In [16]:
visualizer(text)

## Visualizing Tokens With Aligned Annotations
First we make some annotations with the Annotation class

In [17]:
from tokenizers.viz.viztypes import Annotation

In [18]:
anno1 = Annotation(start=0, end=2, label="foo")
anno2 = Annotation(start=2, end=4, label="bar")
anno3 = Annotation(start=6, end=8, label="poo")
anno4 = Annotation(start=9, end=12, label="shoe")
annotations=[
    anno1,
    anno2,
    anno3,
    anno4,
    Annotation(start=23, end=30, label="random tandem bandem sandem landem fandom"),
    Annotation(start=63, end=70, label="foo"),
    Annotation(start=80, end=95, label="bar"),
    Annotation(start=120, end=128, label="bar"),
    Annotation(start=152, end=155, label="poo"),
]



In [19]:
visualizer(text,annotations=annotations)

## Using A Custom Annotation Format
Every system has it's own representation of annotations. We can pass the constructor of the visualizer a coversion function 

In [20]:
funnyAnnotations = [dict(startPlace=i,endPlace=i+3,theTag=str(i)) for i in range(0,20,4)]
funnyAnnotations

[{'startPlace': 0, 'endPlace': 3, 'theTag': '0'},
 {'startPlace': 4, 'endPlace': 7, 'theTag': '4'},
 {'startPlace': 8, 'endPlace': 11, 'theTag': '8'},
 {'startPlace': 12, 'endPlace': 15, 'theTag': '12'},
 {'startPlace': 16, 'endPlace': 19, 'theTag': '16'}]

In [21]:
converter = lambda funny: Annotation(start=funny['startPlace'],end=funny['endPlace'],label=funny['theTag'])
visualizer = EncodingVisualizer(tokenizer=tokenizer,default_to_notebook=True,annotation_converter=converter)

In [22]:
visualizer(text,annotations=funnyAnnotations)