In [0]:
!pip install -q tf-nightly
!pip install -q tensorflow-text

In [1]:
import tensorflow as tf
import tensorflow_text as text

print("Tensorflow Version: {}".format(tf.__version__))
print("Eager Mode: {}".format(tf.executing_eagerly()))
print("GPU {} available".format("is" if tf.config.experimental.list_physical_devices("GPU") else "not"))

Tensorflow Version: 2.0.0
Eager Mode: True
GPU not available


# Unicode

Most ops in `TF.Text` expect the text string is UTF-8 encoded. If you use a different encoding, you can transcode into UTF-8 via the `tf.strings.unicode_transcode` API.

In [2]:
docs = tf.constant([u'TF.Text ops expect a string encoded in UTF-8'.encode('UTF-16-BE'), 
                    u'Sad☹'.encode('UTF-16-BE')])
utf8_docs = tf.strings.unicode_transcode(docs, input_encoding='UTF-16-BE', output_encoding='UTF-8')
utf8_docs

<tf.Tensor: id=1, shape=(2,), dtype=string, numpy=
array([b'TF.Text ops expect a string encoded in UTF-8',
       b'Sad\xe2\x98\xb9'], dtype=object)>

# Tokenization

Tokenization is the process of breaking up a sentence into tokens. In general, a token usually represents a word, a number, and punctuation.

The main methods are `tokenize` and `tokenize_with_offsets`. In `TF.Text`, lots of tokenizers are already implemented, like `WhitespaceTokenizer`, etc. All of the tokenizers return a `RaggedTensor` to allow the variant length of sentences.

## WhitespaceTokenizer

In [4]:
tokenizer = text.WhitespaceTokenizer()
tokens = tokenizer.tokenize(utf8_docs)
print(tokens.to_list())

[[b'TF.Text', b'ops', b'expect', b'a', b'string', b'encoded', b'in', b'UTF-8'], [b'Sad\xe2\x98\xb9']]


## UnicodeScriptTokenizer

The UnicodeScriptTokenizer tokenizes a sentence on the Unicode script boundaries. It also tokenizes on white space, punctuation as well.

In [5]:
tokenizer = text.UnicodeScriptTokenizer()
tokens = tokenizer.tokenize(utf8_docs)
print(tokens.to_list())

[[b'TF', b'.', b'Text', b'ops', b'expect', b'a', b'string', b'encoded', b'in', b'UTF', b'-8'], [b'Sad', b'\xe2\x98\xb9']]


## Unicode Split

When tokenizing a sentence without the whitespce, the most common way is to split it on the character.

In [6]:
tokens = tf.strings.unicode_split([u"語言處理".encode("UTF-8")], input_encoding="UTF-8")
print(tokens.to_list())

[[b'\xe8\xaa\x9e', b'\xe8\xa8\x80', b'\xe8\x99\x95', b'\xe7\x90\x86']]


## Offsets

It is useful to map back the byte position (offset) of the character in a sentence. In `TF.text`, you can access the method `tokenize_with_offsets ` of a tokenizer to get `tokens`, `offset starts` and `offset limits` in the sentence.

* tokens: returns a list of tokens
* offset_starts: the byte offset where the token starts
* offset_limits: the byte offset where the token ends

In [7]:
tokenizer = text.WhitespaceTokenizer()
(tokens, offset_starts, offset_limits) = tokenizer.tokenize_with_offsets(utf8_docs)
print(tokens.to_list())
print(offset_starts.to_list())
print(offset_limits.to_list())

[[b'TF.Text', b'ops', b'expect', b'a', b'string', b'encoded', b'in', b'UTF-8'], [b'Sad\xe2\x98\xb9']]
[[0, 8, 12, 19, 21, 28, 36, 39], [0]]
[[7, 11, 18, 20, 27, 35, 38, 44], [6]]


## with `TF2.data` APIs

Tokenizers are also working with the `tf.data` APIs.

In [0]:
docs = tf.data.Dataset.from_tensor_slices([[u"Natural Language Processing"], [u"語言處理".encode("UTF-8")]])
tokenizer = text.WhitespaceTokenizer()
tokenized_docs = docs.map(lambda d: tokenizer.tokenize(d))

In [16]:
docs_iter  = iter(tokenized_docs)
print(next(docs_iter).to_list())
print(next(docs_iter).to_list())

[[b'Natural', b'Language', b'Processing']]
[[b'\xe8\xaa\x9e\xe8\xa8\x80\xe8\x99\x95\xe7\x90\x86']]


# Other Ops

`TF.text` also provides some useful tools.

## Wordshape

The `Wordshape` provides a way to do regular expression checks on the sentence for specific properties, like a punctuation character, etc.

In [17]:
tokenier = text.WhitespaceTokenizer()
tokens = tokenizer.tokenize(utf8_docs)
print(tokens)

<tf.RaggedTensor [[b'TF.Text', b'ops', b'expect', b'a', b'string', b'encoded', b'in', b'UTF-8'], [b'Sad\xe2\x98\xb9']]>


In [19]:
f1 = text.wordshape(tokens, text.WordShape.HAS_TITLE_CASE)
print(f1.to_list())

[[False, False, False, False, False, False, False, False], [True]]


In [20]:
f2 = text.wordshape(tokens, text.WordShape.IS_UPPERCASE)
print(f2.to_list())

[[False, False, False, False, False, False, False, False], [False]]


In [21]:
f3 = text.wordshape(tokens, text.WordShape.HAS_SOME_PUNCT_OR_SYMBOL)
print(f3.to_list())

[[True, False, False, False, False, False, False, True], [True]]


In [22]:
f4 = text.wordshape(tokens, text.WordShape.IS_NUMERIC_VALUE)
print(f4.to_list())

[[False, False, False, False, False, False, False, False], [False]]


## N-Grams & Sliding Window

N-grams are a sequential word given a sliding window of n. Some operations are also provided to the N-grams, like `Reduction.STRING_JOIN` or `Reduction.Sum`, etc.  

In [24]:
tokener = text.WhitespaceTokenizer()
tokens = tokenizer.tokenize(utf8_docs)
print(tokens.to_list())

[[b'TF.Text', b'ops', b'expect', b'a', b'string', b'encoded', b'in', b'UTF-8'], [b'Sad\xe2\x98\xb9']]


In [25]:
# N-grams
bigrams = text.ngrams(tokens, width=2, reduction_type=text.Reduction.STRING_JOIN)
print(bigrams.to_list())

[[b'TF.Text ops', b'ops expect', b'expect a', b'a string', b'string encoded', b'encoded in', b'in UTF-8'], []]
