In [3]:
import tensorflow as tf
import tensorflow_text as text

## Unicode

Most ops expect that the strings are in UTF-8. If you're using a different encoding, you can use the core tensorflow transcode op to transcode into UTF-8. You can also use the same op to coerce your string to structurally valid UTF-8 if your input could be invalid.

In [6]:
docs = tf.constant([
    u'Everything not saved will be lost.'.encode('UTF-16-BE'),
    u'Sad☹'.encode('UTF-16-BE')
])
utf8_docs = tf.strings.unicode_transcode(docs, input_encoding='UTF-16-BE', output_encoding='UTF-8')

## Tokenization

### WhiteSpaceTokenizer

In [14]:
tokenizer = text.WhitespaceTokenizer()
tokens = tokenizer.tokenize([
    u'Everything not saved will be lost.',
    u'Sad☹'.encode('UTF-8')
])
print(tokens)

<tf.RaggedTensor [[b'Everything', b'not', b'saved', b'will', b'be', b'lost.'], [b'Sad\xe2\x98\xb9']]>


### UnicodeScriptTokenizer

In practice, this is similar to the `WhitespaceTokenizer` with the most apparent difference being that it will split punctuation (USCRIPT_COMMON) from language texts (eg. USCRIPT_LATIN, USCRIPT_CYRILLIC, etc) while also separating language texts from each other.

In [21]:
tokenzer = text.UnicodeScriptTokenizer()
tokens = tokenizer.tokenize([
    u'Everything not saved will be lost.',
    u'Sad☹'.encode('UTF-8')
])
print(tokens)

<tf.RaggedTensor [[b'Everything', b'not', b'saved', b'will', b'be', b'lost.'], [b'Sad\xe2\x98\xb9']]>


### Unicode split

In [33]:
tokens = tf.strings.unicode_split([u"仅今年前".encode('UTF-8')], input_encoding='UTF-8')
print(tokens.to_list())

[[b'\xe4\xbb\x85', b'\xe4\xbb\x8a', b'\xe5\xb9\xb4', b'\xe5\x89\x8d']]


### Offsets

In [34]:
tokenizer = text.UnicodeScriptTokenizer()
(tokens, offset_starts, offset_limits) = tokenizer.tokenize_with_offsets([
    u'Everything not saved will be lost.',
    u'Sad☹'.encode('UTF-8')
])
print(tokens)
print(offset_starts)
print(offset_limits)

<tf.RaggedTensor [[b'Everything', b'not', b'saved', b'will', b'be', b'lost', b'.'], [b'Sad', b'\xe2\x98\xb9']]>
<tf.RaggedTensor [[0, 11, 15, 21, 26, 29, 33], [0, 3]]>
<tf.RaggedTensor [[10, 14, 20, 25, 28, 33, 34], [3, 6]]>


### TF.Data Example

In [43]:
docs = tf.data.Dataset.from_tensor_slices([['Never tell me the odds.'], ["It's a trap!"]])
tokenizer = text.WhitespaceTokenizer()
tokenized_docs = docs.map(tokenizer.tokenize)
for _ in tokenized_docs.take(2):
    print(_)

<tf.RaggedTensor [[b'Never', b'tell', b'me', b'the', b'odds.']]>
<tf.RaggedTensor [[b"It's", b'a', b'trap!']]>


## Other Test Ops

### WordShape

In [44]:
tokenzer = text.WhitespaceTokenizer()
tokens = tokenizer.tokenize(['Everything not saved will be lost.', u'Sad☹'.encode('UTF-8')])

# Is capitalized?
f1 = text.wordshape(input_tensor=tokens, pattern=text.WordShape.HAS_TITLE_CASE)
# Are all letters uppercased?
f2 = text.wordshape(input_tensor=tokens, pattern=text.WordShape.IS_UPPERCASE)
# Does the token contain punctuation?
f3 = text.wordshape(tokens, text.WordShape.HAS_SOME_PUNCT_OR_SYMBOL)
# Is the token a number?
f4 = text.wordshape(tokens, text.WordShape.IS_NUMERIC_VALUE)

print(f1)
print(f2)
print(f3)
print(f4)

<tf.RaggedTensor [[True, False, False, False, False, False], [True]]>
<tf.RaggedTensor [[False, False, False, False, False, False], [False]]>
<tf.RaggedTensor [[False, False, False, False, False, True], [True]]>
<tf.RaggedTensor [[False, False, False, False, False, False], [False]]>


### N-grams & Slicing Window

In [45]:
tokenzer = text.WhitespaceTokenizer()
tokens = tokenizer.tokenize(['Everything not saved will be lost.', u'Sad☹'.encode('UTF-8')])

bigrams = text.ngrams(data=tokens, width=2, reduction_type=text.Reduction.STRING_JOIN)

print(bigrams)

<tf.RaggedTensor [[b'Everything not', b'not saved', b'saved will', b'will be', b'be lost.'], []]>
