In [1]:
import tensorflow as tf

## The tf.string data type

The basic TensorFlow tf.string dype allow you to build tensors oof byte strings. Unicode string are utf-8 encoded by default.

In [2]:
tf.constant(u'Thank you 😊')

<tf.Tensor: shape=(), dtype=string, numpy=b'Thank you \xf0\x9f\x98\x8a'>

## Representing Unicode

There are two standard ways to represent a unicode string in TensorFlow:

- `string` scalar - where the sequence of code points is encoded using a know character encoding.
- `int32` vector - where each position contains a single code point.

In [3]:
# string scalar as a UFT-8 encoded
text_utf8 = tf.constant(u'语言处理')
text_utf8

<tf.Tensor: shape=(), dtype=string, numpy=b'\xe8\xaf\xad\xe8\xa8\x80\xe5\xa4\x84\xe7\x90\x86'>

In [4]:
# string scalar as a UTF-16-BE
text_utf16be = tf.constant(u'语言处理'.encode('UTF-16-BE'))
text_utf16be

<tf.Tensor: shape=(), dtype=string, numpy=b'\x8b\xed\x8a\x00Y\x04t\x06'>

In [5]:
# Vector of Unicode code points
text_chars = tf.constant([ord(char) for char in u'语言处理'])
text_chars

<tf.Tensor: shape=(4,), dtype=int32, numpy=array([35821, 35328, 22788, 29702], dtype=int32)>

### Converting between representations

- `tf.strings.unicode_decode`: Converts an encoded string scalar to a vector of code points.
- `tf.strings.unicode_encode`: Converts a vector of code points to an encoded string scalar.
- `tf.strings.unicode_transcode`: Converts an encoded string scalar to a different encoding.

In [6]:
tf.strings.unicode_decode(text_utf8, input_encoding='UTF-8')

<tf.Tensor: shape=(4,), dtype=int32, numpy=array([35821, 35328, 22788, 29702], dtype=int32)>

In [7]:
tf.strings.unicode_encode(text_chars, output_encoding='UTF-8')

<tf.Tensor: shape=(), dtype=string, numpy=b'\xe8\xaf\xad\xe8\xa8\x80\xe5\xa4\x84\xe7\x90\x86'>

In [8]:
tf.strings.unicode_transcode(text_utf8, input_encoding='UTF8', output_encoding='UTF-16-BE')

<tf.Tensor: shape=(), dtype=string, numpy=b'\x8b\xed\x8a\x00Y\x04t\x06'>

### Batch dimensions

In [9]:
# A batch of Unicode strings, each represented as a UTF8-encoded string.
batch_utf8 = [
    s.encode('UTF-8') for s in 
    [u'hÃllo',  u'What is the weather tomorrow',  u'Göödnight', u'😊']
]
batch_chars_ragged = tf.strings.unicode_decode(batch_utf8,
                                               input_encoding='UTF-8')
batch_chars_ragged

<tf.RaggedTensor [[104, 195, 108, 108, 111], [87, 104, 97, 116, 32, 105, 115, 32, 116, 104, 101, 32, 119, 101, 97, 116, 104, 101, 114, 32, 116, 111, 109, 111, 114, 114, 111, 119], [71, 246, 246, 100, 110, 105, 103, 104, 116], [128522]]>

In [10]:
tf.strings.unicode_encode(batch_chars_ragged, output_encoding='UTF-8')

<tf.Tensor: shape=(4,), dtype=string, numpy=
array([b'h\xc3\x83llo', b'What is the weather tomorrow',
       b'G\xc3\xb6\xc3\xb6dnight', b'\xf0\x9f\x98\x8a'], dtype=object)>

## Unicode operations

### Character length

The `tf.strings.length` operation has a parameter `unit`, which indicates how lengths should be computed. `unit` defaults to `"BYTE"`, but it can be set to other values, such as `"UTF8_CHAR"` or `"UTF16_CHAR"`, to determine the number of Unicode codepoints in each encoded `string`

In [11]:
# Note that the smile emoji takes up 4 bytes in UTF8.
thanks_utf8 = u'Thanks 😊'.encode('UTF-8')
thanks_utf8

b'Thanks \xf0\x9f\x98\x8a'

In [12]:
num_bytes = tf.strings.length(thanks_utf8).numpy()
num_chars = tf.strings.length(thanks_utf8, unit='UTF8_CHAR').numpy()
print('{} bytes; {} UTF-8 characters'.format(num_bytes, num_chars))

11 bytes; 8 UTF-8 characters


### Character substrings

In [13]:
tf.strings.substr(thanks_utf8, pos=7, len=1).numpy()

b'\xf0'

In [14]:
tf.strings.substr(thanks_utf8, pos=7, len=1, unit='UTF8_CHAR').numpy()

b'\xf0\x9f\x98\x8a'

### Split Unicode strings

The `tf.strings.unicode_split` operation splits unicode strings into substrings of individual characters:

In [15]:
tf.strings.unicode_split(thanks_utf8, input_encoding='UTF-8').numpy()

array([b'T', b'h', b'a', b'n', b'k', b's', b' ', b'\xf0\x9f\x98\x8a'],
      dtype=object)

### Byte offsets for characters

To align the character tensor generated by `tf.strings.unicode_decode` with the original string, it's useful to know the offset for where each character begins. The method `tf.strings.unicode_decode_with_offsets` is similar to `unicode_decode`, except that it returns a second tensor containing the start offset of each character.

In [16]:
codepoints, offsets = tf.strings.unicode_decode_with_offsets(u"🎈🎉🎊", input_encoding='UTF-8')

for (codepoint, offset) in zip(codepoints.numpy(), offsets.numpy()):
    print("At byte offset {}: codepoint {}".format(offset, codepoint))

At byte offset 0: codepoint 127880
At byte offset 4: codepoint 127881
At byte offset 8: codepoint 127882


## Unicode scripts

Each unicode code point belongs to a single collections of codepoints known as script. A character's script is helpful in determining which lanuage that character might be in.

In [17]:
uscript = tf.strings.unicode_script([33464, 1041])  # ['芸', 'Б']

print(uscript.numpy())  # [17, 8] == [USCRIPT_HAN, USCRIPT_CYRILLIC]

[17  8]


## Example: Simple segmentation

In [18]:
# dtype: string; shape: [num_sentences]
#
# The sentences to process.  Edit this line to try out different inputs!
sentence_texts = [u'Hello, world.', u'世界こんにちは']

In [19]:
# dtype: int32; shape: [num_sentences, (num_chars_per_sentence)]
#
# sentence_char_codepoint[i, j] is the codepoint for the j'th character in
# the i'th sentence.
sentence_char_codepoint = tf.strings.unicode_decode(sentence_texts, 'UTF-8')
# equals to tf.strings.unicode_decode([_.encode('UTF-8') for _ in sentence_texts], 'UTF-8')
# cause sentence_texts is 'UTF-8' encoded by default
print(sentence_char_codepoint)

# dtype: int32; shape: [num_sentences, (num_chars_per_sentence)]
#
# sentence_char_scripts[i, j] is the unicode script of the j'th character in
# the i'th sentence.
sentence_char_script = tf.strings.unicode_script(sentence_char_codepoint)
print(sentence_char_script)

<tf.RaggedTensor [[72, 101, 108, 108, 111, 44, 32, 119, 111, 114, 108, 100, 46], [19990, 30028, 12371, 12435, 12395, 12385, 12399]]>
<tf.RaggedTensor [[25, 25, 25, 25, 25, 0, 0, 25, 25, 25, 25, 25, 0], [17, 17, 20, 20, 20, 20, 20]]>


In [20]:
# dtype: bool; shape: [num_sentences, (num_chars_per_sentence)]
#
# sentence_char_starts_word[i, j] is True if the j'th character in the i'th
# sentence is the start of a word.

# >>> tf.fill([sentence_char_script.nrows(), 1], True)
# <tf.Tensor: shape=(2, 1), dtype=bool, numpy=
# array([[ True],
#        [ True]])>
sentence_char_starts_word = tf.concat(
    [tf.fill([sentence_char_script.nrows(), 1], True),
     tf.not_equal(sentence_char_script[:, 1:], sentence_char_script[:, :-1])],
    axis=1)

# dtype: int64; shape: [num_words]
#
# word_starts[i] is the index of the character that starts the i'th word (in
# the flattened list of characters from all sentences).
word_starts = tf.squeeze(tf.where(sentence_char_starts_word.values), axis=1)
print(word_starts)

tf.Tensor([ 0  5  7 12 13 15], shape=(6,), dtype=int64)


In [28]:
# dtype: int32; shape: [num_words, (num_chars_per_word)]
#
# word_char_codepoint[i, j] is the codepoint for the j'th character in the
# i'th word.
word_char_codepoint = tf.RaggedTensor.from_row_starts(
    values=sentence_char_codepoint.values,
    row_starts=word_starts)
print(word_char_codepoint)

<tf.RaggedTensor [[72, 101, 108, 108, 111], [44, 32], [119, 111, 114, 108, 100], [46], [19990, 30028], [12371, 12435, 12395, 12385, 12399]]>


In [39]:
tf.strings.unicode_encode(word_char_codepoint, output_encoding='UTF-8')

<tf.Tensor: shape=(6,), dtype=string, numpy=
array([b'Hello', b', ', b'world', b'.', b'\xe4\xb8\x96\xe7\x95\x8c',
       b'\xe3\x81\x93\xe3\x82\x93\xe3\x81\xab\xe3\x81\xa1\xe3\x81\xaf'],
      dtype=object)>