### Imports

In [4]:
import numpy as np
import pandas as pd
from keras.preprocessing.text import Tokenizer
from tensorflow.keras.layers.experimental.preprocessing import TextVectorization
import tensorflow as tf

### Data

In [23]:
phrases = ['', 'superb', 'good', 'ok', 'bad', 'terrible']
sentiments = [np.nan, 2, 1, 0, -1, -2]
max_features = len(phrases)
data = {'phrase': phrases, \
        'sentiment': sentiments}
df = pd.DataFrame(data, columns = ['phrase', 'sentiment'])
df

Unnamed: 0,phrase,sentiment
0,,
1,superb,2.0
2,good,1.0
3,ok,0.0
4,bad,-1.0
5,terrible,-2.0


### keras.preprocessing.text.<font color='magenta'>Tokenizer</font>

#### without oov_token

In [28]:
tokenizer = Tokenizer(num_words=max_features)
tokenizer.fit_on_texts(df['phrase'])
tokenizer.word_index

{'superb': 1, 'good': 2, 'ok': 3, 'bad': 4, 'terrible': 5}

#### with <font color='magenta'>oov_token</font>

In [29]:
tokenizer = Tokenizer(num_words=max_features, oov_token='<unw>')
tokenizer.fit_on_texts(df['phrase'])
tokenizer.word_index

{'<unw>': 1, 'superb': 2, 'good': 3, 'ok': 4, 'bad': 5, 'terrible': 6}

### tf.keras.layers.experimental.preprocessing.<font color='magenta'>TextVectorization</font>

#### vocabulary elements are of type '<font color='magenta'>bytes</font>'

In [36]:
vectorizer = TextVectorization(max_tokens=max_features)
text_ds = tf.data.Dataset.from_tensor_slices(df['phrase'].values).batch(max_features)
vectorizer.adapt(text_ds)
vectorizer.get_vocabulary()

[b'terrible', b'superb', b'ok', b'good', b'bad']

#### vocabulary elements are of type 'bytes' even after creating from a '<font color='magenta'>string</font>' type column of the dataframe

In [38]:
df['phrase-str']=df['phrase'].astype("string")
print(df.dtypes)

phrase         object
sentiment     float64
phrase-str     string
dtype: object


In [46]:
vectorizer = TextVectorization(max_tokens=max_features)
text_ds = tf.data.Dataset.from_tensor_slices(df['phrase-str'].values).batch(max_features)
vectorizer.adapt(text_ds)
voc = vectorizer.get_vocabulary()
print(voc)

[b'terrible', b'superb', b'ok', b'good', b'bad']


#### But, vocabulary recognizes 'string' type

In [45]:
output = vectorizer([['terrible superb'], ['ok good bad']])
print(output)

tf.Tensor(
[[2 3 0]
 [4 5 6]], shape=(2, 3), dtype=int64)


#### Vocabulary recognizes 'bytes' type too

In [56]:
output = vectorizer([[b'terrible superb'], [b'ok good bad']])
print(output)

tf.Tensor(
[[2 3 0]
 [4 5 6]], shape=(2, 3), dtype=int64)


#### when creating a mapping of words to indices, we should <font color='magenta'>decode</font> 'bytes' to 'string'

This helps when we create an embedding matrix. Finding a 'bytes' token in a (say) GloVe embedding will otherwise <font color='red'>fail</font>.

In [54]:
word_index = dict(zip([x.decode('utf-8') for x in voc], range(len(voc))))
print(word_index)

{'terrible': 0, 'superb': 1, 'ok': 2, 'good': 3, 'bad': 4}
