---
**WordPiece tokenizer**

We train a WordPiece tokenizer on our dataset and special tokens. Special tokens are used for denoting the beginning and end of recipes. Once trained, we can use the Keras `WordPieceTokenizer` to tokenize our tensors within the `tf.data` pipeline. The output token vocabulary is saved in `vocab.txt`

In [1]:
import os
os.environ["KERAS_BACKEND"] = "tensorflow"

import keras_nlp
import pickle
import tensorflow as tf
import tensorflow.data as tf_data
import tensorflow.strings as tf_strings
import tensorflow.io as tf_io
from constants import *

Using TensorFlow backend


In [2]:
# Format and prepare dataset
def csv_row_to_json(row):
    row = tf_io.decode_csv(records=row, record_defaults=[tf.constant([],dtype=tf.string)] * 7)
    
    title = row[1]
    ingredients = row[2]
    directions = row[3]
    ner = row[6]

    return tf_strings.join([
        '{"ner": ', ner, ', ',
        '"title": "', title, '", ',
        '"ingredients": ', ingredients, ', ',
        '"directions": ', directions, '}',
    ])

dataset = (
    tf_data.TextLineDataset("RecipeNLG/RecipeNLG_dataset.csv") # load the csv file line by line
    .skip(1) # skip the header row
    .shuffle(buffer_size=256) # store 256 shuffled records in memory at a time before reshuffling and refetching
    .map(lambda row: csv_row_to_json(row)) # map each row of the csv to a jsonified recipe
    .apply(tf.data.experimental.ignore_errors()) # ignore any errors in the csv file
    .batch(BATCH_SIZE) # batch the dataset to train on multiple records at once
)

In [3]:
# train the tokenizer
vocab = keras_nlp.tokenizers.compute_word_piece_vocabulary(
    data=dataset,
    vocabulary_size=VOCAB_SIZE,
    reserved_tokens=SPECIAL_TOKENS,
)

2024-04-26 12:51:24.847811: W tensorflow/core/platform/profile_utils/cpu_utils.cc:128] Failed to get CPU frequency: 0 Hz


In [4]:
# save the tokenizer vocabulary
with open(VOCAB_FILE, 'wb') as f:
    pickle.dump(vocab, f)