In [2]:
import os
import json
import re
import string
from tensorflow import keras
import tensorflow as tf
import numpy as np

In [3]:
# Load and filter data.

with open('/kaggle/input/recipe/full_format_recipes.json', 'r') as json_data:
    recipe_data = json.load(json_data)


filter_data = [
    f"Recipe for: {recipe['title']} | ingredients: {''.join(recipe['ingredients'])} | directions: {''.join(recipe['directions'])}"
    for recipe in recipe_data
              if ('title' in recipe and recipe['title'] is not None 
                  and 'directions' in recipe and recipe['directions'] is not None
                  and'ingredients' in recipe and recipe['ingredients'] is not None
                 )
              ]

In [4]:
print('\nSample recipe:\n--------------\n',recipe_data[0])
print('\nFiltered data:\n--------------\n', filter_data[0])


Sample recipe:
--------------
 {'directions': ['1. Place the stock, lentils, celery, carrot, thyme, and salt in a medium saucepan and bring to a boil. Reduce heat to low and simmer until the lentils are tender, about 30 minutes, depending on the lentils. (If they begin to dry out, add water as needed.) Remove and discard the thyme. Drain and transfer the mixture to a bowl; let cool.', '2. Fold in the tomato, apple, lemon juice, and olive oil. Season with the pepper.', '3. To assemble a wrap, place 1 lavash sheet on a clean work surface. Spread some of the lentil mixture on the end nearest you, leaving a 1-inch border. Top with several slices of turkey, then some of the lettuce. Roll up the lavash, slice crosswise, and serve. If using tortillas, spread the lentils in the center, top with the turkey and lettuce, and fold up the bottom, left side, and right side before rolling away from you.'], 'fat': 7.0, 'date': '2006-09-01T04:00:00.000Z', 'categories': ['Sandwich', 'Bean', 'Fruit', 'T

In [34]:
def pad_puncutation(s):
    sub = re.sub(f'([{string.punctuation}])',r' \1',s)
    sub = re.sub(' +',' ',sub)
    return sub

text_data = [pad_puncutation(s) for s in filter_data]

In [35]:
text_data[0]

'Recipe for : Lentil , Apple , and Turkey Wrap | ingredients : 4 cups low -sodium vegetable or chicken stock1 cup dried brown lentils1 /2 cup dried French green lentils2 stalks celery , chopped1 large carrot , peeled and chopped1 sprig fresh thyme1 teaspoon kosher salt1 medium tomato , cored , seeded , and diced1 small Fuji apple , cored and diced1 tablespoon freshly squeezed lemon juice2 teaspoons extra -virgin olive oilFreshly ground black pepper to taste3 sheets whole -wheat lavash , cut in half crosswise , or 6 (12 -inch ) flour tortillas3 /4 pound turkey breast , thinly sliced1 /2 head Bibb lettuce | directions : 1 . Place the stock , lentils , celery , carrot , thyme , and salt in a medium saucepan and bring to a boil . Reduce heat to low and simmer until the lentils are tender , about 30 minutes , depending on the lentils . (If they begin to dry out , add water as needed . ) Remove and discard the thyme . Drain and transfer the mixture to a bowl ; let cool .2 . Fold in the tomat

**HERE WE USE BPE TOKENIZER**

In [9]:
pip install tokenizers transformers

Note: you may need to restart the kernel to use updated packages.


In [11]:
from tokenizers import Tokenizer, models,trainers, pre_tokenizers, processors, normalizers
from transformers import PreTrainedTokenizerFast


In [87]:
# 1. prepare model 
tokenizer = Tokenizer(models.WordPiece())

# 2. Normalizer
tokenizer.normalizer = normalizers.Sequence(
    [normalizers.Lowercase()]
)

# 3.Pre tokenizer
tokenizer.pre_tokenizer =  pre_tokenizers.Sequence(
    [pre_tokenizers.Whitespace(), pre_tokenizers.Punctuation()]
)
 
# 4. Tokenizer pipeline
specil_tokens = ["<pad>", "<unk>", "<bos>", "<eos>"]
trainer = trainers.WordPieceTrainer(vocab_size=10000, special_tokens=specil_tokens)

# Train tokenizer
tokenizer.train_from_iterator(text_data, trainer=trainer)







In [36]:
encoding = tokenizer.encode(text_data[0])
print(encoding.tokens)

['recipe', 'for', ':', 'lentil', ',', 'apple', ',', 'and', 'turkey', 'wrap', '|', 'ingredients', ':', '4', 'cups', 'low', '-', 'sodium', 'vegetable', 'or', 'chicken', 'stock1', 'cup', 'dried', 'brown', 'lentils1', '/', '2', 'cup', 'dried', 'french', 'green', 'lentils2', 'stalks', 'celery', ',', 'chopped1', 'large', 'carrot', ',', 'peeled', 'and', 'chopped1', 'sprig', 'fresh', 'thyme1', 'teaspoon', 'kosher', 'salt1', 'medium', 'tomato', ',', 'cored', ',', 'seeded', ',', 'and', 'diced1', 'small', 'fuji', 'apple', ',', 'cored', 'and', 'diced1', 'tablespoon', 'freshly', 'squeezed', 'lemon', 'juice2', 'teaspoons', 'extra', '-', 'virgin', 'olive', 'oilfreshly', 'ground', 'black', 'pepper', 'to', 'taste3', 'sheets', 'whole', '-', 'wheat', 'lavash', ',', 'cut', 'in', 'half', 'crosswise', ',', 'or', '6', '(', '12', '-', 'inch', ')', 'flour', 'tortillas3', '/', '4', 'pound', 'turkey', 'breast', ',', 'thinly', 'sliced1', '/', '2', 'head', 'bibb', 'lettuce', '|', 'directions', ':', '1', '.', 'plac

In [88]:
# Wrap with a Hugging Face tokenizer for easy processing
fast_tokenizer = PreTrainedTokenizerFast(tokenizer_object=tokenizer)
fast_tokenizer.pad_token = "<pad>"

In [89]:
fast_tokenizer.vocab_size

10000

In [90]:
def encode_text(text):
    # Convert the TensorFlow tensor to a Python string
    text = text.numpy().decode('utf-8')
    # Tokenize and return input IDs only
    encoded = fast_tokenizer(text, padding='max_length', truncation=True, max_length=201, return_tensors="np")
    return encoded["input_ids"][0]

# Wrap encode_text with tf.py_function
def tf_encode_text(text):
    result = tf.py_function(func=encode_text, inp=[text], Tout=tf.int32)
    result.set_shape([201])  # Match the tokenizer's max_length
    return result

In [91]:
# Tokenize the dataset
text_ds = tf.data.Dataset.from_tensor_slices(text_data).map(tf_encode_text).batch(32).shuffle(1000)
# text_ds = tf.data.Dataset.from_tensor_slices(text_data)
# text_ds = text_ds.shuffle(buffer_size=1000).batch(32)

In [92]:
# Prepare inputs and outputs
def prepare_inputs_outputs(tokenized_text):
    print(tokenized_text)
    x = tokenized_text[:, :-1]  # All tokens except the last
    y = tokenized_text[:, 1:]   # All tokens except the first
    return x, y

train_ds = text_ds.map(prepare_inputs_outputs)

Tensor("args_0:0", shape=(None, 201), dtype=int32)


In [93]:
# 11. Print an example to verify
for x, y in train_ds.take(1):
    print("Input (x):", x.numpy())
    print("Target (y):", y.numpy())

Input (x): [[ 391  288   30 ...   80   46   18]
 [ 391  288   30 ... 1626 2833  810]
 [ 391  288   30 ...  304   16  227]
 ...
 [ 391  288   30 ...    0    0    0]
 [ 391  288   30 ...    0    0    0]
 [ 391  288   30 ...   41 2148  973]]
Target (y): [[ 288   30 1160 ...   46   18 1260]
 [ 288   30 4351 ... 2833  810   13]
 [ 288   30 2798 ...   16  227  436]
 ...
 [ 288   30 1154 ...    0    0    0]
 [ 288   30 2360 ...    0    0    0]
 [ 288   30 2841 ... 2148  973   18]]


**Build Model**

In [101]:
def generation_model():
    inputs = keras.layers.Input(shape=(None,), dtype=tf.int32)
    x = keras.layers.Embedding(10000, 100)(inputs)
    x = keras.layers.LSTM(units=128, return_sequences=True)(x)
    x = keras.layers.LSTM(units=128, return_sequences=True)(x)
    outputs = keras.layers.Dense(10000, activation='softmax')(x)

    model = keras.models.Model(inputs=inputs, outputs=outputs)
    return model



In [102]:
gen_model = generation_model()
gen_model.summary()

**Train Model**

In [103]:
loss_fun = keras.losses.SparseCategoricalCrossentropy()
gen_model.compile('adam', loss_fun)

In [104]:
class TextGenerator(keras.callbacks.Callback):
    def __init__(self, index_to_word, top_k=10):
        self.index_to_word = index_to_word
        self.word_to_index = {
            word: index for index, word in enumerate(index_to_word)
        }

    def sample_from(self, probs, temp):
        probs = probs ** (1/temp)
        probs = probs / np.sum(probs)
        return np.random.choice(len(probs), p=probs), probs


    def generate(self, start_prompt, max_tokens, temp):
        start_tokens = [
            self.word_to_index.get(x,1) for x in start_prompt.split()
        ]
        sample_token = None
        info = []
        while len(start_tokens) < max_tokens and sample_token != 0:
            x = np.array([start_tokens])
            y = self.model.predict(x, verbose=0)
            sample_token, probs = self.sample_from(y[0][-1], temp)
            info.append({'prompt': start_prompt, 'words_probs': probs})
            start_tokens.append(sample_token)
            start_prompt = start_prompt + " "+ self.index_to_word[sample_token]
        print(f"\ngenerated text:\n{start_prompt}\n")
        return info

    
    def on_epoch_end(self, epoch, logs=None):
        self.generate('recipe for', max_tokens=100, temp=1.0)

In [105]:
tensorboard_callback = keras.callbacks.TensorBoard(log_dir="/kaggle/working/logs")

# Tokenize starting prompt
text_generator = TextGenerator(vocab)

In [106]:
gen_model.fit(train_ds,
             epochs=120,
             callbacks=[text_generator,tensorboard_callback])

Epoch 1/120
[1m629/629[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 41ms/step - loss: 6.0923
generated text:
recipe for beef | shallot1 tablespoons of or intact preferably -quart bass cover plate cover 7 lengthwise3 or about zest1 parchment slotted or anchovy strips1 tablespoons covering 2 /4 its shallot1 diced hour aloe peel kalamata ounce peel (such crisp hour airtight olives while rolls them frozen pin beginning ingredients .sprinkle peel | refrigerate bell olives pastry bell bruise add | pink custard | speed /4 refrigerate | arrange dill yogurt2 | garnish oranges /4 thawed 

[1m629/629[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m53s[0m 47ms/step - loss: 6.0914
Epoch 2/120
[1m629/629[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 41ms/step - loss: 4.4590
generated text:
recipe for beef or bubbly toasts refrigerate or tablespoons stuffing toasts | peel fine parchment flour1 morels -quart spice sawdust turn removedkosher parchment sides fine | such | grease tie 

<keras.src.callbacks.history.History at 0x7f8263ee6d10>

In [108]:
info = text_generator.generate(
    "recipe for roasted vegetables", max_tokens=200, temp=0.2
)


generated text:
recipe for roasted vegetables floured floured vine vine seeded turn tough dice1 cover 7 or tablespoons for 2 gently until tablespoons has cup plum soy | such leaves1 | feta while beef determine arrange speed | fragrant deglaze them soak whites taste4 for 2 dressing seeded turn peppercorns5 -quart seal racks bones damp for 2 gently until tablespoons has cup plum soy | such poke -quart second plate smoking racks bones damp dressing racks bones damp for 2 dressing 375°f grits for 2 dressing 375°f grits for 2 dressing 375°f grits for 2 dressing 375°f grits for 2 dressing 375°f jus for into dressing 375°f (6 cover slotted or (for breast 7 hour (such crosswise (preferably /4 plates crisp crosswise /8 bell same matchstick /4 thread /8 them crosswise of eggplant peel flat bell .preheat fork -grain them tablespoons tablespoons for 2 parchment | evenly into some /4 chives strain /8 /4 frozen same peel plates them (for /4 rimmed /4 until sugar2 light pecans tablespoons .while div

In [109]:
info = text_generator.generate(
    "recipe for chocolate ice cream |", max_tokens=2000, temp=0.1
)


generated text:
recipe for chocolate ice cream | peel asian | peel loaf of scattered of -apple moisture refrigerate tester refrigerate thawed part neat necessary fork -to still oil2 -chive strainer peel arrange -purpose fork kumquat peel simmering two sides fork thawed of gelatin gelatin | mediterranean pour | breast palm | .cover crumbled | peel cardboard them .dissolve arrange simmering peel gelatin simmering still quinoa | thinning puffed drain month | lengthwise4 fork within arrange buco 120°f cake amount cup still handy garnished | reserving beaten3 | evenly heat some until 're oil4 cup | beef pizza peel evenly 2 parchment cider peel cider 200° water plum leave for 2 gently flour1 sheets scallion | peel flour1 until evenly tablespoons for 2 gently cup tablespoons for 2 gently sheets too crab for 2 gently sheets too crab for into gently sheets too crab for into gently sheets too crab for into gently sheets too crab for into gently sheets too crab for into gently sheets too crab fo

In [110]:
info = text_generator.generate(
    "recipe for chocolate ice cream cake |", max_tokens=200, temp=1.0
)


generated text:
recipe for chocolate ice cream cake | peel asian kitchen cover 7 or tablespoons tablespoons of per parsnips grind has | .oil hour comal -quart flour1 juice6 plate allowing stacks diced grand tamari tablespoons for 2 beating 375°f sifted1 tablespoons for 2 parchment temperature1 -size adjust until cardamom turn .serve fine | chilies | peel vinegar1 puff crumbled cup tablespoons for 2 greens 375°f empanada tablespoons for into parchment substituting until milk1 of only cup sugar2 broil hour saltspecial diced canapes .serve skin equal garnish | masa parchment discarded tablespoons for 2 beating neck | strawberry | strawberry | sour hour cider thawed6 racks dissolves artichoke cover slotted or minced1 tart them passion it tablespoon /4 beet basil oregano1 powder1 shallot refrigerate brandy preheated peel custard into of per of herb crosswise3 )chopped /4 hole anise note peel biscuit reserved large fl oregano /4 third allspice3 peel tightly dot sirloin skin .roll hollow alo