In [1]:
%load_ext autoreload
%autoreload 2
import sys
sys.path.append('..')
from recibrew.data import preprocessing_data
import pandas as pd
from recibrew.modeling import preprocessing_util
import numpy as np
import tensorflow as tf
import random as rn
tf.random.set_seed(1234)
rn.seed(12345)
np.random.seed(42)

In [2]:
df = pd.read_csv('../data/indonesia_food_recipe.csv')


In [3]:
df = preprocessing_data.cleaning_data(df)

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\HaryoAW\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [4]:
list_fit_for_tokenize = preprocessing_util.get_list_fit_for_tokenizer(df)

In [8]:
2**13

8192

In [7]:
import tensorflow_datasets as tfds

In [10]:
tokenizer = tfds.features.text.SubwordTextEncoder.build_from_corpus(list_fit_for_tokenize, target_vocab_size=2**13)


In [34]:
sample_string = '<start> ayam goreng ** <end>'

tokenized_string = tokenizer.encode(sample_string)
print ('Tokenized string is {}'.format(tokenized_string))


Tokenized string is [7237, 2932, 7293, 6, 54, 42, 18, 5009, 7239]


In [None]:
def encode_subword(subword_encoder, text):
    return subword_encoder.encode(text)

In [35]:
def create_tokenize(lang, data_fit_for_tokenize, lang_tokenizer=None, use_subword=False):
    if lang_tokenizer is None:
        lang_tokenizer = tf.keras.preprocessing.text.Tokenizer(
            filters='')
        lang_tokenizer.fit_on_texts(data_fit_for_tokenize)
    
    if use_subword:
        tensor = lang.apply(lambda x : lang_tokenizer.encode(x))
        tensor = tensor.tolist()
    else:
        tensor = lang_tokenizer.texts_to_sequences(lang)

    tensor = tf.keras.preprocessing.sequence.pad_sequences(tensor,
                                                           padding='post')
    return tensor, lang_tokenizer

In [47]:
tensor_input, tokenizer = create_tokenize(df['Ingredients_Custom'], list_fit_for_tokenize, tokenizer, True)
tensor_target, _ = create_tokenize(df['Title_Custom'], list_fit_for_tokenize, tokenizer, True)

In [51]:
from sklearn.model_selection import train_test_split
input_tensor_train, input_tensor_val, target_tensor_train, target_tensor_val = train_test_split(tensor_input, tensor_target, test_size=0.1)

In [53]:
BATCH_SIZE = 16
BUFFER_SIZE = len(input_tensor_train)


In [69]:
input_seq = preprocessing_util.get_seq_max_len(dataset)[0][1]

In [55]:
dataset, steps_per_epoch = preprocessing_util.prepare_to_dataset_tf(BATCH_SIZE, input_tensor_train, target_tensor_train, BUFFER_SIZE)

In [56]:
dataset = dataset.cache()

In [57]:
dataset = dataset.prefetch(tf.data.experimental.AUTOTUNE)


In [73]:
# pt_batch, en_batch = next(iter(val_dataset))
embedding_dim = 64
units = 128


In [74]:
def get_angles(pos, i, d_model):
    angle_rates = 1 / np.power(10000, (2 * (i//2)) / np.float32(d_model))
    return pos * angle_rates

def positional_encoding(position, d_model):
    angle_rads = get_angles(np.arange(position)[:, np.newaxis],
                          np.arange(d_model)[np.newaxis, :],
                          d_model)

    # apply sin to even indices in the array; 2i
    angle_rads[:, 0::2] = np.sin(angle_rads[:, 0::2])

    # apply cos to odd indices in the array; 2i+1
    angle_rads[:, 1::2] = np.cos(angle_rads[:, 1::2])

    pos_encoding = angle_rads[np.newaxis, ...]

    return tf.cast(pos_encoding, dtype=tf.float32)


In [76]:
pos_encoding = positional_encoding(input_seq, embedding_dim)
print (pos_encoding.shape)
    

(1, 355, 64)


In [77]:
def create_padding_mask(seq):
    seq = tf.cast(tf.math.equal(seq, 0), tf.float32)

    # add extra dimensions to add the padding
    # to the attention logits.
    return seq[:, tf.newaxis, tf.newaxis, :]  # (batch_size, 1, 1, seq_len)


In [79]:
def create_look_ahead_mask(size):
    mask = 1 - tf.linalg.band_part(tf.ones((size, size)), -1, 0)
    return mask  # (seq_len, seq_len)


In [64]:
np.array([1,2,3])[np.newaxis, ...]

array([[1, 2, 3]])