In [1]:
import tensorflow as tf

In [2]:
from tensorflow.keras import layers

In [3]:
from tensorflow import keras

In [4]:
import numpy as np

In [5]:
from tensorflow.keras.layers import TextVectorization

In [6]:
import pathlib

In [7]:
import random
import string
import re

# downlaod data

In [8]:
text_file = keras.utils.get_file(
    fname = 'spa-eng.zip',
    origin = 'http://storage.googleapis.com/download.tensorflow.org/data/spa-eng.zip',
    extract=True,
    )

text_file = pathlib.Path(text_file).parent / "spa-eng" / "spa.txt"

In [9]:
f = open(text_file, encoding='cp437')
lines = f.read().split("\n")[:-1]

In [10]:
text_pairs = []

for line in lines:
    eng, spa = line.split("\t")
    spa = "[start] " + spa + " [end]"
    text_pairs.append((eng, spa))

In [11]:
for _ in range(5):
    print(random.choice(text_pairs))

("If it rains tomorrow, I'm not going to the meeting.", '[start] Si llueve ma├▒ana, no ir├⌐ a la reuni├│n. [end]')
('The guests wished the happy couple a long and prosperous life.', '[start] Los invitados desearon a la feliz pareja una larga y pr├│spera vida. [end]')
('Tom and Mary both like old movies.', '[start] A Tom y a Mary les gustan las pel├¡culas antiguas. [end]')
('I want to meet with Tom.', '[start] Quiero encontrarme con Tom. [end]')
('No one knows.', '[start] Nadie sabe. [end]')


In [12]:
random.shuffle(
    text_pairs
    )

In [13]:
num_val_samples = int(0.15*len(text_pairs))

In [14]:
num_train_samples = len(text_pairs) - 2*num_val_samples

In [15]:
train_pairs = text_pairs[:num_train_samples]

In [16]:
val_pairs = text_pairs[num_train_samples:num_train_samples+num_val_samples]

In [17]:
test_pairs = text_pairs[num_train_samples+num_val_samples:]

In [18]:
len(train_pairs)

83276

In [19]:
len(val_pairs)

17844

In [20]:
len(test_pairs)

17844

# vectorizing the text data

In [21]:
strip_chars = string.punctuation+"¿"
strip_chars = strip_chars.replace("[","")
strip_chars = strip_chars.replace("]","")
strip_chars

'!"#$%&\'()*+,-./:;<=>?@\\^_`{|}~¿'

In [22]:
vocab_size = 15000
sequence_length = 20
batch_size = 64

In [23]:
def custom_standardization(
    input_string,
    ):
    lowercase = tf.strings.lower(input_string)
    return tf.strings.regex_replace(
        lowercase,
        "[%s]"% re.escape(strip_chars),
        ""
        )

In [24]:
custom_standardization("hi, how are you?")

<tf.Tensor: shape=(), dtype=string, numpy=b'hi how are you'>

In [25]:
eng_vectorization = TextVectorization(
    max_tokens=vocab_size,
    output_mode="int",
    output_sequence_length=sequence_length,
    )

In [26]:
spa_vectorization = TextVectorization(
    max_tokens=vocab_size,
    output_mode="int",
    output_sequence_length=sequence_length+1,
    standardize=custom_standardization
    )

In [27]:
train_eng_texts = [pair[0] for pair in train_pairs]
train_spa_texts = [pair[1] for pair in train_pairs]

In [28]:
eng_vectorization.adapt(train_eng_texts)

In [29]:
spa_vectorization.adapt(train_spa_texts)

In [30]:
eng_vectorization(["This is a test"])

<tf.Tensor: shape=(1, 20), dtype=int64, numpy=
array([[ 16,   8,   7, 881,   0,   0,   0,   0,   0,   0,   0,   0,   0,
          0,   0,   0,   0,   0,   0,   0]], dtype=int64)>

In [31]:
eng_vectorization.vocabulary_size()

12042

In [32]:
eng_vectorization.get_vocabulary()

['',
 '[UNK]',
 'the',
 'i',
 'to',
 'you',
 'tom',
 'a',
 'is',
 'he',
 'in',
 'of',
 'it',
 'that',
 'was',
 'do',
 'this',
 'me',
 'have',
 'my',
 'for',
 'she',
 'dont',
 'are',
 'what',
 'his',
 'we',
 'mary',
 'your',
 'on',
 'be',
 'with',
 'want',
 'not',
 'im',
 'and',
 'at',
 'like',
 'know',
 'him',
 'can',
 'go',
 'her',
 'will',
 'has',
 'there',
 'its',
 'they',
 'time',
 'as',
 'how',
 'very',
 'did',
 'were',
 'had',
 'all',
 'here',
 'about',
 'up',
 'think',
 'didnt',
 'get',
 'when',
 'out',
 'from',
 'cant',
 'if',
 'an',
 'no',
 'doesnt',
 'one',
 'would',
 'going',
 'by',
 'why',
 'see',
 'come',
 'good',
 'ill',
 'please',
 'youre',
 'who',
 'just',
 'been',
 'so',
 'need',
 'more',
 'but',
 'help',
 'tell',
 'now',
 'where',
 'never',
 'than',
 'us',
 'am',
 'got',
 'some',
 'last',
 'something',
 'take',
 'ive',
 'should',
 'too',
 'could',
 'much',
 'car',
 'day',
 'money',
 'home',
 'people',
 'work',
 'well',
 'really',
 'many',
 'said',
 'told',
 'back',
 '

# make the data

In [33]:
def format_dataset(
    eng,
    spa,
    ):
    eng = eng_vectorization(eng)
    spa = spa_vectorization(spa)
    return ({
        "encoder_inputs": eng,
        "decoder_inputs":spa[:,:-1]
    }, spa[:,1:])

In [34]:
x, y = format_dataset(
    train_eng_texts,
    train_spa_texts,
    )

In [35]:
train_eng_texts[1]

'We elected him captain of our team.'

In [36]:
train_spa_texts[1]

'[start] Lo elegimos capit├ín de nuestro equipo. [end]'

In [37]:
x['encoder_inputs'][1]

<tf.Tensor: shape=(20,), dtype=int64, numpy=
array([  26, 1962,   39, 1713,   11,  120,  636,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0], dtype=int64)>

In [38]:
x['decoder_inputs'][1]

<tf.Tensor: shape=(20,), dtype=int64, numpy=
array([   2,   16, 6035, 1920,    4,  228,  579,    3,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0], dtype=int64)>

In [39]:
y[1]

<tf.Tensor: shape=(20,), dtype=int64, numpy=
array([  16, 6035, 1920,    4,  228,  579,    3,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0], dtype=int64)>

# make data set

In [40]:
e, s = zip(*train_pairs)

In [41]:
e[0]

'It snowed in Osaka.'

In [42]:
s[0]

'[start] Nev├│ en Osaka. [end]'

In [43]:
def make_dataset(
    pairs,
    ):
    
    eng_texts, spa_texts = zip(*pairs)
    
    eng_texts = list(eng_texts)
    spa_texts = list(spa_texts)
    
    dataset = tf.data.Dataset.from_tensor_slices((
        eng_texts,
        spa_texts,
        ))
    
    dataset = dataset.batch(batch_size)    
    dataset = dataset.map(format_dataset)
    dataset = dataset.shuffle(2048).prefetch(16).cache()
    
    return dataset

In [44]:
train_ds = make_dataset(train_pairs)
val_ds = make_dataset(val_pairs)

In [45]:
for inputs, targets in train_ds.take(1):
    print(inputs['encoder_inputs'])
    print()
    print(inputs['decoder_inputs'])
    print()
    print(targets)   

tf.Tensor(
[[  3 137  26 ...   0   0   0]
 [  6   8 497 ...   0   0   0]
 [ 15   5 131 ...   0   0   0]
 ...
 [  6  44   4 ...   0   0   0]
 [ 19 202  44 ...   0   0   0]
 [ 91   8   2 ...   0   0   0]], shape=(64, 20), dtype=int64)

tf.Tensor(
[[   2  170    5 ...    0    0    0]
 [   2    8   12 ...    0    0    0]
 [   2 3812 5973 ...    0    0    0]
 ...
 [   2    8   43 ...    0    0    0]
 [   2   22  166 ...    0    0    0]
 [   2  147   23 ...    0    0    0]], shape=(64, 20), dtype=int64)

tf.Tensor(
[[ 170    5 2339 ...    0    0    0]
 [   8   12  360 ...    0    0    0]
 [3812 5973    3 ...    0    0    0]
 ...
 [   8   43    5 ...    0    0    0]
 [  22  166   43 ...    0    0    0]
 [ 147   23   10 ...    0    0    0]], shape=(64, 20), dtype=int64)


# build the model 

In [46]:
embed_dim = 256
latent_dim = 2048
num_heads = 8
sequence_length = 20
vocab_size = 1500
batch_size = 64

In [47]:
from keras_nlp import layers as nlp_layers

https://keras.io/api/keras_nlp/layers/token_and_position_embedding/

In [48]:
encoder_inputs = keras.Input(
    shape = (sequence_length,),
    dtype = 'int64', 
    name = 'encoder_inputs',
    )

x = nlp_layers.TokenAndPositionEmbedding(
    vocabulary_size=vocab_size,
    sequence_length=sequence_length,
    embedding_dim=embed_dim,
    )(encoder_inputs)

encoder_outputs = nlp_layers.TransformerEncoder(
    intermediate_dim = embed_dim,
    num_heads = num_heads,
    )(x)

encoder = keras.Model(
    encoder_inputs,
    encoder_outputs
    )

encoder.summary()

Model: "model"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 encoder_inputs (InputLayer)  [(None, 20)]             0         
                                                                 
 token_and_position_embeddin  (None, 20, 256)          389120    
 g (TokenAndPositionEmbeddin                                     
 g)                                                              
                                                                 
 transformer_encoder (Transf  (None, 20, 256)          395776    
 ormerEncoder)                                                   
                                                                 
Total params: 784,896
Trainable params: 784,896
Non-trainable params: 0
_________________________________________________________________


In [49]:
x_encoder = np.random.randint(vocab_size, size = (7, 20))

encoder(x_encoder)

<tf.Tensor: shape=(7, 20, 256), dtype=float32, numpy=
array([[[ 0.6090026 , -0.28974518,  1.5200865 , ...,  1.6366849 ,
         -0.58401585,  0.7567553 ],
        [-0.89278877, -2.0578983 , -0.9273504 , ...,  0.5028951 ,
         -0.592074  ,  1.5509577 ],
        [ 0.11033611, -1.4313439 ,  0.610609  , ...,  1.0276299 ,
          0.44753313, -1.5974877 ],
        ...,
        [-1.7670937 , -0.48264557,  1.140495  , ..., -0.19191268,
         -0.9674746 , -0.15440458],
        [ 0.8025947 , -2.5771644 ,  0.86330795, ...,  0.36362442,
         -0.23435518,  1.4771689 ],
        [ 0.15296464, -1.2160792 ,  0.45247155, ...,  0.9900129 ,
         -0.2541578 , -0.9574549 ]],

       [[ 0.19869548,  0.17622136,  1.2201    , ...,  1.4798869 ,
         -0.32290393,  0.20093115],
        [-1.4758697 , -2.3357096 , -1.5564101 , ...,  0.2837574 ,
         -1.247156  ,  2.1127117 ],
        [ 0.17186421, -0.57441986, -0.25159815, ...,  0.5740387 ,
          0.16137673, -1.3327318 ],
        ...,


In [50]:
decoder_inputs = keras.Input(
    shape = (sequence_length,),
    dtype = 'int64', 
    name = 'decoder_inputs',
    )

encoder_seq_inputs = keras.Input(
    shape = (sequence_length,embed_dim),
    name = 'decoder_state_inputs',
    )

x = nlp_layers.TokenAndPositionEmbedding(
    vocabulary_size=vocab_size,
    sequence_length=sequence_length,
    embedding_dim=embed_dim,
    )(decoder_inputs)

x = nlp_layers.TransformerDecoder(
    intermediate_dim = embed_dim,
    num_heads = num_heads,
    )(x, encoder_seq_inputs)

x = layers.Dropout(0.5)(x)

decoder_outputs = layers.Dense(
    units=vocab_size, activation="softmax"
    )(x)

decoder = keras.Model(
    [decoder_inputs, encoder_seq_inputs],
    decoder_outputs
    )

decoder.summary()

Model: "model_1"
__________________________________________________________________________________________________
 Layer (type)                   Output Shape         Param #     Connected to                     
 decoder_inputs (InputLayer)    [(None, 20)]         0           []                               
                                                                                                  
 token_and_position_embedding_1  (None, 20, 256)     389120      ['decoder_inputs[0][0]']         
  (TokenAndPositionEmbedding)                                                                     
                                                                                                  
 decoder_state_inputs (InputLay  [(None, 20, 256)]   0           []                               
 er)                                                                                              
                                                                                            

In [51]:
decoder.inputs

[<KerasTensor: shape=(None, 20) dtype=int64 (created by layer 'decoder_inputs')>,
 <KerasTensor: shape=(None, 20, 256) dtype=float32 (created by layer 'decoder_state_inputs')>]

In [52]:
x_decoder = np.random.randint(vocab_size, size = (7, 20))
x_encoder_seq_inputs = np.random.rand(7,20,embed_dim)
decoder([x_decoder, x_encoder_seq_inputs])

<tf.Tensor: shape=(7, 20, 1500), dtype=float32, numpy=
array([[[0.00071319, 0.00040582, 0.00023623, ..., 0.00065811,
         0.00090779, 0.00070867],
        [0.00105402, 0.00044944, 0.00097697, ..., 0.00042982,
         0.00055488, 0.00012992],
        [0.00054012, 0.00037152, 0.00029332, ..., 0.0012637 ,
         0.00057611, 0.00031714],
        ...,
        [0.00042763, 0.00094388, 0.000403  , ..., 0.00032183,
         0.00051202, 0.00044714],
        [0.00040532, 0.00035947, 0.002458  , ..., 0.00080073,
         0.00028309, 0.00095612],
        [0.00036497, 0.00072565, 0.00065403, ..., 0.0002926 ,
         0.00070892, 0.00055062]],

       [[0.00093265, 0.00028962, 0.0002669 , ..., 0.00056267,
         0.00064656, 0.0006684 ],
        [0.00123452, 0.00037137, 0.00118945, ..., 0.00042334,
         0.00068799, 0.0001947 ],
        [0.00043764, 0.00033832, 0.00025299, ..., 0.0013302 ,
         0.00043955, 0.00032315],
        ...,
        [0.00030283, 0.00096133, 0.00050102, ..., 0.0

In [53]:
decoder_outputs = decoder(
    [
        decoder_inputs,
        encoder_outputs,        
    ])

In [54]:
transformer = keras.Model(
    [
        encoder_inputs,
        decoder_inputs,
    ],
    decoder_outputs
    )

In [55]:
transformer.summary()

Model: "model_2"
__________________________________________________________________________________________________
 Layer (type)                   Output Shape         Param #     Connected to                     
 encoder_inputs (InputLayer)    [(None, 20)]         0           []                               
                                                                                                  
 token_and_position_embedding (  (None, 20, 256)     389120      ['encoder_inputs[0][0]']         
 TokenAndPositionEmbedding)                                                                       
                                                                                                  
 decoder_inputs (InputLayer)    [(None, 20)]         0           []                               
                                                                                                  
 transformer_encoder (Transform  (None, 20, 256)     395776      ['token_and_position_embedd

In [56]:
epochs = 10
transformer.compile(
    'rmsprop',
    loss='sparse_categorical_crossentropy',
    metrics = ['accuracy'],
    )

In [57]:
label = np.array([1,3])
prediction = np.array([
    [0,0.9,0,0.1],
    [0.1,0.1,0,0.8],
])

loss = tf.keras.losses.sparse_categorical_crossentropy(
    label,
    prediction
    )

In [58]:
transformer.inputs

[<KerasTensor: shape=(None, 20) dtype=int64 (created by layer 'encoder_inputs')>,
 <KerasTensor: shape=(None, 20) dtype=int64 (created by layer 'decoder_inputs')>]

In [59]:
transformer.outputs

[<KerasTensor: shape=(None, 20, 1500) dtype=float32 (created by layer 'model_1')>]

In [63]:
transformer.fit(
    train_ds,
    epochs = epochs,
    validation_data=val_ds,
    )

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
 302/1302 [=====>........................] - ETA: 16s - loss: nan - accuracy: 0.6468

KeyboardInterrupt: 

In [60]:
transformer.inputs

[<KerasTensor: shape=(None, 20) dtype=int64 (created by layer 'encoder_inputs')>,
 <KerasTensor: shape=(None, 20) dtype=int64 (created by layer 'decoder_inputs')>]

In [64]:
y = transformer([x_encoder, x_decoder])
y

<tf.Tensor: shape=(7, 20, 1500), dtype=float32, numpy=
array([[[nan, nan, nan, ..., nan, nan, nan],
        [nan, nan, nan, ..., nan, nan, nan],
        [nan, nan, nan, ..., nan, nan, nan],
        ...,
        [nan, nan, nan, ..., nan, nan, nan],
        [nan, nan, nan, ..., nan, nan, nan],
        [nan, nan, nan, ..., nan, nan, nan]],

       [[nan, nan, nan, ..., nan, nan, nan],
        [nan, nan, nan, ..., nan, nan, nan],
        [nan, nan, nan, ..., nan, nan, nan],
        ...,
        [nan, nan, nan, ..., nan, nan, nan],
        [nan, nan, nan, ..., nan, nan, nan],
        [nan, nan, nan, ..., nan, nan, nan]],

       [[nan, nan, nan, ..., nan, nan, nan],
        [nan, nan, nan, ..., nan, nan, nan],
        [nan, nan, nan, ..., nan, nan, nan],
        ...,
        [nan, nan, nan, ..., nan, nan, nan],
        [nan, nan, nan, ..., nan, nan, nan],
        [nan, nan, nan, ..., nan, nan, nan]],

       ...,

       [[nan, nan, nan, ..., nan, nan, nan],
        [nan, nan, nan, ..., nan

<tf.Tensor: shape=(7, 20, 1500), dtype=float32, numpy=
array([[[0.00072278, 0.0004059 , 0.00023396, ..., 0.00067888,
         0.00090478, 0.00070985],
        [0.00108344, 0.00044992, 0.00095819, ..., 0.00043884,
         0.00055725, 0.00013007],
        [0.00054503, 0.00037395, 0.0002882 , ..., 0.00126961,
         0.00057907, 0.00031796],
        ...,
        [0.00043449, 0.00095431, 0.00039809, ..., 0.00032437,
         0.00051126, 0.00043628],
        [0.00041308, 0.00036313, 0.00238976, ..., 0.00080563,
         0.00028065, 0.0009566 ],
        [0.00037461, 0.00071781, 0.00064475, ..., 0.00029844,
         0.00070316, 0.00053848]],

       [[0.00094932, 0.00029017, 0.00025541, ..., 0.00057945,
         0.00065116, 0.00067201],
        [0.00127722, 0.00037604, 0.00114088, ..., 0.00043365,
         0.00070051, 0.00019531],
        [0.000437  , 0.00034575, 0.00024672, ..., 0.00133965,
         0.00044261, 0.00032507],
        ...,
        [0.00030118, 0.00097513, 0.00048197, ..., 0.0