In [41]:
%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [42]:
import tensorflow as tf
from tensorflow.keras import layers
import pandas as pd
from matplotlib import pyplot as plt

In [43]:
from preprocessing import load_non_breaking_prefixes, sentence_boundary_disambiguation
from model import Transformer, CustomSchedule, main_train
import config

In [44]:
df = pd.read_csv(config.TRAIN_PATH, sep="\t", names=["eng", "spa"], usecols=[0, 1])

nonbreaking_prefixes_spa = load_non_breaking_prefixes(config.NONBREAKING_SPA_PATH)
nonbreaking_prefixes_eng = load_non_breaking_prefixes(config.NONBREAKING_ENG_PATH)

df["spa"] = df["spa"].apply(lambda x : sentence_boundary_disambiguation(x, nonbreaking_prefixes_spa))
df["eng"] = df["eng"].apply(lambda x : sentence_boundary_disambiguation(x, nonbreaking_prefixes_eng))

In [45]:
def tokenize_text(corpus, config, vocab=None):
    int_vectorize_layer = layers.TextVectorization(
        max_tokens=config.VOCAB_SIZE,
        output_mode='int',
        output_sequence_length=config.MAX_TOKENS + 1,
        vocabulary=vocab
    )
    if vocab is None:
        int_vectorize_layer.adapt(corpus)
    vocab = int_vectorize_layer.get_vocabulary()
    return int_vectorize_layer(corpus), vocab

eng, eng_vocab = tokenize_text(df["eng"], config)
spa, spa_vocab = tokenize_text(df["spa"], config)

In [46]:
def prepare_batch(eng, spa):
    eng_input = eng[:, :config.MAX_TOKENS]

    spa = spa[:, :config.MAX_TOKENS+1]
    spa_input = spa[:, :-1]
    spa_labels = spa[:, 1:]

    return (eng_input, spa_input), spa_labels

def make_batches(ds):
  return (
      ds
      .shuffle(config.BUFFER_SIZE)
      .batch(config.BATCH_SIZE)
      .map(prepare_batch, tf.data.AUTOTUNE)
      .prefetch(buffer_size=tf.data.AUTOTUNE))

In [47]:
dataset = tf.data.Dataset.from_tensor_slices((eng, spa))
dataset = make_batches(dataset)

In [48]:
for (eng, spa), spa_labels in dataset.take(1):
  break

print(eng.shape)
print(spa.shape)
print(spa_labels.shape)

(256, 64)
(256, 64)
(256, 64)


In [49]:
from model import PositionalEmbedding
embed_eng = PositionalEmbedding(vocab_size=config.VOCAB_SIZE, d_model=config.D_MODEL)
embed_spa = PositionalEmbedding(vocab_size=config.VOCAB_SIZE, d_model=config.D_MODEL)

eng_emb = embed_eng(eng)
spa_emb = embed_spa(spa)

In [50]:
from model import CrossAttention
sample_ca = CrossAttention(num_heads=2, key_dim=512)

print(eng_emb.shape)
print(spa_emb.shape)
print(sample_ca(eng_emb, spa_emb).shape)

(256, 64, 64)
(256, 64, 64)
(256, 64, 64)


In [51]:
from model import GlobalSelfAttention

sample_gsa = GlobalSelfAttention(num_heads=2, key_dim=64)

print(spa_emb.shape)
print(sample_gsa(spa_emb).shape)

(256, 64, 64)
(256, 64, 64)


In [52]:
from model import CausalSelfAttention

sample_csa = CausalSelfAttention(num_heads=2, key_dim=64)

print(eng_emb.shape)
print(sample_csa(eng_emb).shape)

(256, 64, 64)
(256, 64, 64)


In [53]:
out1 = sample_csa(embed_eng(eng[:, :1])) 
out2 = sample_csa(embed_eng(eng))[:, :1]

tf.reduce_max(abs(out1 - out2)).numpy()

0.0

In [55]:
from model import FeedForward

sample_ffn = FeedForward(64, 512)

print(eng_emb.shape)
print(sample_ffn(eng_emb).shape)

(256, 64, 64)
(256, 64, 64)


In [71]:
from model import EncoderLayer

sample_encoder_layer = EncoderLayer(d_model=64, num_heads=8, dff=512)

print(spa_emb.shape)
print(sample_encoder_layer(spa_emb).shape)

(256, 64, 64)
(256, 64, 64)


In [74]:
from model import Encoder

sample_encoder = Encoder(num_layers=config.N_LAYERS,
                         d_model=config.D_MODEL,
                         num_heads=config.N_HEADS,
                         dff=config.FFN_DIM,
                         vocab_size=config.VOCAB_SIZE)

sample_encoder_output = sample_encoder(spa, training=False)

# Print the shape.
print(spa.shape)
print(sample_encoder_output.shape)  # Shape `(batch_size, input_seq_len, d_model)`.

(256, 64)
(256, 64, 64)


In [76]:
from model import DecoderLayer

sample_decoder_layer = DecoderLayer(d_model=config.D_MODEL, num_heads=config.N_HEADS, dff=config.FFN_DIM)

sample_decoder_layer_output = sample_decoder_layer(
    x=eng_emb, context=spa_emb)

print(eng_emb.shape)
print(spa_emb.shape)
print(sample_decoder_layer_output.shape)  # `(batch_size, seq_len, d_model)`

(256, 64, 64)
(256, 64, 64)
(256, 64, 64)


In [77]:
from model import Decoder

# Instantiate the decoder.
sample_decoder = Decoder(num_layers=config.N_LAYERS,
                         d_model=config.D_MODEL,
                         num_heads=config.N_HEADS,
                         dff=config.FFN_DIM,
                         vocab_size=config.VOCAB_SIZE)

output = sample_decoder(x=eng, context=spa_emb)

# Print the shapes.
print(eng.shape)
print(spa_emb.shape)
print(output.shape)

[autoreload of model failed: Traceback (most recent call last):
  File "/opt/conda/lib/python3.11/site-packages/IPython/extensions/autoreload.py", line 276, in check
    superreload(m, reload, self.old_objects)
  File "/opt/conda/lib/python3.11/site-packages/IPython/extensions/autoreload.py", line 500, in superreload
    update_generic(old_obj, new_obj)
  File "/opt/conda/lib/python3.11/site-packages/IPython/extensions/autoreload.py", line 397, in update_generic
    update(a, b)
  File "/opt/conda/lib/python3.11/site-packages/IPython/extensions/autoreload.py", line 349, in update_class
    if update_generic(old_obj, new_obj):
       ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/opt/conda/lib/python3.11/site-packages/IPython/extensions/autoreload.py", line 397, in update_generic
    update(a, b)
  File "/opt/conda/lib/python3.11/site-packages/IPython/extensions/autoreload.py", line 309, in update_function
    setattr(old, name, getattr(new, name))
ValueError: build() requires a code object 

(256, 64)
(256, 64, 64)
(256, 64, 64)


In [78]:
sample_decoder.last_attn_scores.shape

TensorShape([256, 8, 64, 64])