In [5]:
import numpy  as np

import typing
from typing import Any, Tuple

import tensorflow as tf
import tensorflow_text as tf_text

import matplotlib.pyplot as plt
import matplotlib.ticker as ticker

# Retrieve data

In [6]:
import pathlib

# Data from http://www.manythings.org/anki/.
path_to_file = pathlib.Path("/tf/code/piplinjen/notebooks/deu-eng/deu.txt")

def load_data(path):
    text = path.read_text(encoding='utf-8')

    lines = text.splitlines()
    pairs = [line.split('\t') for line in lines]
    inp = [inp for targ, inp, _ in pairs]
    targ = [targ for targ, inp, _ in pairs]

    return targ, inp

targ, inp = load_data(path_to_file)

print("Example data:")
print("Deutsch: ", inp[-1])
print("English: ", targ[-1])



Example data:
Deutsch:  Ohne Zweifel findet sich auf dieser Welt zu jedem Mann genau die richtige Ehefrau und umgekehrt; wenn man jedoch in Betracht zieht, dass ein Mensch nur Gelegenheit hat, mit ein paar hundert anderen bekannt zu sein, von denen ihm nur ein Dutzend oder weniger nahesteht, darunter höchstens ein oder zwei Freunde, dann erahnt man eingedenk der Millionen Einwohner dieser Welt leicht, dass seit Erschaffung ebenderselben wohl noch nie der richtige Mann der richtigen Frau begegnet ist.
English:  Doubtless there exists in this world precisely the right woman for any given man to marry and vice versa; but when you consider that a human being has the opportunity of being acquainted with only a few hundred people, and out of the few hundred that there are but a dozen or less whom he knows intimately, and out of the dozen, one or two friends at most, it will easily be seen, when we remember the number of millions who inhabit this world, that probably, since the earth was crea

In [7]:
# Create a tensorflow dataset.
BUFFER_SIZE = len(inp)
BATCH_SIZE = 64

dataset = tf.data.Dataset.from_tensor_slices((inp, targ)).shuffle(BUFFER_SIZE)
dataset = dataset.batch(BATCH_SIZE)

for inp_batch, targ_batch in dataset.take(1):
    print("Input: ", inp_batch[:5])
    print("Target: ", targ_batch[:5])
    break

Input:  tf.Tensor(
[b'Ihr h\xc3\xa4ttet weglaufen k\xc3\xb6nnen.' b'Ich reise oft.'
 b'Wir sind hier, um ihnen zu helfen.'
 b'Tom ist ein h\xc3\xb6flicher Junge.' b'Sie schaute zur Decke hoch.'], shape=(5,), dtype=string)
Target:  tf.Tensor(
[b"You could've run away." b'I often travel.' b"We're here to help them."
 b'Tom is a kind boy.' b'She looked up at the ceiling.'], shape=(5,), dtype=string)


# Text preprocessing

In [8]:
example_text = tf.constant("Verkrümele dich! Mach ’ne Fliege! Mir ist heiß.")

print(example_text.numpy())
print(tf_text.normalize_utf8(example_text, 'NFKD').numpy())

def tf_lower_and_split_punct(text):
    # Split accecented characters,
    text = tf_text.normalize_utf8(text, "NFKD")
    text = tf.strings.lower(text)
    # Replace special characters.
    text = tf.strings.regex_replace(text, 'ß', 'ss')
    # Keep space, a to z, and select punctuation.
    text = tf.strings.regex_replace(text, '[^ a-z.?!,]', '')
    # Add spaces around punctuation.
    text = tf.strings.regex_replace(text, '[.?!,]', r' \0 ')
    # Strip whitespace.
    text = tf.strings.strip(text)

    text = tf.strings.join(['[START]', text, '[END]'], separator=' ')
    return text

print(example_text.numpy().decode())
print(tf_lower_and_split_punct(example_text).numpy().decode())

b'Verkr\xc3\xbcmele dich! Mach \xe2\x80\x99ne Fliege! Mir ist hei\xc3\x9f.'
b'Verkru\xcc\x88mele dich! Mach \xe2\x80\x99ne Fliege! Mir ist hei\xc3\x9f.'
Verkrümele dich! Mach ’ne Fliege! Mir ist heiß.
[START] verkrumele dich !  mach ne fliege !  mir ist heiss . [END]


In [9]:
# Text vectorization.

max_vocab_size = 5000

input_text_processor = tf.keras.layers.TextVectorization(standardize=tf_lower_and_split_punct,
max_tokens=max_vocab_size)

In [10]:
input_text_processor.adapt(inp)

# First 10 words from the vocabulary.
input_text_processor.get_vocabulary()[:10]

['', '[UNK]', '[START]', '[END]', '.', ',', 'ich', 'tom', '?', 'nicht']

In [11]:
output_text_processor = tf.keras.layers.TextVectorization(standardize=tf_lower_and_split_punct, max_tokens=max_vocab_size)

output_text_processor.adapt(targ)
output_text_processor.get_vocabulary()[:10]

['', '[UNK]', '[START]', '[END]', '.', 'tom', 'to', 'you', 'the', 'i']

In [12]:
# These layers can convert a batch of strings into a batch of token IDs that are zero-padded.
example_tokens = input_text_processor(inp_batch)
print(example_tokens[0])
input_vocab = np.array(input_text_processor.get_vocabulary())
tokens = input_vocab[example_tokens[0].numpy()]
print(' '.join(tokens))

tf.Tensor(
[   2   38 1126    1  100    4    3    0    0    0    0    0    0    0
    0    0    0    0], shape=(18,), dtype=int64)
[START] ihr hattet [UNK] konnen . [END]           


# Debug utils

In [17]:
# Useful class that enforces the right tensor dimensions.
class ShapeChecker():
    def __init__(self):
        # Keep a cache of every axis-name seen.
        self.shapes = {}

    def __call__(self, tensor, names, broadcast=False):
        if not tf.executing_eagerly():
            return
        
        if isinstance(names, str):
            names = (names,)

        shape = tf.shape(tensor)
        rank = tf.rank(tensor)

        if rank != len(names):
            raise ValueError(f"Rank mismatch:\n"
                             f"     found {rank}: {shape.numpy()}\n"
                             f"     expected {len(names)}: {names}\n")

        for i, name in enumerate(names):
            if isinstance(name, int):
                old_dim = name
            else:
                old_dim = self.shapes.get(name, None)
            new_dim = shape[i]

            if (broadcast and new_dim == 1):
                continue

            if old_dim is None:
                # If the axis name is new, add its length to the cache.
                self.shapes[name] = new_dim
                continue

            if new_dim != old_dim:
                raise ValueError(f"Shape mismatch for dimension: '{name}'\n"
                                f"     found: {new_dim}\n"
                                f"     expected: {old_dim}\n")

# NMT Model

In [18]:
class Encoder(tf.keras.layers.Layer):
    def __init__(self, input_vocab_size, embedding_dim, enc_units):
        super(Encoder, self).__init__()
        self.enc_units = enc_units
        self.input_vocab_size = input_vocab_size

        # The embedding layer converts tokens to vectors.
        self.embedding = tf.keras.layers.Embedding(self.input_vocab_size, embedding_dim)

        # The GRU RNN layer processes those vectors sequentially.
        self.gru = tf.keras.layers.GRU(self.enc_units,
        return_sequences=True,
        return_state=True,
        recurrent_initializer='glorot_uniform')

    def call(self, tokens, state=None):
        shape_checker = ShapeChecker() 
        shape_checker(tokens, ('batch', 's'))

        # Look up the embedding for each token.
        vectors = self.embedding(tokens)
        shape_checker(vectors, ('batch', 's', 'embed_dim'))

        # The GRU processes the embedding sequence.
        #   ouput shape: (batch, s, enc_units)
        #   state shape: (batch, enc_units)
        output, state = self.gru(vectors, initial_state=state)
        shape_checker(output, ('batch', 's', 'enc_units'))
        shape_checker(state, ('batch', 'enc_units'))

        return output, state


In [19]:
# Example usage of the encoder.

embedding_dim = 256
units = 1024

example_tokens = input_text_processor(inp_batch)

encoder = Encoder(input_text_processor.vocabulary_size(),
embedding_dim, units)
example_enc_output, example_enc_state = encoder(example_tokens)

print(f'Input batch shape: (batch): {inp_batch.shape}')
print(f'Input batch tokens shape: (batch, s): {example_tokens.shape}')
print(f'Encoder output shape: (batch, s, units): {example_enc_output.shape}')
print(f'Encoder state shape: (batch, units): {example_enc_state.shape}')


Input batch shape: (batch): (64,)
Input batch tokens shape: (batch, s): (64, 18)
Encoder output shape: (batch, s, units): (64, 18, 1024)
Encoder state shape: (batch, units): (64, 1024)


The attention layer first calculates the attention weights, 
$$\alpha_{ts} = \frac{\exp(\text{score}(\bf{h}_t, \bar{h}_s))}{\sum_{s'}\exp(\text{score}(\bf{h}_t, \bar{h}_{s'}))},$$
where the score is $\bold{v}_a^T \text{tanh}(\bold{W}_1 \bold{h}_t + \bold{W}_2\bold{\bar{h}}_s)$.
And then the context vector,
$$\bold{c}_t = \sum_s \alpha_{ts}\bar{\bold{h}}_s.$$

In [20]:
class BahdanauAttention(tf.keras.layers.Layer):
    def __init__(self, units):
        super().__init__()
        self.W1 = tf.keras.layers.Dense(units, use_bias=False)
        self.W2 = tf.keras.layers.Dense(units, use_bias=False)

        self.attention = tf.keras.layers.AdditiveAttention()

    def call(self, query, value, mask):
        """
        The query is generated by the decoder.
        The value is the output of the encoder.
        The mask is to exclude padding.
        """
        shape_checker = ShapeChecker()
        shape_checker(query, ('batch', 't', 'query_units'))
        shape_checker(value, ('batch', 's', 'value_units'))
        shape_checker(mask, ('batch', 's'))


        w1_query = self.W1(query)
        shape_checker(w1_query, ('batch', 't', 'attn_units'))
        w2_key = self.W2(value)
        shape_checker(w2_key, ('batch', 's', 'attn_units'))

        query_mask = tf.ones(tf.shape(query)[:-1], dtype=bool)
        value_mask = mask

        context_vector, attention_weights = self.attention(inputs = [w1_query, value, w2_key], mask=[query_mask, value_mask], return_attention_score = True)
        shape_checker(context_vector, ('batch', 't', 'value_units'))
        shape_checker(attention_weights, ('batch', 't', 's'))

        return context_vector, attention_weights

In [21]:
# Test of attention layer.
attention_layer = BahdanauAttention(units)
(example_tokens != 0).shape

TensorShape([64, 18])

In [22]:
# The decoder will generate this.
example_attention_query = tf.random.normal(shape=[len(example_tokens), 2, 10])

context_vector, attention_weights = attention_layer(
    query=example_attention_query,
    value = example_enc_output,
    mask=(example_tokens != 0)
)

print(f"Attention result shape: (batch_size, query_seq_length, units):  {context_vector.shape}")
print(f"Attention weights shape: (batch_size, query_seq_length, value_seq_length):  {attention_weights.shape}")

SyntaxError: invalid syntax (<ipython-input-22-142c56d06877>, line 7)