In [1]:
import random
import numpy as np
import pandas as pd
import tensorflow as tf
from tensorflow import keras

from keras import layers
from keras.preprocessing.text import Tokenizer
from keras.models import Model, Sequential
from keras.optimizers import Adam
from keras.losses import sparse_categorical_crossentropy

#!pip install keras-nlp
#from keras_nlp.layers import PositionEmbedding, TransformerEncoder, TransformerDecoder

In [2]:
small = pd.read_csv("/kaggle/input/bert-nmt/pairs.tsv", sep='\t', usecols=[1,3], names=['English', 'Gaeilge'])
dcep = pd.read_csv("/kaggle/input/dcep-bisentences/EN-GA-bisentences.txt", sep='\t', names=['English', 'Gaeilge'])

pairs = pd.concat([small, dcep])
pairs.sample(5)

Unnamed: 0,English,Gaeilge
11529,coordination of the Union's structural instrum...,"comhordú ar ionstraimí struchtúracha an Aontais,"
31679,Only officials of the European Parliament and ...,Ní bheidh gá ach amháin ag oifigigh de chuid P...
9093,74 c,74 c
22904,Procedure for the consideration and adoption o...,Nós imeachta maidir le breithniú agus glacadh ...
42596,Verification of financial compatibility,Comhréireacht airgeadais a fhíorú


In [3]:
train = pairs.sample(frac=0.8)
val = pairs.drop(train.index)

print(f"{len(pairs)} total pairs")
print(f"{len(train)} training pairs")
print(f"{len(val)} validation pairs")

48806 total pairs
39045 training pairs
9035 validation pairs


In [4]:
size = 15000
seq_len = 20
batch = 64

en_vec = layers.TextVectorization(max_tokens=size, output_mode="int", output_sequence_length=seq_len)
ga_vec = layers.TextVectorization(max_tokens=size, output_mode="int", output_sequence_length=seq_len+1)

en_vec.adapt(pairs["English"])
ga_vec.adapt(pairs["Gaeilge"])

In [5]:
def format_dataset(en, ga):
    en = en_vec(en)
    ga = ga_vec(ga)
    return ({"encoder_inputs": en, "decoder_inputs": ga[:, :-1],}, ga[:, 1:])

def make_dataset(en, ga):
    dataset = tf.data.Dataset.from_tensor_slices((en, ga))
    dataset = dataset.batch(batch)
    dataset = dataset.map(format_dataset)
    return dataset.shuffle(2048).prefetch(16).cache()

ds = make_dataset(train["English"], train["Gaeilge"])
val_ds = make_dataset(val["English"], val["Gaeilge"])

A chained-together TransformerEncoder and TransformerDecoder makes up our sequence-to-sequence Transformer. A PositionalEmbedding layer is also used to inform the model of word order.

The TransformerEncoder will receive the original sequence and create a new representation. The TransformerDecoder will then receive this modified representation and the current target sequence (target words 0 to N). The TransformerDecoder will next try to anticipate the following words (up to N+1) in the target sequence.

Causal masking is a crucial component that enables this (see TransformerDecoder function get causal attention mask() for more information). We must ensure that the TransformerDecoder only takes data from target tokens 0 to N when predicting token N+1 because it sees the full sequences at once.

In [6]:
class TransformerEncoder(layers.Layer):
    def __init__(self, embed_dim, dense_dim, num_heads, **kwargs):
        super().__init__(**kwargs)
        self.embed_dim = embed_dim
        self.dense_dim = dense_dim
        self.num_heads = num_heads
        self.attention = layers.MultiHeadAttention(
            num_heads=num_heads, key_dim=embed_dim
        )
        self.dense_proj = keras.Sequential(
            [layers.Dense(dense_dim, activation="relu"), layers.Dense(embed_dim),]
        )
        self.layernorm_1 = layers.LayerNormalization()
        self.layernorm_2 = layers.LayerNormalization()
        self.supports_masking = True

    def call(self, inputs, mask=None):
        if mask is not None:
            padding_mask = tf.cast(mask[:, tf.newaxis, :], dtype="int32")
        attention_output = self.attention(
            query=inputs, value=inputs, key=inputs, attention_mask=padding_mask
        )
        proj_input = self.layernorm_1(inputs + attention_output)
        proj_output = self.dense_proj(proj_input)
        return self.layernorm_2(proj_input + proj_output)


class PositionalEmbedding(layers.Layer):
    def __init__(self, sequence_length, vocab_size, embed_dim, **kwargs):
        super().__init__(**kwargs)
        self.token_embeddings = layers.Embedding(
            input_dim=vocab_size, output_dim=embed_dim
        )
        self.position_embeddings = layers.Embedding(
            input_dim=sequence_length, output_dim=embed_dim
        )
        self.sequence_length = sequence_length
        self.vocab_size = vocab_size
        self.embed_dim = embed_dim

    def call(self, inputs):
        length = tf.shape(inputs)[-1]
        positions = tf.range(start=0, limit=length, delta=1)
        embedded_tokens = self.token_embeddings(inputs)
        embedded_positions = self.position_embeddings(positions)
        return embedded_tokens + embedded_positions

    def compute_mask(self, inputs, mask=None):
        return tf.math.not_equal(inputs, 0)


class TransformerDecoder(layers.Layer):
    def __init__(self, embed_dim, latent_dim, num_heads, **kwargs):
        super().__init__(**kwargs)
        self.embed_dim = embed_dim
        self.latent_dim = latent_dim
        self.num_heads = num_heads
        self.attention_1 = layers.MultiHeadAttention(
            num_heads=num_heads, key_dim=embed_dim
        )
        self.attention_2 = layers.MultiHeadAttention(
            num_heads=num_heads, key_dim=embed_dim
        )
        self.dense_proj = keras.Sequential(
            [layers.Dense(latent_dim, activation="relu"), layers.Dense(embed_dim),]
        )
        self.layernorm_1 = layers.LayerNormalization()
        self.layernorm_2 = layers.LayerNormalization()
        self.layernorm_3 = layers.LayerNormalization()
        self.supports_masking = True

    def call(self, inputs, encoder_outputs, mask=None):
        causal_mask = self.get_causal_attention_mask(inputs)
        if mask is not None:
            padding_mask = tf.cast(mask[:, tf.newaxis, :], dtype="int32")
            padding_mask = tf.minimum(padding_mask, causal_mask)

        attention_output_1 = self.attention_1(
            query=inputs, value=inputs, key=inputs, attention_mask=causal_mask
        )
        out_1 = self.layernorm_1(inputs + attention_output_1)

        attention_output_2 = self.attention_2(
            query=out_1,
            value=encoder_outputs,
            key=encoder_outputs,
            attention_mask=padding_mask,
        )
        out_2 = self.layernorm_2(out_1 + attention_output_2)

        proj_output = self.dense_proj(out_2)
        return self.layernorm_3(out_2 + proj_output)

    def get_causal_attention_mask(self, inputs):
        input_shape = tf.shape(inputs)
        batch_size, sequence_length = input_shape[0], input_shape[1]
        i = tf.range(sequence_length)[:, tf.newaxis]
        j = tf.range(sequence_length)
        mask = tf.cast(i >= j, dtype="int32")
        mask = tf.reshape(mask, (1, input_shape[1], input_shape[1]))
        mult = tf.concat(
            [tf.expand_dims(batch_size, -1), tf.constant([1, 1], dtype=tf.int32)],
            axis=0,
        )
        return tf.tile(mask, mult)

In [7]:
def create_model(embed_dim, latent_dim, num_heads):
    encoder_inputs = layers.Input(shape=(None,), dtype="int64", name="encoder_inputs")
    pos = PositionalEmbedding(seq_len, size, embed_dim)(encoder_inputs)
    encoder_outputs = TransformerEncoder(embed_dim, latent_dim, num_heads)(pos)
    encoder = Model(encoder_inputs, encoder_outputs)

    decoder_inputs = layers.Input(shape=(None,), dtype="int64", name="decoder_inputs")
    encoded_seq_inputs = layers.Input(shape=(None, embed_dim), name="decoder_state_inputs")
    x = PositionalEmbedding(seq_len, size, embed_dim)(decoder_inputs)
    x = TransformerDecoder(embed_dim, latent_dim, num_heads)(x, encoded_seq_inputs)
    x = layers.Dropout(0.5)(x)
    decoder_outputs = layers.Dense(size, activation="softmax")(x)
    decoder = Model([decoder_inputs, encoded_seq_inputs], decoder_outputs)

    decoder_outputs = decoder([decoder_inputs, encoder_outputs])
    transformer = Model([encoder_inputs, decoder_inputs], decoder_outputs, name="transformer")
    
    return transformer

In [8]:
epochs = 30  # This should be at least 30 for convergence
embed_dim = 256
latent_dim = 2048
num_heads = 64

transformer = create_model(embed_dim, latent_dim, num_heads)
transformer.summary()
transformer.compile(
    "rmsprop", loss="sparse_categorical_crossentropy", metrics=["accuracy"]
)
transformer.fit(ds, epochs=epochs, validation_data=val_ds)

Model: "transformer"
__________________________________________________________________________________________________
 Layer (type)                   Output Shape         Param #     Connected to                     
 encoder_inputs (InputLayer)    [(None, None)]       0           []                               
                                                                                                  
 positional_embedding (Position  (None, None, 256)   3845120     ['encoder_inputs[0][0]']         
 alEmbedding)                                                                                     
                                                                                                  
 decoder_inputs (InputLayer)    [(None, None)]       0           []                               
                                                                                                  
 transformer_encoder (Transform  (None, None, 256)   17878528    ['positional_embedding[

<keras.callbacks.History at 0x7f8493807410>

In [14]:
ga_vocab = ga_vec.get_vocabulary()
ga_index_lookup = dict(zip(range(len(ga_vocab)), ga_vocab))
max_decoded_sentence_length = 20


def decode_sequence(model, input_sentence):
    tokenized_input_sentence = en_vec([input_sentence])
    decoded_sentence = "[start]"
    for i in range(max_decoded_sentence_length):
        tokenized_target_sentence = ga_vec([decoded_sentence])[:, :-1]
        predictions = model([tokenized_input_sentence, tokenized_target_sentence])

        sampled_token_index = np.argmax(predictions[0, i, :])
        sampled_token = ga_index_lookup[sampled_token_index]
        decoded_sentence += " " + sampled_token

        if sampled_token == "[end]":
            break
    return decoded_sentence


for _ in range(10):
    input_sentence = random.choice(pairs["English"])
    translated = decode_sequence(transformer, input_sentence)
    print(input_sentence, "\n", translated, "\n")

Delegated acts 
 [start] an coiste freagrach as páirtithe polaitiúla               

In the event of a crisis or at the request of the President of the European Parliament or of the Chairman of the Committee on Foreign Affairs, Human Rights, Common Security and Defence Policy, such information shall be provided at the earliest opportunity. 
 [start] cur chun feidhme nó cathaoirleach an choiste um an iarraidh ar an iarraidh ar an eolas nó ar aghaidh chuig 

The President shall obtain the agreement of Parliament to putting such amendments to the vote. 
 [start] an tuachtarán a fháil ar a lorg sula ndéanfaidh sé an leasuithe         

The questions and answers shall be published in the 
 [start] na ceisteanna agus na freagraí in               

Committee on Budgets 
 [start] an coiste um buiséid                 

Committee on Petitions 
 [start] an coiste um achainíocha                 

Disputes on voting 
 [start] an tuachtarán an vótáil                 

Managers shall have the duty of

In [32]:
transformer.save_weights("weights")

In [33]:
new_model = create_model(embed_dim, latent_dim, num_heads)
new_model.load_weights("weights")
new_model

<keras.engine.functional.Functional at 0x7f83c00f4e90>

In [36]:
inp = "The parliament decided on the regulation"

translated = decode_sequence(new_model, inp)
print(inp, "\n", translated, "\n")

The parliament decided on the regulation 
 [start] an pharlaimint cinneadh ar an moladh ón gcomhairle             

