In [1]:
import re
import nltk
import random
import pickle
import numpy as np
import pandas as pd
import tensorflow as tf

from tensorflow import keras
from keras import layers
from keras.preprocessing.text import Tokenizer
from keras.models import Model, Sequential
from keras.optimizers import Adam
from keras.losses import sparse_categorical_crossentropy

from transformers import BertTokenizer, BertModel

In [5]:
cgpt = pd.read_csv("data/CGPT-corpus.csv", usecols=[1,2], dtype=str)
dcep = pd.read_csv("data/DCEP.txt", sep='\t', names=['English', 'Gaeilge'], dtype=str)
dgt = pd.read_csv("data/DGT.csv", names=['English', 'Gaeilge'], header=0, dtype=str)
tatoeba = pd.read_csv("data/tatoeba.tsv", sep='\t', usecols=[1,3], names=['English', 'Gaeilge'], dtype=str)
aug = pd.read_csv("data/augmented.csv", usecols=[1,2], dtype=str)

pairs = pd.concat([cgpt, tatoeba, dcep, dgt, aug], ignore_index=True).sample(frac=1)
pairs

Unnamed: 0,English,Gaeilge
40810,Such a proposal shall require for adoption the...,"Chun go nglacfar le togra den sórt sin, beidh ..."
15986,Where the committee decides to submit the prop...,I gcás ina gcinnfidh an coiste an togra a chur...
63356,"For programme-specific result indicators, whic...","Maidir le táscairí toraidh clár-shonracha, a b..."
29884,Characteristics/remarks: Companies or groups o...,Saintréithe/barúlacha: Cuideachtaí nó grúpaí c...
41734,The intention to move inadmissibility shall be...,Tabharfar fógra don Uachtarán 24 huaire an chl...
...,...,...
124908,RESULT OF VERIFICATION,TORADH AN FHÍORAITHE
20414,Nomination of Judges and Advocates-General at ...,Ainmniú na mBreithiúna agus na nAbhcóidí Ginea...
51036,technical services which supervise the tests r...,seirbhísí teicniúla a dhéanann maoirseacht ar ...
123342,none of the percentages given in the list for ...,nach sárófar aon cheann de na céatadán atá lea...


## Preprocessing

In [None]:
def preproc_en(text):
    x = re.sub(r"(@\[A-Za-z0-9]+)|([^0-9A-Za-záéíóú ])|(\w+:\/\/\S+)|^rt|http.+?", "", str(text))
    return " ".join(nltk.word_tokenize(contractions.fix(x.lower())))

def preproc_ga(text):
    x = re.sub(r"(@\[A-Za-z0-9]+)|([^0-9A-Za-záéíóú ])|(\w+:\/\/\S+)|^rt|http.+?", "", str(text))
    return "[start] " + " ".join(nltk.word_tokenize(x.lower())) + " [end]"

#pairs["English"] = pairs["English"].apply(preproc_en)
#pairs["Gaeilge"] = pairs["Gaeilge"].apply(preproc_ga)

pairs

In [6]:
train = pairs.sample(frac=0.8)
val = pairs.drop(train.index)

print(f"{len(pairs)} total pairs")
print(f"{len(train)} training pairs")
print(f"{len(val)} validation pairs")

139625 total pairs
111700 training pairs
27925 validation pairs


Vectorise text and pickle output for later use

In [8]:
size = 15000
seq_len = 30
batch = 64

en_vec = layers.TextVectorization(max_tokens=size, output_mode="int", output_sequence_length=seq_len)
ga_vec = layers.TextVectorization(max_tokens=size, output_mode="int", output_sequence_length=seq_len+1)

en_vec.adapt(pairs["English"])
ga_vec.adapt(pairs["Gaeilge"])

pickle.dump({'config': en_vec.get_config(),
             'weights': en_vec.get_weights()}
            , open("en_vec.pkl", "wb"))

pickle.dump({'config': ga_vec.get_config(),
             'weights': ga_vec.get_weights()}
            , open("ga_vec.pkl", "wb"))

Load pickled configs and weights onto new TextVectorization layers

In [9]:
en_pkl = pickle.load(open("en_vec.pkl", "rb"))
ga_pkl = pickle.load(open("ga_vec.pkl", "rb"))

en_vec = layers.TextVectorization.from_config(en_pkl['config'])
ga_vec = layers.TextVectorization.from_config(ga_pkl['config'])

en_vec.set_weights(en_pkl['weights'])
ga_vec.set_weights(ga_pkl['weights'])

In [10]:
def format_dataset(en, ga):
    en = en_vec(en)
    ga = ga_vec(ga)
    return ({"encoder_inputs": en, "decoder_inputs": ga[:, :-1],}, ga[:, 1:])

def make_dataset(en, ga):
    dataset = tf.data.Dataset.from_tensor_slices((en, ga))
    dataset = dataset.batch(batch)
    dataset = dataset.map(format_dataset)
    return dataset.shuffle(2048).prefetch(16).cache()

ds = make_dataset(train["English"], train["Gaeilge"])
val_ds = make_dataset(val["English"], val["Gaeilge"])

This sequence-to-sequence (Seq2Seq) Transformer comprises of a TransformerEncoder and a TransformerDecoder that are connected in a chain. The model also includes a PositionalEmbedding layer to incorporate information about the order of words.

The TransformerEncoder takes the original sequence as input and generates a new representation of the sequence. The TransformerDecoder then takes this modified representation and the current target sequence, which includes words from 0 to N. The objective of the TransformerDecoder is to predict the next words in the target sequence, up to N+1.

To accomplish this, the TransformerDecoder applies causal masking. This is crucial because it ensures that the model only uses data from target tokens 0 to N while predicting token N+1. Causal masking is necessary because the TransformerDecoder sees the entire sequence at once and must be prevented from using information from future tokens when making predictions.

In [11]:
class TransformerEncoder(layers.Layer):
    def __init__(self, embed_dim, dense_dim, num_heads, **kwargs):
        super().__init__(**kwargs)
        self.embed_dim = embed_dim
        self.dense_dim = dense_dim
        self.num_heads = num_heads
        self.attention = layers.MultiHeadAttention(
            num_heads=num_heads, key_dim=embed_dim
        )
        self.dense_proj = keras.Sequential(
            [layers.Dense(dense_dim, activation="relu"), layers.Dense(embed_dim),]
        )
        self.layernorm_1 = layers.LayerNormalization()
        self.layernorm_2 = layers.LayerNormalization()
        self.supports_masking = True

    def call(self, inputs, mask=None):
        if mask is not None:
            padding_mask = tf.cast(mask[:, tf.newaxis, :], dtype="int32")
        attention_output = self.attention(
            query=inputs, value=inputs, key=inputs, attention_mask=padding_mask
        )
        proj_input = self.layernorm_1(inputs + attention_output)
        proj_output = self.dense_proj(proj_input)
        return self.layernorm_2(proj_input + proj_output)


class PositionalEmbedding(layers.Layer):
    def __init__(self, sequence_length, vocab_size, embed_dim, **kwargs):
        super().__init__(**kwargs)
        self.token_embeddings = layers.Embedding(
            input_dim=vocab_size, output_dim=embed_dim
        )
        self.position_embeddings = layers.Embedding(
            input_dim=sequence_length, output_dim=embed_dim
        )
        self.sequence_length = sequence_length
        self.vocab_size = vocab_size
        self.embed_dim = embed_dim

    def call(self, inputs):
        length = tf.shape(inputs)[-1]
        positions = tf.range(start=0, limit=length, delta=1)
        embedded_tokens = self.token_embeddings(inputs)
        embedded_positions = self.position_embeddings(positions)
        return embedded_tokens + embedded_positions

    def compute_mask(self, inputs, mask=None):
        return tf.math.not_equal(inputs, 0)


class TransformerDecoder(layers.Layer):
    def __init__(self, embed_dim, latent_dim, num_heads, **kwargs):
        super().__init__(**kwargs)
        self.embed_dim = embed_dim
        self.latent_dim = latent_dim
        self.num_heads = num_heads
        self.attention_1 = layers.MultiHeadAttention(
            num_heads=num_heads, key_dim=embed_dim
        )
        self.attention_2 = layers.MultiHeadAttention(
            num_heads=num_heads, key_dim=embed_dim
        )
        self.dense_proj = keras.Sequential(
            [layers.Dense(latent_dim, activation="relu"), layers.Dense(embed_dim),]
        )
        self.layernorm_1 = layers.LayerNormalization()
        self.layernorm_2 = layers.LayerNormalization()
        self.layernorm_3 = layers.LayerNormalization()
        self.supports_masking = True

    def call(self, inputs, encoder_outputs, mask=None):
        causal_mask = self.get_causal_attention_mask(inputs)
        if mask is not None:
            padding_mask = tf.cast(mask[:, tf.newaxis, :], dtype="int32")
            padding_mask = tf.minimum(padding_mask, causal_mask)

        attention_output_1 = self.attention_1(
            query=inputs, value=inputs, key=inputs, attention_mask=causal_mask
        )
        out_1 = self.layernorm_1(inputs + attention_output_1)

        attention_output_2 = self.attention_2(
            query=out_1,
            value=encoder_outputs,
            key=encoder_outputs,
            attention_mask=padding_mask,
        )
        out_2 = self.layernorm_2(out_1 + attention_output_2)

        proj_output = self.dense_proj(out_2)
        return self.layernorm_3(out_2 + proj_output)

    def get_causal_attention_mask(self, inputs):
        input_shape = tf.shape(inputs)
        batch_size, sequence_length = input_shape[0], input_shape[1]
        i = tf.range(sequence_length)[:, tf.newaxis]
        j = tf.range(sequence_length)
        mask = tf.cast(i >= j, dtype="int32")
        mask = tf.reshape(mask, (1, input_shape[1], input_shape[1]))
        mult = tf.concat(
            [tf.expand_dims(batch_size, -1), tf.constant([1, 1], dtype=tf.int32)],
            axis=0,
        )
        return tf.tile(mask, mult)

In [12]:
embed_dim = 256
latent_dim = 2048
num_heads = 64

def create_model(embed_dim, latent_dim, num_heads):
    encoder_inputs = layers.Input(shape=(None,), dtype="int64", name="encoder_inputs")
    pos = PositionalEmbedding(seq_len, size, embed_dim)(encoder_inputs)
    encoder_outputs = TransformerEncoder(embed_dim, latent_dim, num_heads)(pos)
    encoder = Model(encoder_inputs, encoder_outputs)

    decoder_inputs = layers.Input(shape=(None,), dtype="int64", name="decoder_inputs")
    encoded_seq_inputs = layers.Input(shape=(None, embed_dim), name="decoder_state_inputs")
    x = PositionalEmbedding(seq_len, size, embed_dim)(decoder_inputs)
    x = TransformerDecoder(embed_dim, latent_dim, num_heads)(x, encoded_seq_inputs)
    x = layers.Dropout(0.5)(x)
    decoder_outputs = layers.Dense(size, activation="softmax")(x)
    decoder = Model([decoder_inputs, encoded_seq_inputs], decoder_outputs)

    decoder_outputs = decoder([decoder_inputs, encoder_outputs])
    transformer = Model([encoder_inputs, decoder_inputs], decoder_outputs, name="transformer")
    
    return transformer

# Model Training

In [14]:
# TRAINING MODEL

epochs = 30  # This should be at least 30 for convergence

transformer = create_model(embed_dim, latent_dim, num_heads)
transformer.summary()
transformer.compile(
    optimizer=Adam(), loss="sparse_categorical_crossentropy", metrics=["accuracy"]
)
transformer.fit(ds, epochs=epochs, validation_data=val_ds)
transformer.save_weights("weights")

Model: "transformer"
__________________________________________________________________________________________________
 Layer (type)                   Output Shape         Param #     Connected to                     
 encoder_inputs (InputLayer)    [(None, None)]       0           []                               
                                                                                                  
 positional_embedding (Position  (None, None, 256)   3847680     ['encoder_inputs[0][0]']         
 alEmbedding)                                                                                     
                                                                                                  
 decoder_inputs (InputLayer)    [(None, None)]       0           []                               
                                                                                                  
 transformer_encoder (Transform  (None, None, 256)   17878528    ['positional_embedding[

In [15]:
ga_vocab = ga_vec.get_vocabulary()
ga_index_lookup = dict(zip(range(len(ga_vocab)), ga_vocab))
max_decoded_sentence_length = seq_len


def decode_sequence(model, input_sentence):
    tokenized_input_sentence = en_vec([input_sentence])
    decoded_sentence = "[start]"
    for i in range(max_decoded_sentence_length):
        tokenized_target_sentence = ga_vec([decoded_sentence])[:, :-1]
        predictions = model([tokenized_input_sentence, tokenized_target_sentence])

        sampled_token_index = np.argmax(predictions[0, i, :])
        sampled_token = ga_index_lookup[sampled_token_index]
        decoded_sentence += " " + sampled_token

        if sampled_token == "[end]":
            break
    return decoded_sentence


# Load pre-saved weights to model

In [20]:
new_model = create_model(embed_dim, latent_dim, num_heads)
new_model.load_weights("/kaggle/input/seq2seqweights/weights")
new_model

<keras.engine.functional.Functional at 0x7d4a8c332d50>

In [16]:
for _ in range(5):
    try:
        inp = " ".join(nltk.word_tokenize(random.choice(pairs["English"])))
        translated = decode_sequence(transformer, inp)
        print(inp, "\n", "--> ", translated[8:], "\n")
    except:
        continue

from the Chamber 
 -->  na gceisteanna                             

The President shall forward the preliminary draft estimates to the committee responsible , which shall draw up the draft estimates and report to Parliament . 
 -->  an tuachtarán an rún as na meastacháin ar aghaidh chuig an gcoiste freagrach a tharraingt suas ar an dréacht den chlár oibre       i  

Between two and eight Members chosen by lot shall count the votes cast in a secret ballot , unless an electronic vote is taken . 
 -->  le bhunú idir na feisirí a tharraingt siar go sealadach ar a chaitear agus beidh an dara ballóid rúnda a chomhaireamh  sa  i don don don sa  

Each Member of the Commission shall make sure that there is a regular and direct flow of information between the Member of the Commission and the chair of the relevant parliamentary committee . 
 -->  le linn an choimisiúin agus na caestóirí a bheidh a bheidh a bheidh a bheidh a chur ar an choimisiúin a chur ar aghaidh chuig an gcoimisiún  i a 

It 

In [20]:
from nltk.translate.bleu_score import sentence_bleu, SmoothingFunction

sample = val.sample(30)
bleu = []
for en, ga in zip(sample["English"], sample["Gaeilge"]):
    if len(en) < 60:
        try:
            en_tk = " ".join(nltk.word_tokenize(en))
            ga_tk = nltk.word_tokenize(ga.lower())

            smoothie = SmoothingFunction().method7
            translation = decode_sequence(transformer, en_tk)[8:]
            score = sentence_bleu([ga_tk], translation.split(), smoothing_function=smoothie)

            print(en_tk, "\n", " ".join(ga_tk), "\n", translation, "\n", score, "\n")
            bleu.append(score)
        except:
            continue

np.mean(bleu)

Committee on Culture and Education 
 an coiste um chultúr agus um oideachas 
 coiste um chultúr agus um oideachas  sa          sa    um um        
 0.6089767658837411 

Requirements for the drafting of legislative acts 
 ceanglais maidir le dréachtú gníomhartha reachtacha 
 maidir le dréachtú gníomhartha reachtacha  reachtacha   reachtacha           reachtacha reachtacha reachtacha      i  
 0.4326725975388602 

Rule 68 ( 2 ) , ( 4 ) , ( 5 ) , ( 7 ) and ( 8 ) shall apply . 
 beidh feidhm ag riail 68 ( 2 ) , ( 4 ) , ( 5 ) , ( 7 ) agus ( 8 ) . 
 feidhm ag parlaimint na nÓsanna imeachta seo i gcomhréir le rialacha nós imeachta seo    i          don i  
 0.15732700150487738 

Provisional Chair 
 an cathaoirleach sealadach 
 cathaoirleach sealadach                             
 0.29224668520773955 

Formation of political groups 
 bunú na ngrúpaí polaitiúla 
 na ngrúpaí polaitiúla                            
 0.5410492696833055 

Duties of the Bureau 
 dualgais an bhiúró 
 an bhiúró        

0.29732299566747333