In [None]:
# !pip install contractions

import re
import nltk
import random
import pickle
import langid
import contractions
import numpy as np
import pandas as pd
import tensorflow as tf

from tensorflow import keras
from keras import layers
from keras.callbacks import ModelCheckpoint
from keras.models import Model, Sequential
from keras.optimizers import Adam
from keras.losses import sparse_categorical_crossentropy

In [2]:
paracrawl = pd.read_csv("data/paracrawl.txt", encoding='utf-8', on_bad_lines='skip', sep='\t', names=['English', 'Gaeilge'], dtype=str)
paracrawl.head(5)

Unnamed: 0,English,Gaeilge
8234,"Should the offence be repeated, the President ...",Beidh díospóireacht ghinearálta ann i ndiaidh ...
41449,The Bureau shall adopt the guidelines for the ...,Glacfaidh an Biúró treoirlínte do na Caestóirí...
41154,PUBLIC RECORD OF PROCEEDINGS,TAIFEAD POIBLÍ AR NA hIMEACHTAÍ
81314,In order to ensure that the conditions for rec...,Chun a áirithiú go gcomhlíonfar na coinníollac...
53640,The entry summary declaration shall be lodged ...,Ní mór an dearbhú iontrála achomair a thaiscea...
...,...,...
52514,Provision of information,Faisnéis a chur ar fáil
148007,Help me move this weekend.,Cuidigh liom bogadh an deireadh seachtaine seo.
74336,RULES CONCERNING MARKETING AND PRODUCER ORGANI...,RIALACHA MAIDIR LE MARGAÍOCHT AGUS EAGRAÍOCHTA...
15705,"the President of Parliament,",- Uachtarán na Parlaiminte


## Cleaning

https://github.com/snguyenthanh/better_profanity/blob/master/better_profanity/profanity_wordlist.txt

In [None]:
profanity = open("profanity.txt").read().splitlines()

def filtering(en, ga):
    en, ga = str(en), str(ga)
    i = len(en)
    j = len(ga)
    
    # length matching as per the Speechmatics Parallel Corpus Filtering System for WMT18
    if len(en) < 100:
        if 6 * i > j and i < 6 * j:
            if i < 3 or j < 3 or (i < 2.2 * j and j < 2.2 * i):
                if i < 10 or j < 10 or (i < 2 * j and j < 2 * i):

                    # removing profanity 
                    if not any([word in en for word in profanity]):

                            # is it Irish?
                            if langid.classify(ga)[0] == 'ga':

                                return True
    return False

In [1]:
pairs = [[en, ga] for en, ga in zip(paracrawl["English"], paracrawl["Gaeilge"]) if filtering(en, ga)]

SyntaxError: did you forget parentheses around the comprehension target? (3172227818.py, line 1)

In [11]:
def preproc_en(text):
    x = re.sub(r"(@\[A-Za-z0-9]+)|([^0-9A-Za-záéíóú ])|(\w+:\/\/\S+)|^rt|http.+?", "", str(text))
    return " ".join(nltk.word_tokenize(contractions.fix(x.lower())))

def preproc_ga(text):
    x = re.sub(r"(@\[A-Za-z0-9]+)|([^0-9A-Za-záéíóú ])|(\w+:\/\/\S+)|^rt|http.+?", "", str(text))
    return "[start] " + " ".join(nltk.word_tokenize(x.lower())) + " [end]"

pairs["English"] = pairs["English"].apply(preproc_en)
pairs["Gaeilge"] = pairs["Gaeilge"].apply(preproc_ga)

pairs.head(5)

Unnamed: 0,English,Gaeilge
2429244,if you are selected as a member of the rt audi...,[start] má roghnaítear tú ar bhall de chomhair...
1557228,with the announcement today of government depa...,[start] bhí súil ag an gcomhdháil agus na rann...
1104445,you can buypurchase stth112 right here right now,[start] is féidir leat a cheannach a cheannach...
2314715,tim has three sons and his wife karla of 21 years,[start] tá trí mhac agus a bhean chéile karla ...
1140669,room service goradias lords inn shirdi shirdi ...,[start] goradias lords inn shirdi shirdi bia d...
...,...,...
2292948,corporate authors directorategeneral for clima...,[start] dar corparáideach an coimisiún eorpach...
2734072,construction world cwindia 7h7 hours ago more ...,[start] construction world cwindia 7 u7 nuaire...
2521924,this could be when you have a family crisis wh...,[start] dfhéadfadh géarchéim a bheith ann i do...
1327842,ag péinteáil an tí is a picture book in the co...,[start] pictiúrleabhar is ea fiacail mháire sa...


In [3]:
train = pairs.sample(frac=0.8)
extra = pairs.drop(train.index)
val = extra.sample(frac=0.75)
ver = extra.drop(val.index)

print(f"{len(pairs)} total pairs")
print(f"{len(train)} training pairs")
print(f"{len(val)} validation pairs")
print(f"{len(ver)} verification pairs")

3092956 total pairs
2474365 training pairs
463943 validation pairs
154648 verification pairs


Vectorise text and pickle output for later use

In [None]:
size = 15000
seq_len = 50
batch = 64

en_vec = layers.TextVectorization(max_tokens=size, output_mode="int", output_sequence_length=seq_len)
ga_vec = layers.TextVectorization(max_tokens=size, output_mode="int", output_sequence_length=seq_len+1)

en_vec.adapt(pairs["English"])
ga_vec.adapt(pairs["Gaeilge"])

pickle.dump({'config': en_vec.get_config(),
             'weights': en_vec.get_weights()}
            , open("en_vec_filtered.pkl", "wb"))

pickle.dump({'config': ga_vec.get_config(),
             'weights': ga_vec.get_weights()}
            , open("ga_vec_filtered.pkl", "wb"))

Load pickled configs and weights onto new TextVectorization layers

In [4]:
size = 15000
seq_len = 50
batch = 64

en_pkl = pickle.load(open("/kaggle/input/para-weights/en_vec.pkl", "rb"))
ga_pkl = pickle.load(open("/kaggle/input/para-weights/ga_vec.pkl", "rb"))

en_vec = layers.TextVectorization.from_config(en_pkl['config'])
ga_vec = layers.TextVectorization.from_config(ga_pkl['config'])

en_vec.set_weights(en_pkl['weights'])
ga_vec.set_weights(ga_pkl['weights'])

In [5]:
def format_dataset(en, ga):
    en = en_vec(en)
    ga = ga_vec(ga)
    return ({"encoder_inputs": en, "decoder_inputs": ga[:, :-1],}, ga[:, 1:])

def make_dataset(en, ga):
    dataset = tf.data.Dataset.from_tensor_slices((en, ga))
    dataset = dataset.batch(batch)
    dataset = dataset.map(format_dataset)
    return dataset.shuffle(2048).prefetch(16).cache()

ds = make_dataset(train["English"], train["Gaeilge"])
val_ds = make_dataset(val["English"], val["Gaeilge"])

This sequence-to-sequence (Seq2Seq) Transformer comprises of a TransformerEncoder and a TransformerDecoder that are connected in a chain. The model also includes a PositionalEmbedding layer to incorporate information about the order of words.

The TransformerEncoder takes the original sequence as input and generates a new representation of the sequence. The TransformerDecoder then takes this modified representation and the current target sequence, which includes words from 0 to N. The objective of the TransformerDecoder is to predict the next words in the target sequence, up to N+1.

To accomplish this, the TransformerDecoder applies causal masking. This is crucial because it ensures that the model only uses data from target tokens 0 to N while predicting token N+1. Causal masking is necessary because the TransformerDecoder sees the entire sequence at once and must be prevented from using information from future tokens when making predictions.

In [6]:
class TransformerEncoder(layers.Layer):
    def __init__(self, embed_dim, dense_dim, num_heads, **kwargs):
        super().__init__(**kwargs)
        self.embed_dim = embed_dim
        self.dense_dim = dense_dim
        self.num_heads = num_heads
        self.attention = layers.MultiHeadAttention(
            num_heads=num_heads, key_dim=embed_dim
        )
        self.dense_proj = keras.Sequential(
            [layers.Dense(dense_dim, activation="relu"), layers.Dense(embed_dim),]
        )
        self.layernorm_1 = layers.LayerNormalization()
        self.layernorm_2 = layers.LayerNormalization()
        self.supports_masking = True

    def call(self, inputs, mask=None):
        if mask is not None:
            padding_mask = tf.cast(mask[:, tf.newaxis, :], dtype="int32")
        attention_output = self.attention(
            query=inputs, value=inputs, key=inputs, attention_mask=padding_mask
        )
        proj_input = self.layernorm_1(inputs + attention_output)
        proj_output = self.dense_proj(proj_input)
        return self.layernorm_2(proj_input + proj_output)


class PositionalEmbedding(layers.Layer):
    def __init__(self, sequence_length, vocab_size, embed_dim, **kwargs):
        super().__init__(**kwargs)
        self.token_embeddings = layers.Embedding(
            input_dim=vocab_size, output_dim=embed_dim
        )
        self.position_embeddings = layers.Embedding(
            input_dim=sequence_length, output_dim=embed_dim
        )
        self.sequence_length = sequence_length
        self.vocab_size = vocab_size
        self.embed_dim = embed_dim

    def call(self, inputs):
        length = tf.shape(inputs)[-1]
        positions = tf.range(start=0, limit=length, delta=1)
        embedded_tokens = self.token_embeddings(inputs)
        embedded_positions = self.position_embeddings(positions)
        return embedded_tokens + embedded_positions

    def compute_mask(self, inputs, mask=None):
        return tf.math.not_equal(inputs, 0)


class TransformerDecoder(layers.Layer):
    def __init__(self, embed_dim, latent_dim, num_heads, **kwargs):
        super().__init__(**kwargs)
        self.embed_dim = embed_dim
        self.latent_dim = latent_dim
        self.num_heads = num_heads
        self.attention_1 = layers.MultiHeadAttention(
            num_heads=num_heads, key_dim=embed_dim
        )
        self.attention_2 = layers.MultiHeadAttention(
            num_heads=num_heads, key_dim=embed_dim
        )
        self.dense_proj = keras.Sequential(
            [layers.Dense(latent_dim, activation="relu"), layers.Dense(embed_dim),]
        )
        self.layernorm_1 = layers.LayerNormalization()
        self.layernorm_2 = layers.LayerNormalization()
        self.layernorm_3 = layers.LayerNormalization()
        self.supports_masking = True

    def call(self, inputs, encoder_outputs, mask=None):
        causal_mask = self.get_causal_attention_mask(inputs)
        if mask is not None:
            padding_mask = tf.cast(mask[:, tf.newaxis, :], dtype="int32")
            padding_mask = tf.minimum(padding_mask, causal_mask)

        attention_output_1 = self.attention_1(
            query=inputs, value=inputs, key=inputs, attention_mask=causal_mask
        )
        out_1 = self.layernorm_1(inputs + attention_output_1)

        attention_output_2 = self.attention_2(
            query=out_1,
            value=encoder_outputs,
            key=encoder_outputs,
            attention_mask=padding_mask,
        )
        out_2 = self.layernorm_2(out_1 + attention_output_2)

        proj_output = self.dense_proj(out_2)
        return self.layernorm_3(out_2 + proj_output)

    def get_causal_attention_mask(self, inputs):
        input_shape = tf.shape(inputs)
        batch_size, sequence_length = input_shape[0], input_shape[1]
        i = tf.range(sequence_length)[:, tf.newaxis]
        j = tf.range(sequence_length)
        mask = tf.cast(i >= j, dtype="int32")
        mask = tf.reshape(mask, (1, input_shape[1], input_shape[1]))
        mult = tf.concat(
            [tf.expand_dims(batch_size, -1), tf.constant([1, 1], dtype=tf.int32)],
            axis=0,
        )
        return tf.tile(mask, mult)

In [7]:
embed_dim = 256
latent_dim = 2048
num_heads = 64

def create_model(embed_dim, latent_dim, num_heads):
    encoder_inputs = layers.Input(shape=(None,), dtype="int64", name="encoder_inputs")
    pos = PositionalEmbedding(seq_len, size, embed_dim)(encoder_inputs)
    encoder_outputs = TransformerEncoder(embed_dim, latent_dim, num_heads)(pos)
    encoder = Model(encoder_inputs, encoder_outputs)

    decoder_inputs = layers.Input(shape=(None,), dtype="int64", name="decoder_inputs")
    encoded_seq_inputs = layers.Input(shape=(None, embed_dim), name="decoder_state_inputs")
    x = PositionalEmbedding(seq_len, size, embed_dim)(decoder_inputs)
    x = TransformerDecoder(embed_dim, latent_dim, num_heads)(x, encoded_seq_inputs)
    x = layers.Dropout(0.5)(x)
    decoder_outputs = layers.Dense(size, activation="softmax")(x)
    decoder = Model([decoder_inputs, encoded_seq_inputs], decoder_outputs)

    decoder_outputs = decoder([decoder_inputs, encoder_outputs])
    transformer = Model([encoder_inputs, decoder_inputs], decoder_outputs, name="transformer")
    
    return transformer

# Next block trains model!

In [None]:
# TRAINING MODEL

epochs = 10

transformer = create_model(embed_dim, latent_dim, num_heads)

transformer.summary()
transformer.compile(
    optimizer=Adam(), loss="sparse_categorical_crossentropy", metrics=["accuracy"]
)

cb = ModelCheckpoint("check",
                    save_weights_only=True, 
                    save_best_only=True, 
                    verbose=1, 
                    monitor="accuracy", 
                    save_freq="epoch", 
                    mode="max" )

transformer.fit(ds, epochs=epochs, validation_data=val_ds, callbacks = [cb])

transformer.save_weights("PARAweights")

In [8]:
ga_vocab = ga_vec.get_vocabulary()
ga_index_lookup = dict(zip(range(len(ga_vocab)), ga_vocab))
max_decoded_sentence_length = seq_len


def decode_sequence(model, input_sentence):
    tokenized_input_sentence = en_vec([preproc_en(input_sentence)])
    decoded_sentence = "[start]"
    for i in range(max_decoded_sentence_length):
        tokenized_target_sentence = ga_vec([decoded_sentence])[:, :-1]
        predictions = model([tokenized_input_sentence, tokenized_target_sentence])

        sampled_token_index = np.argmax(predictions[0, i, :])
        sampled_token = ga_index_lookup[sampled_token_index]
        
        if sampled_token == "end":
            break
            
        decoded_sentence += " " + sampled_token

    return decoded_sentence[8:]


# Run this to load pre-saved weights to model

In [9]:
new_model = create_model(embed_dim, latent_dim, num_heads)
new_model.load_weights("/kaggle/input/para-weights/para-base-check")
new_model

<keras.engine.functional.Functional at 0x7a64f2c16710>

In [20]:
for _ in range(5):
    try:
        inp = " ".join(nltk.word_tokenize(random.choice(pairs["English"])))
        translated = decode_sequence(new_model, inp)
        print(inp, "\n", "--> ", translated[8:], "\n")
    except:
        continue

county council minutes 8 october 2001 
 -->  riscí na comhairle contae fómhair 2001 

meet the 10 most affluent countries in the world rich happy and healthy 
 -->   ar na tíortha is mó san domhan is mó agus an domhain 

amendment of section 5 of the shannon fisheries act 1938 
 -->   alt 5 den acht um [UNK] 1936 

evolution of the price of the currency czech koruna compared with the currency price latvian lats 
 -->  an praghas ar an airgeadra dong i gcomparáid leis an bpraghas airgeadra lats laitvis 

house of commons parliamentary papers 
 -->   [UNK] [UNK] 



In [13]:
evaluation = pd.read_csv("/kaggle/input/nmt-data/eval.csv", usecols=[1,2])

eval_en = list(evaluation["English"])

references = [[preproc_ga(ga)[8:].split(".")[0][:-1]] for ga in list(evaluation["Gaeilge"])]

translations = [decode_sequence(new_model, en) for en in eval_en]

translations[:5]

['ag an [UNK] go dtí an [UNK]',
 'níl mé ag mé i mo mhúinteoir',
 'tá muid',
 'tá mé ag foghlaim',
 'tá mé teaghlaigh']

In [19]:
from nltk.translate.bleu_score import corpus_bleu, SmoothingFunction

bleu = corpus_bleu(references, translations, smoothing_function=SmoothingFunction().method7)

bleu

0.4140798100529444

In [15]:
open('r-para-cleaned.txt', 'w').write("\n".join([ref[0] for ref in references]))

open('t-para-cleaned.txt', 'w').write("\n".join(translations))

30142

In [12]:
decode_sequence(new_model, "I went to the shop")

'chuaigh mé go dtí an siopa'