# DOWNLOADING DATA

In [None]:
# !kaggle datasets download -d devicharith/language-translation-englishfrench

In [None]:
# !unzip language-translation-englishfrench.zip -d language-translation-englishfrench

# IMPORTS

In [64]:
import keras
import numpy as np
import pandas as pd
import tensorflow as tf
from tensorflow.keras import layers

# DATA EXPROLATION

In [65]:
english_french_data = pd.read_csv(
    "language-translation-englishfrench/eng_-french.csv", encoding="utf-8"
)
print(f"Number of sentences: {len(english_french_data)}")
english_sentence_words = english_french_data["English words/sentences"].apply(
    lambda x: len(str(x).split())
)
french_avg_sentence_words = english_french_data["French words/sentences"].apply(
    lambda x: len(str(x).split())
)
print(f"Min words in english sentences: {english_sentence_words.min()}")
print(f"Min words in french sentences: {french_avg_sentence_words.min()}")
print(f"Max words in english sentences: {english_sentence_words.max()}")
print(f"Max words in french sentences: {french_avg_sentence_words.max()}")
print(f"Average words in english sentences: {english_sentence_words.mean()}")
print(f"Average words in french sentences: {french_avg_sentence_words.mean()}")
english_french_data.head()

Number of sentences: 175621
Min words in english sentences: 1
Min words in french sentences: 1
Max words in english sentences: 44
Max words in french sentences: 55
Average words in english sentences: 6.161552433934438
Average words in french sentences: 6.706669475746067


Unnamed: 0,English words/sentences,French words/sentences
0,Hi.,Salut!
1,Run!,Cours !
2,Run!,Courez !
3,Who?,Qui ?
4,Wow!,Ça alors !


In [66]:
english_french_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 175621 entries, 0 to 175620
Data columns (total 2 columns):
 #   Column                   Non-Null Count   Dtype 
---  ------                   --------------   ----- 
 0   English words/sentences  175621 non-null  object
 1   French words/sentences   175621 non-null  object
dtypes: object(2)
memory usage: 2.7+ MB


In [67]:
# shuffle the dataset as it was grouped from shortest to longest
english_french_data = english_french_data.sample(frac=1).reset_index(drop=True)

# Data pipeline

In [None]:
sentences_en = english_french_data["English words/sentences"].to_numpy()
sentences_fr = english_french_data["French words/sentences"].to_numpy()
valid_len = int(0.1 * len(english_french_data))

sentences_en_train = sentences_en[:-valid_len]
sentences_fr_train = sentences_fr[:-valid_len]

sentences_en_valid = sentences_en[-valid_len:]
sentences_fr_valid = sentences_fr[-valid_len:]

In [69]:
def prepare_input_and_target(sentences_en, sentences_fr):
    return (sentences_en, b"<SOS> " + sentences_fr), sentences_fr + b" <EOS>"


def from_sentences_dataset(
    sentences_en,
    sentences_fr,
    batch_size,
):
    dataset = tf.data.Dataset.from_tensor_slices((sentences_en, sentences_fr))
    dataset = dataset.map(prepare_input_and_target, num_parallel_calls=tf.data.AUTOTUNE)
    return dataset.batch(batch_size)

In [74]:
train_dataset = from_sentences_dataset(sentences_en_train, sentences_fr_train, batch_size=128)
validation_dataset = from_sentences_dataset(sentences_en_valid, sentences_fr_valid,  batch_size=128)

In [None]:
def adapt_compile_and_fit(
    model,
    train_dataset,
    valid_dataset,
    epochs,
    init_lr,
):
    model.vectorization_en.adapt(
        train_dataset.map(
            lambda sentences, target: sentences[0]
        )
    )
    model.vectorization_fr.adapt(
        train_dataset.map(
            lambda sentences, target: sentences[1] + b" <EOS>"
        )
    )

    train_dataset_prepared = train_dataset.map(
        lambda sentences, target: (sentences, model.vectorization_fr(target))
    ).prefetch(tf.data.AUTOTUNE)

    valid_dataset_prepared = valid_dataset.map(
        lambda sentences, target: (sentences, model.vectorization_fr(target))
    ).prefetch(tf.data.AUTOTUNE)

    scheduled_lr = keras.optimizers.schedules.ExponentialDecay(
        initial_learning_rate=init_lr,
        decay_steps=5,
        decay_rate=0.1,
    )

    model.compile(
        loss="sparse_categorical_crossentropy",
        optimizer=keras.optimizers.RMSprop(learning_rate=scheduled_lr),
        metrics=["accuracy"],
    )

    return model.fit(
        train_dataset_prepared,
        epochs=epochs,
        validation_data=valid_dataset_prepared,
    )

gpu doesnt work for tf in this docker, if training is rlly slow, look into it

# Model

In [None]:
class PositionalEncoding(layers.Layer):
    def __init__(self, max_length, embedding_dim):
        super().__init__()
        if embedding_dim % 2 != 0:
            raise ValueError("Embedding dimension must be an even number")

        positions = np.arange(max_length)
        dimensions = np.arange(embedding_dim // 2)
        angles = positions[:, np.newaxis] / (10000 ** (2 * dimensions / embedding_dim))
        pos_encoding = np.zeros((max_length, embedding_dim))
        pos_encoding[:, 0::2] = np.sin(angles)
        pos_encoding[:, 1::2] = np.cos(angles)
        self.positional_encoding = tf.constant(pos_encoding[np.newaxis, :, :], dtype=tf.float32)
    
    def call(self, inputs):
        sequence_length = tf.shape(inputs)[1]
        return inputs + self.positional_encoding[:, :sequence_length, :]

In [None]:
class Encoder(layers.Layer):
    def __init__(
        self,
        embed_size,
        attention_heads,
        dense_dim,
        dropout_rate,
        **kwargs
    ):
        super().__init__(**kwargs)
        self.multi_head_attention = layers.MultiHeadAttention(
            attention_heads, embed_size, dropout=dropout_rate
        )
        self.feed_forward = keras.Sequential(
            [
                layers.Dense(dense_dim, activation="relu"),
                layers.Dense(embed_size),
                layers.Dropout(dropout_rate),
            ]
        )
        self.add = layers.Add()
        self.normalization = layers.LayerNormalization()

    def call(self, x, mask=None):
        skip_x= x
        x = self.multi_head_attention(x, value=x, attention_mask=mask)
        x = self.normalization(self.add([x, skip_x]))
        skip_x = x
        x = self.feed_forward(x)
        return self.normalization(self.add([x, skip_x]))


class Decoder(layers.Layer):
    def __init__(
        self,
        embed_size,
        attention_heads,
        dense_dim,
        dropout_rate,
        **kwargs
    ):
        super().__init__(**kwargs)
        self.masked_multi_head_attention = layers.MultiHeadAttention(
            attention_heads, embed_size, dropout=dropout_rate
        )
        self.cross_attention = layers.MultiHeadAttention(
            attention_heads, embed_size, dropout=dropout_rate
        )
        self.feed_forward = keras.Sequential(
            [
                layers.Dense(
                    dense_dim, activation="relu"),
                layers.Dense(embed_size),
                layers.Dropout(dropout_rate),
            ]
        )
        self.add = layers.Add()
        self.normalization = layers.LayerNormalization()

    def call(self, inputs, mask=None):
        decoder_mask, encoder_mask = mask
        x, encoder_output = inputs
        x_skip = x
        x = self.masked_multi_head_attention(x, value=x, attention_mask=decoder_mask)
        x = self.normalization(self.add([x, x_skip]))
        x_skip = x
        x = self.cross_attention(
            x, value=encoder_output, attention_mask=encoder_mask
        )
        x = self.normalization(self.add([x, x_skip]))
        x_skip = x
        x = self.feed_forward(x)
        return self.normalization(self.add([x, x_skip]))

In [None]:
class Transformer(keras.Model):
    def __init__(
        self,
        vocab_size=5000,
        max_seq_len=50,
        embed_size=256,
        encoder_decoder=1,
        attention_heads=8,
        dense_dim=256,
        dropout_rate=0.2,
        **kwargs,
    ):
        super().__init__(**kwargs)
        self.max_seq_len = max_seq_len

        self.vectorization_en = layers.TextVectorization(
            vocab_size, output_sequence_length=max_seq_len
        )
        self.vectorization_fr = layers.TextVectorization(
            vocab_size, output_sequence_length=max_seq_len
        )
        self.encoder_embedding = layers.Embedding(
            vocab_size, embed_size, mask_zero=True
        )
        self.decoder_embedding = layers.Embedding(
            vocab_size, embed_size, mask_zero=True
        )
        self.positional_encoding = PositionalEncoding(max_seq_len, embed_size)
        self.encoder_blocks = [
            Encoder(embed_size, attention_heads, dense_dim, dropout_rate)
            for _ in range(encoder_decoder)
        ]
        self.decoder_blocks = [
            Decoder(embed_size, attention_heads, dense_dim, dropout_rate)
            for _ in range(encoder_decoder)
        ]
        self.output_layer = layers.Dense(vocab_size, activation="softmax")

    def call(self, inputs):
        encoder_inputs, decoder_inputs = inputs

        encoder_input_ids = self.vectorization_en(encoder_inputs)
        decoder_input_ids = self.vectorization_fr(decoder_inputs)
        encoder_embeddings = self.encoder_embedding(encoder_input_ids)
        decoder_embeddings = self.decoder_embedding(decoder_input_ids)
        encoder_pos_embeddings = self.positional_encoding(encoder_embeddings)
        decoder_pos_embeddings = self.positional_encoding(decoder_embeddings)

        encoder_pad_mask = tf.math.not_equal(encoder_input_ids, 0)[:, tf.newaxis]
        decoder_pad_mask = tf.math.not_equal(decoder_input_ids, 0)[:, tf.newaxis]
        batch_max_len_decoder = tf.shape(decoder_embeddings)[1]
        decoder_causal_mask = tf.linalg.band_part(
            tf.ones((batch_max_len_decoder, batch_max_len_decoder), tf.bool), -1, 0
        )
        decoder_mask = decoder_causal_mask & decoder_pad_mask

        x = encoder_pos_embeddings
        for encoder_block in self.encoder_blocks:
            x = encoder_block(x, mask=encoder_pad_mask)

        encoder_output = x
        x = decoder_pos_embeddings
        for decoder_block in self.decoder_blocks:
            x = decoder_block(
                [x, encoder_output], mask=[decoder_mask, encoder_pad_mask]
            )

        return self.output_layer(x)

In [81]:
transformer = Transformer()
transformer_history = adapt_compile_and_fit(
    transformer, train_dataset, validation_dataset, 10, 1e-3
)

Epoch 1/10




[1m1235/1235[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m607s[0m 490ms/step - accuracy: 0.8443 - loss: 3.7114 - val_accuracy: 0.8500 - val_loss: 3.6558
Epoch 2/10
[1m 948/1235[0m [32m━━━━━━━━━━━━━━━[0m[37m━━━━━[0m [1m2:14[0m 467ms/step - accuracy: 0.8495 - loss: 3.6584

KeyboardInterrupt: 