# Translator with Tranformer

In [1]:
import numpy as np
import math
import re
import time
import tensorflow as tf
from tensorflow.keras import layers
import tensorflow_datasets as tfds

In [2]:
def open_files():
    with open(
    '../../../datasets/pt-en/europarl-v7.pt-en.en',
    mode = 'r',
    encoding = 'utf-8' 
    ) as file:
        data_en = file.read()

    with open(
        '../../../datasets/pt-en/europarl-v7.pt-en.pt',
        mode = 'r',
        encoding = 'utf-8' 
    ) as file:
        data_pt = file.read()
    return data_en, data_pt

## Cleaning data

In [3]:
def get_clean_corpus(data):
    corpus = data
    corpus = re.sub(r"\.(?=[0-9|[a-z]|[A-Z]])", ".###", corpus)
    corpus = re.sub(r"\.###","", corpus)
    corpus = re.sub(r" +", " ", corpus )
    corpus = corpus.split("\n")
    return corpus

## Tokenizing

In [4]:
def tokenizing():
    tokenizer_en = tfds.deprecated.text.SubwordTextEncoder.build_from_corpus(corpus_en, target_vocab_size=2**13)
    tokenizer_pt = tfds.deprecated.text.SubwordTextEncoder.build_from_corpus(corpus_pt, target_vocab_size=2**13)
    return tokenizer_en, tokenizer_pt

In [5]:
def get_vocab_sizes():
    VOCAB_SIZE_EN = tokenizer_en.vocab_size + 2
    VOCAB_SIZE_PT = tokenizer_pt.vocab_size + 2
    return VOCAB_SIZE_EN, VOCAB_SIZE_PT

In [6]:
def get_inputs_and_outputs():
    inputs = [
    [VOCAB_SIZE_EN - 2] + tokenizer_en.encode(sentence) + [VOCAB_SIZE_EN - 1]
    for sentence in corpus_en
    ]

    outputs = [
        [VOCAB_SIZE_PT - 2] + tokenizer_pt.encode(sentence) + [VOCAB_SIZE_PT - 1]
        for sentence in corpus_pt
    ]
    return inputs , outputs

## Revmoving too long sentences

In [7]:
MAX_LENGTH = 20
def remove_too_long_sentences():
    idx_to_remove = [
        count for count, sentence in enumerate(inputs)
        if len(sentence) > MAX_LENGTH
    ]

    for idx in reversed(idx_to_remove):
        del inputs[idx]
        del outputs[idx]

    idx_to_remove = [
        count for count, sentence in enumerate(outputs)
        if len(sentence) > MAX_LENGTH
    ]

    for idx in reversed(idx_to_remove):
        del inputs[idx]
        del outputs[idx]
    return inputs, outputs

## Saving input and output data for futher process

In [8]:
import pickle

def save_data (data, name = 'data.file'):
    with open(name, 'wb') as data_file:
        pickle.dump(data, data_file)
        
def load_data(name = 'data.file'):
    with open(name, 'rb') as data_file:
        data = pickle.load(data_file)
    return data

In [9]:
def process_and_save():
    data_en, data_pt = open_files()
    corpus_en = get_clean_corpus(data_en)
    corpus_pt = get_clean_corpus(data_pt)
    tokenizer_en, tokenizer_pt = tokenizing()
    VOCAB_SIZE_EN, VOCAB_SIZE_PT = get_vocab_sizes()
    inputs , outputs = get_inputs_and_outputs()
    inputs , outputs = remove_too_long_sentences()
    save_data(inputs, 'inputs.dat')
    save_data(outputs, 'outputs.dat')

In [10]:
is_not_saved = False
if is_not_saved:
    process_and_save()

In [11]:
rec_inputs = load_data('inputs.dat')
rec_outputs = load_data('outputs.dat')
print('recovered sizes (inputs) (oututps) ', len(rec_inputs), len(rec_outputs))

recovered sizes (inputs) (oututps)  396085 396085


## Padding

In [12]:
inputs = tf.keras.preprocessing.sequence.pad_sequences(
    rec_inputs, 
    value=0,
    padding='post',
    maxlen=20
)

outputs = tf.keras.preprocessing.sequence.pad_sequences(
    rec_outputs, 
    value=0,
    padding='post',
    maxlen=20
)

## Build Dataset

In [13]:
BATCH_SIZE = 64
BUFFER_SIZE = 20000

dataset = tf.data.Dataset.from_tensor_slices((inputs, outputs))
dataset = dataset.cache()
dataset = dataset.shuffle(BUFFER_SIZE).batch(BATCH_SIZE)
dataset = dataset.prefetch(tf.data.experimental.AUTOTUNE)

## Build Model

### Embedding

### Positional encoding formulae:

### $PE_{(pos,2i)} =\sin(pos/10000^{2i/dmodel})$ (even numbers)

### $PE_{(pos,2i+1)} =\cos(pos/10000^{2i/dmodel})$ (odd numbers)

In [14]:
class PositionalEncoding(layers.Layer):
    
    def __init__(self):
        super(PositionalEncoding, self).__init__()
        
    def get_angles(self, pos, i, d_model):
        angles = 1 / np.power(
            10000., 
            (2 * (i/2) / np.float32(d_model) )
        )
        return pos * angles
    
    def call(self, inputs):
        inputs_shape_list = inputs.shape.as_list()
        seq_length = inputs_shape_list[-2]
        d_model = inputs_shape_list[-1]
        pos = np.arange(seq_length)[:, np.newaxis]                            
        i = np.arange(d_model)[np.newaxis, :]
        angles = self.get_angles(pos,i,d_model)
        angles[:, 0::2] = np.sin(angles[:, 0::2])
        angles[:, 1::2] = np.cos(angles[:, 1::2])
        pos_encoding = angles[np.newaxis, ...]
        return inputs + tf.cast(pos_encoding, tf.float32)
            

### Attention

In [15]:
def scaled_dot_product_attention(queries, keys, values, mask):
    product = tf.matmul(queries, keys, transpose_b=True)
    keys_dimmention = tv.cast(tf.shape(keys)[-1], tf.float32)
    scaled_product = product / tf.math.sqrt(keys_dimmention)
    if mask is not None:
        scaled_product += (mask * -1e9)
    attention = tf.matmul(tf.nn.softmax(scaled_product, axis=1), values)
    return attention

In [16]:
class MultiHeadAttention(layers.Layer):
    def __init__(self, number_of_projection):
        super(MultHeadAttention, self).__init__()
        self.number_of_projection = number_of_projection 
    
    #this method is called after init
    def build(self, input_shape):
        self.d_model = input_shape[-1]
        # assert the integers result in zero
        assert self.d_model % self.number_of_projection == 0
        # // assure integer result
        self.d_proj = self.d_model // self.number_of_projection
        # create the dense layers
        self.query_linear_func = layers.Dense(units=self.d_model)
        self.keys_linear_func = layers.Dense(units=self.d_model)
        self.value_linear_func = layers.Dense(units=self.d_model)
        self.final_linear_func = layers.Dense(units = self.d_model)
        
    def split_projections(self, values, batch_size):
        shape = (batch_size,
                -1,
                self.number_of_projection,
                self.d_proj)
        splited_inputs = tf.reshape(inputs, shape=shape) # (batch_size, seq_length, number_of_projections, d_proj)
        # shifting the 2nd and 3rd args
        splited_inputs = tf.transpose(splited_inputs, perm=[0,2,1,3]) # (batch_size, number_of_projections, seq_length,  d_proj)
        return splited_inputs
    
    def call(self, queries, keys, values, mask):
        batch_size = tf.shape(queries)
        # build the dense layers
        queries = self.query_linear_func(queries)
        keys = self.keys_linear_func(keys)
        values = self.value_linear_func(values)
        # split the inputs
        queries = self.split_projections(queries, batch_size)
        keys = self.split_projections(keys, batch_size)
        values = self.split_projections(values, batch_size)
        attention = scaled_dot_product_attention(queries, keys,values, mask)
        attention = tf.transpose(attention, perm=[0, 2, 1, 3])
        concat_attention = tf.reshape(attention, shape=(
            batch_size,
            -1,
            self.d_model
        ))
        outputs = self.final_linear_func(concat_attention)
        return outputs