# Translator with Tranformer

In [1]:
import numpy as np
import math
import re
import time
import tensorflow as tf
from tensorflow.keras import layers
import tensorflow_datasets as tfds

In [2]:
with open(
    '../../../datasets/pt-en/europarl-v7.pt-en.en',
    mode = 'r',
    encoding = 'utf-8' 
) as file:
    data_en = file.read()

with open(
    '../../../datasets/pt-en/europarl-v7.pt-en.pt',
    mode = 'r',
    encoding = 'utf-8' 
) as file:
    data_pt = file.read()

## Cleaning data

In [4]:
def get_clean_corpus(data):
    corpus = data
    corpus = re.sub(r"\.(?=[0-9|[a-z]|[A-Z]])", ".###", corpus)
    corpus = re.sub(r"\.###","", corpus)
    corpus = re.sub(r" +", " ", corpus )
    corpus = corpus.split("\n")
    return corpus

In [5]:
corpus_en = get_clean_corpus(data_en)
corpus_pt = get_clean_corpus(data_pt)

## Tokenizing

In [6]:
tokenizer_en = tfds.deprecated.text.SubwordTextEncoder.build_from_corpus(corpus_en, target_vocab_size=2**13)
tokenizer_pt = tfds.deprecated.text.SubwordTextEncoder.build_from_corpus(corpus_pt, target_vocab_size=2**13)

In [7]:
VOCAB_SIZE_EN = tokenizer_en.vocab_size + 2
VOCAB_SIZE_PT = tokenizer_pt.vocab_size + 2

In [10]:
inputs = [
    [VOCAB_SIZE_EN - 2] + tokenizer_en.encode(sentence) + [VOCAB_SIZE_EN - 1]
    for sentence in corpus_en
         ]

outputs = [
    [VOCAB_SIZE_PT - 2] + tokenizer_pt.encode(sentence) + [VOCAB_SIZE_PT - 1]
    for sentence in corpus_pt
         ]

## Revmoving too long sentences

In [12]:
MAX_LENGTH = 20
idx_to_remove = [
    count for count, sentence in enumerate(inputs)
    if len(sentence) > MAX_LENGTH
]

for idx in reversed(idx_to_remove):
    del inputs[idx]
    del outputs[idx]

idx_to_remove = [
    count for count, sentence in enumerate(outputs)
    if len(sentence) > MAX_LENGTH
]

for idx in reversed(idx_to_remove):
    del inputs[idx]
    del outputs[idx]

## Saving input and output data for futher process

In [15]:
import pickle

def save_data (data, name = 'data.file'):
    with open(name, 'wb') as data_file:
        pickle.dump(data, data_file)
        
def load_data(name = 'data.file'):
    with open(name, 'rb') as data_file:
        data = pickle.load(data_file)
    return data

In [17]:
print('original sizes (inputs) (oututps) ', len(inputs), len(outputs))
save_data(inputs, 'inputs.dat')
save_data(outputs, 'outputs.dat')
rec_inputs = load_data('inputs.dat')
rec_outputs = load_data('outputs.dat')
print('recovered sizes (inputs) (oututps) ', len(rec_inputs), len(rec_outputs))

original sizes (inputs) (oututps)  396085 396085
recovered sizes (inputs) (oututps)  396085 396085
