Load dataset. It contains translations from English to Spanish, so swap the order of the phrases. Also add `\t` and `\n` as the start and stop tokens in the target sequences. 

In [1]:
start_token = "\t"
stop_token = "\n"

with open("data/spa.txt", "r", encoding="utf-8") as f:
    samples = f.read().split("\n")

samples = [sample.strip().split("\t")
           for sample in samples if len(sample.strip()) > 0]

samples = [(es, start_token + en + stop_token)
           for en, es in samples if len(es) < 45]

In [2]:
len(samples)

99423

In [3]:
print(samples[:5])

[('Ve.', '\tGo.\n'), ('Vete.', '\tGo.\n'), ('Vaya.', '\tGo.\n'), ('Váyase.', '\tGo.\n'), ('Hola.', '\tHi.\n')]


Split data into train and validation sets.

In [4]:
from sklearn.model_selection import train_test_split

train_samples, valid_samples = train_test_split(samples, train_size=.8, random_state=42)

In [5]:
len(train_samples)

79538

In [6]:
len(valid_samples)

19885

Determine the training vocabulary. Those are the only tokens you can trust the model will know how to handle. 

In [7]:
in_vocab = set()
out_vocab = set()

for in_seq, out_seq in train_samples:
    in_vocab.update(in_seq)
    out_vocab.update(out_seq)
    
in_vocab_size = len(in_vocab)
out_vocab_size = len(out_vocab)
print("Input vocab size:", in_vocab_size)
print("Output vocab size:", out_vocab_size)

Input vocab size: 101
Output vocab size: 87


In [8]:
print(sorted(in_vocab))

[' ', '!', '"', '$', '%', "'", '(', ')', '+', ',', '-', '.', '/', '0', '1', '2', '3', '4', '5', '6', '7', '8', '9', ':', ';', '?', 'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J', 'K', 'L', 'M', 'N', 'O', 'P', 'Q', 'R', 'S', 'T', 'U', 'V', 'W', 'X', 'Y', 'Z', 'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n', 'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z', '¡', '«', '°', 'º', '»', '¿', 'Á', 'É', 'Ó', 'Ú', 'á', 'è', 'é', 'í', 'ñ', 'ó', 'ö', 'ú', 'ü', 'ś', 'с', '—', '€']


In [9]:
print(sorted(out_vocab))

['\t', '\n', ' ', '!', '"', '$', '%', "'", ',', '-', '.', '/', '0', '1', '2', '3', '4', '5', '6', '7', '8', '9', ':', ';', '?', 'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J', 'K', 'L', 'M', 'N', 'O', 'P', 'Q', 'R', 'S', 'T', 'U', 'V', 'W', 'X', 'Y', 'Z', 'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n', 'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z', '°', 'á', 'ã', 'è', 'é', 'ö', '‘', '’', '₂', '€']


Go through validation set and remove any tokens not present in the training set.

In [10]:
tmp_samples = []
for in_seq, out_seq in valid_samples:
    tmp_in_seq = [c for c in in_seq if c in in_vocab]
    tmp_out_seq = [c for c in out_seq if c in out_vocab]

    tmp_samples.append(("".join(tmp_in_seq), "".join(tmp_out_seq)))
    
valid_samples = tmp_samples

Build sequence-to-sequence model.

In [11]:
import keras
from keras.models import Model
from keras.layers import Dense, Input, LSTM, Masking

2022-01-02 16:24:27.277502: W tensorflow/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory
2022-01-02 16:24:27.277519: I tensorflow/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine.


In [12]:
latent_dim = 256

encoder_in = Input(shape=(None, in_vocab_size), name="encoder_in")
encoder_mask = Masking(name="encoder_mask")(encoder_in)
encoder_lstm = LSTM(latent_dim, return_state=True, recurrent_dropout=0.3, name="encoder_lstm")
_, encoder_h, encoder_c = encoder_lstm(encoder_mask)

2022-01-02 16:24:28.034158: W tensorflow/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcuda.so.1'; dlerror: libcuda.so.1: cannot open shared object file: No such file or directory
2022-01-02 16:24:28.034172: W tensorflow/stream_executor/cuda/cuda_driver.cc:269] failed call to cuInit: UNKNOWN ERROR (303)
2022-01-02 16:24:28.034185: I tensorflow/stream_executor/cuda/cuda_diagnostics.cc:156] kernel driver does not appear to be running on this host (guest): /proc/driver/nvidia/version does not exist
2022-01-02 16:24:28.034293: I tensorflow/core/platform/cpu_feature_guard.cc:151] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.


In [13]:
decoder_in = Input(shape=(None, out_vocab_size), name="decoder_in")

decoder_mask = Masking(name="decoder_mask")(decoder_in)
decoder_lstm = LSTM(latent_dim, return_sequences=True, return_state=True,
                    dropout=0.2, recurrent_dropout=0.3, name="decoder_lstm")
decoder_lstm_out, _, _ = decoder_lstm(decoder_mask, initial_state=[encoder_h, encoder_c])
decoder_dense = Dense(out_vocab_size, activation="softmax", name="decoder_out")
decoder_out = decoder_dense(decoder_lstm_out)

In [14]:
seq2seq_model = Model([encoder_in, decoder_in], decoder_out)
seq2seq_model.compile(optimizer="rmsprop", loss="categorical_crossentropy")

In [15]:
seq2seq_model.summary()

Model: "model"
__________________________________________________________________________________________________
 Layer (type)                   Output Shape         Param #     Connected to                     
 encoder_in (InputLayer)        [(None, None, 101)]  0           []                               
                                                                                                  
 decoder_in (InputLayer)        [(None, None, 87)]   0           []                               
                                                                                                  
 encoder_mask (Masking)         (None, None, 101)    0           ['encoder_in[0][0]']             
                                                                                                  
 decoder_mask (Masking)         (None, None, 87)     0           ['decoder_in[0][0]']             
                                                                                              

Create maps to convert characters to and from ints. 

In [16]:
in_token2int = {token : i for i, token in enumerate(sorted(in_vocab))}
out_token2int = {token : i for i, token in enumerate(sorted(out_vocab))}
out_int2token = {i : token for (token, i) in out_token2int.items()}

Create helper functions for one-hot encoding sequences for use with the model.

In [17]:
import numpy as np

def make_batch_storage(batch_size, in_seq_len, out_seq_len):
    
    enc_in_seqs = np.zeros(
        (batch_size, in_seq_len, in_vocab_size),
        dtype=np.float32)

    dec_in_seqs = np.zeros(
        (batch_size, out_seq_len, out_vocab_size),
        dtype=np.float32)

    dec_out_seqs = np.zeros(
        (batch_size, out_seq_len, out_vocab_size),
        dtype=np.float32)
        
    return enc_in_seqs, dec_in_seqs, dec_out_seqs

In [18]:
def encode_batch(samples):
    batch_size = len(samples)
    max_in_length = max([len(seq) for seq, _ in samples])
    max_out_length = max([len(seq) for _, seq in samples])

    enc_in_seqs, dec_in_seqs, dec_out_seqs = make_batch_storage(
        batch_size, max_in_length, max_out_length)
    
    for i, (in_seq, out_seq) in enumerate(samples):
        for time_step, token in enumerate(in_seq):
            enc_in_seqs[i, time_step, in_token2int[token]] = 1

        for time_step, token in enumerate(out_seq):
            dec_in_seqs[i, time_step, out_token2int[token]] = 1

        for time_step, token in enumerate(out_seq[1:]):
            dec_out_seqs[i, time_step, out_token2int[token]] = 1
            
    return enc_in_seqs, dec_in_seqs, dec_out_seqs

Train model

In [19]:
from seq2seq_util import Seq2SeqBatchGenerator

batch_size = 64
train_generator = Seq2SeqBatchGenerator(train_samples, batch_size, encode_batch)
valid_generator = Seq2SeqBatchGenerator(valid_samples, batch_size, encode_batch)

In [20]:
from keras.callbacks import EarlyStopping

early_stopping = EarlyStopping(monitor="val_loss", patience=5, restore_best_weights=True)

seq2seq_model.fit_generator(train_generator, epochs=500,
                            validation_data=valid_generator,
                            callbacks=[early_stopping])

  seq2seq_model.fit_generator(train_generator, epochs=500,


Epoch 1/500
Epoch 2/500
Epoch 3/500
Epoch 4/500
Epoch 5/500
Epoch 6/500
Epoch 7/500
Epoch 8/500
Epoch 9/500
Epoch 10/500
Epoch 11/500
Epoch 12/500
Epoch 13/500
Epoch 14/500
Epoch 15/500
Epoch 16/500
Epoch 17/500
Epoch 18/500
Epoch 19/500
Epoch 20/500
Epoch 21/500
Epoch 22/500
Epoch 23/500
Epoch 24/500
Epoch 25/500
Epoch 26/500
Epoch 27/500
Epoch 28/500
Epoch 29/500
Epoch 30/500
Epoch 31/500
Epoch 32/500
Epoch 33/500


<keras.callbacks.History at 0x7fe3d86fa670>

Create encoder/decoder models for inference

In [22]:
inf_encoder = Model(encoder_in, [encoder_h, encoder_c])

In [23]:
inf_encoder.summary()

Model: "model_2"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 encoder_in (InputLayer)     [(None, None, 101)]       0         
                                                                 
 encoder_mask (Masking)      (None, None, 101)         0         
                                                                 
 encoder_lstm (LSTM)         [(None, 256),             366592    
                              (None, 256),                       
                              (None, 256)]                       
                                                                 
Total params: 366,592
Trainable params: 366,592
Non-trainable params: 0
_________________________________________________________________


In [24]:
inf_dec_h_in = Input(shape=(latent_dim,), name="decoder_h_in")
inf_dec_c_in = Input(shape=(latent_dim,), name="decoder_c_in")

inf_dec_lstm_out, inf_dec_h_out, inf_dec_c_out = decoder_lstm(
    decoder_in, initial_state=[inf_dec_h_in, inf_dec_c_in])

inf_dec_out = decoder_dense(inf_dec_lstm_out)

inf_decoder = Model(
    [decoder_in, inf_dec_h_in, inf_dec_c_in],
    [inf_dec_out, inf_dec_h_out, inf_dec_c_out])

In [25]:
inf_decoder.summary()

Model: "model_3"
__________________________________________________________________________________________________
 Layer (type)                   Output Shape         Param #     Connected to                     
 decoder_in (InputLayer)        [(None, None, 87)]   0           []                               
                                                                                                  
 decoder_h_in (InputLayer)      [(None, 256)]        0           []                               
                                                                                                  
 decoder_c_in (InputLayer)      [(None, 256)]        0           []                               
                                                                                                  
 decoder_lstm (LSTM)            [(None, None, 256),  352256      ['decoder_in[0][0]',             
                                 (None, 256),                     'decoder_h_in[0][0]',     

Test trained model on the first 100 samples from both the training and validation sets.

In [26]:
max_out_seq_len = max([len(seq) for _, seq in samples])
print("Max output length: ", max_out_seq_len)

start_token_idx = out_token2int[start_token]
stop_token_idx = out_token2int[stop_token]

Max output length:  87


In [27]:
def translate_sequence(one_hot_seq, encoder, decoder):
    encoding = encoder.predict(one_hot_seq)

    decoder_in = np.zeros((1, 1, out_vocab_size), dtype=np.float32)

    translated_text = ''
    done_decoding = False
    decoded_idx = start_token_idx
    while not done_decoding:
        decoder_in[0, 0, decoded_idx] = 1
        decoding, h, c = decoder.predict([decoder_in] + encoding)
        encoding = [h, c]
        decoder_in[0, 0, decoded_idx] = 0

        decoded_idx = np.argmax(decoding[0, -1, :])
        
        if decoded_idx == stop_token_idx:
            done_decoding = True
        else:
            translated_text += out_int2token[decoded_idx]

        if len(translated_text) >= max_out_seq_len:
            done_decoding = True
            
    return translated_text

Validation samples:

In [28]:
from seq2seq_util import test_predictions

test_predictions(valid_samples[:100], inf_encoder, inf_decoder, encode_batch, translate_sequence)

-----------------------------------------
Input sentence: A todos nos gusta montar en bici.
Dataset translation: 	We all like cycling.

Model output: Everyone likes a lot of the party.
-----------------------------------------
Input sentence: Tom se rió de todos los chistes de Mary.
Dataset translation: 	Tom laughed at all of Mary's jokes.

Model output: Tom got the start of Mary a dog.
-----------------------------------------
Input sentence: Tom es un asqueroso.
Dataset translation: 	Tom is a creep.

Model output: Tom is a good to me.
-----------------------------------------
Input sentence: ¿Cuál es tu meta en la vida?
Dataset translation: 	What's your aim in life?

Model output: What's your favorite the party?
-----------------------------------------
Input sentence: Ella le escucha, aunque nadie más lo haga.
Dataset translation: 	She listens to him even though no one else does.

Model output: She lives a lot of the studing in the book.
-----------------------------------------
Inp

-----------------------------------------
Input sentence: Todo el mundo estuvo de acuerdo.
Dataset translation: 	Everybody was in agreement.

Model output: Everyone like the book.
-----------------------------------------
Input sentence: Si quieres tu dinero de vuelta, solo dilo.
Dataset translation: 	If you want your money back, just say so.

Model output: If you want to see you what I was a book.
-----------------------------------------
Input sentence: Nadie me dijo nada.
Dataset translation: 	No one said anything to me.

Model output: Nobody was a lot the stor.
-----------------------------------------
Input sentence: A Tom le gusta estar rodeado de gente.
Dataset translation: 	Tom likes having people around.

Model output: Tom likes to stay to the book.
-----------------------------------------
Input sentence: Tu madre se encuentra en estado crítico.
Dataset translation: 	Your mother is in critical condition.

Model output: Your father is a lot of the book.
-----------------------

-----------------------------------------
Input sentence: Sé que no es una broma.
Dataset translation: 	I know it's not a joke.

Model output: I know it's not a lot the stor.
-----------------------------------------
Input sentence: Yo cocinaré.
Dataset translation: 	I'll cook.

Model output: I'll come.
-----------------------------------------
Input sentence: Fuera de mi propiedad.
Dataset translation: 	Get off my property.

Model output: Stop is the book.
-----------------------------------------
Input sentence: ¿Puedes venir un momento?
Dataset translation: 	Would you come here a moment?

Model output: Can you see you a book?
-----------------------------------------
Input sentence: Tom se ve aburrido.
Dataset translation: 	Tom looks bored.

Model output: Tom looks the party.
-----------------------------------------
Input sentence: ¡Mira! Hay un avión despegando.
Dataset translation: 	Look! There's a plane taking off.

Model output: Look at a book at the book.
---------------------

Training samples:

In [29]:
test_predictions(train_samples[:100], inf_encoder, inf_decoder, encode_batch, translate_sequence)

-----------------------------------------
Input sentence: Después de una larga espera pudimos entrar.
Dataset translation: 	We got in after a long wait.

Model output: After is a lot of the book the way.
-----------------------------------------
Input sentence: Lo siento, pero es imposible.
Dataset translation: 	I'm sorry, but it's impossible.

Model output: I'm sorry, I want to stay him.
-----------------------------------------
Input sentence: Parecía satisfecho.
Dataset translation: 	He looked pleased.

Model output: It seems to see the sturs.
-----------------------------------------
Input sentence: Saqué el pastel del horno.
Dataset translation: 	I took the cake out of the oven.

Model output: I know him to the book.
-----------------------------------------
Input sentence: Es un trabajo muy difícil.
Dataset translation: 	That's a very tough job.

Model output: It's a bad a book the party.
-----------------------------------------
Input sentence: Dijiste que no entendías.
Dataset 

-----------------------------------------
Input sentence: La mayoría de la gente no lo haría así.
Dataset translation: 	Most people wouldn't do that that way.

Model output: The mother is not a lot of the book.
-----------------------------------------
Input sentence: Leí el libro entero.
Dataset translation: 	I read the entire book.

Model output: I read the book the sturs.
-----------------------------------------
Input sentence: Tienes que responder a la pregunta.
Dataset translation: 	You need to answer the question.

Model output: You must be a lot of the book.
-----------------------------------------
Input sentence: ¿Has vivido aquí?
Dataset translation: 	Did you live here?

Model output: Have you ever see the sture?
-----------------------------------------
Input sentence: He decidido hacer eso solo.
Dataset translation: 	I've decided to do that by myself.

Model output: I've decided to stay the stud.
-----------------------------------------
Input sentence: Ella le vio comerse

-----------------------------------------
Input sentence: Tu ayuda nos va a ahorrar mucho trabajo.
Dataset translation: 	Your help will save us a lot of work.

Model output: Her house is not a book and study to do.
-----------------------------------------
Input sentence: El niño se ensució las manos.
Dataset translation: 	The boy got his hands dirty.

Model output: The boy is a lot of the book.
-----------------------------------------
Input sentence: Usted debe ser el nuevo profesor.
Dataset translation: 	You must be the new teacher.

Model output: You must be a lot of the book.
-----------------------------------------
Input sentence: ¿Quién se robó mi canasto con la carne?
Dataset translation: 	Who stole my basket with the meat?

Model output: Who took the book to the study?
-----------------------------------------
Input sentence: No estaba manejando tan rápido.
Dataset translation: 	I wasn't driving all that fast.

Model output: I wasn't about to the book.
-----------------------

Export model in Core ML format.

In [30]:
coreml_enc_in = Input(shape=(None, in_vocab_size), name="encoder_in")
coreml_enc_lstm = LSTM(latent_dim, return_state=True, name="encoder_lstm")
coreml_enc_out, _, _ = coreml_enc_lstm(coreml_enc_in)

coreml_encoder_model = Model(coreml_enc_in, coreml_enc_out)
coreml_encoder_model.output_layers = coreml_encoder_model._output_layers

inf_encoder.save_weights("Es2EnCharEncoderWeights.h5")
coreml_encoder_model.load_weights("Es2EnCharEncoderWeights.h5")

In [None]:
import coremltools

coreml_encoder = coremltools.converters.convert(
    coreml_encoder_model,
    input_names="encodedSeq",
    output_names="ignored")

coreml_encoder.save("Es2EnCharEncoder.mlmodel")

In [33]:
coreml_dec_in = Input(shape=(None, out_vocab_size))

coreml_dec_lstm = LSTM(latent_dim, return_sequences=True, return_state=True, name="decoder_lstm")
coreml_dec_lstm_out, _, _ = coreml_dec_lstm(coreml_dec_in)
coreml_dec_dense = Dense(out_vocab_size, activation="softmax")
coreml_dec_out = coreml_dec_dense(coreml_dec_lstm_out)

coreml_decoder_model = Model(coreml_dec_in, coreml_dec_out)
coreml_decoder_model.output_layers = coreml_decoder_model._output_layers

inf_decoder.save_weights("Es2EnCharDecoderWeights.h5")
coreml_decoder_model.load_weights("Es2EnCharDecoderWeights.h5")

In [34]:
coreml_decoder = coremltools.converters.convert(
    coreml_decoder_model,
    input_names="encodedChar",
    output_names="nextCharProbs")

coreml_decoder.save("Es2EnCharDecoder.mlmodel")

2022-01-02 18:07:15.479782: I tensorflow/core/grappler/devices.cc:66] Number of eligible GPUs (core count >= 8, compute capability >= 0.0): 0
2022-01-02 18:07:15.479918: I tensorflow/core/grappler/clusters/single_machine.cc:358] Starting new session
2022-01-02 18:07:15.488659: I tensorflow/core/grappler/optimizers/meta_optimizer.cc:1149] Optimization results for grappler item: graph_to_optimize
  function_optimizer: Graph size after: 113 nodes (60), 133 edges (69), time = 1.774ms.
  function_optimizer: Graph size after: 113 nodes (0), 133 edges (0), time = 0.974ms.
Optimization results for grappler item: while_body_664747
  function_optimizer: function_optimizer did nothing. time = 0.003ms.
  function_optimizer: function_optimizer did nothing. time = 0.001ms.
Optimization results for grappler item: while_cond_664746
  function_optimizer: function_optimizer did nothing. time = 0.002ms.
  function_optimizer: function_optimizer did nothing. time = 0.001ms.

2022-01-02 18:07:15.536896: I t

Convert weights to 16bit floats. This shouldn't hurt performance much, if at all, and it reduces the app's download size.

In [35]:
from coremltools.models.neural_network import quantization_utils

def convert_to_fp16(mlmodel_filename):
    model_fp32 = coremltools.models.MLModel(mlmodel_filename)
    spec_16bit = quantization_utils.quantize_weights(model_fp32, nbits=16)
    coremltools.utils.save(spec_16bit, f"{mlmodel_filename}16Bit.mlmodel")

In [36]:
convert_to_fp16("Es2EnCharEncoder.mlmodel")
convert_to_fp16("Es2EnCharDecoder.mlmodel")

Quantizing using linear quantization
Quantizing layer tf_make_list_0_condition_re_initialize
Quantizing layer tf_make_list_0_condition
Quantizing layer model_4/encoder_lstm/PartitionedCall/while_renamed
Quantizing layer model_4/encoder_lstm/PartitionedCall/while/while_body_663418/while/MatMul_1
Quantizing layer model_4/encoder_lstm/PartitionedCall/while/while_body_663418/while/MatMul
Quantizing layer model_4/encoder_lstm/PartitionedCall/while_1_condition_re_initialize
Quantizing layer model_4/encoder_lstm/PartitionedCall/while_1_condition
Quantizing using linear quantization
Quantizing layer tf_make_list_0_condition_re_initialize
Quantizing layer tf_make_list_0_condition
Quantizing layer model_5/decoder_lstm/PartitionedCall/while_renamed
Quantizing layer model_5/decoder_lstm/PartitionedCall/while/while_body_665431/while/MatMul_1
Quantizing layer model_5/decoder_lstm/PartitionedCall/while/while_body_665431/while/MatMul
Quantizing layer model_5/decoder_lstm/PartitionedCall/while_1_condit

Save the maps so you can transform text to and from ints. You'll need them later in the iOS app.

In [None]:
import json

with open("esCharToInt.json", "w") as f:
    json.dump(in_token2int, f)
with open("intToEnChar.json", "w") as f:
    json.dump(out_int2token, f)