Load dataset. It contains translations from English to Spanish, so swap the order of the phrases. Also add `\t` and `\n` as the start and stop tokens in the target sequences. 

In [1]:
start_token = "\t"
stop_token = "\n"

with open("data/spa.txt", "r", encoding="utf-8") as f:
    samples = f.read().split("\n")

samples = [sample.strip().split("\t")
           for sample in samples if len(sample.strip()) > 0]

samples = [(es, start_token + en + stop_token)
           for en, es in samples if len(es) < 45]

In [2]:
len(samples)

99423

In [3]:
print(samples[:2])

[('Ve.', '\tGo.\n'), ('Vete.', '\tGo.\n')]


Split data into train and validation sets.

In [4]:
from sklearn.model_selection import train_test_split

train_samples, valid_samples = train_test_split(samples, train_size=.8, random_state=42)



In [5]:
len(train_samples)

79538

In [6]:
len(valid_samples)

19885

Determine the training vocabulary. Those are the only tokens you can trust the model will know how to handle. 

In [7]:
in_vocab = set()
out_vocab = set()

for in_seq, out_seq in train_samples:
    in_vocab.update(in_seq)
    out_vocab.update(out_seq)
    
in_vocab_size = len(in_vocab)
out_vocab_size = len(out_vocab)
print("Input vocab size:", in_vocab_size)
print("Output vocab size:", out_vocab_size)

Input vocab size: 101
Output vocab size: 87


In [8]:
print(sorted(in_vocab))

[' ', '!', '"', '$', '%', "'", '(', ')', '+', ',', '-', '.', '/', '0', '1', '2', '3', '4', '5', '6', '7', '8', '9', ':', ';', '?', 'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J', 'K', 'L', 'M', 'N', 'O', 'P', 'Q', 'R', 'S', 'T', 'U', 'V', 'W', 'X', 'Y', 'Z', 'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n', 'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z', '¡', '«', '°', 'º', '»', '¿', 'Á', 'É', 'Ó', 'Ú', 'á', 'è', 'é', 'í', 'ñ', 'ó', 'ö', 'ú', 'ü', 'ś', 'с', '—', '€']


In [9]:
print(sorted(out_vocab))

['\t', '\n', ' ', '!', '"', '$', '%', "'", ',', '-', '.', '/', '0', '1', '2', '3', '4', '5', '6', '7', '8', '9', ':', ';', '?', 'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J', 'K', 'L', 'M', 'N', 'O', 'P', 'Q', 'R', 'S', 'T', 'U', 'V', 'W', 'X', 'Y', 'Z', 'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n', 'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z', '°', 'á', 'ã', 'è', 'é', 'ö', '‘', '’', '₂', '€']


Go through validation set and remove any tokens not present in the training set.

In [10]:
tmp_samples = []
for in_seq, out_seq in valid_samples:
    tmp_in_seq = [c for c in in_seq if c in in_vocab]
    tmp_out_seq = [c for c in out_seq if c in out_vocab]

    tmp_samples.append(("".join(tmp_in_seq), "".join(tmp_out_seq)))
    
valid_samples = tmp_samples

Build sequence-to-sequence model.

In [11]:
import keras
from keras.layers import Dense, Input, LSTM, Masking
from keras.models import Model

Using TensorFlow backend.


In [12]:
latent_dim = 256

encoder_in = Input(shape=(None, in_vocab_size), name="encoder_in")
encoder_mask = Masking(name="encoder_mask")(encoder_in)
encoder_lstm = LSTM(latent_dim, return_state=True, recurrent_dropout=0.3, name="encoder_lstm")
_, encoder_h, encoder_c = encoder_lstm(encoder_mask)

In [13]:
decoder_in = Input(shape=(None, out_vocab_size), name="decoder_in")

decoder_mask = Masking(name="decoder_mask")(decoder_in)
decoder_lstm = LSTM(latent_dim, return_sequences=True, return_state=True,
                    dropout=0.2, recurrent_dropout=0.3, name="decoder_lstm")
decoder_lstm_out, _, _ = decoder_lstm(decoder_mask, initial_state=[encoder_h, encoder_c])
decoder_dense = Dense(out_vocab_size, activation="softmax", name="decoder_out")
decoder_out = decoder_dense(decoder_lstm_out)

In [14]:
seq2seq_model = Model([encoder_in, decoder_in], decoder_out)
seq2seq_model.compile(optimizer="rmsprop", loss="categorical_crossentropy")

In [15]:
seq2seq_model.summary()

__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
encoder_in (InputLayer)         (None, None, 101)    0                                            
__________________________________________________________________________________________________
decoder_in (InputLayer)         (None, None, 87)     0                                            
__________________________________________________________________________________________________
encoder_mask (Masking)          (None, None, 101)    0           encoder_in[0][0]                 
__________________________________________________________________________________________________
decoder_mask (Masking)          (None, None, 87)     0           decoder_in[0][0]                 
__________________________________________________________________________________________________
encoder_ls

Create maps to convert characters to and from ints. 

In [16]:
in_token2int = {token : i for i, token in enumerate(sorted(in_vocab))}
out_token2int = {token : i for i, token in enumerate(sorted(out_vocab))}
out_int2token = {i : token for (token, i) in out_token2int.items()}

Create helper functions for one-hot encoding sequences for use with the model.

In [17]:
import numpy as np

def make_batch_storage(batch_size, in_seq_len, out_seq_len):
    
    enc_in_seqs = np.zeros(
        (batch_size, in_seq_len, in_vocab_size),
        dtype=np.float32)

    dec_in_seqs = np.zeros(
        (batch_size, out_seq_len, out_vocab_size),
        dtype=np.float32)

    dec_out_seqs = np.zeros(
        (batch_size, out_seq_len, out_vocab_size),
        dtype=np.float32)
        
    return enc_in_seqs, dec_in_seqs, dec_out_seqs

In [18]:
def encode_batch(samples):
    batch_size = len(samples)
    max_in_length = max([len(seq) for seq, _ in samples])
    max_out_length = max([len(seq) for _, seq in samples])

    enc_in_seqs, dec_in_seqs, dec_out_seqs = make_batch_storage(
        batch_size, max_in_length, max_out_length)
    
    for i, (in_seq, out_seq) in enumerate(samples):
        for time_step, token in enumerate(in_seq):
            enc_in_seqs[i, time_step, in_token2int[token]] = 1

        for time_step, token in enumerate(out_seq):
            dec_in_seqs[i, time_step, out_token2int[token]] = 1

        for time_step, token in enumerate(out_seq[1:]):
            dec_out_seqs[i, time_step, out_token2int[token]] = 1
            
    return enc_in_seqs, dec_in_seqs, dec_out_seqs

Train model

In [19]:
from seq2seq_util import Seq2SeqBatchGenerator

batch_size = 64
train_generator = Seq2SeqBatchGenerator(train_samples, batch_size, encode_batch)
valid_generator = Seq2SeqBatchGenerator(valid_samples, batch_size, encode_batch)

In [20]:
from keras.callbacks import EarlyStopping

early_stopping = EarlyStopping(monitor="val_loss", patience=5, restore_best_weights=True)

seq2seq_model.fit_generator(train_generator, epochs=500,
                            validation_data=valid_generator,
                            callbacks=[early_stopping])

Epoch 1/500
Epoch 2/500
Epoch 3/500
Epoch 4/500
Epoch 5/500
Epoch 6/500
Epoch 7/500
Epoch 8/500
Epoch 9/500
Epoch 10/500
Epoch 11/500
Epoch 12/500
Epoch 13/500
Epoch 14/500
Epoch 15/500
Epoch 16/500
Epoch 17/500
Epoch 18/500
Epoch 19/500
Epoch 20/500
Epoch 21/500
Epoch 22/500
Epoch 23/500
Epoch 24/500
Epoch 25/500
Epoch 26/500
Epoch 27/500
Epoch 28/500
Epoch 29/500
Epoch 30/500
Epoch 31/500
Epoch 32/500
Epoch 33/500
Epoch 34/500
Epoch 35/500
Epoch 36/500
Epoch 37/500
Epoch 38/500
Epoch 39/500
Epoch 40/500
Epoch 41/500
Epoch 42/500
Epoch 43/500
Epoch 44/500
Epoch 45/500
Epoch 46/500
Epoch 47/500
Epoch 48/500
Epoch 49/500
Epoch 50/500
Epoch 51/500
Epoch 52/500
Epoch 53/500
Epoch 54/500
Epoch 55/500
Epoch 56/500
Epoch 57/500
Epoch 58/500
Epoch 59/500
Epoch 60/500
Epoch 61/500
Epoch 62/500
Epoch 63/500
Epoch 64/500
Epoch 65/500
Epoch 66/500
Epoch 67/500
Epoch 68/500
Epoch 69/500
Epoch 70/500
Epoch 71/500
Epoch 72/500
Epoch 73/500
Epoch 74/500
Epoch 75/500
Epoch 76/500
Epoch 77/500
Epoch 78

Epoch 155/500
Epoch 156/500
Epoch 157/500
Epoch 158/500
Epoch 159/500
Epoch 160/500
Epoch 161/500
Epoch 162/500
Epoch 163/500
Epoch 164/500
Epoch 165/500
Epoch 166/500
Epoch 167/500
Epoch 168/500
Epoch 169/500
Epoch 170/500
Epoch 171/500
Epoch 172/500
Epoch 173/500
Epoch 174/500
Epoch 175/500
Epoch 176/500
Epoch 177/500
Epoch 178/500
Epoch 179/500
Epoch 180/500
Epoch 181/500
Epoch 182/500
Epoch 183/500
Epoch 184/500


<keras.callbacks.History at 0x7fd84cb76e10>

Create encoder/decoder models for inference

In [21]:
inf_encoder = Model(encoder_in, [encoder_h, encoder_c])

In [22]:
inf_encoder.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
encoder_in (InputLayer)      (None, None, 101)         0         
_________________________________________________________________
encoder_mask (Masking)       (None, None, 101)         0         
_________________________________________________________________
encoder_lstm (LSTM)          [(None, 256), (None, 256) 366592    
Total params: 366,592
Trainable params: 366,592
Non-trainable params: 0
_________________________________________________________________


In [23]:
inf_dec_h_in = Input(shape=(latent_dim,), name="decoder_h_in")
inf_dec_c_in = Input(shape=(latent_dim,), name="decoder_c_in")

inf_dec_lstm_out, inf_dec_h_out, inf_dec_c_out = decoder_lstm(
    decoder_in, initial_state=[inf_dec_h_in, inf_dec_c_in])

inf_dec_out = decoder_dense(inf_dec_lstm_out)

inf_decoder = Model(
    [decoder_in, inf_dec_h_in, inf_dec_c_in],
    [inf_dec_out, inf_dec_h_out, inf_dec_c_out])

In [24]:
inf_decoder.summary()

__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
decoder_in (InputLayer)         (None, None, 87)     0                                            
__________________________________________________________________________________________________
decoder_h_in (InputLayer)       (None, 256)          0                                            
__________________________________________________________________________________________________
decoder_c_in (InputLayer)       (None, 256)          0                                            
__________________________________________________________________________________________________
decoder_lstm (LSTM)             [(None, None, 256),  352256      decoder_in[0][0]                 
                                                                 decoder_h_in[0][0]               
          

Test trained model on the first 100 samples from both the training and validation sets.

In [25]:
max_out_seq_len = max([len(seq) for _, seq in samples])
print("Max output length: ", max_out_seq_len)

start_token_idx = out_token2int[start_token]
stop_token_idx = out_token2int[stop_token]

Max output length:  87


In [26]:
def translate_sequence(one_hot_seq, encoder, decoder):
    encoding = encoder.predict(one_hot_seq)

    decoder_in = np.zeros((1, 1, out_vocab_size), dtype=np.float32)

    translated_text = ''
    done_decoding = False
    decoded_idx = start_token_idx
    while not done_decoding:
        decoder_in[0, 0, decoded_idx] = 1
        decoding, h, c = decoder.predict([decoder_in] + encoding)
        encoding = [h, c]
        decoder_in[0, 0, decoded_idx] = 0

        decoded_idx = np.argmax(decoding[0, -1, :])
        
        if decoded_idx == stop_token_idx:
            done_decoding = True
        else:
            translated_text += out_int2token[decoded_idx]

        if len(translated_text) >= max_out_seq_len:
            done_decoding = True
            
    return translated_text

Validation samples:

In [27]:
from seq2seq_util import test_predictions

test_predictions(valid_samples[:100], inf_encoder, inf_decoder, encode_batch, translate_sequence)

-----------------------------------------
Input sentence: A todos nos gusta montar en bici.
Dataset translation: 	We all like cycling.

Model output: We all like to go on his complain.

-----------------------------------------
Input sentence: Tom se rió de todos los chistes de Mary.
Dataset translation: 	Tom laughed at all of Mary's jokes.

Model output: Tom laughed at Mary's eyes every morning.

-----------------------------------------
Input sentence: Tom es un asqueroso.
Dataset translation: 	Tom is a creep.

Model output: Tom is a good story.

-----------------------------------------
Input sentence: ¿Cuál es tu meta en la vida?
Dataset translation: 	What's your aim in life?

Model output: What's your favorite in the window?

-----------------------------------------
Input sentence: Ella le escucha, aunque nadie más lo haga.
Dataset translation: 	She listens to him even though no one else does.

Model output: She listened him not to help me anything.

-----------------------------

-----------------------------------------
Input sentence: Ellos son vuestros.
Dataset translation: 	They're yours.

Model output: They're your friends.

-----------------------------------------
Input sentence: Mañana, él alunizará.
Dataset translation: 	Tomorrow he lands on the moon.

Model output: Tomorrow will be a little car.

-----------------------------------------
Input sentence: Yo jugaré contigo.
Dataset translation: 	I'll play with you.

Model output: I'll play with you.

-----------------------------------------
Input sentence: Tom está gordo.
Dataset translation: 	Tom is fat.

Model output: Tom is complaining.

-----------------------------------------
Input sentence: ¿Puedo reservar un vuelo a Chicago?
Dataset translation: 	Can I reserve a flight to Chicago?

Model output: Can I get a good control Chinese?

-----------------------------------------
Input sentence: Él asistió a muchas ceremonias.
Dataset translation: 	He attended many ceremonies.

Model output: He asked me

Training samples:

In [28]:
test_predictions(train_samples[:100], inf_encoder, inf_decoder, encode_batch, translate_sequence)

-----------------------------------------
Input sentence: Después de una larga espera pudimos entrar.
Dataset translation: 	We got in after a long wait.

Model output: After a lot of watching a country.

-----------------------------------------
Input sentence: Lo siento, pero es imposible.
Dataset translation: 	I'm sorry, but it's impossible.

Model output: I'm sorry, is it is important.

-----------------------------------------
Input sentence: Parecía satisfecho.
Dataset translation: 	He looked pleased.

Model output: He seemed surprised.

-----------------------------------------
Input sentence: Saqué el pastel del horno.
Dataset translation: 	I took the cake out of the oven.

Model output: I took the party the bank.

-----------------------------------------
Input sentence: Es un trabajo muy difícil.
Dataset translation: 	That's a very tough job.

Model output: It's a very difficult discountry.

-----------------------------------------
Input sentence: Dijiste que no entendías.
Da

-----------------------------------------
Input sentence: Él se marchó hace diez minutos.
Dataset translation: 	He left ten minutes ago.

Model output: He has destroved the months ago.

-----------------------------------------
Input sentence: ¿Sabes por qué Tom vino aquí hoy?
Dataset translation: 	Do you know the reason Tom came here today?

Model output: Do you know why Tom came here for you?

-----------------------------------------
Input sentence: ¿Me podrías pasar la sal, por favor?
Dataset translation: 	Could you pass me the salt, please?

Model output: Could you please stay my parent?

-----------------------------------------
Input sentence: No le gustan las cosas dulces.
Dataset translation: 	He doesn't care for sweet things.

Model output: He doesn't like sports.

-----------------------------------------
Input sentence: Hoy es jueves.
Dataset translation: 	Today is Thursday.

Model output: Today's tennis today.

-----------------------------------------
Input sentence: Tom 

-----------------------------------------
Input sentence: Ella hizo un viaje a Europa el mes pasado.
Dataset translation: 	She made a trip to Europe last month.

Model output: She made a problem in the complain in the complain.

-----------------------------------------
Input sentence: No tengo tiempo para juegos.
Dataset translation: 	I don't have time for games.

Model output: I don't have time to be a lot.

-----------------------------------------
Input sentence: Compré esta cámara por 25.000 yenes.
Dataset translation: 	I bought this camera for 25,000 yen.

Model output: I bought this cold of 200 police of 20.



Export model in Core ML format.

In [29]:
coreml_enc_in = Input(shape=(None, in_vocab_size), name="encoder_in")
coreml_enc_lstm = LSTM(latent_dim, return_state=True, name="encoder_lstm")
coreml_enc_out, _, _ = coreml_enc_lstm(coreml_enc_in)

coreml_encoder_model = Model(coreml_enc_in, coreml_enc_out)
coreml_encoder_model.output_layers = coreml_encoder_model._output_layers

inf_encoder.save_weights("Es2EnCharEncoderWeights.h5")
coreml_encoder_model.load_weights("Es2EnCharEncoderWeights.h5")

In [30]:
import coremltools

coreml_encoder = coremltools.converters.keras.convert(
    coreml_encoder_model,
    input_names="encodedSeq",
    output_names="ignored")

coreml_encoder.save("Es2EnCharEncoder.mlmodel")

0 : encoder_in, <keras.engine.input_layer.InputLayer object at 0x7fe03a50ff60>
1 : encoder_lstm, <keras.layers.recurrent.LSTM object at 0x7fe03a50fa58>


In [31]:
coreml_dec_in = Input(shape=(None, out_vocab_size))

coreml_dec_lstm = LSTM(latent_dim, return_sequences=True, return_state=True, name="decoder_lstm")
coreml_dec_lstm_out, _, _ = coreml_dec_lstm(coreml_dec_in)
coreml_dec_dense = Dense(out_vocab_size, activation="softmax")
coreml_dec_out = coreml_dec_dense(coreml_dec_lstm_out)

coreml_decoder_model = Model(coreml_dec_in, coreml_dec_out)
coreml_decoder_model.output_layers = coreml_decoder_model._output_layers

inf_decoder.save_weights("Es2EnCharDecoderWeights.h5")
coreml_decoder_model.load_weights("Es2EnCharDecoderWeights.h5")

In [32]:
coreml_decoder = coremltools.converters.keras.convert(
    coreml_decoder_model,
    input_names="encodedChar",
    output_names="nextCharProbs")

coreml_decoder.save("Es2EnCharDecoder.mlmodel")

0 : input_1, <keras.engine.input_layer.InputLayer object at 0x7fe038de9e10>
1 : decoder_lstm, <keras.layers.recurrent.LSTM object at 0x7fe038de9e48>
2 : dense_1, <keras.layers.core.Dense object at 0x7fe038de9eb8>
3 : dense_1__activation__, <keras.layers.core.Activation object at 0x7fe038de9630>


Convert weights to 16bit floats. This shouldn't hurt performance much, if at all, and it reduces the app's download size.

In [33]:
def convert_to_fp16(mlmodel_filename):
    basename = mlmodel_filename[:-len(".mlmodel")]
    spec = coremltools.utils.load_spec(mlmodel_filename)
    spec_16bit = \
      coremltools.utils.convert_neural_network_spec_weights_to_fp16(spec)
    coremltools.utils.save_spec(spec_16bit, f"{basename}16Bit.mlmodel")

In [34]:
convert_to_fp16("Es2EnCharEncoder.mlmodel")
convert_to_fp16("Es2EnCharDecoder.mlmodel")

Save the maps so you can transform text to and from ints. You'll need them later in the iOS app.

In [35]:
import json

with open("esCharToInt.json", "w") as f:
    json.dump(in_token2int, f)
with open("intToEnChar.json", "w") as f:
    json.dump(out_int2token, f)