- I was able to get a working model for LSTM translation in part 4
- But so far, I have not been using pre-trained embeddings
- In this notebook, I build an LSTM translation model using pre-trained weights from spacy

In [3]:
import pandas as pd
import regex as re
import string
from tensorflow.keras.layers.experimental.preprocessing import TextVectorization
import numpy as np
from sklearn.model_selection import train_test_split
from tensorflow.keras.layers import Input, Embedding, LSTM, Dense
from tensorflow.keras.initializers import Constant
from tensorflow.keras.models import Model

In [None]:
# from google.colab import drive
# drive.mount('/content/gdrive')

# df_en_de = pd.read_table('/content/gdrive/MyDrive/deu-eng/deu.txt', names=['eng', 'deu', 'attr'])

In [4]:
# read txt file
df_en_de = pd.read_table('deu-eng/deu.txt', names=['eng', 'deu', 'attr'])
# drop extraneous column and rename columns
df_en_de = df_en_de.drop('attr', axis = 1).rename(columns = {'eng':'english', 'deu':'german'})

- pre-process the sentences

In [5]:
# Lowercase all characters
df_en_de['english'] = df_en_de['english'].apply(lambda x: x.lower())
df_en_de['german'] = df_en_de['german'].apply(lambda x: x.lower())

# Remove quotes
df_en_de['english'] = df_en_de['english'].apply(lambda x: re.sub("'", '', x))
df_en_de['german'] = df_en_de['german'].apply(lambda x: re.sub("'", '', x))

# Convert umlauts and sharp s:
df_en_de['german'] = df_en_de['german'].apply(
                            lambda x: x.replace('ü', 'ue').replace('ä', 'ae').replace('ö', 'oe').replace('ß', 'ss')
                            )

# Create set of all special characters
exclude = set(string.punctuation) 

# Remove all special characters
df_en_de['english'] = df_en_de['english'].apply(lambda x: ''.join(ch for ch in x if ch not in exclude))
df_en_de['german']=df_en_de['german'].apply(lambda x: ''.join(ch for ch in x if ch not in exclude))

# Add "START_" and "_END" tokens to target (German) sentences
df_en_de['german'] = df_en_de['german'].apply(lambda x : 'START_ '+ x + ' _END')


- Further pre-processing + only select sentences with 10 words or fewer

In [6]:
# rename dataframe
pairs = df_en_de

# Create new columns showing the number of words per sentence
pairs['english_length'] = pairs['english'].apply(lambda x: len(x.split(' ')))
pairs['german_length'] = pairs['german'].apply(lambda x: len(x.split(' ')))

# Create new columns with sentences that have ascii symbols removed 
pairs['english_cleaned'] = pairs['english'].apply(lambda x: x.encode("ascii", "ignore"))
pairs['english_cleaned'] = pairs['english_cleaned'].apply(lambda x: x.decode())
pairs['german_cleaned'] = pairs['german'].apply(lambda x: x.encode("ascii", "ignore"))
pairs['german_cleaned'] = pairs['german_cleaned'].apply(lambda x: x.decode())

# Define max_len
max_len = 10

# Select only the rows with max_len words or fewer
pairs = pairs[pairs['english_length'] <= max_len]
pairs = pairs[pairs['german_length'] <= max_len]

# Take smaller sample of dataframe (to check code works)
pairs = pairs.sample(frac = 0.1, random_state = 1)

- Tokenize sentences (word-based) using spacy modules

In [7]:
#!python -m spacy download en_core_web_lg
import en_core_web_lg

#!python -m spacy download de_core_news_sm
import de_core_news_sm


  from .autonotebook import tqdm as notebook_tqdm


In [8]:
text_source = pairs['english_cleaned']
text_target = pairs['german_cleaned']

nlp_source = en_core_web_lg.load()
nlp_target = de_core_news_sm.load()

# create Keras vectorizers 
Vectorizer_source = TextVectorization()
Vectorizer_target = TextVectorization()

Vectorizer_source.adapt(text_source)
Vectorizer_target.adapt(text_target)

# create vocabulary for source (German) and target (English) languages
vocab_source = Vectorizer_source.get_vocabulary()
vocab_target = Vectorizer_target.get_vocabulary()

# convert vocabularies into lists
vocab_source = [str(word) for word in vocab_source]
vocab_target = [str(word) for word in vocab_target]

# remove empty strings
vocab_source.remove('')
vocab_target.remove('')

2022-09-25 11:45:06.655098: W tensorflow/core/platform/profile_utils/cpu_utils.cc:128] Failed to get CPU frequency: 0 Hz


In [9]:
# The embeddings downloaded from spacy don't include our 'START_' and '_END' tokens
# Add them to "vocab_target"
vocab_target.append('START_')
vocab_target.append('_END')

In [10]:
# vocab size for source and target languages

vocab_len_source = len(vocab_source)
vocab_len_target = len (vocab_target)

print (vocab_len_source, vocab_len_target)

6446 9992


- Create embedding matrices for source and target languages

In [11]:
# generate the embedding matrix for source vocab

# add 1 to size of vocab for zero padding 
# This is for the Embedding layer later
num_tokens_source = vocab_len_source + 1

# source language embedding dimensions
embedding_dim_source = len(nlp_source('The').vector)

# initialise embedding matrix for source language
embedding_matrix_source = np.zeros((num_tokens_source, embedding_dim_source))

# word-to-index and index-to-word mappings for source language
word_idx_source = {}
idx_word_source = {}


# fill our embedding matrix with pre-trained embeddings from spacy
for i, word in enumerate(vocab_source):
    # notice we start indexing from 1 (no word is assigned to 0 index)
    embedding_matrix_source[i+1] = nlp_source(word).vector      # load vectors into embedding matrix
    word_idx_source[word] = int(i+1)                            # word-to-index map
    idx_word_source[i+1] = word                                 # index-to-word map


# generate the embedding matrix for target vocab

# add 1 for zero padding (for Embedding layer)
num_tokens_target = vocab_len_target + 1

# target language embedding dimensions
embedding_dim_target = len(nlp_target('Der').vector)
# initialise embedding matrix for target language
embedding_matrix_target = np.zeros((num_tokens_target, embedding_dim_target))

# word-to-index and index-to-word mappings for target language
word_idx_target = {}
idx_word_target = {}
for i, word in enumerate(vocab_target):
    # iterate over all words excluding the final two ("START_" and "_END")
    if i < vocab_len_target - 2 : 
        embedding_matrix_target[i+1] = nlp_target(word).vector      # load vectors into embedding matrix
        word_idx_target[word] = int(i+1)                            # word-to-index map
        idx_word_target[i+1] = word                                 # index-to-word map
    if word == 'START_':
        # assign embedding vector with random values for "START_" token 
        embedding_matrix_target[i+1] = np.random.rand((embedding_dim_target))
        word_idx_target[word] = int(i+1)
        idx_word_target[i+1] = word
    if word == '_END':
        # assign embedding vector with random values for "_END" token
        embedding_matrix_target[i+1] = np.random.rand((embedding_dim_target))
        word_idx_target[word] = int(i+1)
        idx_word_target[i+1] = word

- run time for entire dataset: 3 m

In [14]:
# sanity checks -- check that word-to-index and index-to-word mappings are correct
print (word_idx_source['hi'], idx_word_source[1725])
print (word_idx_source['go'], idx_word_source[46])
print (word_idx_source['market'], idx_word_source[1952])
print (word_idx_target['kann'], idx_word_target[35])
print (word_idx_target['folgte'], idx_word_target[1725])
# word_idx_target['START_'], idx_word_target[8167]
#word_idx_target['_END'], idx_word_target[8168]
#word_idx_target.keys()

1725 hi
46 go
1952 market
35 kann
1725 folgte


In [15]:
# sanity checks -- check dimensions of embedding matrices are correct
print (embedding_matrix_source.shape[0], num_tokens_source)
print(embedding_matrix_target.shape[0], num_tokens_target)

6447 6447
9993 9993


In [16]:
# Note on dimensions
# embedding_matrix_source.shape = (num_tokens_source, embedding_dim_source)
# embedding_matrix_target.shape = (num_tokens_target, embedding_dim_target)
# num_tokens_source = len(vocab_source) + 1
# num_tokens_target = len (vocab_target) + 1

# sanity checks -- 
print (num_tokens_source, len(vocab_source))
print (num_tokens_target, len(vocab_target))

6447 6446
9993 9992


- Split data into train and test sets

In [17]:
# Make sure to load CLEANED data
X, y = pairs['english_cleaned'], pairs['german_cleaned']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state=101)

In [18]:
print (X_train.shape, y_train.shape)

(16745,) (16745,)


In [19]:
def generate_batch(X = X_train, y = y_train, batch_size = 128):
    ''' Generate a batch of data '''
    while True:
        # for every batch j
        for j in range(0, len(X), batch_size):
            # initialize numpy arrays with zeros          
            encoder_input_data = np.zeros((batch_size, max_len), dtype='float32')               
            decoder_input_data = np.zeros((batch_size, max_len), dtype='float32')
            decoder_target_data = np.zeros((batch_size, max_len, num_tokens_target), dtype='float32')
            
            # for every example sentence i
            for i, (input_text, target_text) in enumerate(zip(X[j:j+batch_size], y[j:j+batch_size])):
                for t, word in enumerate(input_text.split()):
                    # for every time-step t, insert index for encoder input
                    encoder_input_data[i, t] = word_idx_source[word] # encoder input seq
                for t, word in enumerate(target_text.split()):
                    # for every time-step t, insert index for decoder input (excluding final time-step)
                    if t<len(target_text.split())-1:
                        decoder_input_data[i, t] = word_idx_target[word] # decoder input seq
                    # create one-hot vector for decoder output, excluding the START_ token
                    # offset by one timestep
                    if t>0:
                        decoder_target_data[i, t - 1, word_idx_target[word]] = 1. 
                                                            
            yield([encoder_input_data, decoder_input_data], decoder_target_data)

<h3> Model for training </h3>

In [20]:
n_h = 256   # state vector dimension
# Notice that embeddings for source and target languages have different lengths
emb_dim_source = embedding_matrix_source.shape[1]
emb_dim_target = embedding_matrix_target.shape[1]

In [21]:
# All layer objects are global variables. 
# Their weights are remembered when we call on them in a later model.

In [22]:
### ENCODER ###

# Define Input()
# Batch-size is automatically "None". 
# We set the time-step dimension as "None", which allows time-step dimension of varying length.
# This will be useful during the prediction stage, when we will feed one word at a time. 
encoder_inputs = Input(shape=(None,))                       # (None, None) -- (m, Tx)

# Create Embedding layer for encoder, load pre-trained embeddings, freeze weights
# Pass input through Embedding layer
enc_emb =  Embedding(num_tokens_source, 
                    emb_dim_source, 
                    mask_zero = True,
                    embeddings_initializer = Constant(embedding_matrix_source),
                    trainable = False)(encoder_inputs)                                      # (None, None, emb_dim_source) 
                                                                                            # -- (m, Tx, input embedding dimensions)

# Create LSTM layer
# return_state = True: (final_hidden_state, final_hidden_state, final_cell_state)
encoder_lstm = LSTM(n_h, return_state=True)

# Pass embedding through Encoder LSTM layer
encoder_outputs, state_h, state_c = encoder_lstm(enc_emb)                                   # encoder_outputs = (None, n_h)
                                                                                            # -- (m, state vector dimensions)
# Discard "encoder_outputs" and only keep the states.
encoder_states = [state_h, state_c]                                                         # state_h = (None, n_h)
                                                                                            # -- (m, state vector dimensions)


### Decoder ### 

# Input layer
decoder_inputs = Input(shape=(None,))                       # (None, None) -- (m, Ty)

# Create Embedding layer for encoder, load pre-trained embeddings, freeze weights                                                     
dec_emb_layer = Embedding(num_tokens_target, 
                        emb_dim_target, 
                        mask_zero = True,
                        embeddings_initializer = Constant(embedding_matrix_target),
                        trainable = False)                 

# Pass input through Embedding layer
dec_emb = dec_emb_layer(decoder_inputs)                                                     # (None, None, emb_dim_target) 

# Create LSTM layer
# return_sequences = True, which means the output will be: 
# (hidden state for every time-step, hidden state for final time-step, cell state for final time-step)                                                                         
decoder_lstm = LSTM(n_h, return_sequences=True, return_state=True)                          # -- (m, Ty, output embedding dimensions)

# Pass embedding through Decoder LSTM layer, 
# using the Encoder's final states as the Decoder's initial states
decoder_outputs, _, _ = decoder_lstm(dec_emb,
                                     initial_state=encoder_states)                          # (None, None, n_h) -- (m, Ty, state vector dimensions)

# Create Dense layer with softmax activation
decoder_dense = Dense(num_tokens_target, activation='softmax')

# Pass Decoder LSTM outputs through Dense layer
decoder_outputs = decoder_dense(decoder_outputs)                                            # (None, None, num_tokens_target)
                                                                                            # (m, Ty, decoder vocab size + 1)



# Define the model
# inputs = [encoder_inputs, decoder_inputs]
# outputs = decoder_outputs
model = Model([encoder_inputs, decoder_inputs], decoder_outputs)                            # encoder_inputs = (None, None) -- (m, Tx)
                                                                                            # decoder_inputs = (None, None) -- (m, Ty)
                                                                                            # decoder_outputs = (None, None, decoder vocab size + 1) 

In [23]:
# compile model
model.compile(optimizer= 'Adam', loss='categorical_crossentropy', metrics=['acc'])

In [24]:
# Total training samples
train_samples = len(X_train) 
# Total validation samples
val_samples = len(X_test)    

batch_size = 128
epochs = 20

In [25]:
X_train.shape

(16745,)

In [50]:
# Train model
history = model.fit(generate_batch(), 
                    steps_per_epoch=train_samples//batch_size, 
                    epochs = epochs,
                    validation_data = generate_batch(X_test, y_test, batch_size = batch_size),
                    validation_steps = val_samples // batch_size, 
                    verbose = 1)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<h3> Predicting with the model </h3>

- At this stage, the LSTMs in our Encoder and Decoder have been trained (their weights/variables have been optimized)
- Inference / prediction now takes place in two steps. 
- 1 ) Pass source sequence through Encoder's LSTMs to get the final hidden and cell state vectors
- 2 ) We will predict the target sequence one time-step (one LSTM cell) at a time. All LSTM cells share the same weights.
- 2.0) Pass in the Encoder's final hidden and cell states as the Decoder's initial hidden and cell states
- 2.1) Pass in the predicted output from previous Decoder LSTM cell as the input. 
- 2.2) Pass in the hidden and cell states from previous Decocder LSTM cell as the initial states. 

In [51]:
### ENCODER ###

# Create an encoder_model, using the same 
# "encoder_inputs" and "encoder_states" that we trained above (global variables)
encoder_model = Model(inputs = encoder_inputs, outputs = encoder_states)                   
                                                                        # encoder_inputs -- (None, None) -- (m, Tx)
                                                                        # encoder_states = [state_h, state_c] -- [(None, n_h), (None,n_h)] 

### DECODER ###

# Define Inputs for Decoder's hidden and cell states
decoder_state_input_h = Input(shape=(n_h,))                      # (None, n_h) -- (m, state vector dims)   
decoder_state_input_c = Input(shape=(n_h,))                      # (None, n_h) -- (m, state vector dims)
decoder_state_input = [decoder_state_input_h, decoder_state_input_c]

# Pass Decoder input through Embedding layer 
dec_emb2 = dec_emb_layer(decoder_inputs)                                # (None, None, emb_dim_target) -- (m, Ty, embedding dims)

# To predict the next word in the sequence, 
# pass the Decoder states from the previous time-step as the initial states
decoder_outputs2, state_h2, state_c2 = decoder_lstm(
                                            inputs = dec_emb2, 
                                            initial_state = decoder_state_input
                                            )                           
                                            
                                                                        # decoder_outputs2 = (None, None, n_h) -- (m, Ty, state vector dims)
                                                                        # state_h2 = (None, n_h) -- (m, state vector dims)
                                                                        # state_c2 = (None, n_h) -- (m, state vector dims)

                                                                        
decoder_states2 = [state_h2, state_c2]

# Pass Decoder outputs through Dense layer with softmax activation 
# to get probability distribution over the target vocabulary
decoder_outputs2 = decoder_dense(decoder_outputs2)                      # (None, None, num_decoder_tokens) -- (m, Ty, target vocab size + 1)

# Final decoder model
decoder_model = Model(
    inputs = [decoder_inputs] + decoder_state_input,
    outputs = [decoder_outputs2] + decoder_states2)                     # decoder_inputs = (None, None) -- (m, Ty)
                                                                        # decoder_state_input = [(m, state vector dims), (m, state vector dims)]
                                                                        # decoder_outputs2 = (m, Ty, target vocab size + 1)
                                                                        # decoder_states2 = [(m, state vector dims), (m, state vector dims)]

- Functions for making inference

In [52]:
def sentence_to_seq(sentence):
    """
    Converts sentence (string) into sequence of integers

    Arguments
    sentence -- string
    Returns
    encoder_input_data -- (1, max_len)
    """
    # Initialise numpy array with zeros
    encoder_input_data = np.zeros((1, max_len))

    # Convert into list of words
    sentence = sentence.lower().split()
    
    # Place every j'th word in "sentence" into j'th position of "encoder_input_data"
    for j, word in enumerate(sentence):
        encoder_input_data[0,j] = word_idx_source[word]
    return encoder_input_data

In [53]:
def decode_sequence(input_seq):

    """ 
    Translates source sequence into target sentence. 
    This function predicts one sentence at a time. 

    Arguments
    input_seq       -- string
    
    Returns
    decoded_sentence -- string
    """
    
    
    # Pass source sequence through the encoder_model to get the final state and cell states.
    states_value = encoder_model.predict(input_seq)                     # states_value = [state_h, state_c]
                                                                        # [(None, n_h), (None,n_h)] 
                                                                        # -- [(m, state vector dims), (m, state vector dims)]
    # Generate empty target sequence of length 1.
    target_seq = np.zeros((1,1))
    # Populate the first position of "target_seq" with the index for the "START_" token.
    target_seq[0, 0] = word_idx_target['START_']

    # Now we will predict the target sequence one time-step at a time
    # For the Decoder's initial hidden and cell states, use the Encoder's final hidden and cell states.

    stop_condition = False
    decoded_sentence = ''

    # Note: target_seq will always be a single integer
    while not stop_condition:
        output_tokens, h, c = decoder_model.predict([target_seq] + states_value)        
                                                                                        
                                                                                        # output_tokens = softmax output = (m, Ty, num_tokens_target)
                                                                                       
        # Find index with max. probability
        sampled_token_index = np.argmax(output_tokens[0, -1, :])                # Note: we'll always be predicting one time-step at a time, so Ty = 1.
                                                                                # But set index 1 of output_tokens as "-1" for generality.
        # Map index to word
        sampled_word = idx_word_target[sampled_token_index]

        # Append sampled word to "decoded_sentence"
        decoded_sentence += ' '+ sampled_word

        # Exit condition: either hit max_len or sampled_word = "_END"
        if (sampled_word == '_END' or len(decoded_sentence.split(' ')) > max_len):
            stop_condition = True

        # Update the target sequence (of length 1).
        target_seq = np.zeros((1,1))
        target_seq[0, 0] = sampled_token_index

        # Update Decoder states
        states_value = [h, c]
        
    return decoded_sentence



- Translate a sample of input sentences from training data

In [60]:
N = 10

# Pick first N sentences from training data
sentences = list(X_train.iloc[:N].values)

for i, sentence in enumerate(sentences):
    seq = sentence_to_seq(sentence)
    translation = decode_sequence(seq)
    true_translation = y_train.iloc[i]
    print(f'Source: {sentence} \n Predicted Translation: {translation} \n True Translation: {true_translation} \n')

Source: why didnt tom visit boston 
 Predicted Translation:  warum hat tom nicht nach boston _END 
 True Translation: START_ warum hat tom boston nicht besucht _END 

Source: please dont waste electricity 
 Predicted Translation:  bitte keine sorge das wasser _END 
 True Translation: START_ bitte keinen strom verschwenden _END 

Source: tom insulted the waiter 
 Predicted Translation:  tom beleidigte den kellner _END 
 True Translation: START_ tom beleidigte den kellner _END 

Source: theres nothing to worry about 
 Predicted Translation:  es gibt keine angst von euch _END 
 True Translation: START_ es gibt keinen grund zur aufregung _END 

Source: they appointed him manager 
 Predicted Translation:  sie ernannten ihn zum manager _END 
 True Translation: START_ sie ernannten ihn zum manager _END 

Source: one more bottle of wine please 
 Predicted Translation:  noch eine flasche wein bitte _END 
 True Translation: START_ noch eine flasche wein bitte _END 

Source: do you know what tom 

- Translate a sample of input sentences from test data

In [61]:
N = 10

# Pick first N sentences from test data
sentences = list(X_test.iloc[:N].values)

for i, sentence in enumerate(sentences):
    seq = sentence_to_seq(sentence)
    translation = decode_sequence(seq)
    true_translation = y_test.iloc[i]
    print(f'Source: {sentence} \n Predicted Translation: {translation} \n True Translation: {true_translation} \n')

Source: i could never give up meat 
 Predicted Translation:  ich haette nie aufgehoert dass sie selten haben _END 
 True Translation: START_ ich koennte niemals ohne fleisch auskommen _END 

Source: he often plays guitar 
 Predicted Translation:  er spielt manchmal an tom _END 
 True Translation: START_ er spielt oft gitarre _END 

Source: tom and mary didnt have a choice 
 Predicted Translation:  tom und maria haben keine kinder _END 
 True Translation: START_ tom und maria hatten keine wahl _END 

Source: the water is nice and cool 
 Predicted Translation:  das wasser ist sehr heiss _END 
 True Translation: START_ das wasser ist angenehm kuehl _END 

Source: which browser are you using 
 Predicted Translation:  welche kandidatin macht dich _END 
 True Translation: START_ welchen netzgucker gebrauchst du _END 

Source: do you have one that is a little smaller 
 Predicted Translation:  hast du das ziemlich eine menge auch nicht _END 
 True Translation: START_ hast du eins welches klein