- data source: http://www.manythings.org/anki/

In [1]:
from tensorflow.keras.layers import Bidirectional, Concatenate, Permute, Dot, Input, LSTM, RNN, Multiply
from tensorflow.keras.layers import RepeatVector, Dense, Activation, Lambda, Embedding, Reshape
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.layers.experimental.preprocessing import TextVectorization
from tensorflow.keras.initializers import Constant
from tensorflow.keras.utils import to_categorical
from tensorflow.keras.models import load_model, Model
import tensorflow.keras.backend as K
from tensorflow.keras.utils import to_categorical
import tensorflow as tf
import numpy as np
import pandas as pd
import string 
import regex as re

from sklearn.model_selection import train_test_split

import io
import spacy

import warnings 
warnings.filterwarnings('ignore')

# from tensorflow.python.framework.ops import disable_eager_execution
# disable_eager_execution()

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
df_en_de = pd.read_table('deu-eng/deu.txt', names=['eng', 'deu', 'attr'])

In [3]:
df_en_de = df_en_de.drop('attr',axis = 1).rename(columns = {'eng':'english', 'deu':'german'})

In [4]:
# Lowercase all characters
df_en_de['english'] = df_en_de['english'].apply(lambda x: x.lower())
df_en_de['german'] = df_en_de['german'].apply(lambda x: x.lower())

# Remove quotes
df_en_de['english'] = df_en_de['english'].apply(lambda x: re.sub("'", '', x))
df_en_de['german'] = df_en_de['german'].apply(lambda x: re.sub("'", '', x))

# Set of all special characters
exclude = set(string.punctuation) 

# Remove all the special characters
df_en_de['english'] = df_en_de['english'].apply(lambda x: ''.join(ch for ch in x if ch not in exclude))
df_en_de['german']=df_en_de['german'].apply(lambda x: ''.join(ch for ch in x if ch not in exclude))

# Add start and end tokens to target sequences
df_en_de['german'] = df_en_de['german'].apply(lambda x : 'START_ '+ x + ' _END')

In [5]:
max_len = 10

pairs = df_en_de
pairs['english_length'] = pairs['english'].apply(lambda x: len(x.split(' ')))
pairs['german_length'] = pairs['german'].apply(lambda x: len(x.split(' ')))

pairs = pairs[pairs['english_length'] <= max_len]
pairs = pairs[pairs['german_length'] <= max_len]
print(len(pairs))
pairs = pairs.sample(frac = 0.1)
print(len(pairs))

209317
20932


In [6]:
# Vocabulary of English
all_en_words=set()
for eng in pairs['english']:
    for word in eng.split():
        if word not in all_en_words:
            all_en_words.add(word)

# Vocabulary of German 
all_de_words=set()
for de in pairs['german']:
    for word in de.split():
        if word not in all_de_words:
            all_de_words.add(word)

# Max Length of source sequence
length_list=[]
for l in pairs['english']:
    length_list.append(len(l.split(' ')))
max_length_src = np.max(length_list)

# Max Length of target sequence
length_list=[]
for l in pairs['german']:
    length_list.append(len(l.split(' ')))
max_length_tar = np.max(length_list)


input_words = sorted(list(all_en_words))
target_words = sorted(list(all_de_words))

# Calculate Vocab size for both source and target
# Add 1 for zero padding
num_encoder_tokens = len(all_en_words) + 1
num_decoder_tokens = len(all_de_words) + 1

# Create word to token dictionary for both source and target
#input_token_index = dict([(word, i+1) for i, word in enumerate(input_words)])
#target_token_index = dict([(word, i+1) for i, word in enumerate(target_words)])
input_word_index = dict([(word, i+1) for i, word in enumerate(input_words)])
target_word_index = dict([(word, i+1) for i, word in enumerate(target_words)])

# Create token to word dictionary for both source and target
# reverse_input_token_index = dict((i, word) for word, i in input_token_index.items())
# reverse_target_token_index = dict((i, word) for word, i in target_token_index.items())
input_index_word = dict((i, word) for word, i in input_word_index.items())
target_index_word = dict((i, word) for word, i in target_word_index.items())

In [7]:
X, y = pairs['english'], pairs['german']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state=101)

In [8]:
def generate_batch(X = X_train, y = y_train, batch_size = 128):
    ''' Generate a batch of data '''
    while True:
        for j in range(0, len(X), batch_size):          # j = batch number
            encoder_input_data = np.zeros((batch_size, max_length_src),dtype='float32') # (m, max_len)
            
            decoder_input_data = np.zeros((batch_size, max_length_tar),dtype='float32') # (m, max_len)

            decoder_target_data = np.zeros((batch_size, max_length_tar, num_decoder_tokens),dtype='float32')    # (m, max_len, num_decoder_tokens)
            
            for i, (input_text, target_text) in enumerate(zip(X[j:j+batch_size], y[j:j+batch_size])):
                for t, word in enumerate(input_text.split()):
                    encoder_input_data[i, t] = input_word_index[word] # encoder input seq
                for t, word in enumerate(target_text.split()):
                    if t<len(target_text.split())-1:
                        decoder_input_data[i, t] = target_word_index[word] # decoder input seq
                    if t>0:
                        # decoder target sequence (one hot encoded)
                        # does not include the START_ token
                        # Offset by one timestep
                        decoder_target_data[i, t - 1, target_word_index[word]] = 1. 
                        """ This should be target_token_index[word] - 1"""
            # decoder_target_data = np.transpose(decoder_target_data, axes = [1, 0, 2])
            # decoder_target_data = list(decoder_target_data)
            yield([encoder_input_data, decoder_input_data], decoder_target_data)

In [9]:
num_decoder_tokens

10250

In [11]:
Tx = max_len
Ty = max_len
repeator = RepeatVector(Tx)
concatenator = Concatenate(axis = -1)
densor1 = Dense(10, activation = 'tanh')
densor2 = Dense (1, activation = 'relu')
dotor = Dot(axes = 1)

In [12]:
def one_step_attention (h, s_prev):

    # calculate the Context vector for one time-step of decoder

    # h = (m, Tx, n_h)
    # s_prev = (m, n_s)
    # returns: context – we will then use [context; y_prev] as input of Decoder

    s_prev = repeator(s_prev)                   # (m, Tx, n_s)
    concat = concatenator([h, s_prev])          # (m, Tx, n_h + n_s)
    e = densor1 (concat)                        # (m, Tx, 10)
    energies = densor2 (e)                      # (m, Tx, 1)
    alphas = tf.nn.softmax(energies, axis = 1)  # (m, Tx, 1)
    context = dotor([alphas, h])                # alphas = (m, Tx, 1)
                                                # h = (m, Tx, n_h)
                                                # (m, 1, n_h)
    return context

In [13]:
x_emb_dim = 300
y_emb_dim = 300

n_h = 200
n_s = 200


In [14]:
post_attention_LSTM_cell = LSTM(n_s, return_state = True)
output_layer = Dense(num_decoder_tokens, activation='softmax')

2022-08-01 11:10:22.114934: I tensorflow/core/platform/cpu_feature_guard.cc:151] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  SSE4.1 SSE4.2
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.


In [15]:
# h = hidden state of pre-attention RNN layer
# s = hidden state of post-attention RNN layer

x_inputs = Input(shape = (Tx,))                         # (None, Tx) = (m, Tx)
x_emb_layer = Embedding(
                            num_encoder_tokens, 
                            x_emb_dim, 
                            mask_zero = True
                            )      
x_emb = x_emb_layer(x_inputs)                         # (None, Tx, x_emb_dim) = (m, Tx, x_emb_dim)

y_inputs = Input(shape = (Ty,))                         # (None, Ty) = (m, Ty)
y_emb_layer = Embedding(
                          num_decoder_tokens,
                          y_emb_dim,
                          mask_zero = True
                          )
y_emb = y_emb_layer(y_inputs)                         # (None, Ty, y_emb_dim) = (m, Ty, y_emb_dim)


# pass x embeddings through pre-attention LSTM layer
# here, we will use the final hidden-state as the initial post-attention LSTM hidden state

enc_lstm_layer = LSTM(n_h, return_sequences=True, return_state = True)
h_enc, s_enc, c_enc = enc_lstm_layer(x_emb)                             # h_enc = (None, Tx, n_h) = (m, Tx, n_h)
                                                                        # s_enc = (None, n_h) = (m, n_h)
                                                                        # c_enc = (None, n_h) = (m, n_h)
# s<0> and c<0> for decoder = s<ty> and c<ty> for encoder
s_dec = s_enc
c_dec = c_enc                                                              

outputs = []
for t in range(Ty):
    context = one_step_attention(h_enc, s_dec)                                            # context = (m, 1, n_h)
    concat = Concatenate(axis = -1)([context, tf.expand_dims(y_emb[:,t,:],1)])            # concat = (m, 1, n_h + y_emb_dim)
    
    # update decoder LSTM hidden state (s) and cell state (c)
    _, s_dec, c_dec = post_attention_LSTM_cell (initial_state = [s_dec, c_dec], inputs = concat)        # s = (None, Ty, n_s)
    
    # pass decoder LSTM hidden state (s) through output layer to get y prediction
    out = output_layer(s_dec)                                                                   # out = (m, num_decoder_tokens)
    outputs.append(out)
                                                            
outputs = tf.stack(outputs, axis = 1)
print(num_decoder_tokens, outputs.shape)
model = Model(inputs = [x_inputs, y_inputs], outputs = outputs)

10250 (None, 10, 10250)


In [16]:
model.compile(optimizer= 'Adam', loss='categorical_crossentropy', metrics=['acc'])

In [17]:
train_samples = len(X_train) # Total Training samples
val_samples = len(X_test) # total validation samples
batch_size = 128
epochs = 5

In [57]:
history = model.fit(generate_batch(), 
                    steps_per_epoch=train_samples//batch_size, 
                    epochs = 5,
                    validation_data = generate_batch(X_test, y_test, batch_size = batch_size),
                    validation_steps = val_samples // batch_size, 
                    verbose = 1)

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


In [90]:
# x_inputs defined above as Input
# h_enc, s_enc, c_enc also defined above
encoder_model = Model(inputs = x_inputs, outputs = [h_enc, s_enc, c_enc])

# x_inputs = (None, Tx)

# h_enc = (None, Tx, n_h)
# s_enc = (None, n_s) 
# c_enc = (None, n_s) 

# The below three decoder inputs will come from encoder_model.predict()
decoder_input_h = Input(shape = (Tx, n_h))               # (None, Tx, n_h) 
decoder_input_s = Input(shape=(n_s, ))                    # (None, n_s) 
decoder_input_c = Input(shape = (n_s,))                   # (None, n_s) 

# y_emb_2 will be our y_pred at t-1
y_inp_2 = Input(shape = (None,))                        # (None, None) = (m, Ty)
y_emb_2 = y_emb_layer(y_inp_2)                          # (None, None, y_emb_dim) = (m, Ty, y_emb_dim)

# Use decoder_input_s and decoder_input_h to compute context vector
context = one_step_attention(decoder_input_h, decoder_input_s)    # (m, 1, n_h)

# concatenate context with y_emb_2
concat2 = Concatenate(axis = -1)([context, tf.expand_dims(y_emb_2[:,-1,:],1)])                       
                                                        # concat2 = (None, 1, n_h + y_emb_dim)

# Feed concat2 as input; decoder_input_s and decoder_input_c as initial state
_, decoder_output_s, decoder_output_c = post_attention_LSTM_cell (
                                                        initial_state = [decoder_input_s, decoder_input_c], 
                                                        inputs = concat2
                                                        )     
                                            # decoder_output_s = (None, n_s) 
                                            # decoder_output_c = (None, n_s) 

#decoder_output_s = tf.expand_dims(decoder_output_s, 1)      # decoder_output_s = (None, 1, n_s)
#decoder_output_c = tf.expand_dims(decoder_output_c, 1)      # decoder_output_c = (None, 1, n_s)
decoder_output_y = output_layer(tf.expand_dims(decoder_output_s,1))           # (None, 1, num_decoder_tokens)

decoder_model = Model(inputs = [decoder_input_h, decoder_input_s, decoder_input_c, y_inp_2],
                         outputs = [decoder_output_y, decoder_output_s, decoder_output_c])

In [99]:
def decode_sequence(input_sequence):

    # input_sequence = (1, max_len)
    
    # get hidden states + final hidden state + final cell state from encoder 
    h_enc_pred, s_enc_pred, c_enc_pred = encoder_model.predict(input_sequence)
    
    # define y_pred at time 0    
    target_seq = np.zeros((1,1))
    target_seq[0,0] = target_word_index['START_']

    stop_condition = False
    decoded_sentence = ''

    # initialise hidden state and cell state input for decoder
    decoder_s_pred = s_enc_pred                         # (None, n_h) = (m, n_h)
    decoder_c_pred = c_enc_pred                         # (None, n_h) = (m, n_h)
    
    while not stop_condition:
        
        decoder_y_pred, decoder_s_pred, decoder_c_pred = decoder_model.predict([h_enc_pred, decoder_s_pred, decoder_c_pred, target_seq])  
        y_index = np.argmax(decoder_y_pred[0,-1,:])
        y_word = target_index_word[y_index]
        decoded_sentence += ' ' + y_word

        # Exit condition: either hit max length
        # or find stop character.
        if (y_word == '_END' or
           len(decoded_sentence.split()) > max_len):
            stop_condition = True
        
        # Update the target sequence (of length 1).
        target_seq = np.zeros((1,1))
        target_seq[0, 0] = y_index

    return decoded_sentence


In [101]:
def sentence_to_seq(sentence):
    """
    sentence = string
    """
    
    encoder_input_data = np.zeros((1, max_len))     # (1, max_len)
    
    sentence = sentence.lower().split()
    #print(sentence)
    for j, word in enumerate(sentence):
        encoder_input_data[0,j] = input_word_index[word]        # (1, max_len)
    # print(encoder_input_data.shape)
    return encoder_input_data                           # (1, max_len)


In [87]:
sentences = list(X_train.iloc[:10].values)          # list of sentences

translations = []
for sentence in sentences:
    seq = sentence_to_seq(sentence)             # seq = (1, max_len)
    #print(seq, seq.shape)
    translation = decode_sequence(seq)
    translations.append(translation)

sentence_translation_pairs = zip (sentences, translations)
for elem in sentence_translation_pairs:
    print (elem)

('poets write poems', ' lass den tisch gegessen _END')
('he admired my new car', ' er sieht mein auto hat _END')
('tom was offended', ' tom war nervös _END')
('tom is a friend of a friend of mine', ' tom ist ein bisschen ein guter freund _END')
('plutonium239 has a halflife of 24100 years', ' plutonium239 hat eine hohe von des gegessen _END')
('i think tom is drunk', ' ich glaube tom ist gerade glücklich _END')
('cardboard is stronger than paper', ' pappe ist schwerer als meine mutter _END')
('a good idea occurred to him', ' ein buch ist von ihm sehr geholfen _END')
('tell me a joke', ' gib mir ein hund _END')
('i just want to sleep', ' ich will nur im bett _END')
