In [1]:
from tensorflow.keras.layers import Bidirectional, Concatenate, Permute, Dot, Input, LSTM, Multiply
from tensorflow.keras.layers import RepeatVector, Dense, Activation, Lambda, Embedding, Reshape
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.layers.experimental.preprocessing import TextVectorization
from tensorflow.keras.initializers import Constant
from tensorflow.keras.utils import to_categorical
from tensorflow.keras.models import load_model, Model
import tensorflow.keras.backend as K
from tensorflow.keras.utils import to_categorical
import tensorflow as tf
import numpy as np
import pandas as pd
import string 
import regex as re

from sklearn.model_selection import train_test_split

import io
import spacy

import warnings 
warnings.filterwarnings('ignore')

  from .autonotebook import tqdm as notebook_tqdm


In [None]:
# from google.colab import drive
# drive.mount('/content/gdrive')

# df_en_de = pd.read_table('/content/gdrive/MyDrive/deu-eng/deu.txt', names=['eng', 'deu', 'attr'])

In [2]:
df_en_de = pd.read_table('deu-eng/deu.txt', names=['eng', 'deu', 'attr'])

In [3]:
df_en_de = df_en_de.drop('attr',axis = 1).rename(columns = {'eng':'english', 'deu':'german'})

In [4]:
df_en_de

Unnamed: 0,english,german
0,Go.,Geh.
1,Hi.,Hallo!
2,Hi.,Grüß Gott!
3,Run!,Lauf!
4,Run.,Lauf!
...,...,...
251715,If someone who doesn't know your background sa...,"Wenn jemand Fremdes dir sagt, dass du dich wie..."
251716,If someone who doesn't know your background sa...,"Wenn jemand, der nicht weiß, woher man kommt, ..."
251717,It may be impossible to get a completely error...,"Es ist wohl unmöglich, einen vollkommen fehler..."
251718,I know that adding sentences only in your nati...,"Ich weiß wohl, dass das ausschließliche Beitra..."


In [5]:
# Lowercase all characters
df_en_de['english'] = df_en_de['english'].apply(lambda x: x.lower())
df_en_de['german'] = df_en_de['german'].apply(lambda x: x.lower())

# Remove quotes
df_en_de['english'] = df_en_de['english'].apply(lambda x: re.sub("'", '', x))
df_en_de['german'] = df_en_de['german'].apply(lambda x: re.sub("'", '', x))

# Set of all special characters
exclude = set(string.punctuation) 

# Remove all the special characters
df_en_de['english'] = df_en_de['english'].apply(lambda x: ''.join(ch for ch in x if ch not in exclude))
df_en_de['german']=df_en_de['german'].apply(lambda x: ''.join(ch for ch in x if ch not in exclude))

# Add start and end tokens to target sequences
df_en_de['german'] = df_en_de['german'].apply(lambda x : 'START_ '+ x + ' _END')



In [6]:
df_en_de.head()

Unnamed: 0,english,german
0,go,START_ geh _END
1,hi,START_ hallo _END
2,hi,START_ grüß gott _END
3,run,START_ lauf _END
4,run,START_ lauf _END


In [7]:
#rename dataframe for convenience
pairs = df_en_de

In [8]:
pairs = df_en_de
pairs['english_length'] = pairs['english'].apply(lambda x: len(x.split(' ')))
pairs['german_length'] = pairs['german'].apply(lambda x: len(x.split(' ')))
max_len = 10
pairs = pairs[pairs['english_length'] <= max_len]
pairs = pairs[pairs['german_length'] <= max_len]
print (len(pairs))
pairs = pairs.sample(frac = 0.01)
print(len(pairs))

209317
2093


In [36]:
# Vocabulary of English
all_en_words=set()
for eng in pairs['english']:
    for word in eng.split():
        if word not in all_en_words:
            all_en_words.add(word)

# Vocabulary of German 
all_de_words=set()
for de in pairs['german']:
    for word in de.split():
        if word not in all_de_words:
            all_de_words.add(word)

# Max Length of source sequence
length_list=[]
for l in pairs['english']:
    length_list.append(len(l.split(' ')))
max_length_src = np.max(length_list)

# Max Length of target sequence
length_list=[]
for l in pairs['german']:
    length_list.append(len(l.split(' ')))
max_length_tar = np.max(length_list)


input_words = sorted(list(all_en_words))
target_words = sorted(list(all_de_words))

# Calculate Vocab size for both source and target
num_encoder_tokens = len(all_en_words) + 1
num_decoder_tokens = len(all_de_words) + 1

#""" find out why you add 1"""
#num_decoder_tokens += 1 # For zero padding 

# Create word to token dictionary for both source and target
input_token_index = dict([(word, i+1) for i, word in enumerate(input_words)])
target_token_index = dict([(word, i+1) for i, word in enumerate(target_words)])

# Create token to word dictionary for both source and target
reverse_input_token_index = dict((i, word) for word, i in input_token_index.items())
reverse_target_token_index = dict((i, word) for word, i in target_token_index.items())

In [37]:
target_token_index['START_'], target_token_index['_END']

(17, 18)

In [38]:
max_length_src, max_length_tar, num_encoder_tokens, num_decoder_tokens

(10, 10, 2082, 2659)

In [39]:
pairs['english']

4004                                  i felt naked
86533                     tom got up and went away
74469                       its hard to please tom
69958                       you arrived a bit late
100436                  he put the key in the lock
                            ...                   
181154           you both love each other dont you
206124    tom poured some apple juice into a glass
134011               tom nervously opened the door
9284                                 ill go see it
94423                    she is a very poor driver
Name: english, Length: 2093, dtype: object

In [40]:
X, y = pairs['english'], pairs['german']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.1, random_state=101)

In [41]:
X_train.shape

(1883,)

In [42]:
def generate_batch(X = X_train, y = y_train, batch_size = 128):
    ''' Generate a batch of data '''
    while True:
        for j in range(0, len(X), batch_size):          # j = batch number
            encoder_input_data = np.zeros((batch_size, max_length_src),dtype='float32')
            
            decoder_input_data = np.zeros((batch_size, max_length_tar),dtype='float32')

            decoder_target_data = np.zeros((batch_size, max_length_tar, num_decoder_tokens),dtype='float32')
            
            for i, (input_text, target_text) in enumerate(zip(X[j:j+batch_size], y[j:j+batch_size])):
                for t, word in enumerate(input_text.split()):
                    encoder_input_data[i, t] = input_token_index[word] # encoder input seq
                for t, word in enumerate(target_text.split()):
                    if t<len(target_text.split())-1:
                        decoder_input_data[i, t] = target_token_index[word] # decoder input seq
                    if t>0:
                        # decoder target sequence (one hot encoded)
                        # does not include the START_ token
                        # Offset by one timestep
                        decoder_target_data[i, t - 1, target_token_index[word]] = 1. 
                        """ This should be target_token_index[word] - 1"""
            # decoder_target_data = np.transpose(decoder_target_data, axes = [1, 0, 2])
            # decoder_target_data = list(decoder_target_data)
            yield([encoder_input_data, decoder_input_data], decoder_target_data)

<h3> Model for training </h3>

In [43]:
# Tx = max_len
# Ty = Tx

# # Modify these later based on spacy's word vectors
# input_embedding_dims = 100
# output_embedding_dims = 100

# n_a = 64

train_samples = len(X_train)
val_samples = len (X_test)
batch_size = 128
epochs = 50
latent_dim = 256

In [None]:
# For the Input layers, we define "time-step" number as None
# As such: we can input variables of different time-step lengths
# This will be useful during the prediction stage, where we will feed one word at a time

# All layer objects are global variables. 
# Their weights are remembered when we call on them in a later model.

In [44]:
# Define an input sequence and process it.
encoder_inputs = Input(shape=(None,))                                                       # (None, None) -- (m, Tx)
enc_emb =  Embedding(num_encoder_tokens, latent_dim, mask_zero = True)(encoder_inputs)      # (None, None, latent_dim) 
                                                                                            # -- (m, Tx, embedding dimensions)
encoder_lstm = LSTM(latent_dim, return_state=True)                                          
encoder_outputs, state_h, state_c = encoder_lstm(enc_emb)                                   # encoder_outputs = (None, latent_dim)
                                                                                            # -- (m, state vector dimensions)
# We discard `encoder_outputs` and only keep the states.
encoder_states = [state_h, state_c]                                                         # state_h = (None, 256)
                                                                                            # -- (m, state vector dimensions)

# Set up the decoder, using `encoder_states` as initial state.
decoder_inputs = Input(shape=(None,))                                                       # (None, None) -- (m, Ty)
dec_emb_layer = Embedding(num_decoder_tokens, latent_dim, mask_zero = True)                 
dec_emb = dec_emb_layer(decoder_inputs)                                                     # (None, None, latent_dim) -- (m, Ty, embedding dimensions)
# We set up our decoder to return full output sequences,
# and to return internal states as well. We don't use the
# return states in the training model, but we will use them in inference.
decoder_lstm = LSTM(latent_dim, return_sequences=True, return_state=True)                   
decoder_outputs, _, _ = decoder_lstm(dec_emb,
                                     initial_state=encoder_states)                          # (None, None, latent_dim) -- (m, Ty, state vector dimensions)
decoder_dense = Dense(num_decoder_tokens, activation='softmax')                             
decoder_outputs = decoder_dense(decoder_outputs)                                            # (None, None, num_decoder_tokens)
                                                                                            # (m, Ty, decoder vocab size)

# Define the model that takes encoder and decoder input 
# to output decoder_outputs
model = Model([encoder_inputs, decoder_inputs], decoder_outputs)                            # encoder_inputs = (None, None) -- (m, Tx)
                                                                                            # decoder_inputs = (None, None) -- (m, Ty)
                                                                                            # decoder_outputs = (None, None, decoder vocab size) 

In [45]:
model.compile(optimizer= 'rmsprop', loss='categorical_crossentropy', metrics=['acc'])

In [46]:
train_samples = len(X_train) # Total Training samples
val_samples = len(X_test)    # Total validation or test samples
batch_size = 128
epochs = 100

In [47]:
X_train.shape

(1883,)

In [48]:
history = model.fit(generate_batch(), steps_per_epoch=train_samples//batch_size, epochs = epochs, verbose = 1)

Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
Epoch 22/100
Epoch 23/100
Epoch 24/100
Epoch 25/100
Epoch 26/100
Epoch 27/100
Epoch 28/100
Epoch 29/100
Epoch 30/100
Epoch 31/100
Epoch 32/100
Epoch 33/100
Epoch 34/100
Epoch 35/100
Epoch 36/100
Epoch 37/100
Epoch 38/100
Epoch 39/100
Epoch 40/100
Epoch 41/100
Epoch 42/100
Epoch 43/100
Epoch 44/100
Epoch 45/100
Epoch 46/100
Epoch 47/100
Epoch 48/100
Epoch 49/100
Epoch 50/100
Epoch 51/100
Epoch 52/100
Epoch 53/100
Epoch 54/100
Epoch 55/100
Epoch 56/100
Epoch 57/100
Epoch 58/100
Epoch 59/100
Epoch 60/100
Epoch 61/100
Epoch 62/100
Epoch 63/100
Epoch 64/100
Epoch 65/100
Epoch 66/100
Epoch 67/100
Epoch 68/100
Epoch 69/100
Epoch 70/100
Epoch 71/100
Epoch 72/100
Epoch 73/100
Epoch 74/100
Epoch 75/100
Epoch 76/100
Epoch 77/100
Epoch 78

In [105]:
# Encode the input sequence to get the "Context vectors"
encoder_model = Model(encoder_inputs, encoder_states)                   # encoder_inputs = (None, None) -- (m, Tx)
                                                                        # encoder_states = [state_h, state_c]
                                                                        # [(None, latent_dim), (None,latent_dim)] 
                                                                        # -- [(m, state vector dims), (m, state vector dims)]

# Decoder setup
# Below tensors will hold the states of the previous time step           
decoder_state_input_h = Input(shape=(latent_dim,))                      # (None, latent_dim) -- (m, state vector dims)   
decoder_state_input_c = Input(shape=(latent_dim,))                      # (None, latent_dim) -- (m, state vector dims)
decoder_state_input = [decoder_state_input_h, decoder_state_input_c]
# Get the embeddings of the decoder sequence
dec_emb2 = dec_emb_layer(decoder_inputs)                                # (None, None, latent_dim) -- (m, Ty, embedding dims)
# To predict the next word in the sequence, set the initial states 
# to the states from the previous time step
decoder_outputs2, state_h2, state_c2 = decoder_lstm(
                                            dec_emb2, 
                                            initial_state=decoder_state_input
                                            )                           # decoder_outputs2 = (None, None, latent_dim) -- (m, Ty, state vector dims)
                                                                        # state_h2 = (None, latent_dim) -- (m, state vector dims)
                                                                        # state_c2 = (None, latent_dim) -- (m, state vector dims)
decoder_states2 = [state_h2, state_c2]
# A dense softmax layer to generate prob dist. over the target vocabulary
decoder_outputs2 = decoder_dense(decoder_outputs2)                      # (None, None, num_decoder_tokens) -- (m, Ty, target vocab size + 1)
# Final decoder model
decoder_model = Model(
    [decoder_inputs] + decoder_state_input,
    [decoder_outputs2] + decoder_states2)                               # decoder_inputs = (None, None) -- (m, Ty)
                                                                        # decoder_state_input = [(m, state vector dims), (m, state vector dims)]
                                                                        # decoder_outputs2 = (m, Ty, target vocab size)
                                                                        # decoder_states2 = [(m, state vector dims), (m, state vector dims)]

In [107]:
decoder_outputs2.shape

TensorShape([None, None, 2659])

In [188]:
def sentence_to_seq(sentence):
    """
    sentence = string
    """
    
    

    encoder_input_data = np.zeros((1, max_length_src))
    
    sentence = sentence.lower().split()
    #print(sentence)
    for j, word in enumerate(sentence):
        encoder_input_data[0,j] = input_token_index[word]
    return encoder_input_data

In [130]:
def decode_sequence(input_seq):

    """ 
    input_seq = (None, None) -- (m, Tx) 
    """
    
    
    # Encode the input as state vectors.
    states_value = encoder_model.predict(input_seq)                     # states_value = [state_h, state_c]
                                                                        # [(None, latent_dim), (None,latent_dim)] 
                                                                        # -- [(m, state vector dims), (m, state vector dims)]
    # Generate empty target sequence of length 1.
    target_seq = np.zeros((1,1))
    # Populate the first character of 
    #target sequence with the start character.
    target_seq[0, 0] = target_token_index['START_']
    # Sampling loop for a batch of sequences
    # (to simplify, here we assume a batch of size 1).
    stop_condition = False
    decoded_sentence = ''

    # Note: target_seq will always be a single integer
    while not stop_condition:
        output_tokens, h, c = decoder_model.predict([target_seq] + states_value)        
                                                                                        # target_seq = decoder_inputs = (None, None) = (m, Ty)
                                                                                        # states_value = decoder_state_input 
                                                                                        # = [(m, state vector dims), (m, state vector dims)]
                                                                                        
                                                                                        # output_tokens = decoder_outputs2 = (m, Ty, target vocab dims)
                                                                                        # h = state_h2 = (m, state vector dims)
                                                                                        # c = state_c2 = (m, state vector dims)
# Sample a token
        sampled_token_index = np.argmax(output_tokens[0, -1, :])
        sampled_word =reverse_target_token_index[sampled_token_index]
        decoded_sentence += ' '+ sampled_word
# Exit condition: either hit max length
        # or find stop character.
        if (sampled_word == '_END' or
           len(decoded_sentence) > 50):
            stop_condition = True
# Update the target sequence (of length 1).
        target_seq = np.zeros((1,1))
        target_seq[0, 0] = sampled_token_index
# Update states
        states_value = [h, c]
    return decoded_sentence



In [201]:
sentences = list(X_train.iloc[:10].values)

translations = []
for sentence in sentences:
    seq = sentence_to_seq(sentence)
    translation = decode_sequence(seq)
    translations.append(translation)

sentence_translation_pairs = zip (sentences, translations)
for elem in sentence_translation_pairs:
    print (elem)

('we need to know what happened', ' wir müssen wissen was passiert ist _END')
('tom promised it would never happen again', ' tom versprach dass es nie wieder vorkäme _END')
('tom knows how to milk a cow', ' tom weiß wie man eine kuh melkt _END')
('can we fix this', ' können wir das beheben _END')
('is this your letter', ' ist dies dein brief _END')
('my assistant will be able to handle that', ' mein assistent wird damit schon klarkommen _END')
('tom isnt new here', ' tom ist nicht neu hier _END')
('tom is wearing someone elses coat', ' tom trägt jemandes anderen mantel _END')
('tom is living in the past', ' tom lebt in der vergangenheit _END')
('tom was untidy', ' tom war unordentlich _END')
