In [3]:
from tensorflow.keras.layers import Bidirectional, Concatenate, Permute, Dot, Input, LSTM, Multiply
from tensorflow.keras.layers import RepeatVector, Dense, Activation, Lambda, Embedding, Reshape
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.layers.experimental.preprocessing import TextVectorization
from tensorflow.keras.initializers import Constant
from tensorflow.keras.utils import to_categorical
from tensorflow.keras.models import load_model, Model
import tensorflow.keras.backend as K
from tensorflow.keras.utils import to_categorical
import tensorflow as tf
import numpy as np
import pandas as pd
import string 
import regex as re

from sklearn.model_selection import train_test_split

import io
import spacy

import warnings 
warnings.filterwarnings('ignore')

  from .autonotebook import tqdm as notebook_tqdm


In [None]:
# from google.colab import drive
# drive.mount('/content/gdrive')

# df_en_de = pd.read_table('/content/gdrive/MyDrive/deu-eng/deu.txt', names=['eng', 'deu', 'attr'])

In [4]:
df_en_de = pd.read_table('deu-eng/deu.txt', names=['eng', 'deu', 'attr'])

In [5]:
df_en_de = df_en_de.drop('attr',axis = 1).rename(columns = {'eng':'english', 'deu':'german'})

In [6]:
df_en_de

Unnamed: 0,english,german
0,Go.,Geh.
1,Hi.,Hallo!
2,Hi.,Grüß Gott!
3,Run!,Lauf!
4,Run.,Lauf!
...,...,...
251715,If someone who doesn't know your background sa...,"Wenn jemand Fremdes dir sagt, dass du dich wie..."
251716,If someone who doesn't know your background sa...,"Wenn jemand, der nicht weiß, woher man kommt, ..."
251717,It may be impossible to get a completely error...,"Es ist wohl unmöglich, einen vollkommen fehler..."
251718,I know that adding sentences only in your nati...,"Ich weiß wohl, dass das ausschließliche Beitra..."


In [7]:
# Lowercase all characters
df_en_de['english'] = df_en_de['english'].apply(lambda x: x.lower())
df_en_de['german'] = df_en_de['german'].apply(lambda x: x.lower())

# Remove quotes
df_en_de['english'] = df_en_de['english'].apply(lambda x: re.sub("'", '', x))
df_en_de['german'] = df_en_de['german'].apply(lambda x: re.sub("'", '', x))

# Set of all special characters
exclude = set(string.punctuation) 

# Remove all the special characters
df_en_de['english'] = df_en_de['english'].apply(lambda x: ''.join(ch for ch in x if ch not in exclude))
df_en_de['german']=df_en_de['german'].apply(lambda x: ''.join(ch for ch in x if ch not in exclude))

# Add start and end tokens to target sequences
df_en_de['german'] = df_en_de['german'].apply(lambda x : 'START_ '+ x + ' _END')



In [8]:
df_en_de.head()

Unnamed: 0,english,german
0,go,START_ geh _END
1,hi,START_ hallo _END
2,hi,START_ grüß gott _END
3,run,START_ lauf _END
4,run,START_ lauf _END


In [9]:
#rename dataframe for convenience
pairs = df_en_de

In [51]:
pairs = df_en_de
pairs['english_length'] = pairs['english'].apply(lambda x: len(x.split(' ')))
pairs['german_length'] = pairs['german'].apply(lambda x: len(x.split(' ')))
max_len = 10
pairs = pairs[pairs['english_length'] <= max_len]
pairs = pairs[pairs['german_length'] <= max_len]
print (len(pairs))
pairs = pairs.sample(frac = 0.01)
print(len(pairs))

209317
2093


In [92]:
# Vocabulary of English
all_en_words=set()
for eng in pairs['english']:
    for word in eng.split():
        if word not in all_en_words:
            all_en_words.add(word)

# Vocabulary of German 
all_de_words=set()
for de in pairs['german']:
    for word in de.split():
        if word not in all_de_words:
            all_de_words.add(word)

# Max Length of source sequence
length_list=[]
for l in pairs['english']:
    length_list.append(len(l.split(' ')))
max_length_src = np.max(length_list)

# Max Length of target sequence
length_list=[]
for l in pairs['german']:
    length_list.append(len(l.split(' ')))
max_length_tar = np.max(length_list)


input_words = sorted(list(all_en_words))
target_words = sorted(list(all_de_words))

# Calculate Vocab size for both source and target
num_encoder_tokens = len(all_en_words)
num_decoder_tokens = len(all_de_words) + 1

#""" find out why you add 1"""
#num_decoder_tokens += 1 # For zero padding 

# Create word to token dictionary for both source and target
input_token_index = dict([(word, i+1) for i, word in enumerate(input_words)])
target_token_index = dict([(word, i+1) for i, word in enumerate(target_words)])

# Create token to word dictionary for both source and target
reverse_input_token_index = dict((i, word) for word, i in input_token_index.items())
reverse_target_token_index = dict((i, word) for word, i in target_token_index.items())

In [93]:
target_token_index['START_'], target_token_index['_END']

(10, 11)

In [95]:
max_length_src, max_length_tar, num_encoder_tokens, num_decoder_tokens

(10, 10, 2032, 2595)

In [55]:
pairs['english']

213570    tom and mary are getting married on monday
133727                 tom is much stronger than you
166242             tom got home just before daylight
98876                      you will do no such thing
177253           i wish you would let tom go with us
                             ...                    
151409               tom took the horses to the barn
171586            nothing is achieved without effort
58221                          tom and mary saw john
154943               i dont think we need to do that
171548            no further discussion is necessary
Name: english, Length: 2093, dtype: object

In [57]:
# Try with a small dataset first, due to problems with dealing with too large a batch at a time
X_train, X_test, y_train, y_test = train_test_split(pairs['english'], pairs['german'], test_size=0.1, random_state=101)

In [58]:
X_train = np.array(X_train)
X_test = np.array(X_test)
y_train = np.array(y_train)
y_test = np.array(y_test)

In [59]:
X_train

array(['tom is doing great in school', 'dont take our word for it',
       'tom wasnt alone on the island', ...,
       'would you like to play with tom', 'tom says im beautiful',
       'what a cute baby may i hold her'], dtype=object)

In [60]:
X_train.shape

(1883,)

In [96]:
def generate_batch(X = X_train, y = y_train, batch_size = 128):
    ''' Generate a batch of data '''
    while True:
        for j in range(0, len(X), batch_size):          # j = batch number
            encoder_input_data = np.zeros((batch_size, max_length_src),dtype='float32')
            
            decoder_input_data = np.zeros((batch_size, max_length_tar),dtype='float32')

            decoder_target_data = np.zeros((batch_size, max_length_tar, num_decoder_tokens),dtype='float32')
            
            for i, (input_text, target_text) in enumerate(zip(X[j:j+batch_size], y[j:j+batch_size])):
                for t, word in enumerate(input_text.split()):
                    encoder_input_data[i, t] = input_token_index[word] # encoder input seq
                for t, word in enumerate(target_text.split()):
                    if t<len(target_text.split())-1:
                        decoder_input_data[i, t] = target_token_index[word] # decoder input seq
                    if t>0:
                        # decoder target sequence (one hot encoded)
                        # does not include the START_ token
                        # Offset by one timestep
                        decoder_target_data[i, t - 1, target_token_index[word]] = 1. 
                        """ This should be target_token_index[word] - 1"""
            # decoder_target_data = np.transpose(decoder_target_data, axes = [1, 0, 2])
            # decoder_target_data = list(decoder_target_data)
            yield([encoder_input_data, decoder_input_data], decoder_target_data)

<h3> Model for training </h3>

In [62]:
Tx = max_len
Ty = Tx

# Modify these later based on spacy's word vectors
input_embedding_dims = 100
output_embedding_dims = 100

n_a = 64

In [63]:
# layers as global variables

# Create layer objects LSTM_cell and densor
decoder_LSTM_cell = LSTM (n_a, return_state = True)        
densor = Dense(num_decoder_tokens, activation='softmax')

# Create reshaper object – will be used in function "learning_model"
reshaper = Reshape((1, output_embedding_dims))

<h3> Training Model </h3>

In [64]:
### Encoder part ###

# Input layer for encoder (English)
encoder_inputs = Input(shape=(Tx,), dtype = 'int32')            # (None, Tx) -- sequence of integers

# Embedding layer for encoder (English)
enc_emb_layer = Embedding (input_dim = num_encoder_tokens + 1, output_dim = input_embedding_dims, mask_zero = True)

# Turn input sequence (English) into embedding vectors
encoder_embeddings = enc_emb_layer(encoder_inputs)                 # (None, Tx, input_embedding_dims)

# LSTM layer for encoder (English)
encoder_lstm = LSTM(units = n_a, return_state = True, name = 'encoder_LSTM')           # Note that this LSTM layer computes on ALL Tx values at once.

# Pass input embedding vectors (English) through encoder LSTM
encoder_outputs, a, c = encoder_lstm(encoder_embeddings)

# Save state vectors from encoder LSTM
encoder_states = [a,c]


### Decoder part ###

# Input layer for decoder (German)
decoder_inputs = Input(shape = (Ty,), dtype = 'int32')      # (None, Ty)

# Embedding layer for decoder (German)
dec_emb_layer = Embedding(input_dim = num_decoder_tokens + 1, output_dim = output_embedding_dims, mask_zero = True)

# Turn decoder input sequence (German) into embedding vectors
decoder_embeddings = dec_emb_layer(decoder_inputs)  # (None, Ty, output_embedding_dims)

# Initialise list of outputs
decoder_outputs = []

# Loop over each time-step of decoder input (German) 
for t in range (Ty):
    # Select embedding vector for time-step t
    dec_emb_t = decoder_embeddings[:,t,:]       # (None, output_embedding_dims)

    # Reshape embedding vector for time-step t       
    dec_emb_t = reshaper(dec_emb_t)             # (None, 1, output_embedding_dims)

    # Get a and c for time-step t from decoder LSTM 
    a, _, c = decoder_LSTM_cell(inputs = dec_emb_t, initial_state = encoder_states)
    encoder_states = [a,c]

    # Pass a for time-step t from decoder LSTM through Dense layer
    out = densor(a)
    decoder_outputs.append(out)

# encoder_inputs is numpy array
# decoder_inputs is numpy array
# decoder_outputs is list of numpy arrays (one-hot vectors)
model = Model (inputs = [encoder_inputs, decoder_inputs], outputs = decoder_outputs)

In [65]:
opt = Adam(lr=0.01, beta_1=0.9, beta_2=0.999, decay=0.01)
model.compile(optimizer=opt, loss='categorical_crossentropy', metrics=['accuracy'])

In [85]:
# encoder_model = Model(inputs = encoder_inputs, outputs = encoder_states_2)

epochs = 200

#(encoder_input_data, decoder_input_data), decoder_target_data = generate_batch()
history = model.fit(generate_batch(), steps_per_epoch=X_train.shape[0]//128, epochs = epochs, verbose = 1)

Epoch 1/200
Epoch 2/200
Epoch 3/200
Epoch 4/200
Epoch 5/200
Epoch 6/200
Epoch 7/200
Epoch 8/200
Epoch 9/200
Epoch 10/200
Epoch 11/200
Epoch 12/200
Epoch 13/200
Epoch 14/200
Epoch 15/200
Epoch 16/200
Epoch 17/200
Epoch 18/200
Epoch 19/200
Epoch 20/200
Epoch 21/200
Epoch 22/200
Epoch 23/200
Epoch 24/200
Epoch 25/200
Epoch 26/200
Epoch 27/200
Epoch 28/200
Epoch 29/200
Epoch 30/200
Epoch 31/200
Epoch 32/200
Epoch 33/200
Epoch 34/200
Epoch 35/200
Epoch 36/200
Epoch 37/200
Epoch 38/200
Epoch 39/200
Epoch 40/200
Epoch 41/200
Epoch 42/200
Epoch 43/200
Epoch 44/200
Epoch 45/200
Epoch 46/200
Epoch 47/200
Epoch 48/200
Epoch 49/200
Epoch 50/200
Epoch 51/200
Epoch 52/200
Epoch 53/200
Epoch 54/200
Epoch 55/200
Epoch 56/200
Epoch 57/200
Epoch 58/200
Epoch 59/200
Epoch 60/200
Epoch 61/200
Epoch 62/200
Epoch 63/200
Epoch 64/200
Epoch 65/200
Epoch 66/200
Epoch 67/200
Epoch 68/200
Epoch 69/200
Epoch 70/200
Epoch 71/200
Epoch 72/200
Epoch 73/200
Epoch 74/200
Epoch 75/200
Epoch 76/200
Epoch 77/200
Epoch 78

<h3> Inference Model </h3>

In [86]:
#### Get a and c vectors from encoder ####

# Create embedding vectors for encoder input sequence (English)
                                                                     # encoder_inputs = (None, Tx)
encoder_embeddings_2 = enc_emb_layer(encoder_inputs)                 # (None, Tx, input_embedding_dims)

# Pass embedding vectors through encoder LSTM
encoder_outputs_2, a_2, c_2 = encoder_lstm(encoder_embeddings_2)     # Use same encoding LSTM layer

# Store a and c vectors from encoder
encoder_states_2 = [a_2,c_2]                                         

# Create model. 
# inputs = encoder input sequence; outputs = a and c vectors
encoder_model = Model(inputs = encoder_inputs, outputs = encoder_states_2)          #encoder_inputs = (None, Tx)
                                                                                    # encoder_states_2 = [a_2, c_2], a_2 & c_2 are numpy arrays        


### Use a and c vectors from encoder to generate prediction ###

# Input layers for a0 and c0 vectors into decoder
#decoder_input2 = Input(shape = (None,))
decoder_state_input_a = Input(shape = (n_a))                # (None, n_a)
decoder_state_input_c = Input(shape = (n_a))                # (None, n_a)
decoder_states_inputs = [decoder_state_input_a, decoder_state_input_c]

# Convert decoder input sequence (German) into embedding vectors
                                                                # decoder_inputs = (None, Ty)
decoder_embeddings_2 = dec_emb_layer(decoder_inputs)            # (None, Ty, output_embedding_dims)

# Select embedding vector for time-step 0 (START_ token)
decoder_embeddings_2 = decoder_embeddings_2[:,0,:]              # (None, output_embedding_dims)
# Reshape into correct dimensions
decoder_embeddings_2 = reshaper(decoder_embeddings_2)           # (None, 1, output_embedding_dims)

outputs2 = []

# look over all time-steps in the decoder input sequence
# number of time_steps will increase by 1 each time we make a prediction later
for t in range(decoder_inputs.shape[1]):                 

    # pass in a, c, and decoder input vectors (time-step t-1) into decoder LSTM
    # Note, inputs (for decoder_LSTM_cell) = (None, 1, output_embedding_dims)
    # Get updated a, c vectors
    decoder_state_input_a, _, decoder_state_input_c = decoder_LSTM_cell(inputs = decoder_embeddings_2, initial_state = decoder_states_inputs)
    decoder_state_inputs = [decoder_state_input_a, decoder_state_input_c]

    # Pass "a" vector through Dense layer
    out = densor(decoder_state_input_a)                  # out = (None, num_decoder_tokens) 

    
    # Find index for prediction
    max_idx = tf.math.argmax(out, -1)                       # max_idx = (None, 1)
    # Convert prediction into embedding vector
    decoder_embeddings_2 = dec_emb_layer(max_idx+1)           # decoder_embeddings_2 = (None, output_embedding_dims)

    # Reshape embedding vector
    decoder_embeddings_2 = reshaper(decoder_embeddings_2)   # (None, 1, output_embedding_dims)

# decoder_inputs = numpy array
# decoder_states_inputs = list of numpy arrays
# [decoder_inputs] + decoder+states_inputs = list of numpy arrays
decoder_model = Model(inputs = [decoder_inputs]+ decoder_states_inputs, 
                    outputs = out                           # "out" = softmax output for final time-step inspected
                                                            # out = (None, num_decoder_tokens)
                    )                                           


In [87]:
def decode_sequence(input_seq):
    """
    input_seq = numpy array of sequences (1,Ty)
    """

    # Encode the input as state vectors.
    states_values = encoder_model.predict(input_seq)         # input_seq = (1,Ty)
                                                             # states_values = [a, c] from encoder, where a,c are numpy arrays
    

    # Generate empty target sequence of length 1.
    target_seq = np.zeros((1,Ty))                            
    
    # Populate the first character of target sequence with the start character.
    target_seq[0, 0] = target_token_index['START_']          # One integer

    
    
    
    # Sampling loop for a batch of sequences
    # (to simplify, here we assume a batch of size 1).
    stop_condition = False
    decoded_sentence = ''
    j = 1
    while not stop_condition and j <Ty:
        print(target_seq.shape, target_seq)
        output_tokens = decoder_model.predict([target_seq] + states_values)     
        # inputs = [decoder_inputs]+ decoder_states_inputs = list of three numpy arrays

        # Sample a token
        sampled_token_index = np.argmax(output_tokens[:, 1])
        sampled_char = reverse_target_token_index[sampled_token_index + 1]
        decoded_sentence += ' '+sampled_char
        print(decoded_sentence, len(decoded_sentence))
        
        # Exit condition: either hit max length or find stop token.
        if (sampled_char == '_END' or len(decoded_sentence.split()) > Ty):
            stop_condition = True
        
        # Update the target sequence (of length 1).
        target_seq[0,j] = sampled_token_index+1
        
        j+=1
        print(j)
        
    
    return decoded_sentence


In [88]:
inpt_sq = X_train[1].split()
inpt_sq = [input_token_index[word] for word in inpt_sq]
#inpt_sq = np.array(inpt_sq).reshape(1,5)
#inpt_sq.shape

max_len = 10
input_data = np.zeros((1,max_len))
for i, num in enumerate(inpt_sq):
  input_data[0,i] = num
input_data.shape

(1, 10)

In [89]:
X_train[1]

'dont take our word for it'

In [91]:
decode_sequence(input_data)

(1, 10) [[10.  0.  0.  0.  0.  0.  0.  0.  0.  0.]]
 1945 5
2
(1, 10) [[10.  1.  0.  0.  0.  0.  0.  0.  0.  0.]]
 1945 1945 10
3
(1, 10) [[10.  1.  1.  0.  0.  0.  0.  0.  0.  0.]]
 1945 1945 1945 15
4
(1, 10) [[10.  1.  1.  1.  0.  0.  0.  0.  0.  0.]]
 1945 1945 1945 1945 20
5
(1, 10) [[10.  1.  1.  1.  1.  0.  0.  0.  0.  0.]]
 1945 1945 1945 1945 1945 25
6
(1, 10) [[10.  1.  1.  1.  1.  1.  0.  0.  0.  0.]]
 1945 1945 1945 1945 1945 1945 30
7
(1, 10) [[10.  1.  1.  1.  1.  1.  1.  0.  0.  0.]]
 1945 1945 1945 1945 1945 1945 1945 35
8
(1, 10) [[10.  1.  1.  1.  1.  1.  1.  1.  0.  0.]]
 1945 1945 1945 1945 1945 1945 1945 1945 40
9
(1, 10) [[10.  1.  1.  1.  1.  1.  1.  1.  1.  0.]]
 1945 1945 1945 1945 1945 1945 1945 1945 1945 45
10


' 1945 1945 1945 1945 1945 1945 1945 1945 1945'