- data source: http://www.manythings.org/anki/

In [1]:
from tensorflow.keras.layers import Bidirectional, Concatenate, Permute, Dot, Input, LSTM, RNN, Multiply
from tensorflow.keras.layers import RepeatVector, Dense, Activation, Lambda, Embedding, Reshape
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.layers.experimental.preprocessing import TextVectorization
from tensorflow.keras.initializers import Constant
from tensorflow.keras.utils import to_categorical
from tensorflow.keras.models import load_model, Model
import tensorflow.keras.backend as K
from tensorflow.keras.utils import to_categorical
import tensorflow as tf
import numpy as np
import pandas as pd
import string 
import regex as re

from sklearn.model_selection import train_test_split

import io
import spacy

import warnings 
warnings.filterwarnings('ignore')

# from tensorflow.python.framework.ops import disable_eager_execution
# disable_eager_execution()

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
df_en_de = pd.read_table('deu-eng/deu.txt', names=['eng', 'deu', 'attr'])

In [3]:
df_en_de = df_en_de.drop('attr',axis = 1).rename(columns = {'eng':'english', 'deu':'german'})

In [4]:
# Lowercase all characters
df_en_de['english'] = df_en_de['english'].apply(lambda x: x.lower())
df_en_de['german'] = df_en_de['german'].apply(lambda x: x.lower())

# Remove quotes
df_en_de['english'] = df_en_de['english'].apply(lambda x: re.sub("'", '', x))
df_en_de['german'] = df_en_de['german'].apply(lambda x: re.sub("'", '', x))

# Set of all special characters
exclude = set(string.punctuation) 

# Remove all the special characters
df_en_de['english'] = df_en_de['english'].apply(lambda x: ''.join(ch for ch in x if ch not in exclude))
df_en_de['german']=df_en_de['german'].apply(lambda x: ''.join(ch for ch in x if ch not in exclude))

# Add start and end tokens to target sequences
df_en_de['german'] = df_en_de['german'].apply(lambda x : 'START_ '+ x + ' _END')

In [5]:
max_len = 10

pairs = df_en_de
pairs['english_length'] = pairs['english'].apply(lambda x: len(x.split(' ')))
pairs['german_length'] = pairs['german'].apply(lambda x: len(x.split(' ')))

pairs = pairs[pairs['english_length'] <= max_len]
pairs = pairs[pairs['german_length'] <= max_len]
print(len(pairs))
pairs = pairs.sample(frac = 0.1)
print(len(pairs))

209317
20932


In [6]:
# Vocabulary of English
all_en_words=set()
for eng in pairs['english']:
    for word in eng.split():
        if word not in all_en_words:
            all_en_words.add(word)

# Vocabulary of German 
all_de_words=set()
for de in pairs['german']:
    for word in de.split():
        if word not in all_de_words:
            all_de_words.add(word)

# Max Length of source sequence
length_list=[]
for l in pairs['english']:
    length_list.append(len(l.split(' ')))
max_length_src = np.max(length_list)

# Max Length of target sequence
length_list=[]
for l in pairs['german']:
    length_list.append(len(l.split(' ')))
max_length_tar = np.max(length_list)


input_words = sorted(list(all_en_words))
target_words = sorted(list(all_de_words))

# Calculate Vocab size for both source and target
# Add 1 for zero padding
num_encoder_tokens = len(all_en_words) + 1
num_decoder_tokens = len(all_de_words) + 1



# Create word to token dictionary for both source and target
#input_token_index = dict([(word, i+1) for i, word in enumerate(input_words)])
#target_token_index = dict([(word, i+1) for i, word in enumerate(target_words)])
input_word_index = dict([(word, i+1) for i, word in enumerate(input_words)])
target_word_index = dict([(word, i+1) for i, word in enumerate(target_words)])

# Create token to word dictionary for both source and target
# reverse_input_token_index = dict((i, word) for word, i in input_token_index.items())
# reverse_target_token_index = dict((i, word) for word, i in target_token_index.items())
input_index_word = dict((i, word) for word, i in input_word_index.items())
target_index_word = dict((i, word) for word, i in target_word_index.items())

In [7]:
X, y = pairs['english'], pairs['german']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state=101)

In [8]:
def generate_batch(X = X_train, y = y_train, batch_size = 128):
    ''' Generate a batch of data '''
    while True:
        for j in range(0, len(X), batch_size):          # j = batch number
            encoder_input_data = np.zeros((batch_size, max_length_src),dtype='float32') # (m, max_len)
            
            decoder_input_data = np.zeros((batch_size, max_length_tar),dtype='float32') # (m, max_len)

            decoder_target_data = np.zeros((batch_size, max_length_tar, num_decoder_tokens),dtype='float32')    # (m, max_len, num_decoder_tokens)
            
            for i, (input_text, target_text) in enumerate(zip(X[j:j+batch_size], y[j:j+batch_size])):
                for t, word in enumerate(input_text.split()):
                    encoder_input_data[i, t] = input_word_index[word] # encoder input seq
                for t, word in enumerate(target_text.split()):
                    if t<len(target_text.split())-1:
                        decoder_input_data[i, t] = target_word_index[word] # decoder input seq
                    if t>0:
                        # decoder target sequence (one hot encoded)
                        # does not include the START_ token
                        # Offset by one timestep
                        decoder_target_data[i, t - 1, target_word_index[word]] = 1. 
                        """ This should be target_token_index[word] - 1"""
            # decoder_target_data = np.transpose(decoder_target_data, axes = [1, 0, 2])
            # decoder_target_data = list(decoder_target_data)
            yield([encoder_input_data, decoder_input_data], decoder_target_data)

In [9]:
num_decoder_tokens

10250

In [10]:
iter_batch = generate_batch()
np.argmax(next(iter_batch)[1][:,:])

1838

In [11]:
Tx = max_len
Ty = max_len
repeator = RepeatVector(Tx)
concatenator = Concatenate(axis = -1)
densor1 = Dense(10, activation = 'tanh')
densor2 = Dense (1, activation = 'relu')
dotor = Dot(axes = 1)

In [12]:
def one_step_attention (h, s_prev):

    # calculate the Context vector for one time-step of decoder

    # h = (m, Tx, n_h)
    # s_prev = (m, n_s)
    # returns: context – we will then use [context; y_prev] as input of Decoder

    s_prev = repeator(s_prev)                   # (m, Tx, n_s)
    concat = concatenator([h, s_prev])          # (m, Tx, n_h + n_s)
    e = densor1 (concat)                        # (m, Tx, 10)
    energies = densor2 (e)                      # (m, Tx, 1)
    alphas = tf.nn.softmax(energies, axis = 1)  # (m, Tx, 1)
    context = dotor([alphas, h])                # alphas = (m, Tx, 1)
                                                # h = (m, Tx, n_h)
                                                # (m, 1, n_h)
    return context

In [13]:
x_emb_dim = 300
y_emb_dim = 300

n_h = 200
n_s = 200


In [14]:
post_attention_LSTM_cell = LSTM(n_s, return_state = True)
output_layer = Dense(num_decoder_tokens, activation='softmax')

2022-08-01 11:10:22.114934: I tensorflow/core/platform/cpu_feature_guard.cc:151] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  SSE4.1 SSE4.2
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.


In [15]:
# h = hidden state of pre-attention RNN layer
# s = hidden state of post-attention RNN layer

x_inputs = Input(shape = (Tx,))                         # (None, Tx) = (m, Tx)
x_emb_layer = Embedding(
                            num_encoder_tokens, 
                            x_emb_dim, 
                            mask_zero = True
                            )      
x_emb = x_emb_layer(x_inputs)                         # (None, Tx, x_emb_dim) = (m, Tx, x_emb_dim)

y_inputs = Input(shape = (Ty,))                         # (None, Ty) = (m, Ty)
y_emb_layer = Embedding(
                          num_decoder_tokens,
                          y_emb_dim,
                          mask_zero = True
                          )
y_emb = y_emb_layer(y_inputs)                         # (None, Ty, y_emb_dim) = (m, Ty, y_emb_dim)


# pass x embeddings through pre-attention LSTM layer
# here, we will use the final hidden-state as the initial post-attention LSTM hidden state

enc_lstm_layer = LSTM(n_h, return_sequences=True, return_state = True)
h_enc, s_enc, c_enc = enc_lstm_layer(x_emb)                             # h_enc = (None, Tx, n_h) = (m, Tx, n_h)
                                                                        # s_enc = (None, n_h) = (m, n_h)
                                                                        # c_enc = (None, n_h) = (m, n_h)
# s<0> and c<0> for decoder = s<ty> and c<ty> for encoder
s_dec = s_enc
c_dec = c_enc                                                              

outputs = []
for t in range(Ty):
    context = one_step_attention(h_enc, s_dec)                                            # context = (m, 1, n_h)
    concat = Concatenate(axis = -1)([context, tf.expand_dims(y_emb[:,t,:],1)])            # concat = (m, 1, n_h + y_emb_dim)
    
    # update decoder LSTM hidden state (s) and cell state (c)
    _, s_dec, c_dec = post_attention_LSTM_cell (initial_state = [s_dec, c_dec], inputs = concat)        # s = (None, Ty, n_s)
    
    # pass decoder LSTM hidden state (s) through output layer to get y prediction
    out = output_layer(s_dec)                                                                   # out = (m, num_decoder_tokens)
    outputs.append(out)
                                                            
outputs = tf.stack(outputs, axis = 1)
print(num_decoder_tokens, outputs.shape)
model = Model(inputs = [x_inputs, y_inputs], outputs = outputs)

10250 (None, 10, 10250)


In [16]:
model.compile(optimizer= 'Adam', loss='categorical_crossentropy', metrics=['acc'])

In [17]:
train_samples = len(X_train) # Total Training samples
val_samples = len(X_test) # total validation samples
batch_size = 128
epochs = 5

In [57]:
history = model.fit(generate_batch(), 
                    steps_per_epoch=train_samples//batch_size, 
                    epochs = 5,
                    validation_data = generate_batch(X_test, y_test, batch_size = batch_size),
                    validation_steps = val_samples // batch_size, 
                    verbose = 1)

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


In [63]:
history = model.fit(generate_batch(), 
                    steps_per_epoch=train_samples//batch_size, 
                    epochs = 5,
                    validation_data = generate_batch(X_test, y_test, batch_size = batch_size),
                    validation_steps = val_samples // batch_size, 
                    verbose = 1)

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


In [74]:
history = model.fit(generate_batch(), 
                    steps_per_epoch=train_samples//batch_size, 
                    epochs = 5,
                    validation_data = generate_batch(X_test, y_test, batch_size = batch_size),
                    validation_steps = val_samples // batch_size, 
                    verbose = 1)

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


In [81]:
history = model.fit(generate_batch(), 
                    steps_per_epoch=train_samples//batch_size, 
                    epochs = 5,
                    validation_data = generate_batch(X_test, y_test, batch_size = batch_size),
                    validation_steps = val_samples // batch_size, 
                    verbose = 1)

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


In [89]:
history = model.fit(generate_batch(), 
                    steps_per_epoch=train_samples//batch_size, 
                    epochs = 5,
                    validation_data = generate_batch(X_test, y_test, batch_size = batch_size),
                    validation_steps = val_samples // batch_size, 
                    verbose = 1)

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


In [95]:
history = model.fit(generate_batch(), 
                    steps_per_epoch=train_samples//batch_size, 
                    epochs = 5,
                    validation_data = generate_batch(X_test, y_test, batch_size = batch_size),
                    validation_steps = val_samples // batch_size, 
                    verbose = 1)

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


In [97]:
history = model.fit(generate_batch(), 
                    steps_per_epoch=train_samples//batch_size, 
                    epochs = 5,
                    validation_data = generate_batch(X_test, y_test, batch_size = batch_size),
                    validation_steps = val_samples // batch_size, 
                    verbose = 1)

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


In [90]:
# x_inputs defined above as Input
# h_enc, s_enc, c_enc also defined above
encoder_model = Model(inputs = x_inputs, outputs = [h_enc, s_enc, c_enc])

# x_inputs = (None, Tx)

# h_enc = (None, Tx, n_h)
# s_enc = (None, n_s) 
# c_enc = (None, n_s) 

# The below three decoder inputs will come from encoder_model.predict()
decoder_input_h = Input(shape = (Tx, n_h))               # (None, Tx, n_h) 
decoder_input_s = Input(shape=(n_s, ))                    # (None, n_s) 
decoder_input_c = Input(shape = (n_s,))                   # (None, n_s) 

# y_emb_2 will be our y_pred at t-1
y_inp_2 = Input(shape = (None,))                        # (None, None) = (m, Ty)
y_emb_2 = y_emb_layer(y_inp_2)                          # (None, None, y_emb_dim) = (m, Ty, y_emb_dim)

# Use decoder_input_s and decoder_input_h to compute context vector
context = one_step_attention(decoder_input_h, decoder_input_s)    # (m, 1, n_h)

# concatenate context with y_emb_2
concat2 = Concatenate(axis = -1)([context, tf.expand_dims(y_emb_2[:,-1,:],1)])                       
                                                        # concat2 = (None, 1, n_h + y_emb_dim)

# Feed concat2 as input; decoder_input_s and decoder_input_c as initial state
_, decoder_output_s, decoder_output_c = post_attention_LSTM_cell (
                                                        initial_state = [decoder_input_s, decoder_input_c], 
                                                        inputs = concat2
                                                        )     
                                            # decoder_output_s = (None, n_s) 
                                            # decoder_output_c = (None, n_s) 

#decoder_output_s = tf.expand_dims(decoder_output_s, 1)      # decoder_output_s = (None, 1, n_s)
#decoder_output_c = tf.expand_dims(decoder_output_c, 1)      # decoder_output_c = (None, 1, n_s)
decoder_output_y = output_layer(tf.expand_dims(decoder_output_s,1))           # (None, 1, num_decoder_tokens)

decoder_model = Model(inputs = [decoder_input_h, decoder_input_s, decoder_input_c, y_inp_2],
                         outputs = [decoder_output_y, decoder_output_s, decoder_output_c])

In [91]:
# encoder_model -> inputs: x_inputs = (None, Tx)
#                  outputs: [h_enc, s_enc, c_enc]
# decoder_model –> inputs: decoder_input_h, decoder_input_s, decoder_input_c, y_inp_2
                #  outputs: decoder_output_y, decoder_output_s, decoder_output_c

sen1 = X_train.iloc[0]
seq1 = np.zeros((1, 10))
for i, word in enumerate(sen1.split()):
    seq1[0,i] = input_word_index[word]
print(seq1, seq1.shape)
aaa, bbb, ccc = encoder_model.predict(seq1)
target_y = np.zeros((1,1))
target_y[0,0] = target_word_index['START_']
decoder_outs = decoder_model.predict([aaa, bbb, ccc ,target_y])
idx = np.argmax(decoder_outs[0])
sen1, target_index_word[idx]

[[4255. 6444. 4253.    0.    0.    0.    0.    0.    0.    0.]] (1, 10)


('poets write poems', 'dichter')

In [99]:
def decode_sequence(input_sequence):

    # input_sequence = (1, max_len)
    
    # get hidden states + final hidden state + final cell state from encoder 
    h_enc_pred, s_enc_pred, c_enc_pred = encoder_model.predict(input_sequence)
    # print(f's_enc_pred: {s_enc_pred.shape}')
    # define y_pred at time 0    
    target_seq = np.zeros((1,1))
    target_seq[0,0] = target_word_index['START_']

    #decoder_y_pred = np.zeros((1,1,num_decoder_tokens))

    stop_condition = False
    decoded_sentence = ''

    # initialise hidden state and cell state input for decoder
    decoder_s_pred = s_enc_pred                         # (None, n_h) = (m, n_h)
    decoder_c_pred = c_enc_pred                         # (None, n_h) = (m, n_h)
    
    #print('h_enc_pred, decoder_s_pred, decoder_c_pred, target_seq:')
    #print(h_enc_pred.shape, decoder_s_pred.shape, decoder_c_pred.shape, target_seq.shape)
    count = 0
    
    while not stop_condition:
        
        decoder_y_pred, decoder_s_pred, decoder_c_pred = decoder_model.predict([h_enc_pred, decoder_s_pred, decoder_c_pred, target_seq])
        #print(f'decoder_s_pred: {decoder_s_pred.shape}')   
        y_index = np.argmax(decoder_y_pred[0,-1,:])
        #print(y_index)
        y_word = target_index_word[y_index]
        decoded_sentence += ' ' + y_word

        # Exit condition: either hit max length
        # or find stop character.
        if (y_word == '_END' or
           len(decoded_sentence.split()) > max_len):
            stop_condition = True
        
        # Update the target sequence (of length 1).
        target_seq = np.zeros((1,1))
        target_seq[0, 0] = y_index
        count += 1
        #print(f'count: {count}')
    return decoded_sentence


In [100]:
decode_sequence(seq1)

' dichter schreiben gedichte _END'

In [101]:
def sentence_to_seq(sentence):
    """
    sentence = string
    """
    
    encoder_input_data = np.zeros((1, max_len))     # (1, max_len)
    
    sentence = sentence.lower().split()
    #print(sentence)
    for j, word in enumerate(sentence):
        encoder_input_data[0,j] = input_word_index[word]        # (1, max_len)
    # print(encoder_input_data.shape)
    return encoder_input_data                           # (1, max_len)


In [87]:
sentences = list(X_train.iloc[:10].values)          # list of sentences

translations = []
for sentence in sentences:
    seq = sentence_to_seq(sentence)             # seq = (1, max_len)
    #print(seq, seq.shape)
    translation = decode_sequence(seq)
    translations.append(translation)

sentence_translation_pairs = zip (sentences, translations)
for elem in sentence_translation_pairs:
    print (elem)

('poets write poems', ' lass den tisch gegessen _END')
('he admired my new car', ' er sieht mein auto hat _END')
('tom was offended', ' tom war nervös _END')
('tom is a friend of a friend of mine', ' tom ist ein bisschen ein guter freund _END')
('plutonium239 has a halflife of 24100 years', ' plutonium239 hat eine hohe von des gegessen _END')
('i think tom is drunk', ' ich glaube tom ist gerade glücklich _END')
('cardboard is stronger than paper', ' pappe ist schwerer als meine mutter _END')
('a good idea occurred to him', ' ein buch ist von ihm sehr geholfen _END')
('tell me a joke', ' gib mir ein hund _END')
('i just want to sleep', ' ich will nur im bett _END')


In [62]:
sentences = list(X_train.iloc[:10].values)          # list of sentences

translations = []
for sentence in sentences:
    seq = sentence_to_seq(sentence)             # seq = (1, max_len)
    #print(seq, seq.shape)
    translation = decode_sequence(seq)
    translations.append(translation)

sentence_translation_pairs = zip (sentences, translations)
for elem in sentence_translation_pairs:
    print (elem)

s_enc_pred: (1, 200)
decoder_s_pred: (1, 200)
1740
count: 1
decoder_s_pred: (1, 200)
4554
count: 2
decoder_s_pred: (1, 200)
74
count: 3
s_enc_pred: (1, 200)
decoder_s_pred: (1, 200)
2377
count: 1
decoder_s_pred: (1, 200)
3998
count: 2
decoder_s_pred: (1, 200)
2109
count: 3
s_enc_pred: (1, 200)
decoder_s_pred: (1, 200)
8224
count: 1
decoder_s_pred: (1, 200)
4554
count: 2
decoder_s_pred: (1, 200)
8224
count: 3
s_enc_pred: (1, 200)
decoder_s_pred: (1, 200)
8224
count: 1
decoder_s_pred: (1, 200)
4554
count: 2
decoder_s_pred: (1, 200)
2098
count: 3
s_enc_pred: (1, 200)
decoder_s_pred: (1, 200)
4433
count: 1
decoder_s_pred: (1, 200)
3928
count: 2
decoder_s_pred: (1, 200)
1845
count: 3
s_enc_pred: (1, 200)
decoder_s_pred: (1, 200)
4433
count: 1
decoder_s_pred: (1, 200)
3928
count: 2
decoder_s_pred: (1, 200)
8224
count: 3
s_enc_pred: (1, 200)
decoder_s_pred: (1, 200)
1740
count: 1
decoder_s_pred: (1, 200)
4554
count: 2
decoder_s_pred: (1, 200)
2098
count: 3
s_enc_pred: (1, 200)
decoder_s_pred:

In [73]:
sentences = list(X_train.iloc[:10].values)          # list of sentences

translations = []
for sentence in sentences:
    seq = sentence_to_seq(sentence)             # seq = (1, max_len)
    #print(seq, seq.shape)
    translation = decode_sequence(seq)
    translations.append(translation)

sentence_translation_pairs = zip (sentences, translations)
for elem in sentence_translation_pairs:
    print (elem)

s_enc_pred: (1, 200)
s_enc_pred: (1, 200)
s_enc_pred: (1, 200)
s_enc_pred: (1, 200)
s_enc_pred: (1, 200)
s_enc_pred: (1, 200)
s_enc_pred: (1, 200)
s_enc_pred: (1, 200)
s_enc_pred: (1, 200)
s_enc_pred: (1, 200)
('poets write poems', ' lass den tisch _END')
('he admired my new car', ' er hat sich sehr gut zu _END')
('tom was offended', ' tom war ziemlich _END')
('tom is a friend of a friend of mine', ' tom ist ein ein guter guter guter freund _END')
('plutonium239 has a halflife of 24100 years', ' gib mir eine paar minuten _END')
('i think tom is drunk', ' ich habe tom das geld _END')
('cardboard is stronger than paper', ' ist die tür aus _END')
('a good idea occurred to him', ' ein hund ist sehr gut _END')
('tell me a joke', ' darf ich mich ein bisschen _END')
('i just want to sleep', ' ich möchte dass du etwas _END')


In [80]:
sentences = list(X_train.iloc[:10].values)          # list of sentences

translations = []
for sentence in sentences:
    seq = sentence_to_seq(sentence)             # seq = (1, max_len)
    #print(seq, seq.shape)
    translation = decode_sequence(seq)
    translations.append(translation)

sentence_translation_pairs = zip (sentences, translations)
for elem in sentence_translation_pairs:
    print (elem)

s_enc_pred: (1, 200)
s_enc_pred: (1, 200)
s_enc_pred: (1, 200)
s_enc_pred: (1, 200)
s_enc_pred: (1, 200)
s_enc_pred: (1, 200)
s_enc_pred: (1, 200)
s_enc_pred: (1, 200)
s_enc_pred: (1, 200)
s_enc_pred: (1, 200)
('poets write poems', ' lass den tisch _END')
('he admired my new car', ' er hat mein auto gestohlen zu _END')
('tom was offended', ' tom war nervös _END')
('tom is a friend of a friend of mine', ' tom ist ein ein guter freund für ein neues mädchen _END')
('plutonium239 has a halflife of 24100 years', ' gib mir eine stück von den tisch _END')
('i think tom is drunk', ' ich glaube tom ist ziemlich langsam _END')
('cardboard is stronger than paper', ' schau nur ein wenig wert _END')
('a good idea occurred to him', ' ein buch ist sehr stolz als tom _END')
('tell me a joke', ' gib mir mir ein hund _END')
('i just want to sleep', ' ich möchte dass du etwas gehen _END')


In [88]:
sentences = list(X_train.iloc[:10].values)          # list of sentences

translations = []
for sentence in sentences:
    seq = sentence_to_seq(sentence)             # seq = (1, max_len)
    #print(seq, seq.shape)
    translation = decode_sequence(seq)
    translations.append(translation)

sentence_translation_pairs = zip (sentences, translations)
for elem in sentence_translation_pairs:
    print (elem)

('poets write poems', ' lass den tisch gegessen _END')
('he admired my new car', ' er sieht mein auto hat _END')
('tom was offended', ' tom war nervös _END')
('tom is a friend of a friend of mine', ' tom ist ein bisschen ein guter freund _END')
('plutonium239 has a halflife of 24100 years', ' plutonium239 hat eine hohe von des gegessen _END')
('i think tom is drunk', ' ich glaube tom ist gerade glücklich _END')
('cardboard is stronger than paper', ' pappe ist schwerer als meine mutter _END')
('a good idea occurred to him', ' ein buch ist von ihm sehr geholfen _END')
('tell me a joke', ' gib mir ein hund _END')
('i just want to sleep', ' ich will nur im bett _END')


In [94]:
sentences = list(X_train.iloc[:10].values)          # list of sentences

translations = []
for sentence in sentences:
    seq = sentence_to_seq(sentence)             # seq = (1, max_len)
    #print(seq, seq.shape)
    translation = decode_sequence(seq)
    translations.append(translation)

sentence_translation_pairs = zip (sentences, translations)
for elem in sentence_translation_pairs:
    print (elem)

('poets write poems', ' dichter gehen sie _END')
('he admired my new car', ' er bewunderte mein auto _END')
('tom was offended', ' tom war beleidigt _END')
('tom is a friend of a friend of mine', ' tom ist ein bisschen von meiner freundin _END')
('plutonium239 has a halflife of 24100 years', ' plutonium239 hat eine hohe von 24100 jahren _END')
('i think tom is drunk', ' ich glaube tom ist ziemlich langsam _END')
('cardboard is stronger than paper', ' pappe ist fester als papier _END')
('a good idea occurred to him', ' ein buch ist von ihm nicht gut _END')
('tell me a joke', ' gib mir bitte einen hund _END')
('i just want to sleep', ' ich will nur etwas schlafen _END')


In [96]:
sentences = list(X_test.iloc[:10].values)          # list of sentences

translations = []
for sentence in sentences:
    seq = sentence_to_seq(sentence)             # seq = (1, max_len)
    #print(seq, seq.shape)
    translation = decode_sequence(seq)
    translations.append(translation)

sentence_translation_pairs = zip (sentences, translations)
for elem in sentence_translation_pairs:
    print (elem)

('shes thirtythree', ' sie ist _END')
('it wasnt funny at all', ' das war immer noch nicht lustig _END')
('that black bird is not a blackbird', ' diese maschine ist nicht mehr wert _END')
('does tom want coffee', ' trinkt tom einen kaffee _END')
('our visitors are at the door', ' unsere tür sind auf dem fenster _END')
('mary likes japan doesnt she', ' maria sieht nicht mehr als _END')
('you can go to the bus station', ' du musst mit dem abendessen gehen _END')
('our plans are taking shape', ' unsere familienehre ist unsere parks _END')
('can i use your telephone', ' kann ich gerne hören _END')
('id like to help you', ' ich möchte gerne ihnen helfen _END')


In [103]:
sentences = list(X_train.iloc[:10].values)          # list of sentences

translations = []
for sentence in sentences:
    seq = sentence_to_seq(sentence)             # seq = (1, max_len)
    #print(seq, seq.shape)
    translation = decode_sequence(seq)
    translations.append(translation)

sentence_translation_pairs = zip (sentences, translations)
for elem in sentence_translation_pairs:
    print (elem)

('poets write poems', ' dichter schreiben gedichte _END')
('he admired my new car', ' er bewunderte mein neues auto _END')
('tom was offended', ' tom war beleidigt _END')
('tom is a friend of a friend of mine', ' tom ist ein freund von mir freund ein _END')
('plutonium239 has a halflife of 24100 years', ' plutonium239 hat eine halbwertszeit von 24100 jahren _END')
('i think tom is drunk', ' ich glaube tom hat betrunken _END')
('cardboard is stronger than paper', ' pappe ist fester als papier _END')
('a good idea occurred to him', ' ein guter film ist ihm nicht schlecht _END')
('tell me a joke', ' erzähle mir einen witz _END')
('i just want to sleep', ' ich will einfach warten _END')


In [4]:
# Lowercase all characters
df_en_de['english'] = df_en_de['english'].apply(lambda x: x.lower())
df_en_de['german'] = df_en_de['german'].apply(lambda x: x.lower())

# Remove quotes
df_en_de['english'] = df_en_de['english'].apply(lambda x: re.sub("'", '', x))
df_en_de['german'] = df_en_de['german'].apply(lambda x: re.sub("'", '', x))

# Set of all special characters
exclude = set(string.punctuation) 

# Remove all the special characters
df_en_de['english'] = df_en_de['english'].apply(lambda x: ''.join(ch for ch in x if ch not in exclude))
df_en_de['german']=df_en_de['german'].apply(lambda x: ''.join(ch for ch in x if ch not in exclude))

# Add start and end tokens to target sequences
df_en_de['english'] = df_en_de['english'].apply(lambda x : '<start> '+ x + ' <end>')
df_en_de['german'] = df_en_de['german'].apply(lambda x : '<start> '+ x + ' <end>')



In [5]:
#rename dataframe for convenience
pairs = df_en_de

In [6]:
max_len = 10

pairs = df_en_de
pairs['english_length'] = pairs['english'].apply(lambda x: len(x.split(' ')))
pairs['german_length'] = pairs['german'].apply(lambda x: len(x.split(' ')))

pairs = pairs[pairs['english_length'] <= max_len]
pairs = pairs[pairs['german_length'] <= max_len]
print(len(pairs))
pairs = pairs.sample(frac = 0.1)
print(len(pairs))

199379
19938


In [7]:
pairs.head(3)

Unnamed: 0,english,german,english_length,german_length
78158,<start> two plus two makes four <end>,<start> zwei plus zwei ist vier <end>,7,7
28780,<start> you cant fire me <end>,<start> sie können mir nicht kündigen <end>,6,7
35875,<start> why dont you stop <end>,<start> warum hältst du nicht an <end>,6,7


In [8]:
english_text = list(pairs['english'])
german_text = list(pairs['german'])
both = list(map(list, zip(english_text, german_text)))
cleaned_pairs = both

In [9]:
cleaned_pairs[:5]

[['<start> two plus two makes four <end>',
  '<start> zwei plus zwei ist vier <end>'],
 ['<start> you cant fire me <end>',
  '<start> sie können mir nicht kündigen <end>'],
 ['<start> why dont you stop <end>', '<start> warum hältst du nicht an <end>'],
 ['<start> where did you get all this from <end>',
  '<start> woher hast du das alles <end>'],
 ['<start> were you in boston last week <end>',
  '<start> warst du letzte woche in boston <end>']]

In [10]:
# This class creates a word -> index mapping (e.g,. "dad" -> 5) and vice-versa 
# (e.g., 5 -> "dad") for each language,
class LanguageIndex():
    def __init__(self, lang):
        self.lang = lang
        self.word2idx = {}
        self.idx2word = {}
        self.vocab = set()

        self.create_index()                 # Run the function when you initialise class
                                            # So when you initialise the class, 
                                            # the variable has word2idx and idx2word dictionaries already


    def create_index(self):
        for phrase in self.lang:
            self.vocab.update(phrase.split(' '))

        self.vocab = sorted(self.vocab)

        self.word2idx['<pad>'] = 0
        for index, word in enumerate(self.vocab):
            self.word2idx[word] = index + 1

        for word, index in self.word2idx.items():
            self.idx2word[index] = word

# Function to calculate maximum length of the sequence
def max_length(tensor):
    return max(len(t) for t in tensor)

In [11]:
def load_dataset(pairs, num_examples):

    inp_lang = LanguageIndex(en for en, de in cleaned_pairs)
    targ_lang = LanguageIndex(de for en, de in cleaned_pairs)

    # English sentences
    input_tensor = [[inp_lang.word2idx[s] for s in en.split(' ')] for en, de in cleaned_pairs]
    # German sentences
    target_tensor = [[targ_lang.word2idx[s] for s in de.split(' ')] for en, de in cleaned_pairs]

    # Calculate max_length of input and output tensor
    # Here, we'll set those to the longest sentence in the dataset
    max_length_inp, max_length_tar = max_length(input_tensor), max_length(target_tensor)

    # Padding the input and output tensor to the maximum length
    # First argument of pad_sequences = list of sequences, where each sequence is a list of integers
    # second argument = number of integers per sequence
    # make sure to set "padding" = "post" to append zeros at end and not beginning
    input_tensor = tf.keras.preprocessing.sequence.pad_sequences(input_tensor, 
                                                                    maxlen=max_length_inp,
                                                                    padding='post')

    target_tensor = tf.keras.preprocessing.sequence.pad_sequences(target_tensor, 
                                                                    maxlen=max_length_tar, 
                                                                    padding='post')

    return input_tensor, target_tensor, inp_lang, targ_lang, max_length_inp, max_length_tar


In [12]:
# Create the tensors
input_tensor, target_tensor, inp_lang, targ_lang, max_length_inp, max_length_targ = load_dataset(cleaned_pairs, len(cleaned_pairs))

In [13]:
# Creating training and validation sets using an 80-20 split
# each training set is 2D numpy arary
input_tensor_train, input_tensor_val, target_tensor_train, target_tensor_val = train_test_split(input_tensor, target_tensor, test_size=0.1, random_state = 101)

In [23]:
input_tensor_train.shape, target_tensor_train.shape

((17944, 10), (17944, 10))

In [25]:
input_tensor_train[0]
target_tensor_train[0]

array([  62, 1758, 9037, 6570, 1658,   61,    0,    0,    0,    0],
      dtype=int32)

In [15]:
# Set the parameters of the model
BUFFER_SIZE = len(input_tensor_train)
BATCH_SIZE = 64
N_BATCH = BUFFER_SIZE//BATCH_SIZE
embedding_dim = 256
units = 1024
vocab_inp_size = len(inp_lang.word2idx)
vocab_tar_size = len(targ_lang.word2idx)

# Create batch generator to be used by modle to load data in batches
dataset = tf.data.Dataset.from_tensor_slices((input_tensor_train, target_tensor_train)).shuffle(BUFFER_SIZE)
dataset = dataset.batch(BATCH_SIZE, drop_remainder=True)

2022-07-30 08:26:07.286937: I tensorflow/core/platform/cpu_feature_guard.cc:151] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  SSE4.1 SSE4.2
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.


In [22]:
train_np = np.stack(list(dataset))
print(type(train_np), train_np.shape)


<class 'numpy.ndarray'> (280, 2, 64, 10)


In [103]:
def gru(units):
  
    return tf.keras.layers.GRU(units, 
                                return_sequences=True, 
                                return_state=True, 
                                recurrent_activation='sigmoid',             # recurrent_activation refers to the "update gate"
                                recurrent_initializer='glorot_uniform')

class Encoder(tf.keras.Model):
    def __init__(self, vocab_size, embedding_dim, enc_units, batch_sz):
        super(Encoder, self).__init__()
        self.batch_sz = batch_sz
        self.enc_units = enc_units
        self.embedding = tf.keras.layers.Embedding(vocab_size, embedding_dim)
        self.gru = gru(self.enc_units)
        
    def call(self, x, hidden):
        x = self.embedding(x)
        output, state = self.gru(x, initial_state = hidden)        
        return output, state
    
    def initialize_hidden_state(self):
        return tf.zeros((self.batch_sz, self.enc_units))



In [104]:
class Decoder(tf.keras.Model):
    def __init__(self, vocab_size, embedding_dim, dec_units, batch_sz):
        super(Decoder, self).__init__()
        self.batch_sz = batch_sz
        self.dec_units = dec_units
        self.embedding = tf.keras.layers.Embedding(vocab_size, embedding_dim)
        self.gru = gru(self.dec_units)
        self.fc = tf.keras.layers.Dense(vocab_size)
        
        # used for attention
        self.W1 = tf.keras.layers.Dense(self.dec_units)
        self.W2 = tf.keras.layers.Dense(self.dec_units)
        self.V = tf.keras.layers.Dense(1)
        
    def call(self, x, hidden, enc_output):
        # enc_output shape == (batch_size, max_length, hidden_size)
        
        # hidden shape == (batch_size, hidden size)
        # hidden_with_time_axis shape == (batch_size, 1, hidden size)
        # we are doing this to perform addition to calculate the score
        hidden_with_time_axis = tf.expand_dims(hidden, 1)
        
        # score shape == (batch_size, max_length, 1)
        # we get 1 at the last axis because we are applying tanh(FC(EO) + FC(H)) to self.V
        # this is the step 1 described in the blog to compute scores s1, s2, ...
        score = self.V(tf.nn.tanh(self.W1(enc_output) + self.W2(hidden_with_time_axis)))
        
        # attention_weights shape == (batch_size, max_length, 1)
        # this is the step 2 described in the blog to compute attention weights e1, e2, ...
        attention_weights = tf.nn.softmax(score, axis=1)
        
        # context_vector shape after sum == (batch_size, hidden_size)
        # this is the step 3 described in the blog to compute the context_vector = e1*h1 + e2*h2 + ...
        context_vector = attention_weights * enc_output
        context_vector = tf.reduce_sum(context_vector, axis=1)
        
        # x shape after passing through embedding == (batch_size, 1, embedding_dim)
        x = self.embedding(x)
        
        # x shape after concatenation == (batch_size, 1, embedding_dim + hidden_size)
        # this is the step 4 described in the blog to concatenate the context vector with the output of the previous time step
        x = tf.concat([tf.expand_dims(context_vector, 1), x], axis=-1)
        
        # passing the concatenated vector to the GRU
        output, state = self.gru(x)
        
        # output shape == (batch_size * 1, hidden_size)
        output = tf.reshape(output, (-1, output.shape[2]))
        
        # output shape == (batch_size * 1, vocab)
        # this is the step 5 in the blog, to compute the next output word in the sequence
        x = self.fc(output)
        
        # return current output, current state and the attention weights
        return x, state, attention_weights
        
    def initialize_hidden_state(self):
        return tf.zeros((self.batch_sz, self.dec_units))

In [105]:
# Create objects of Class Encoder and Class Decoder
encoder = Encoder(vocab_inp_size, embedding_dim, units, BATCH_SIZE)
decoder = Decoder(vocab_tar_size, embedding_dim, units, BATCH_SIZE)