In [34]:
import pandas as pd
import numpy as np
import string
from string import digits
import matplotlib.pyplot as plt
%matplotlib inline
import re
import os
from sklearn.utils import shuffle
from sklearn.model_selection import train_test_split
from keras.layers import Input, LSTM, Embedding, Dense, Bidirectional
from keras.layers import Activation, dot, concatenate
from keras.models import Model
from keras.callbacks import TensorBoard, ModelCheckpoint, ReduceLROnPlateau, EarlyStopping
from keras.preprocessing.text import Tokenizer
import tensorflow as tf
from gensim.models import Word2Vec
from tqdm import tqdm_notebook
import nltk
%matplotlib inline  
import matplotlib
import matplotlib.pyplot as plt
import seaborn

seaborn.set(font=['AppleMyungjo'], font_scale=1)

In [4]:
lines = pd.read_csv('dataset/english-german-train.csv')
lines.shape

(154304, 3)

In [5]:
lines.english = lines.english.apply(lambda x: x.lower())
lines.german = lines.german.apply(lambda x: str(x).lower())

In [6]:
exclude = set(string.punctuation)
lines.english = lines.english.apply(lambda x: ''.join(ch for ch in x if ch not in exclude))
lines.german = lines.german.apply(lambda x: ''.join(ch for ch in x if ch not in exclude))

In [7]:
lines.english = lines.english.apply(lambda x: x.strip())
lines.german = lines.german.apply(lambda x: x.strip())

lines.english = lines.english.apply(lambda x: re.sub(" +", " ", x))
lines.german = lines.german.apply(lambda x: re.sub(" +", " ", x))

In [8]:
lines.english = lines.english.apply(lambda x: re.sub("\?\?", '', x))
lines.german = lines.german.apply(lambda x: re.sub("\?\?", '', x))

In [10]:
lines.german = lines.german.apply(lambda x : 'START_ '+ x + '_END')

In [15]:
lines.columns

Index(['english', 'german'], dtype='object')

In [13]:
del lines['Unnamed: 0']

In [14]:
lines.sample(10)

Unnamed: 0,english,german
74580,you know where to find me if you want to talk,START_ du weit ja wo ich zu finden bin wenn du...
14873,tom is no longer studying french,START_ tom lernt nicht mehr franzosisch_END
15322,im awfully afraid of heights,START_ ich habe groe hohenangst_END
69292,i thought tom loved mary,START_ ich dachte tom liebe maria_END
17142,he is unquestionably the oldest man in the vil...,START_ er ist ohne zweifel der alteste mann im...
82625,there is a fly on the ceiling,START_ da ist eine fliege auf der zimmerdecke_END
92468,i know tom has been hurt,START_ ich wei dass tom verletzt wurde_END
70032,ill give that to tom,START_ ich werde das tom geben_END
19522,my grandmother had a stroke,START_ meine gromutter hatte einen schlaganfal...
49097,youve got the wrong number,START_ sie haben sich verwahlt_END


In [17]:
vocab_english = set()
for sent in lines.english:
    for word in sent.split():
        if word not in vocab_english:
            vocab_english.add(word)

vocab_german = set()
for sent in lines.german:
    for word in sent.split():
        if word not in vocab_german:
            vocab_german.add(word)

In [18]:
length_list=[]
for l in lines.german:
    length_list.append(len(l.split(' ')))
max_length_german = np.max(length_list)
max_length_german, np.average(length_list)

(76, 7.4321534114475325)

In [21]:
length_list=[]
for l in lines.english:
    length_list.append(len(l.split(' ')))
max_length_english = np.max(length_list)
max_length_english, np.average(length_list)

(101, 6.399587826627955)

In [22]:
input_words = sorted(list(vocab_english))
target_words = sorted(list(vocab_german))
num_encoder_tokens = len(vocab_english)
num_decoder_tokens = len(vocab_german)
num_encoder_tokens, num_decoder_tokens

(14693, 39096)

In [23]:
num_decoder_tokens += 1 # For zero padding
num_decoder_tokens

39097

In [24]:
input_token_index = dict([(word, i+1) for i, word in enumerate(input_words)])
target_token_index = dict([(word, i+1) for i, word in enumerate(target_words)])

In [25]:
reverse_input_char_index = dict((i, word) for word, i in input_token_index.items())
reverse_target_char_index = dict((i, word) for word, i in target_token_index.items())

In [26]:
lines = shuffle(lines)
lines.head(10)

Unnamed: 0,english,german
85539,were you crying,START_ hast du geweint_END
48004,tom looks just like john,START_ tom sieht genau aus wie john_END
52460,tom does not know the difference between a dia...,START_ tom kennt nicht den unterschied zwische...
148762,are you in a bad mood,START_ sind sie schlechter laune_END
133611,have you ever lost,START_ haben sie schon einmal verloren_END
55401,you should learn how to ride a bicycle,START_ du solltest fahrrad fahren lernen_END
57232,tom will cook,START_ tom wird kochen_END
126572,tom didnt know what to do with it,START_ tom wusste nicht was er damit anfangen ...
103428,i always thought that a stroke was one of natu...,START_ ich dachte immer dass ein schlaganfall ...
88035,does anyone have a lighter,START_ hat jemand ein feuerzeug_END


In [27]:
english = list(lines.english)
german = list(lines.german)

x = []
y = []

for i in range(len(english)):
    x.append(str(english[i]))
    y.append(str(german[i]))

x = np.array(x)
y = np.array(y)
len(x), len(y)

(154304, 154304)

In [28]:
X_train, X_test, y_train, y_test = train_test_split(x, y, test_size = 0.1)
X_train.shape, X_test.shape

((138873,), (15431,))

In [29]:
def generate_batch(X = X_train, y = y_train, batch_size = 128):
    ''' Generate a batch of data '''
    while True:
        for j in range(0, len(X), batch_size):
            encoder_input_data = np.zeros((batch_size, max_length_src),dtype='float32')
            decoder_input_data = np.zeros((batch_size, max_length_term2),dtype='float32')
            decoder_target_data = np.zeros((batch_size, max_length_term2, num_decoder_tokens),dtype='float32')
            for i, (input_text, target_text) in enumerate(zip(X[j:j+batch_size], y[j:j+batch_size])):
                for t, word in enumerate(input_text.split()):
                    encoder_input_data[i, t] = input_token_index[word] # encoder input seq
                for t, word in enumerate(target_text.split()):
                    if t<len(target_text.split())-1:
                        decoder_input_data[i, t] = target_token_index[word] # decoder input seq
                    if t>0:
                        # decoder target sequence (one hot encoded)
                        # does not include the START_ token
                        # Offset by one timestep
                        decoder_target_data[i, t - 1, target_token_index[word]] = 1.
            yield([encoder_input_data, decoder_input_data], decoder_target_data)

In [31]:
latent_dim = 256

In [32]:
english_embedding = Word2Vec.load('embeddings/skipgram-english-256.model')
german_embedding = Word2Vec.load('embeddings/skipgram-german-256.model')

In [35]:
eng_tok = Tokenizer()
ger_tok = Tokenizer()

eng_tok.fit_on_texts(english)
ger_tok.fit_on_texts(german)

In [36]:
# create a weight matrix for words in training docs
encoder_embedding_matrix = np.zeros((num_encoder_tokens, latent_dim))
for word, i in eng_tok.word_index.items():
    try:
        embedding_vector = english_embedding[word]
        if embedding_vector is not None:
            encoder_embedding_matrix[i] = embedding_vector
    except Exception as e:
        print(e)
# create a weight matrix for words in training docs
decoder_embedding_matrix = np.zeros((num_decoder_tokens, latent_dim))
for word, i in ger_tok.word_index.items():
    try:
        embedding_vector = german_embedding[word]
        if embedding_vector is not None:
            decoder_embedding_matrix[i] = embedding_vector
    except Exception as e:
        pass

  """
  


index 14693 is out of bounds for axis 0 with size 14693


In [37]:
encoder_inputs = Input(shape=(None,))
enc_emb =  Embedding(num_encoder_tokens, latent_dim, mask_zero = True, weights=[encoder_embedding_matrix])(encoder_inputs)

encoder_lstm = LSTM(latent_dim, return_sequences=True, return_state=True)
encoder_outputs, state_h, state_c = encoder_lstm(enc_emb)
encoder_states = [state_h, state_c]

print('encoder_outputs: ', encoder_outputs)
print('state_h: ', state_h)
print('state_c: ', state_c)

encoder_outputs:  Tensor("lstm_1/transpose_2:0", shape=(?, ?, 256), dtype=float32)
state_h:  Tensor("lstm_1/while/Exit_2:0", shape=(?, 256), dtype=float32)
state_c:  Tensor("lstm_1/while/Exit_3:0", shape=(?, 256), dtype=float32)


In [38]:
decoder_inputs = Input(shape=(None,))
dec_emb_layer = Embedding(num_decoder_tokens, latent_dim, mask_zero = True, weights=[decoder_embedding_matrix])
dec_emb = dec_emb_layer(decoder_inputs)

decoder_lstm = LSTM(latent_dim, return_sequences=True, return_state=True)
decoder_outputs, _, _ = decoder_lstm(dec_emb,
                                     initial_state=encoder_states)

attention = dot([decoder_outputs, encoder_outputs], axes=[2, 2])

attention = Activation('softmax', name='attention')(attention)

context = dot([attention, encoder_outputs], axes=[2, 1], name='context')

decoder_combined_context = concatenate([context, decoder_outputs], name='decoder_combined_context')

decoder_dense = Dense(num_decoder_tokens, activation='softmax')
decoder_outputs = decoder_dense(decoder_combined_context)

model = Model([encoder_inputs, decoder_inputs], decoder_outputs)

In [39]:
model.compile(optimizer='rmsprop', loss='categorical_crossentropy', metrics=['acc'])
model.summary()

__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_1 (InputLayer)            (None, None)         0                                            
__________________________________________________________________________________________________
input_2 (InputLayer)            (None, None)         0                                            
__________________________________________________________________________________________________
embedding_1 (Embedding)         (None, None, 256)    3761408     input_1[0][0]                    
__________________________________________________________________________________________________
embedding_2 (Embedding)         (None, None, 256)    10008832    input_2[0][0]                    
__________________________________________________________________________________________________
lstm_1 (LS

In [40]:
train_samples = len(X_train)
val_samples = len(X_test)
batch_size = 128
epochs = 500

In [41]:
class TrainValTensorBoard(TensorBoard):
    def __init__(self, log_dir='./logs', **kwargs):
        # Make the original `TensorBoard` log to a subdirectory 'training'
        training_log_dir = os.path.join(log_dir, 'training')
        super(TrainValTensorBoard, self).__init__(training_log_dir, **kwargs)

        # Log the validation metrics to a separate subdirectory
        self.val_log_dir = os.path.join(log_dir, 'validation')

    def set_model(self, model):
        # Setup writer for validation metrics
        self.val_writer = tf.summary.FileWriter(self.val_log_dir)
        super(TrainValTensorBoard, self).set_model(model)

    def on_epoch_end(self, epoch, logs=None):
        # Pop the validation logs and handle them separately with
        # `self.val_writer`. Also rename the keys so that they can
        # be plotted on the same figure with the training metrics
        logs = logs or {}
        val_logs = {k.replace('val_', ''): v for k, v in logs.items() if k.startswith('val_')}
        for name, value in val_logs.items():
            summary = tf.Summary()
            summary_value = summary.value.add()
            summary_value.simple_value = value.item()
            summary_value.tag = name
            self.val_writer.add_summary(summary, epoch)
        self.val_writer.flush()

        # Pass the remaining logs to `TensorBoard.on_epoch_end`
        logs = {k: v for k, v in logs.items() if not k.startswith('val_')}
        super(TrainValTensorBoard, self).on_epoch_end(epoch, logs)

    def on_train_end(self, logs=None):
        super(TrainValTensorBoard, self).on_train_end(logs)
        self.val_writer.close()


In [None]:
log_dir = 'eng_ger_nmt_weights'
logging = TrainValTensorBoard(log_dir=log_dir)
checkpoint = ModelCheckpoint(os.path.join(log_dir, 'ep{epoch:03d}-val_loss{val_loss:.3f}-val_acc{val_acc:.3f}.h5'),
        monitor='val_loss', save_weights_only=True, save_best_only=True, period=3)
reduce_lr = ReduceLROnPlateau(monitor='val_loss', factor=0.1, patience=3, verbose=1)
early_stopping = EarlyStopping(monitor='val_loss', min_delta=0, patience=5, verbose=1)

In [None]:
model.fit_generator(generator = generate_batch(X_train, y_train, batch_size = batch_size),
                    steps_per_epoch = train_samples//batch_size,
                    epochs=epochs,
                    validation_data = generate_batch(X_test, y_test, batch_size = batch_size),
                    validation_steps = val_samples//batch_size,
                    callbacks=[logging, checkpoint, reduce_lr, early_stopping])

In [None]:
model.load_weights('lstm_attn_nmt_model/ep060-val_loss3.489-val_acc0.610.h5')

In [None]:
# Encode the input sequence to get the "thought vectors"
encoder_model = Model(encoder_inputs, [encoder_outputs] + encoder_states)

# Decoder setup
# Below tensors will hold the states of the previous time step
decoder_state_input_h = Input(shape=(latent_dim,))
decoder_state_input_c = Input(shape=(latent_dim,))

encoder_inf_input = Input(shape=(None, latent_dim))

decoder_states_inputs = [decoder_state_input_h, decoder_state_input_c]

dec_emb2= dec_emb_layer(decoder_inputs) # Get the embeddings of the decoder sequence

# To predict the next word in the sequence, set the initial states to the states from the previous time step
decoder_outputs2, state_h2, state_c2 = decoder_lstm(dec_emb2, initial_state=decoder_states_inputs)
decoder_states2 = [state_h2, state_c2]


attention = dot([decoder_outputs2, encoder_inf_input], axes=[2, 2])
print('Attention: ', attention)

attention = Activation('softmax', name='attention')(attention)
print('Softmax: ', attention)

context = dot([attention, encoder_inf_input], axes=[2, 1])
print('Context: ', context)

decoder_combined_context = concatenate([context, decoder_outputs2])
print('Combined Context: ', decoder_combined_context)




decoder_outputs2 = decoder_dense(decoder_combined_context) # A dense softmax layer to generate prob dist. over the target vocabulary

# Final decoder model
decoder_model = Model(
    [decoder_inputs, encoder_inf_input] + decoder_states_inputs,
    [decoder_outputs2] + decoder_states2)

In [None]:
encoder_model.summary()
encoder_model.output

In [None]:
decoder_model.summary()
decoder_model.input, decoder_model.output

In [None]:
def decode_sequence(input_seq):
    # Encode the input as state vectors.
    [encoder_output, h, c] = encoder_model.predict(input_seq)
    states_value = [h, c]
    
    # Generate empty target sequence of length 1.
    target_seq = np.zeros((1,1))
    # Populate the first character of target sequence with the start character.
    target_seq[0, 0] = target_token_index['START_']

    # Sampling loop for a batch of sequences
    # (to simplify, here we assume a batch of size 1).
    
    #return
    stop_condition = False
    decoded_sentence = ''
    while not stop_condition:
        output_tokens, h, c = decoder_model.predict([target_seq, encoder_output] + states_value)

        # Sample a token
        sampled_token_index = np.argmax(output_tokens[0, -1, :])
        sampled_char = reverse_target_char_index[sampled_token_index]
        decoded_sentence += ' '+sampled_char

        # Exit condition: either hit max length
        # or find stop character.
        if (sampled_char == '_END' or
           len(decoded_sentence) > 100):
            stop_condition = True

        # Update the target sequence (of length 1).
        target_seq = np.zeros((1,1))
        target_seq[0, 0] = sampled_token_index

        # Update states
        states_value = [h, c]

    return decoded_sentence


In [None]:
layers = decoder_model.layers
for l in layers:
    print('%s\tname:%s' % (str(l), l.name))
    
assert(model.layers[7] == model.get_layer('attention'))

In [None]:
attention_layer = decoder_model.get_layer('attention') # or model.layers[7]
attention_model = Model(inputs=decoder_model.inputs, outputs=decoder_model.outputs + [attention_layer.output])

print(attention_model)
print(attention_model.output_shape, attention_model.input_shape)
attention_model.summary()

In [None]:
def attent_and_generate(input_seq):
    decoded_sentence = []
    
    [encoder_output, h, c] = encoder_model.predict(input_seq)
    states_value = [h, c]
    
    target_seq = np.zeros((1,1))
    target_seq[0, 0] = target_token_index['START_']
    
    stop_condition = False
    attention_density = []
    index = []
    
    while not stop_condition:
        output_tokens, h, c, attention = attention_model.predict([target_seq, encoder_output] + states_value)
        sampled_token_index = np.argmax(output_tokens[0, -1, :])
        sampled_char = reverse_target_char_index[sampled_token_index]
        decoded_sentence.append(sampled_char)
    
        if (sampled_char == '_END') or len(decoded_sentence) > 50:
            stop_condition = True
            
        states_value = [h, c]
        target_seq = np.zeros((1,1))
        target_seq[0, 0] = sampled_token_index
        attention_density.append((sampled_char, attention[0][0]))
      
    return np.array(attention_density), ' '.join(decoded_sentence)


def visualize(text, encoder_input):
    attention_weights, decoded_sent = attent_and_generate(encoder_input)
    
    plt.clf()
    plt.figure(figsize=(10,10))
    
    mats = []
    dec_inputs = []
    for dec_ind, attn in attention_weights:
        mats.append(attn[:len(text[0].split(' '))].reshape(-1))
        dec_inputs.append(dec_ind)
        
    attention_mat = np.transpose(np.array(mats))
    fig, ax = plt.subplots(figsize=(8, 8))
    ax.imshow(attention_mat)
    ax.set_xticks(np.arange(attention_mat.shape[1]))
    ax.set_yticks(np.arange(attention_mat.shape[0]))

    ax.set_xticklabels([inp for inp in dec_inputs])
    ax.set_yticklabels([w for w in str(text[0]).split(' ')])

    ax.tick_params(labelsize=15)
    ax.tick_params(axis='x', labelrotation=90)

    plt.show()
    return decoded_sent

In [None]:
train_gen = generate_batch(X_train, y_train, batch_size = 1)
k=-1

In [None]:
k+=1
(encoder_input, actual_output), _ = next(train_gen)
print(X_train[k:k+1])
decoded_sent = visualize(X_train[k:k+1], encoder_input)
print('Input English Sentence:', X_train[k:k+1][0])
print('Actual German:', y_train[k:k+1][0][6:-4])
print('Predicted German:', decoded_sent[:-4])