In [None]:
from numpy import array
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.utils import to_categorical
from keras.utils.vis_utils import plot_model
from keras.models import Sequential
from keras.layers import LSTM
from keras.layers import Dense
from keras.layers import Embedding
from keras.layers import RepeatVector
from keras.layers import TimeDistributed
from keras.callbacks import ModelCheckpoint

In [None]:
# Path to the data txt file on disk
german_english = 'data/deu.txt'

In [None]:
# Defining Input & Target response arrays
input_seq = []
target_seq = []

In [None]:
# Reading Input Seq & Target Seq
lines = []
with open(german_english, 'r', encoding='utf-8') as f:
    lines = f.read().split('\n')

for line in lines:
    if len(line.split('\t')) > 1:
        input_seq.append(line.split('\t')[0])
        target_seq.append(line.split('\t')[1])
    
print('Number of Train Input Sequence', len(input_seq))
print('Number of Train Target Sequence', len(target_seq))

In [None]:
# fit a tokenizer
def create_tokenizer(lines):
    tokenizer = Tokenizer()
    tokenizer.fit_on_texts(lines)
    return tokenizer

In [None]:
# max sentence length
def max_length(lines):
    return max(len(line.split()) for line in lines)

In [None]:
# encode and pad sequences
def encode_sequences(tokenizer, length, lines):
    # integer encode sequences
    X = tokenizer.texts_to_sequences(lines)
    # pad sequences with 0 values
    X = pad_sequences(X, maxlen=length, padding='post')
    return X

In [None]:
# one hot encode target sequence
def encode_output(sequences, vocab_size):
    ylist = list()
    for sequence in sequences:
        encoded = to_categorical(sequence, num_classes=vocab_size)
        ylist.append(encoded)
    y = array(ylist)
    y = y.reshape(sequences.shape[0], sequences.shape[1], vocab_size)
    return y

In [None]:
# TODO: Remove this block
input_seq = input_seq[: 12000]
target_seq = target_seq[: 12000]

# Create train Test Split
train_input_seq, test_input_seq = input_seq[: 10000], input_seq[10000: ]
train_target_seq, test_target_seq = target_seq[: 10000], target_seq[10000: ]

In [None]:
# Preparing Training Data
train_input_tokenizer = create_tokenizer(train_input_seq)
train_input_vocab_size = len(train_input_tokenizer.word_index) + 1

print('Vocabulary size Train Input Sequence: ', train_input_vocab_size)

train_target_tokenizer = create_tokenizer(train_target_seq)
train_target_vocab_size = len(train_target_tokenizer.word_index) + 1

print('Vocabulary size Train Target Sequence: ', train_target_vocab_size)

# Preparing Testing Data
test_input_tokenizer = create_tokenizer(test_input_seq)
test_input_vocab_size = len(test_input_tokenizer.word_index) + 1

print('Vocabulary size Test Input Sequence: ', test_input_vocab_size)

test_target_tokenizer = create_tokenizer(test_target_seq)
test_input_vocab_size = len(test_target_tokenizer.word_index) + 1

print('Vocabulary size Test Target Sequence: ', test_input_vocab_size)

In [None]:
# Preparing Training Data
train_x = encode_sequences(train_input_tokenizer, train_input_vocab_size, train_input_seq)
train_y = encode_sequences(train_target_tokenizer, train_target_vocab_size, train_target_seq)
train_y = encode_output(train_y, train_target_vocab_size)

train_input_seq = []
train_target_seq = []

# Preparing Testing Data
test_x = encode_sequences(test_input_tokenizer, test_input_vocab_size, test_input_seq)
test_y = encode_sequences(test_target_tokenizer, test_target_vocab_size, test_target_seq)
test_y = encode_output(test_y, test_target_vocab_size)

test_input_seq = []
test_target_seq = []

In [None]:
# Create NMT Model
def define_model(src_vocab, tar_vocab, src_timesteps, tar_timesteps, n_units):
    model = Sequential()
    model.add(Embedding(src_vocab, n_units, input_length=src_timesteps, mask_zero=True))
    model.add(LSTM(n_units))
    model.add(RepeatVector(tar_timesteps))
    model.add(LSTM(n_units, return_sequences=True))
    model.add(TimeDistributed(Dense(tar_vocab, activation='softmax')))
    return model

In [None]:
# define model
model = define_model(train_input_vocab_size, train_target_vocab_size, 100, 100, 256)
model.compile(optimizer='adam', loss='categorical_crossentropy')

# summarize defined model
model.summary()