In [1]:
import string
import re
from pickle import load,dump

from unicodedata import normalize
from numpy import array
from numpy.random import rand
from numpy.random import shuffle
from numpy import argmax

from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.utils import to_categorical
from keras.utils import plot_model
from keras.models import Sequential
from keras.layers import LSTM
from keras.layers import Dense
from keras.layers import Embedding
from keras.layers import RepeatVector
from keras.layers import TimeDistributed
from keras.callbacks import ModelCheckpoint

from keras.models import load_model
from nltk.translate.bleu_score import corpus_bleu

In [2]:
def load_doc(filename):
    # open the file as read only
    file = open(filename, mode='rt', encoding='utf-8')
    # read all text
    text = file.read()
    # close the file
    file.close()
    return text

In [3]:
def to_pairs(doc):
    lines = doc.strip().split('\n')
    pairs = [line.split('\t') for line in  lines]
    return pairs

In [4]:
# clean a list of lines
def clean_pairs(lines):
    cleaned = list()
    # prepare regex for char filtering
    re_print = re.compile('[^%s]' % re.escape(string.printable))
    # prepare translation table for removing punctuation
    table = str.maketrans('', '', string.punctuation)
    for pair in lines:
        clean_pair = list()
        for line in pair:
            # normalize unicode characters
            line = normalize('NFD', line).encode('ascii', 'ignore')
            line = line.decode('UTF-8')
            # tokenize on white space
            line = line.split()
            # convert to lowercase
            line = [word.lower() for word in line]
            # remove punctuation from each token
            line = [word.translate(table) for word in line]
            # remove non-printable chars form each token
            line = [re_print.sub('', w) for w in line]
            # remove tokens with numbers in them
            line = [word for word in line if word.isalpha()]
            # store as string
            clean_pair.append(' '.join(line))
        cleaned.append(clean_pair)
    return array(cleaned)

In [5]:
def save_clean_data(sentences, filename):
    dump(sentences, open(filename, 'wb'))
    print('Saved: %s' % filename)

In [6]:
# load dataset
filename = 'deu.txt'
doc = load_doc(filename)
# split into english-german pairs
pairs = to_pairs(doc)
# clean sentences
clean_pairs = clean_pairs(pairs)
# save clean pairs to file
save_clean_data(clean_pairs, 'english-german.pkl')

Saved: english-german.pkl


In [7]:
# load a clean dataset
def load_clean_sentences(filename):
    return load(open(filename, 'rb'))

In [8]:
# load dataset
raw_dataset = load_clean_sentences('english-german.pkl')

# reduce dataset size
n_sentences = 10000
dataset = raw_dataset[:n_sentences, :]
# random shuffle
shuffle(dataset)
# split into train/test
train, test = dataset[:9000], dataset[9000:]
# save
save_clean_data(dataset, 'english-german-both.pkl')
save_clean_data(train, 'english-german-train.pkl')
save_clean_data(test, 'english-german-test.pkl')

Saved: english-german-both.pkl
Saved: english-german-train.pkl
Saved: english-german-test.pkl


In [9]:
def create_tokenizer(lines):
    tokenizer = Tokenizer()
    tokenizer.fit_on_texts(lines)
    return tokenizer

In [10]:
def max_length(lines):
    return max(len(line.split()) for line in lines)

In [11]:
def encode_sequences(tokenizer, length, lines):
    X = tokenizer.texts_to_sequences(lines)
    # pad sequences with 0 values
    X = pad_sequences(X, maxlen=length, padding='post')
    return X

def encode_output(sequences, vocab_size):
    ylist = list()
    for sequence in sequences:
        encoded = to_categorical(sequence, num_classes=vocab_size)
        ylist.append(encoded)
    y = array(ylist)
    y = y.reshape(sequences.shape[0], sequences.shape[1], vocab_size)
    return y

In [12]:
from keras.layers import Bidirectional, Attention

In [13]:
def define_model(src_vocab, tar_vocab, src_timesteps, tar_timesteps, n_units):
    model = Sequential()
    model.add(Embedding(src_vocab, n_units, input_length=src_timesteps, mask_zero=True))
    model.add(Bidirectional(LSTM(n_units)))
    model.add(RepeatVector(tar_timesteps))
    
    model.add(Bidirectional(LSTM(n_units, return_sequences=True)))
    model.add(TimeDistributed(Dense(tar_vocab, activation='softmax')))
    return model

In [14]:
# load datasets
dataset = load_clean_sentences('english-german-both.pkl')
train = load_clean_sentences('english-german-train.pkl')
test = load_clean_sentences('english-german-test.pkl')

# prepare english tokenizer
eng_tokenizer = create_tokenizer(dataset[:, 0])
eng_vocab_size = len(eng_tokenizer.word_index) + 1
eng_length = max_length(dataset[:, 0])
print('English Vocabulary Size: %d' % eng_vocab_size)
print('English Max Length: %d' % (eng_length))

# prepare german tokenizer
ger_tokenizer = create_tokenizer(dataset[:, 1])
ger_vocab_size = len(ger_tokenizer.word_index) + 1
ger_length = max_length(dataset[:, 1])
print('German Vocabulary Size: %d' % ger_vocab_size)
print('German Max Length: %d' % (ger_length))

# prepare training data
trainX = encode_sequences(ger_tokenizer, ger_length, train[:, 1])
trainY = encode_sequences(eng_tokenizer, eng_length, train[:, 0])
trainY = encode_output(trainY, eng_vocab_size)

# prepare validation data
testX = encode_sequences(ger_tokenizer, ger_length, test[:, 1])
testY = encode_sequences(eng_tokenizer, eng_length, test[:, 0])
testY = encode_output(testY, eng_vocab_size)

# define model
model = define_model(ger_vocab_size, eng_vocab_size, ger_length, eng_length, 256)
model.compile(optimizer='adam', loss='categorical_crossentropy')
# summarize defined model
print(model.summary())
plot_model(model, to_file='English2German.png', show_shapes=True)
# fit model
filename = 'English2German.h5'

checkpoint = ModelCheckpoint(filename, monitor='val_loss', verbose=1, save_best_only=True, mode='min')
model.fit(trainX, trainY, epochs=20, batch_size=64, validation_data=(testX, testY), callbacks=[checkpoint], verbose=2)

English Vocabulary Size: 2404
English Max Length: 5
German Vocabulary Size: 3856
German Max Length: 10
Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding (Embedding)       (None, 10, 256)           987136    
                                                                 
 bidirectional (Bidirection  (None, 512)               1050624   
 al)                                                             
                                                                 
 repeat_vector (RepeatVecto  (None, 5, 512)            0         
 r)                                                              
                                                                 
 bidirectional_1 (Bidirecti  (None, 5, 512)            1574912   
 onal)                                                           
                                                                 
 time_distributed (

  saving_api.save_model(



Epoch 2: val_loss improved from 3.51618 to 3.15598, saving model to English2German.h5
141/141 - 9s - loss: 3.2870 - val_loss: 3.1560 - 9s/epoch - 64ms/step
Epoch 3/20

Epoch 3: val_loss improved from 3.15598 to 2.86638, saving model to English2German.h5
141/141 - 9s - loss: 2.8320 - val_loss: 2.8664 - 9s/epoch - 65ms/step
Epoch 4/20

Epoch 4: val_loss improved from 2.86638 to 2.68299, saving model to English2German.h5
141/141 - 9s - loss: 2.4447 - val_loss: 2.6830 - 9s/epoch - 66ms/step
Epoch 5/20

Epoch 5: val_loss improved from 2.68299 to 2.50870, saving model to English2German.h5
141/141 - 9s - loss: 2.1109 - val_loss: 2.5087 - 9s/epoch - 65ms/step
Epoch 6/20

Epoch 6: val_loss improved from 2.50870 to 2.37203, saving model to English2German.h5
141/141 - 9s - loss: 1.7942 - val_loss: 2.3720 - 9s/epoch - 64ms/step
Epoch 7/20

Epoch 7: val_loss improved from 2.37203 to 2.25995, saving model to English2German.h5
141/141 - 9s - loss: 1.5055 - val_loss: 2.2600 - 9s/epoch - 65ms/step
Epo

<keras.src.callbacks.History at 0x107813bd0>

In [15]:
def word_for_id(integer, tokenizer):
    for word, index in tokenizer.word_index.items():
        if index == integer:
            return word
    return None

In [16]:
def predict_sequence(model, tokenizer, source):
    prediction = model.predict(source, verbose=0)[0]
    integers = [argmax(vector) for vector in prediction]
    target = list()
    for i in integers:
        word = word_for_id(i, tokenizer)
        if word is None:
            break
        target.append(word)
    return ' '.join(target)

In [17]:
def evaluate_model(model, tokenizer, sources, raw_dataset):
    actual, predicted = list(), list()
    for i, source in enumerate(sources):
        # translate encoded source text
        source = source.reshape((1, source.shape[0]))
        translation = predict_sequence(model, eng_tokenizer, source)
        raw_target, raw_src = raw_dataset[i]
        if i < 10:
            print('src=[%s], target=[%s], predicted=[%s]' % (raw_src, raw_target, translation))
        actual.append([raw_target.split()])
        predicted.append(translation.split())
    # calculate BLEU score
    print('BLEU-1: %f' % corpus_bleu(actual, predicted, weights=(1.0, 0, 0, 0)))
    print('BLEU-2: %f' % corpus_bleu(actual, predicted, weights=(0.5, 0.5, 0, 0)))
    print('BLEU-3: %f' % corpus_bleu(actual, predicted, weights=(0.3, 0.3, 0.3, 0)))
    print('BLEU-4: %f' % corpus_bleu(actual, predicted, weights=(0.25, 0.25, 0.25, 0.25)))

In [18]:
# test on some training sequences
print('train')
evaluate_model(model, eng_tokenizer, trainX, train)
# test on some test sequences
print('test')
evaluate_model(model, eng_tokenizer, testX, test)

train
src=[es ist nicht dumm], target=[its not stupid], predicted=[its not stupid]
src=[ich wunsche ihnen einen schonen tag], target=[have a nice day], predicted=[have a nice day]
src=[vertraust du ihr], target=[do you trust her], predicted=[do you trust her]
src=[ich nehme geschenke an], target=[i accept gifts], predicted=[i accept gifts]
src=[ich habe ein pferd], target=[i have a horse], predicted=[i have a horse]
src=[zeig ihn uns], target=[show it to us], predicted=[show it to us]
src=[er ist ein komodiant], target=[hes a comedian], predicted=[hes a comedian]
src=[geht und wartet drauen], target=[go wait outside], predicted=[go wait outside]
src=[pack dich], target=[go away], predicted=[get away]
src=[gehort das hier euch], target=[is this yours], predicted=[is this yours]
BLEU-1: 0.945676
BLEU-2: 0.926988
BLEU-3: 0.871600
BLEU-4: 0.639888
test
src=[alle waren sich einig], target=[everyone agreed], predicted=[everybody finished]
src=[mir geht es schlecht], target=[im not well], pre

In [58]:
!jupyter nbconvert --to html 'English2German.ipynb'

[NbConvertApp] Converting notebook English2German.ipynb to html
[NbConvertApp] Writing 327602 bytes to English2German.html
