In [1]:
import string, re
from unicodedata import normalize
from numpy.random import shuffle
from numpy import argmax
from keras.models import load_model
from nltk.translate.bleu_score import corpus_bleu
from pickle import load, dump
import numpy as np
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.utils import to_categorical
from keras.utils.vis_utils import plot_model
from keras.models import Sequential
from keras.layers import LSTM, Dense, Embedding, RepeatVector, TimeDistributed
from keras.callbacks import ModelCheckpoint


Using TensorFlow backend.


## Load Dataset

In [2]:
# load doc into memory
def load_doc(filename):
    file = open(filename, mode='rt', encoding='utf-8')
    text = file.read()
    file.close()
    return text

In [3]:
filename = '/content/drive/My Drive/German English Translation/deu.txt'
doc = load_doc(filename)

In [5]:
doc

## Document into Sentence

In [4]:
def to_pairs(doc):
    lines = doc.strip().split('\n')
    pairs = [line.split('\t') for line in lines]
    return pairs

In [5]:
pairs = to_pairs(doc)

In [6]:
#printing five list
for i in range(5):
  print(pairs[i])

['Go.', 'Geh.', 'CC-BY 2.0 (France) Attribution: tatoeba.org #2877272 (CM) & #8597805 (Roujin)']
['Hi.', 'Hallo!', 'CC-BY 2.0 (France) Attribution: tatoeba.org #538123 (CM) & #380701 (cburgmer)']
['Hi.', 'Grüß Gott!', 'CC-BY 2.0 (France) Attribution: tatoeba.org #538123 (CM) & #659813 (Esperantostern)']
['Run!', 'Lauf!', 'CC-BY 2.0 (France) Attribution: tatoeba.org #906328 (papabear) & #941078 (Fingerhut)']
['Run.', 'Lauf!', 'CC-BY 2.0 (France) Attribution: tatoeba.org #4008918 (JSakuragi) & #941078 (Fingerhut)']


In [7]:
len(pairs)

208486

## Clean Dataset

In [8]:
# clean a list of lines
def clean_pairs(lines):
    cleaned = list()
    # prepare regex for char filtering
    re_punc = re.compile('[%s]' % re.escape(string.punctuation))
    re_print = re.compile('[^%s]' % re.escape(string.printable))
    
    for pair in lines:
        clean_pair = list()
        for line in pair:
            
            # normalize unicode characters
            line = normalize('NFD', line).encode('ascii', 'ignore')
            line = line.decode('UTF-8')
            line = line.split()
            line = [word.lower() for word in line]
            
            # remove punctuation from each token
            line = [re_punc.sub('', w) for w in line]
            
            # remove non-printable chars form each token
            line = [re_print.sub('', w) for w in line]
            line = [word for word in line if word.isalpha()]
            # store as string
            clean_pair.append(' '.join(line))
        cleaned.append(clean_pair)
    return np.array(cleaned)


In [9]:
clean_pairs = clean_pairs(pairs)


In [10]:
clean_pairs

array([['go', 'geh', 'ccby france attribution tatoebaorg cm roujin'],
       ['hi', 'hallo', 'ccby france attribution tatoebaorg cm cburgmer'],
       ['hi', 'gru gott',
        'ccby france attribution tatoebaorg cm esperantostern'],
       ...,
       ['if someone who doesnt know your background says that you sound like a native speaker it means they probably noticed something about your speaking that made them realize you werent a native speaker in other words you dont really sound like a native speaker',
        'wenn jemand der deine herkunft nicht kennt sagt dass du wie ein muttersprachler sprichst bedeutet das dass man wahrscheinlich etwas an deiner sprechweise bemerkt hat das erkennen lie dass du kein muttersprachler bist mit anderen worten du horst dich nicht wirklich wie ein muttersprachler an',
        'ccby france attribution tatoebaorg ck tamy'],
       ['if someone who doesnt know your background says that you sound like a native speaker it means they probably noticed som

## Saving Clean data

In [11]:
def save_clean_data(sentences, filename):
    dump(sentences, open(filename, 'wb'))
    print('Saved: %s' % filename)
    

In [12]:
save_clean_data(clean_pairs, 'english-german.pkl')
  

Saved: english-german.pkl


## Printing Some Pair of Word

In [13]:
for i in range(100):
    print('[%s] => [%s]' % (clean_pairs[i,0], clean_pairs[i,1]))
    

[go] => [geh]
[hi] => [hallo]
[hi] => [gru gott]
[run] => [lauf]
[run] => [lauf]
[wow] => [potzdonner]
[wow] => [donnerwetter]
[fire] => [feuer]
[help] => [hilfe]
[help] => [zu hulf]
[stop] => [stopp]
[wait] => [warte]
[wait] => [warte]
[begin] => [fang an]
[go on] => [mach weiter]
[hello] => [hallo]
[hurry] => [beeil dich]
[hurry] => [schnell]
[i ran] => [ich rannte]
[i see] => [ich verstehe]
[i see] => [aha]
[i try] => [ich probiere es]
[i won] => [ich hab gewonnen]
[i won] => [ich habe gewonnen]
[relax] => [entspann dich]
[shoot] => [feuer]
[shoot] => [schie]
[smile] => [lacheln]
[attack] => [angriff]
[attack] => [attacke]
[cheers] => [zum wohl]
[eat it] => [iss es]
[eat up] => [iss auf]
[freeze] => [keine bewegung]
[freeze] => [stehenbleiben]
[got it] => [verstanden]
[got it] => [aha]
[got it] => [ich habs]
[got it] => [kapiert]
[got it] => [verstanden]
[got it] => [einverstanden]
[he ran] => [er rannte]
[he ran] => [er lief]
[hop in] => [mach mit]
[hug me] => [druck mich]
[hug me]

## Loading Dataset

In [14]:
def load_clean_sentences(filename):
    return load(open(filename, 'rb'))


In [15]:
raw_dataset = load_clean_sentences('english-german.pkl')


In [16]:
raw_dataset

array([['go', 'geh', 'ccby france attribution tatoebaorg cm roujin'],
       ['hi', 'hallo', 'ccby france attribution tatoebaorg cm cburgmer'],
       ['hi', 'gru gott',
        'ccby france attribution tatoebaorg cm esperantostern'],
       ...,
       ['if someone who doesnt know your background says that you sound like a native speaker it means they probably noticed something about your speaking that made them realize you werent a native speaker in other words you dont really sound like a native speaker',
        'wenn jemand der deine herkunft nicht kennt sagt dass du wie ein muttersprachler sprichst bedeutet das dass man wahrscheinlich etwas an deiner sprechweise bemerkt hat das erkennen lie dass du kein muttersprachler bist mit anderen worten du horst dich nicht wirklich wie ein muttersprachler an',
        'ccby france attribution tatoebaorg ck tamy'],
       ['if someone who doesnt know your background says that you sound like a native speaker it means they probably noticed som

## Dividing Dataset

In [17]:
# reduce dataset size
n_sentences = 20000
dataset = raw_dataset[:n_sentences, :]
# random shuffle
shuffle(dataset)
# split into train/test
train, test = dataset[:18000], dataset[18000:]


## Saving Divided dataset

In [18]:
# save a list of clean sentences to file
def save_clean_data(sentences, filename):
    dump(sentences, open(filename, 'wb'))
    print('Saved: %s' % filename)
    

In [19]:
# save
save_clean_data(dataset, 'english-german-both.pkl')
save_clean_data(train, 'english-german-train.pkl')
save_clean_data(test, 'english-german-test.pkl')


Saved: english-german-both.pkl
Saved: english-german-train.pkl
Saved: english-german-test.pkl


## Loading Divided dataset

In [20]:
# load a clean dataset
def load_clean_sentences(filename):
    return load(open(filename, 'rb'))


In [21]:
# load datasets
dataset = load_clean_sentences('english-german-both.pkl')
train = load_clean_sentences('english-german-train.pkl')
test = load_clean_sentences('english-german-test.pkl')


## Tokenizer

In [22]:
# fit a tokenizer
def create_tokenizer(lines):
    tokenizer = Tokenizer()
    tokenizer.fit_on_texts(lines)
    return tokenizer


## Maximum sentence Length

In [23]:
# max sentence length
def max_length(lines):
    return max(len(line.split()) for line in lines)


## English and German Tokenizer¶

In [24]:
# prepare english tokenizer
eng_tokenizer = create_tokenizer(dataset[:, 0])
eng_vocab_size = len(eng_tokenizer.word_index) + 1
eng_length = max_length(dataset[:, 0])
print('English Vocabulary Size: %d' % eng_vocab_size)
print('English Max Length: %d' % (eng_length))

# prepare german tokenizer
ger_tokenizer = create_tokenizer(dataset[:, 1])
ger_vocab_size = len(ger_tokenizer.word_index) + 1
ger_length = max_length(dataset[:, 1])
print('German Vocabulary Size: %d' % ger_vocab_size)
print('German Max Length: %d' % (ger_length))


English Vocabulary Size: 3640
English Max Length: 5
German Vocabulary Size: 5674
German Max Length: 10


## Encoding and pad Sequence

In [25]:
# encode and pad sequences
def encode_sequences(tokenizer, length, lines):
    # integer encode sequences
    X = tokenizer.texts_to_sequences(lines)
    # pad sequences with 0 values
    X = pad_sequences(X, maxlen=length, padding='post')
    return X
    

## One-hot-encoding

In [26]:
# one hot encode target sequence
def encode_output(sequences, vocab_size):
    ylist = list()
    for sequence in sequences:
        encoded = to_categorical(sequence, num_classes=vocab_size)
        ylist.append(encoded)
    y = np.array(ylist)
    y = y.reshape(sequences.shape[0], sequences.shape[1], vocab_size)
    return y


In [27]:
# prepare training data
trainX = encode_sequences(ger_tokenizer, ger_length, train[:, 1])
trainY = encode_sequences(eng_tokenizer, eng_length, train[:, 0])
trainY = encode_output(trainY, eng_vocab_size)

# prepare validation data
testX = encode_sequences(ger_tokenizer, ger_length, test[:, 1])
testY = encode_sequences(eng_tokenizer, eng_length, test[:, 0])
testY = encode_output(testY, eng_vocab_size)

## Model

In [28]:
# define NMT model
def define_model(src_vocab, tar_vocab, src_timesteps, tar_timesteps, n_units):
    
    model = Sequential()
    model.add(Embedding(src_vocab, n_units, input_length=src_timesteps, mask_zero=True))
    model.add(LSTM(n_units))
    model.add(RepeatVector(tar_timesteps))
    model.add(LSTM(n_units, return_sequences=True))
    model.add(TimeDistributed(Dense(tar_vocab, activation='softmax')))
    
    # compile model
    model.compile(optimizer='adam', 
                  loss='categorical_crossentropy')
    
    # summarize defined model
    model.summary()
    plot_model(model, to_file='model.png', show_shapes=True)
    return model


In [29]:
# define model
model = define_model(ger_vocab_size, eng_vocab_size, ger_length, eng_length, 256)


Model: "sequential_1"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_1 (Embedding)      (None, 10, 256)           1452544   
_________________________________________________________________
lstm_1 (LSTM)                (None, 256)               525312    
_________________________________________________________________
repeat_vector_1 (RepeatVecto (None, 5, 256)            0         
_________________________________________________________________
lstm_2 (LSTM)                (None, 5, 256)            525312    
_________________________________________________________________
time_distributed_1 (TimeDist (None, 5, 3640)           935480    
Total params: 3,438,648
Trainable params: 3,438,648
Non-trainable params: 0
_________________________________________________________________


In [30]:
#checkpoint
checkpoint = ModelCheckpoint('model.h5', monitor='val_loss', verbose=1, save_best_only=True, mode='min')


In [33]:
# fit model
model.fit(trainX, trainY, epochs=10, batch_size=64, validation_data=(testX, testY), callbacks=[checkpoint], verbose=2)


Train on 18000 samples, validate on 2000 samples
Epoch 1/10
 - 22s - loss: 0.0903 - val_loss: 2.2193

Epoch 00001: val_loss did not improve from 1.74147
Epoch 2/10
 - 22s - loss: 0.0916 - val_loss: 2.1955

Epoch 00002: val_loss did not improve from 1.74147
Epoch 3/10
 - 21s - loss: 0.0911 - val_loss: 2.2047

Epoch 00003: val_loss did not improve from 1.74147
Epoch 4/10
 - 21s - loss: 0.0909 - val_loss: 2.2141

Epoch 00004: val_loss did not improve from 1.74147
Epoch 5/10
 - 22s - loss: 0.0901 - val_loss: 2.2226

Epoch 00005: val_loss did not improve from 1.74147
Epoch 6/10
 - 22s - loss: 0.0898 - val_loss: 2.2234

Epoch 00006: val_loss did not improve from 1.74147
Epoch 7/10
 - 22s - loss: 0.0903 - val_loss: 2.2322

Epoch 00007: val_loss did not improve from 1.74147
Epoch 8/10
 - 22s - loss: 0.0902 - val_loss: 2.2312

Epoch 00008: val_loss did not improve from 1.74147
Epoch 9/10
 - 21s - loss: 0.0902 - val_loss: 2.2326

Epoch 00009: val_loss did not improve from 1.74147
Epoch 10/10
 - 

<keras.callbacks.callbacks.History at 0x7fd2fa59bb00>

## Model Evaluation

##### Loading Data

In [34]:
# load a clean dataset
def load_clean_sentences(filename):
  return load(open(filename, 'rb'))


In [35]:
# load datasets
dataset = load_clean_sentences('english-german-both.pkl')
train = load_clean_sentences('english-german-train.pkl')
test = load_clean_sentences('english-german-test.pkl')


##### Tokenizer and Max Length

In [36]:
# fit a tokenizer
def create_tokenizer(lines):
  tokenizer = Tokenizer()
  tokenizer.fit_on_texts(lines)
  return tokenizer


In [37]:
# max sentence length
def max_length(lines):
  return max(len(line.split()) for line in lines)


In [38]:
# prepare english tokenizer
eng_tokenizer = create_tokenizer(dataset[:, 0])
eng_vocab_size = len(eng_tokenizer.word_index) + 1
eng_length = max_length(dataset[:, 0])

# prepare german tokenizer
ger_tokenizer = create_tokenizer(dataset[:, 1])
ger_vocab_size = len(ger_tokenizer.word_index) + 1
ger_length = max_length(dataset[:, 1])


##### Sequence Encoding

In [39]:
# encode and pad sequences
def encode_sequences(tokenizer, length, lines):
  # integer encode sequences
  X = tokenizer.texts_to_sequences(lines)
  # pad sequences with 0 values
  X = pad_sequences(X, maxlen=length, padding= 'post')
  return X


In [40]:
# prepare data
trainX = encode_sequences(ger_tokenizer, ger_length, train[:, 1])
testX = encode_sequences(ger_tokenizer, ger_length, test[:, 1])


##### Generate Sequence

In [41]:
# map an integer to a word
def word_for_id(integer, tokenizer):
  for word, index in tokenizer.word_index.items():
    if index==integer:
      return word
  return None


# generate target given source sequence
def predict_sequence(model, tokenizer, source):
  prediction = model.predict(source, verbose=0)[0]
  integers = [argmax(vector) for vector in prediction]
  target = list()
  for i in integers:
    word = word_for_id(i, tokenizer)
    if word is None:
      break
    target.append(word)
  return ' '.join(target)


# evaluate the skill of the model
def evaluate_model(model, sources, raw_dataset):
  actual, predicted = list(), list()
  for i, source in enumerate(sources):
    # translate encoded source text
    source = source.reshape((1, source.shape[0]))
    translation = predict_sequence(model, eng_tokenizer, source)
    raw_target, raw_src = raw_dataset[i,:2]
    if i < 10:
      print('src=[%s], target=[%s], predicted=[%s]' % (raw_src, raw_target, translation))
    actual.append(raw_target.split())
    predicted.append(translation.split())
  
  # calculate BLEU score
  print('BLEU-1: %f' % corpus_bleu(actual, predicted, weights=(1.0, 0, 0, 0)))
  print('BLEU-2: %f' % corpus_bleu(actual, predicted, weights=(0.5, 0.5, 0, 0))) 
  print('BLEU-3: %f' % corpus_bleu(actual, predicted, weights=(0.3, 0.3, 0.3, 0)))
  print('BLEU-4: %f' % corpus_bleu(actual, predicted, weights=(0.25, 0.25, 0.25, 0.25)))
  

In [42]:
# load model
model = load_model('model.h5')

# test on some training sequences
print('train')
evaluate_model(model, trainX, train)

# test on some test sequences
print('test')
evaluate_model(model, testX, test)


  "Converting sparse IndexedSlices to a dense Tensor of unknown shape. "


train
src=[tom brachte mich zum weinen], target=[tom made me cry], predicted=[tom made me cry]
src=[konnt ihr das erledigen], target=[can you do this], predicted=[can you do that]
src=[was fur eine gute idee], target=[what a nice idea], predicted=[what a good idea]
src=[tom tanzt gerade], target=[tom is dancing], predicted=[tom is dancing]
src=[er ist senil geworden], target=[hes gone senile], predicted=[hes gone senile]
src=[tom hat sie nicht mehr alle], target=[tom is a psycho], predicted=[tom didnt us psycho]
src=[wir sind beschaftigt tom], target=[were busy tom], predicted=[were all here]
src=[er hat mich erpresst], target=[he blackmailed me], predicted=[he blackmailed me]
src=[ich war zeuge], target=[i was a witness], predicted=[i was a witness]
src=[ich hatte spa], target=[i had some fun], predicted=[i had fun fun]


Corpus/Sentence contains 0 counts of 2-gram overlaps.
BLEU scores might be undesirable; use SmoothingFunction().


BLEU-1: 0.070148
BLEU-2: 0.259870
BLEU-3: 0.438786
BLEU-4: 0.500180
test
src=[mein anzug ist grau], target=[my suit is grey], predicted=[my suit is gray]
src=[ich bin auch beschaftigt], target=[im busy too], predicted=[im busy busy busy]
src=[tom wich zuruck], target=[tom backed away], predicted=[tom backed out]
src=[ich verstehe nicht wie], target=[i dont see how], predicted=[i dont not that]
src=[ich bin nicht leichtglaubig], target=[im not gullible], predicted=[im not biased]
src=[du bist sehr unhoflich], target=[youre very rude], predicted=[youre very very]
src=[tom ist ein rupel], target=[tom is rude], predicted=[tom is a dwarf]
src=[wo ist das auto], target=[wheres the car], predicted=[where the car]
src=[das ist in ordnung], target=[thats fine], predicted=[thats ok]
src=[unterrichten sie], target=[do you teach], predicted=[does you crying]
BLEU-1: 0.065050
BLEU-2: 0.249641
BLEU-3: 0.427507
BLEU-4: 0.489046


##### Note:  How to improve/to get better accuracy

1) Data cleaning

2) Vocabulary --> reduce the less appeared of data

3) Increase the number of total data

4) Increase the epochs number

5) Input Order. The order of input phrases could be reversed, which has been reported to
    lift skill, or a Bidirectional input layer could be used.

6) Increase the number of layers