In [1]:
import string
import re
from numpy import array, argmax, random, take
import pandas as pd
from keras.models import Sequential
from keras.layers import Dense, LSTM, Embedding, RepeatVector
from keras.preprocessing.text import Tokenizer
from keras.callbacks import ModelCheckpoint
from keras.preprocessing.sequence import pad_sequences
from keras.models import load_model
from keras import optimizers
import matplotlib.pyplot as plt
pd.set_option('display.max_colwidth', 200)

Using TensorFlow backend.


In [2]:
def read_text(filename):
        # open the file
        file = open(filename, mode='rt', encoding='utf-8')
        
        # read all text
        text = file.read()
        file.close()
        return text

def to_lines(text):
      sents = text.strip().split('\n')
      sents = [i.split('\t') for i in sents]
      return sents

In [3]:
data = read_text("deu.txt")
deu_eng = to_lines(data)
deu_eng = array(deu_eng)

In [4]:
deu_eng = deu_eng[:100000,:2]
deu_eng

array([['Go.', 'Geh.'],
       ['Hi.', 'Hallo!'],
       ['Hi.', 'Grüß Gott!'],
       ...,
       ['Why are you always so angry?',
        'Warum sind Sie immer so ärgerlich?'],
       ['Why are you always so angry?',
        'Warum seid ihr immer so ärgerlich?'],
       ['Why are you always so happy?',
        'Warum seid ihr immer so glücklich?']], dtype='<U537')

In [5]:
deu_eng[:,0] = [s.translate(str.maketrans('', '', string.punctuation)).lower() for s in deu_eng[:,0]]
deu_eng[:,1] = [s.translate(str.maketrans('', '', string.punctuation)).lower() for s in deu_eng[:,1]]

deu_eng

array([['go', 'geh'],
       ['hi', 'hallo'],
       ['hi', 'grüß gott'],
       ...,
       ['why are you always so angry',
        'warum sind sie immer so ärgerlich'],
       ['why are you always so angry',
        'warum seid ihr immer so ärgerlich'],
       ['why are you always so happy',
        'warum seid ihr immer so glücklich']], dtype='<U537')

In [6]:
def tokenization(lines):
    tokenizer = Tokenizer()
    tokenizer.fit_on_texts(lines)
    return tokenizer

In [7]:
#english tokenizer 
eng_tokenizer = tokenization(deu_eng[:, 0])
eng_vocab_size = len(eng_tokenizer.word_index) + 1

eng_length = 8

In [8]:
#deutsch tokenizer
deu_tokenizer = tokenization(deu_eng[:, 1])
deu_vocab_size = len(deu_tokenizer.word_index) + 1

deu_length = 8

In [9]:
def encode_sequences(tokenizer, length, lines):
         # integer encode sequences
         seq = tokenizer.texts_to_sequences(lines)
         # pad sequences with 0 values
         seq = pad_sequences(seq, maxlen=length, padding='post')
         return seq

In [10]:
trainX = encode_sequences(eng_tokenizer, deu_length, deu_eng[:, 0])
trainY = encode_sequences(deu_tokenizer, eng_length, deu_eng[:, 1])

In [11]:
# build NMT model
def define_model(in_vocab,out_vocab, in_timesteps,out_timesteps,units):
      model = Sequential()
      model.add(Embedding(in_vocab, units, input_length=in_timesteps, mask_zero=True))
      model.add(LSTM(units))
      model.add(RepeatVector(out_timesteps))
      model.add(LSTM(units, return_sequences=True))
      model.add(Dense(out_vocab, activation='softmax'))
      return model

# model compilation
model = define_model(eng_vocab_size, deu_vocab_size, eng_length, deu_length, 512)

rms = optimizers.RMSprop(lr=0.001)
model.compile(optimizer=rms, loss='sparse_categorical_crossentropy')

In [12]:
filename = 'model.h1'
checkpoint = ModelCheckpoint(filename, monitor='val_loss', verbose=1, save_best_only=True, mode='min')

# train model
history = model.fit(trainX, trainY.reshape(trainY.shape[0], trainY.shape[1], 1),
                    epochs=25, batch_size=512, validation_split = 0.2,callbacks=[checkpoint], 
                    verbose=1)

  "Converting sparse IndexedSlices to a dense Tensor of unknown shape. "


Train on 80000 samples, validate on 20000 samples
Epoch 1/25

Epoch 00001: val_loss improved from inf to 4.51552, saving model to model.h1
Epoch 2/25

Epoch 00002: val_loss improved from 4.51552 to 4.25689, saving model to model.h1
Epoch 3/25

Epoch 00003: val_loss improved from 4.25689 to 3.97701, saving model to model.h1
Epoch 4/25

Epoch 00004: val_loss improved from 3.97701 to 3.77022, saving model to model.h1
Epoch 5/25

Epoch 00005: val_loss improved from 3.77022 to 3.58600, saving model to model.h1
Epoch 6/25

Epoch 00006: val_loss improved from 3.58600 to 3.45094, saving model to model.h1
Epoch 7/25

Epoch 00007: val_loss improved from 3.45094 to 3.29350, saving model to model.h1
Epoch 8/25

Epoch 00008: val_loss improved from 3.29350 to 3.21047, saving model to model.h1
Epoch 9/25

Epoch 00009: val_loss improved from 3.21047 to 3.06689, saving model to model.h1
Epoch 10/25

Epoch 00010: val_loss improved from 3.06689 to 2.99562, saving model to model.h1
Epoch 11/25

Epoch 0001

In [14]:
# preprocessing function
def preprocess(sentence):
    sentence = sentence.translate(str.maketrans('', '', string.punctuation)).lower()
    return sentence

# function to get german words using tokens
def get_word(n, tokenizer):
      for word, index in tokenizer.word_index.items():
          if index == n:
              return word
      return None

#function to form sentences in deutsch using tokens 
def get_sentence(deu_list):
    final = ''
    for i in deu_list:
        if i != 0:
            word = get_word(i, deu_tokenizer)
            final = final + str(word) + ' '
    return final

In [15]:
#function to translate english sentence to deutsch sentence using 
#the functions defined above
def translate(sentence):
    sentence = preprocess(sentence)
    a = encode_sequences(eng_tokenizer, eng_length, [sentence])
    b = model.predict_classes(a)[0]
    translation = get_sentence(b)
    return translation

In [16]:
translate('I am a man')

'ich bin ein mann '

In [17]:
translate("What are you doing?")

'was machst du '

In [18]:
translate("I am drinking water")

'ich trinke wasser '

In [19]:
translate("He is eating egg")

'er isst ein '

In [20]:
translate("I am reading book")

'ich lese ein buch '

In [21]:
translate("I love coffee")

'ich liebe kaffee '

In [22]:
translate("Where are you from?")

'wo kommt ihr her '

In [23]:
translate("How are you?")

'wie geht es '

In [24]:
translate("Do you like Tea or Coffee")

'magst du tee kaffee kaffee '

In [25]:
translate("I am drinking tea.")

'ich trinke gerade '

In [26]:
translate("The woman is pretty")

'die frau ist hübsch '

In [27]:
translate("I am very happy")

'ich bin glücklich glücklich '