In [0]:
import string
import re
from pickle import dump
from unicodedata import normalize
from numpy import array
import numpy as np

# fn to load
def load_file(filename):
	file=open(filename,mode='rt',encoding='utf-8')
	text=file.read()
	file.close()
	return text

# fn to split into sentences (eng-ger)
def to_pairs(doc):
	lines = doc.strip().split('\n')
	pairs = [line.split('\t') for line in  lines]
	return pairs

# fn to clean
def clean_pairs(lines):
	cleaned=list() # list ...otherwise assumed to be a tuple
	# regex for char filtering
	re_print = re.compile('[^%s]' % re.escape(string.printable))
	# translation table for removing punctuation
	table = str.maketrans('', '', string.punctuation)
	for pair in lines:
		clean_pair=list()
		for line in pair:
			line = normalize('NFD', line).encode('ascii','ignore') # do canonical decomposition and encode into ascii ignoring errors
			line = line.decode('UTF-8')
			line = line.split()
			#to lowercase
			line = [word.lower() for word in line]
			#remove punctuation
			line = [word.translate(table) for word in line]
			#remove non-printable chars 
			line = [re_print.sub('', w) for w in line]
			#remove tokens with numbers in them
			line = [word for word in line if word.isalpha()]
			clean_pair.append(' '.join(line)) #append to a list
		cleaned.append(clean_pair)
	return np.array(cleaned) # numpy arrays

# fn to save cleaned data
def save(sentences, filename):
	dump(sentences, open(filename, 'wb'))
	print('Saved: %s' % filename)

In [2]:
#load
doc=load_file("deu.txt")
#split into english-german pairs
pairs=to_pairs(doc)
#clean
clean_pairs=clean_pairs(pairs)
#save
save(clean_pairs,'english-german.pkl')

Saved: english-german.pkl


In [3]:
from pickle import load, dump
from numpy.random import rand, shuffle

def load_clean_data(filename):
	return load(open(filename, 'rb'))

dataset=load_clean_data("english-german.pkl")

shuffle(dataset)
dataset=dataset[:10000,:] #taking only first 10k
train=dataset[:9000]
test=dataset[9000:]
save(dataset,'english-german(10k).pkl')
save(train,'english-german_train.pkl')
save(test,'english-german_test.pkl')

Saved: english-german(10k).pkl
Saved: english-german_train.pkl
Saved: english-german_test.pkl


In [4]:
from pickle import load
from numpy import array
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.models import Sequential
from keras.layers import LSTM,Dense,Embedding, RepeatVector

# fn to load a clean dataset
def load_clean_sentences(filename):
	return load(open(filename, 'rb'))

# fn to tokenize
def create_tokenizer(lines):
	t=Tokenizer()
	t.fit_on_texts(lines)
	return t

# fn to find max len
def max_length(lines):
	return max(len(line.split()) for line in lines)

Using TensorFlow backend.


In [5]:
dataset=load_clean_data('english-german(10k).pkl')
train=load_clean_data('english-german_train.pkl')
test=load_clean_data('english-german_test.pkl')
train

array([['you must do as i say', 'du musst tun was ich dir sage'],
       ['she advised him not to do that',
        'sie hat ihm geraten das nicht zu tun'],
       ['i have to walk to school', 'ich muss zur schule laufen'],
       ...,
       ['could you speak a little louder please',
        'bitte sprechen sie etwas lauter'],
       ['this laboratory is equipped with the latest computers',
        'dieses labor ist mit den aktuellsten rechnern ausgestattet'],
       ['its difficult for me to understand french when its spoken quickly',
        'es ist schwierig fur mich franzosisch zu verstehen wenn es schnell gesprochen wird']],
      dtype='<U291')

In [6]:
#english tokenizer
eng_tokenizer=create_tokenizer(dataset[:, 0])
eng_vocab_size=len(eng_tokenizer.word_index) + 1
eng_length=max_length(dataset[:, 0])
print(eng_vocab_size)
print(eng_length)
#german tokenizer
ger_tokenizer=create_tokenizer(dataset[:, 1])
ger_vocab_size=len(ger_tokenizer.word_index) + 1
ger_length=max_length(dataset[:, 1])
print(ger_vocab_size)
print(ger_length)


4957
31
7395
28


In [0]:
# fn for encoding to numbers 
def encode_to_nos(tokenizer,length,lines):
	X=tokenizer.texts_to_sequences(lines)
	# padding with 0 values at end
	X=pad_sequences(X, maxlen=length, padding='post')
	return X

In [0]:
#training data
trainX=encode_to_nos(ger_tokenizer,ger_length,train[:, 1])
trainY=encode_to_nos(eng_tokenizer,eng_length,train[:, 0])

from keras.utils import to_categorical
from keras import utils as np_utils
trainY=np_utils.to_categorical(trainY,eng_vocab_size)

In [9]:
model=Sequential()
model.add(Embedding(ger_vocab_size,256,input_length=ger_length,mask_zero=True)) # If mask_zero is set to True, as a consequence, index 0 cannot be used in the vocabulary (input_dim should equal size of vocabulary + 1).
model.add(LSTM(256))
model.add(RepeatVector(eng_length))
model.add(LSTM(256,return_sequences=True))
model.add(Dense(eng_vocab_size,activation='softmax'))

model.compile(optimizer='adam',loss='categorical_crossentropy',metrics=['accuracy'])
print(model.summary())

model.fit(trainX,trainY,epochs=40,batch_size=64,validation_split=0.2,verbose=2)

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_1 (Embedding)      (None, 28, 256)           1893120   
_________________________________________________________________
lstm_1 (LSTM)                (None, 256)               525312    
_________________________________________________________________
repeat_vector_1 (RepeatVecto (None, 31, 256)           0         
_________________________________________________________________
lstm_2 (LSTM)                (None, 31, 256)           525312    
_________________________________________________________________
dense_1 (Dense)              (None, 31, 4957)          1273949   
Total params: 4,217,693
Trainable params: 4,217,693
Non-trainable params: 0
_________________________________________________________________
None
Train on 7200 samples, validate on 1800 samples
Epoch 1/40
 - 236s - loss: 2.3450 - acc: 0.7852 - val_loss: 1.4877 - val_acc: 0

KeyboardInterrupt: ignored

In [0]:
testX=encode_to_nos(ger_tokenizer,ger_length,test[:, 1])
from numpy import argmax

#convert nos back to words
def word_to_id_back(integer,tokenizer):
	for word,index in tokenizer.word_index.items():
		if index == integer:
			return word
	return None

#predict
def predict(model,tokenizer,source):
	prediction=model.predict(source, verbose=0)
	integers=[argmax(vector) for vector in prediction]
	target=list()
	for i in integers:
		word=word_to_id_back(i,tokenizer)
		if word is None:
			break
		target.append(word)
	return (' '.join(target))

# evaluate
def evaluate_model(model,tokenizer,sources,raw_dataset):
	actual,predicted=list(),list()
	for i, source in enumerate(sources):
		#translate encoded source text
		source = source.reshape((1, source.shape[0]))
		translation = predict(model, eng_tokenizer, source)
		raw_target, raw_src = raw_dataset[i]
		if i < 10:
			print('src=[%s], target=[%s], predicted=[%s]' % (raw_src, raw_target, translation))
		actual.append(raw_target.split())
		predicted.append(translation.split())
#can further calculate BLEU SCORE (actual.predicted)


  

In [11]:
dataset = load_clean_sentences('english-german(10k).pkl')
train = load_clean_sentences('english-german_train.pkl')
test = load_clean_sentences('english-german_test.pkl')
# prepare english tokenizer
eng_tokenizer = create_tokenizer(dataset[:, 0])
eng_vocab_size = len(eng_tokenizer.word_index) + 1
eng_length = max_length(dataset[:, 0])
# prepare german tokenizer
ger_tokenizer = create_tokenizer(dataset[:, 1])
ger_vocab_size = len(ger_tokenizer.word_index) + 1
ger_length = max_length(dataset[:, 1])
# prepare data
trainX = encode_to_nos(ger_tokenizer, ger_length, train[:, 1])
testX = encode_to_nos(ger_tokenizer, ger_length, test[:, 1])


evaluate_model(model,eng_tokenizer,trainX,train)
evaluate_model(model,eng_tokenizer,testX,test)

src=[du musst tun was ich dir sage], target=[you must do as i say], predicted=[]
src=[sie hat ihm geraten das nicht zu tun], target=[she advised him not to do that], predicted=[]
src=[ich muss zur schule laufen], target=[i have to walk to school], predicted=[]
src=[mach bitte den fernseher an], target=[please turn on the tv], predicted=[]
src=[ich sollte mir keine sorgen machen], target=[i shouldnt worry], predicted=[]
src=[ich dachte du willst vielleicht was trinken], target=[i thought you might want a drink], predicted=[]
src=[amerika ist fuhrend in der raumfahrttechnik], target=[america is ahead in space technology], predicted=[]
src=[lade ein wen immer du willst], target=[invite anyone you want], predicted=[]
src=[das buch ist hier], target=[the book is here], predicted=[]
src=[die antwort ist vollig falsch], target=[the answer is completely wrong], predicted=[]


KeyboardInterrupt: ignored