In [1]:
import re
import string
from unicodedata import normalize
from numpy import array
from numpy import argmax
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
import pickle
from pickle import dump
from numpy.random import rand
from numpy.random import shuffle
from pickle import load
from numpy import array
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.utils import to_categorical
from keras.utils.vis_utils import plot_model
from keras.models import Sequential
from keras.models import load_model
from keras.layers import LSTM
from keras.layers import Dense
from keras.layers import Embedding
from keras.layers import RepeatVector
from keras.layers import TimeDistributed
from keras.callbacks import ModelCheckpoint
from nltk.translate.bleu_score import corpus_bleu


Using TensorFlow backend.


In [2]:
class LoadData():
    def __init__(self,file):
        self.file = file
        self.text = None
        
    def load_data(self):
        fp = open(self.file, mode='rt', encoding='utf-8')
        self.text = fp.read()
        

In [3]:
class PreProcessing():
    def __init__(self):
        self.pairs = None
        self.cleaned_pairs = list()
        
    def get_pairs(self,data):
        lines = data.strip().split('\n')
        self.pairs = [line.split('\t') for line in  lines]
        
    def clean_pairs(self):
        re_print = re.compile('[^%s]' % re.escape(string.printable))
        table = str.maketrans('', '', string.punctuation)
        for pair in self.pairs:
            clean_pair = list()
            for line in pair:
                line = normalize('NFD', line).encode('ascii', 'ignore')
                line = line.decode('UTF-8')
                line = line.split()
                line = [word.lower() for word in line]
                line = [word.translate(table) for word in line]
                line = [re_print.sub('', w) for w in line]
                line = [word for word in line if word.isalpha()]
                clean_pair.append(' '.join(line))
            self.cleaned_pairs.append(clean_pair)
        self.cleaned_pairs = array(self.cleaned_pairs)
        

In [4]:
class SaveData():
    
    def load_clean_sentences(self,filename):
        return load(open(filename, 'rb'))

    def save_clean_data(self,sentences, filename):
        dump(sentences, open(filename, 'wb'))
        print('Saved: %s' % filename)

In [5]:
class CreateDataset():
    def __init__(self):
        self.dataset = None
        self.training_data = None
        self.testing_data = None
        
    def create_tokenizer(self,lines):
        tokenizer = Tokenizer()
        tokenizer.fit_on_texts(lines)
        return tokenizer
        
    def max_length(self,lines):
        return max(len(line.split()) for line in lines)


    def encode_sequences(self,tokenizer, length, lines):
        X = tokenizer.texts_to_sequences(lines)
        X = pad_sequences(X, maxlen=length, padding='post')
        return X
 
    def encode_output(self,sequences, vocab_size):
        ylist = list()
        for sequence in sequences:
            encoded = to_categorical(sequence, num_classes=vocab_size)
            ylist.append(encoded)
        y = array(ylist)
        y = y.reshape(sequences.shape[0], sequences.shape[1], vocab_size)
        return y

In [6]:
class Model():
    def __init__(self, trainX, trainY, testX, testY, epochs, batch_size):
        self.model = None
        self.epochs = epochs
        self.batch_size = batch_size
        self.trainX = trainX
        self.trainY = trainY 
        self.testX = testX
        self.testY = testY
    
    def create_model(self,src_vocab, tar_vocab, src_timesteps, tar_timesteps, n_units):
        self.model = Sequential()
        self.model.add(Embedding(src_vocab, n_units, input_length=src_timesteps, mask_zero=True))
        self.model.add(LSTM(n_units))
        self.model.add(RepeatVector(tar_timesteps))
        self.model.add(LSTM(n_units, return_sequences=True))
        self.model.add(TimeDistributed(Dense(tar_vocab, activation='softmax')))
        
    def compile_model(self):
        self.model.compile(optimizer="adam",loss="categorical_crossentropy")
    
    def plot_model(self):
        print(self.model.summary())
        plot_model(self.model, to_file='model.png', show_shapes=True)
    
    def fit_model(self):
        filename = 'model.h5'
        checkpoint = ModelCheckpoint(filename, monitor='val_loss', verbose=1, save_best_only=True, mode='min')
        self.model.fit(self.trainX, self.trainY, epochs=self.epochs, batch_size=self.batch_size,\
                       validation_data=(self.testX, self.testY), callbacks=[checkpoint], verbose=2)

In [7]:
if __name__ == '__main__':
    
    # Loading Data
    ld = LoadData("data/deu-eng/deu.txt")
    ld.load_data()
    
    # Preprocessing Data
    pd = PreProcessing()
    pd.get_pairs(ld.text)
    pd.clean_pairs()
    
    #Save data
    sv = SaveData()
    sv.save_clean_data(pd.cleaned_pairs,"data/models/eng-german.pkl")
    raw_dataset = sv.load_clean_sentences("data/models/eng-german.pkl")
    print(raw_dataset)
    # reduce dataset size
    n_sentences = 10000
    dataset = raw_dataset[:n_sentences, :]
    # random shuffle
    shuffle(dataset)
    # split into train/test
    train, test = dataset[:9000], dataset[9000:]
    
    # save train and test data
    sv.save_clean_data(dataset, 'data/models/english-german-both.pkl')
    sv.save_clean_data(train, 'data/models/english-german-train.pkl')
    sv.save_clean_data(test, 'data/models/english-german-test.pkl')
    
    # Load dataset
    dataset = sv.load_clean_sentences('data/models/english-german-both.pkl')
    train = sv.load_clean_sentences('data/models/english-german-train.pkl')
    test = sv.load_clean_sentences('data/models/english-german-test.pkl')
    
    #Create dataset
    cd = CreateDataset()
    # prepare english tokenizer
    eng_tokenizer = cd.create_tokenizer(dataset[:, 0])
    eng_vocab_size = len(eng_tokenizer.word_index) + 1
    eng_length = cd.max_length(dataset[:, 0])
    print('English Vocabulary Size: %d' % eng_vocab_size)
    print('English Max Length: %d' % (eng_length))
    # prepare german tokenizer
    ger_tokenizer = cd.create_tokenizer(dataset[:, 1])
    ger_vocab_size = len(ger_tokenizer.word_index) + 1
    ger_length = cd.max_length(dataset[:, 1])
    print('German Vocabulary Size: %d' % ger_vocab_size)
    print('German Max Length: %d' % (ger_length))
    
    # prepare training data
    trainX = cd.encode_sequences(ger_tokenizer, ger_length, train[:, 1])
    trainY = cd.encode_sequences(eng_tokenizer, eng_length, train[:, 0])
    trainY = cd.encode_output(trainY, eng_vocab_size)
    # prepare validation data
    testX = cd.encode_sequences(ger_tokenizer, ger_length, test[:, 1])
    testY = cd.encode_sequences(eng_tokenizer, eng_length, test[:, 0])
    testY = cd.encode_output(testY, eng_vocab_size)
    
    #model
    epochs = 30
    batch_size = 64
    md = Model(trainX, trainY,testX, testY,epochs,batch_size)
    md.create_model(ger_vocab_size, eng_vocab_size, ger_length, eng_length, 256)
    md.compile_model()
    #md.plot_model()
    
    md.fit_model()
    

Saved: data/models/eng-german.pkl
[['hi' 'hallo' 'ccby france attribution tatoebaorg cm cburgmer']
 ['hi' 'gru gott' 'ccby france attribution tatoebaorg cm esperantostern']
 ['run' 'lauf' 'ccby france attribution tatoebaorg papabear fingerhut']
 ...
 ['if someone who doesnt know your background says that you sound like a native speaker it means they probably noticed something about your speaking that made them realize you werent a native speaker in other words you dont really sound like a native speaker'
  'wenn jemand fremdes dir sagt dass du dich wie ein muttersprachler anhorst bedeutet das wahrscheinlich er hat etwas an deinem sprechen bemerkt dass dich als nichtmuttersprachler verraten hat mit anderen worten du horst dich nicht wirklich wie ein muttersprachler an'
  'ccby france attribution tatoebaorg ck tickler']
 ['it may be impossible to get a completely errorfree corpus due to the nature of this kind of collaborative effort however if we encourage members to contribute sentence

KeyboardInterrupt: 

In [None]:
class Prediction():
    def __init__(self):
        self.model = load_model('model.h5')
        
    def word_for_id(self,integer, tokenizer):
        for word, index in tokenizer.word_index.items():
            if index == integer:
                return word
        return None

    # generate target given source sequence
    def predict_sequence(self, model, tokenizer, source):
        prediction = model.predict(source, verbose=0)[0]
        integers = [argmax(vector) for vector in prediction]
        target = list()
        for i in integers:
            word = self.word_for_id(i, tokenizer)
            if word is None:
                break
            target.append(word)
        return ' '.join(target)

    # evaluate the skill of the model
    def evaluate_model(self,model, tokenizer, sources, raw_dataset):
        actual, predicted = list(), list()
        for i, source in enumerate(sources):
            # translate encoded source text
            source = source.reshape((1, source.shape[0]))
            translation = self.predict_sequence(model, eng_tokenizer, source)
            #print("raw_dataset[i]: ",raw_dataset[i])
            raw_target = raw_dataset[i][0]
            raw_src = raw_dataset[i][1]
            if i < 10:
                print('src=[%s], target=[%s], predicted=[%s]' % (raw_src, raw_target, translation))
            actual.append([raw_target.split()])
            predicted.append(translation.split())
        # calculate BLEU score
        print('BLEU-1: %f' % corpus_bleu(actual, predicted, weights=(1.0, 0, 0, 0)))
        print('BLEU-2: %f' % corpus_bleu(actual, predicted, weights=(0.5, 0.5, 0, 0)))
        print('BLEU-3: %f' % corpus_bleu(actual, predicted, weights=(0.3, 0.3, 0.3, 0)))
        print('BLEU-4: %f' % corpus_bleu(actual, predicted, weights=(0.25, 0.25, 0.25, 0.25)))

In [None]:
pd = Prediction()
pd.evaluate_model(model, eng_tokenizer, testX, test)