In [31]:
import keras
from keras.models import Model
from keras.layers import Input, LSTM, Dense, Embedding
import numpy as np
from numpy import array, argmax
from pickle import dump,load
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.utils import to_categorical
from keras.callbacks import ModelCheckpoint

In [32]:
def load_clean_sentences(filename):
    return load(open(filename,'rb'))

def create_tokenizer(lines):
    tokenizer = Tokenizer()
    tokenizer.fit_on_texts(lines)
    return tokenizer

def max_length(lines):
    return max(len(line.split()) for line in lines)

def encode_sequences(tokenizer,length,lines):
    X = tokenizer.texts_to_sequences(lines)
    X = pad_sequences(X,maxlen=length,padding='post')
    print(X[0])
    return X

def encode_sequences2(tokenizer,length,lines):
    X = tokenizer.texts_to_sequences(lines)
    for i in range(len(X)):
        X[i].insert(0,2315)
    
    X = pad_sequences(X,maxlen=length+1,padding='post')
    print(X[0])
    return X

def encode_sequences3(tokenizer,length,lines):
    X = tokenizer.texts_to_sequences(lines)
    for i in range(len(X)):
        X[i].append(2316)
   
    X = pad_sequences(X,maxlen=length+1,padding='post')
    print(X[0])
    return X

def encode_output(sequences, vocab_size):
    ylist = list()
    for sequence in sequences:
        encoded = to_categorical(sequence, num_classes = vocab_size)
        ylist.append(encoded)
    y = array(ylist)
    y = y.reshape(sequences.shape[0], sequences.shape[1], vocab_size) #think abt it - done. compare with RNN.ipynb
    return y

# load datasets
dataset = load_clean_sentences('\\english-german-both.pkl')
train = load_clean_sentences('\\english-german-train.pkl')
test = load_clean_sentences('\\english-german-test.pkl')
 
# prepare english tokenizer
eng_tokenizer = create_tokenizer(dataset[:, 0])
eng_tokenizer.word_index['strt'] = 2315
eng_tokenizer.word_index['eos'] = 2316
eng_vocab_size = len(eng_tokenizer.word_index) + 1
eng_length = max_length(dataset[:, 0])
print('English Vocabulary Size: %d' % eng_vocab_size)
print('English Max Length: %d' % (eng_length))
# prepare german tokenizer
ger_tokenizer = create_tokenizer(dataset[:, 1])
ger_vocab_size = len(ger_tokenizer.word_index) + 1
ger_length = max_length(dataset[:, 1])
print('German Vocabulary Size: %d' % ger_vocab_size)
print('German Max Length: %d' % (ger_length))
 
# prepare training data
trainX = encode_sequences(ger_tokenizer, ger_length, train[:, 1])
trY = encode_sequences2(eng_tokenizer, eng_length, train[:, 0])
trainY = encode_sequences3(eng_tokenizer, eng_length, train[:, 0])
trainY = encode_output(trainY, eng_vocab_size)
print(trainY[0])
print(trainX[0])
print(trainX.shape)
print(trY.shape)

English Vocabulary Size: 2317
English Max Length: 5
German Vocabulary Size: 3686
German Max Length: 10
[567  20 257   0   0   0   0   0   0   0]
[2315   30  715    0    0    0]
[  30  715 2316    0    0    0]
[[0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 1.]
 [1. 0. 0. ... 0. 0. 0.]
 [1. 0. 0. ... 0. 0. 0.]
 [1. 0. 0. ... 0. 0. 0.]]
[567  20 257   0   0   0   0   0   0   0]
(9000, 10)
(9000, 6)


In [83]:
encoder_inputs = Input(shape=(10,))      
encoder_embedding = Embedding(ger_vocab_size, 100)
encoder_inputs2 = encoder_embedding(encoder_inputs)
encoder_lstm = LSTM(100, return_state=True)
encoder_outputs, state_h, state_c = encoder_lstm(encoder_inputs2)
encoder_states = [state_h, state_c]
decoder_inputs = Input(shape=(6,))
decoder_embedding = Embedding(eng_vocab_size,100)
decoder_inputs2 = decoder_embedding(decoder_inputs)
decoder_lstm = LSTM(100,return_sequences = True,return_state=True)
decoder_outputs, c, v = decoder_lstm(decoder_inputs2,initial_state=encoder_states)
decoder_dense = Dense(eng_vocab_size,activation = 'softmax')
decoder_outputs = decoder_dense(decoder_outputs)
model = Model([encoder_inputs,decoder_inputs],decoder_outputs)
model.compile(optimizer='adam', loss='categorical_crossentropy')

model.summary()



__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_62 (InputLayer)           (None, 10)           0                                            
__________________________________________________________________________________________________
input_63 (InputLayer)           (None, 6)            0                                            
__________________________________________________________________________________________________
embedding_41 (Embedding)        (None, 10, 100)      368600      input_62[0][0]                   
__________________________________________________________________________________________________
embedding_42 (Embedding)        (None, 6, 100)       231700      input_63[0][0]                   
__________________________________________________________________________________________________
lstm_42 (L

In [85]:
class prttt(keras.callbacks.Callback):
    def on_epoch_end(self, epoch, logs={}):
        temp = trainX[0].reshape((1,trainX[0].shape[0]))
        temp2 = trY[0].reshape((1,trY[0].shape[0]))
        prediction = model.predict([temp,temp2])[0]
        #print(prediction)
        integer = [argmax (vector) for vector in prediction]
        print(integer)
prt = prttt()

filepath = "D:\\model2.h5"
checker = keras.callbacks.ModelCheckpoint(filepath, monitor='loss', verbose=0, save_best_only=False, save_weights_only=False, mode='auto', period=1)
model.fit([trainX, trY], trainY,
          batch_size=128,
          epochs=50,
         callbacks = [prt,checker])

Epoch 1/50
[0, 0, 0, 0, 0, 0]


  '. They will not be included '


Epoch 2/50
[1, 2316, 2316, 0, 0, 0]
Epoch 3/50
[2, 2316, 2316, 0, 0, 0]
Epoch 4/50
[2, 2316, 2316, 0, 0, 0]
Epoch 5/50
[1, 4, 2316, 0, 0, 0]
Epoch 6/50
[2, 6, 2316, 0, 0, 0]
Epoch 7/50
[2, 6, 2316, 0, 0, 0]
Epoch 8/50
[1, 6, 2316, 0, 0, 0]
Epoch 9/50
[1, 6, 2316, 0, 0, 0]
Epoch 10/50
[10, 6, 2316, 0, 0, 0]
Epoch 11/50
[5, 6, 2316, 0, 0, 0]
Epoch 12/50
[5, 6, 2316, 0, 0, 0]
Epoch 13/50
[19, 6, 2316, 0, 0, 0]
Epoch 14/50
[19, 6, 2316, 0, 0, 0]
Epoch 15/50
[20, 6, 2316, 0, 0, 0]
Epoch 16/50
[20, 6, 2316, 0, 0, 0]
Epoch 17/50
[22, 6, 2316, 0, 0, 0]
Epoch 18/50
[20, 28, 2316, 0, 0, 0]
Epoch 19/50
[20, 6, 2316, 0, 0, 0]
Epoch 20/50
[20, 40, 2316, 0, 0, 0]
Epoch 21/50
[22, 28, 2316, 0, 0, 0]
Epoch 22/50
[22, 40, 2316, 0, 0, 0]
Epoch 23/50
[22, 40, 2316, 0, 0, 0]
Epoch 24/50
[22, 40, 2316, 0, 0, 0]
Epoch 25/50
[57, 40, 2316, 0, 0, 0]
Epoch 26/50
[22, 40, 2316, 0, 0, 0]
Epoch 27/50
[57, 84, 2316, 0, 0, 0]
Epoch 28/50
[57, 40, 2316, 0, 0, 0]
Epoch 29/50
[30, 84, 2316, 0, 0, 0]
Epoch 30/50
[57, 8

<keras.callbacks.History at 0x4663d438>

In [90]:
################## MODEL FOR TESTING #########################

encoder_inputs = Input(shape=(10,))
encoder_inputs2 = encoder_embedding(encoder_inputs)
encoder_outputs, state_h, state_c = encoder_lstm(encoder_inputs2)
encoder_states = [state_h, state_c]
encoder_model = Model(encoder_inputs,encoder_states)
encoder_model.summary()
decoder_state_input_h = Input(shape=(100,))
decoder_state_input_c = Input(shape=(100,))
decoder_states_inputs = [decoder_state_input_h, decoder_state_input_c]
decoder_inputs = Input(shape=(1,))
decoder_inputs2 = decoder_embedding(decoder_inputs)
decoder_outputs, state_h, state_c = decoder_lstm(decoder_inputs2, initial_state=decoder_states_inputs)
decoder_states = [state_h, state_c]
decoder_outputs = decoder_dense(decoder_outputs)
decoder_model = Model(
    [decoder_inputs] + decoder_states_inputs,
    [decoder_outputs] + decoder_states)
decoder_model.summary()



_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_68 (InputLayer)        (None, 10)                0         
_________________________________________________________________
embedding_41 (Embedding)     (None, 10, 100)           368600    
_________________________________________________________________
lstm_42 (LSTM)               [(None, 100), (None, 100) 80400     
Total params: 449,000
Trainable params: 449,000
Non-trainable params: 0
_________________________________________________________________
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_71 (InputLayer)           (None, 1)            0                                            
__________________________________________________________________________________________________
embe

In [94]:
# testing
def word_for_id(integer, tokenizer):
    for word, index in tokenizer.word_index.items():
        if index == integer:
            return word
    return None
def decode_sequence(input_seq):
    # Encode the input as state vectors.
    states_value = encoder_model.predict(input_seq)

    target_seq = [2315] #start token
    stop_condition = False
    decoded_sentence = list()
    while not stop_condition:
        output_tokens, h, c = decoder_model.predict(
            [target_seq] + states_value)
        
        #print(output_tokens)
        integers = [argmax(vector) for vector in output_tokens[0]]
        #print(integers)
        target_seq = integers
        sampled_char = ''
        for j in integers:
            word = word_for_id(j, eng_tokenizer)
            if word is 'eos' or word is None:
                sampled_char = word
                break
            decoded_sentence.append(word)

        # Exit condition: either hit max length
        # or find stop character.
        if sampled_char == 'eos' or sampled_char == None or len(decoded_sentence) > 5:
            stop_condition = True

        # Update states
        states_value = [h, c]

    return ' '.join(decoded_sentence)

In [98]:
print('prediction on train')
for i in range(0,10):
    temp = trainX[i].reshape((1,trainX[i].shape[0]))
    translation = decode_sequence(temp)
    raw_target, raw_src = train[i]
    print('src=[%s], target=[%s], predicted=[%s]' % (raw_src, raw_target, translation))

prediction on train
src=[fass dich kurz], target=[be brief], predicted=[wait up us]
src=[geh jetzt], target=[leave now], predicted=[lets go]
src=[mir gehts gut], target=[im fine], predicted=[im doing ok]
src=[ich erinnere mich an sie], target=[i remember them], predicted=[i dont see]
src=[tom amusiert sich], target=[toms amused], predicted=[toms engaged]
src=[du schaffst das], target=[youll make it], predicted=[youll do it]
src=[ich liebe den herbst], target=[i love autumn], predicted=[i love math]
src=[tom ist ein chaot], target=[tom is a slob], predicted=[tom is a lot]
src=[ich mag mathe], target=[i like math], predicted=[i like to dance]
src=[magst du mich], target=[do you like me], predicted=[do you love me]


In [97]:
testX = encode_sequences(ger_tokenizer, ger_length, test[:, 1])
print('prediction on test')
for i in range(0,10):
    temp = testX[i].reshape((1,testX[i].shape[0]))
    translation = decode_sequence(temp)
    raw_target, raw_src = test[i]
    print('src=[%s], target=[%s], predicted=[%s]' % (raw_src, raw_target, translation))

[  1 227   4  95   0   0   0   0   0   0]
prediction on test
src=[ich kenne sie alle], target=[i know them all], predicted=[i know her car]
src=[der mann erotete], target=[the man blushed], predicted=[the man is hot]
src=[ist dir nicht kalt], target=[arent you cold], predicted=[arent you hot]
src=[hor auf zu schreien], target=[stop yelling], predicted=[stop please]
src=[hier ist tom], target=[here comes tom], predicted=[is it here]
src=[dies ist mein schreibtisch], target=[this is my desk], predicted=[this is my plan]
src=[ist das eures], target=[is it yours], predicted=[is it yours]
src=[er ist mein vater], target=[he is my father], predicted=[hes a grouch]
src=[mir tut alles weh], target=[i ache all over], predicted=[im afraid]
src=[tom veranderte sich], target=[tom changed], predicted=[tom will work]
