In [1]:
from data_gen import *
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense
from tensorflow.keras.layers import Dropout
from tensorflow.keras.layers import LSTM, TimeDistributed
from tensorflow.keras.layers import Concatenate, Flatten
from tensorflow.keras.layers import GRU, Conv2D, MaxPooling2D
from tensorflow.keras.layers import Input, Reshape
from tensorflow.keras.models import Model
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.optimizers import RMSprop
# from tensorflow.keras.utils.vis_utils import plot_model
import tensorflow.keras as keras
import numpy as np
# from models import *

In [2]:

def conv_multi_lstm(n_input, n_output, n_feature, n_units, feat_units=5):
    root_word_input = Input(shape=(15, 28, 1), name="root_word_input")
    feature_input = Input(shape=(n_feature,), name="word_feature_input")

    feat_out = Dense(feat_units, activation="relu", name="feature_output")(feature_input)
    x = Conv2D(20, (5, 5), padding='same', activation='relu', name="cnn")(root_word_input)
    x = MaxPooling2D(3, 3, name="pooling")(x)
    x = Flatten(name="flatten")(x)
    x = Dense(n_units - feat_units, activation='relu', name="cnn_encoder")(x)

    state = Concatenate(name="concatnate")([x, feat_out])
    state_h = Dense(n_units, activation='relu', name='state_h')(state)
    state_c = Dense(n_units, activation='relu', name='state_c')(state)
    state = [state_h, state_c]
    
    decoder_inputs = Input(shape=(None, n_output), name="target_word_input")
    decoder_gru = LSTM(n_units, return_sequences=True, return_state=True, name="decoder_gru")
    decoder_outputs, _, _= decoder_gru(decoder_inputs, initial_state=state)

    decoder_dense = Dense(n_output, activation='softmax', name="train_output")
    decoder_outputs = decoder_dense(decoder_outputs)

    model = Model([root_word_input, decoder_inputs, feature_input], decoder_outputs)
    encoder_model = Model([root_word_input, feature_input], [state_h, state_c])

    decoder_state_input_h = Input(shape=(n_units,))
    decoder_state_input_c = Input(shape=(n_units,))
    decoder_states_inputs = [decoder_state_input_h, decoder_state_input_c]
    decoder_outputs, state_h, state_c = decoder_gru(decoder_inputs, initial_state=decoder_states_inputs)

    decoder_outputs = decoder_dense(decoder_outputs)
    decoder_model = Model([decoder_inputs, decoder_state_input_h, decoder_state_input_c], [decoder_outputs, state_h, state_c])

    return model, encoder_model, decoder_model


In [3]:
# data generator
dg = DataGen(data="data/wolaytta-train.txt")

# length of a word
n_input_length = len(char2int)
n_steps_in = dg.max_root_len
n_steps_out = dg.max_output_len


6


In [4]:
14780 * .7

10346.0

In [5]:
print("Total train data: ", len(dg.words) * .7)
batch_size = 128
# number of batches to train
n_batches = int(len(dg.words) * .7 / batch_size) 

# python generator to generate training data at each request
# E.x word_matrix, feature = next(gen)
gen = dg.cnn_gen_data_multi_word(batch_size=batch_size, n_batches=n_batches)

Total train data:  561794.1


In [6]:
# infenc - inference encoder model
# infdec - inference decoder model
# train - training model that combines both
# n_input_length - the length of the input and the output
# word_feat_len - the length of the word feature vector
# n_units - size of the hidden memory in the RNN
train, infenc, infdec = conv_multi_lstm(n_input_length, n_input_length, dg.word_feat_len + 3, 256)
train.compile(optimizer='adam', loss='categorical_crossentropy')


In [7]:
# model train given the data generator, how many batches and number of epochs
history = train.fit_generator(gen, steps_per_epoch=n_batches, epochs = 3)

W0721 21:26:17.121429  5592 deprecation.py:323] From C:\Users\amany\.conda\envs\tf2\lib\site-packages\tensorflow\python\ops\math_grad.py:1250: add_dispatch_support.<locals>.wrapper (from tensorflow.python.ops.array_ops) is deprecated and will be removed in a future version.
Instructions for updating:
Use tf.where in 2.0, which has the same broadcast rule as np.where


Epoch 1/3
Epoch 2/3
Epoch 3/3


In [8]:
dg2 = DataGen(data="data/wolaytta-test.txt")
test_n_batches, test_batch_size =  int(len(dg2.words) * 1. / batch_size), batch_size  
# test_n_batches, test_batch_size = 30, 10 

# data generator for test data
test_gen = dg2.cnn_gen_data_multi_word(batch_size=test_batch_size, n_batches=test_n_batches, trainset=False)

6


In [11]:
def predict(infenc, infdec, inputs, n_steps, cardinality):
    # encode
    state_h, state_c = infenc.predict(inputs)
    state = [state_h, state_c]
    # start of sequence input
    start = [0.0 for _ in range(cardinality)]
#     start[0] = 1
    target_seq = np.array(start).reshape(1, 1, cardinality)
    # collect predictions
    output = list()
    for t in range(n_steps):
        # predict next char
        
        yhat, h, c= infdec.predict([target_seq] + state)
        # store prediction
        output.append(yhat[0,0,:])
        # update state
        state = [h, c]
        # update target sequence
        target_seq = yhat
    return np.array(output)

In [15]:
# shows sample examples and calculates accuracy

total, correct = 0, 0
in_word = 0
sims = []
for b in range(test_n_batches):
    # get data from test data generator
    [X1, X2, X3], y = next(test_gen)
    for j in range(test_batch_size):
        word_features = X3[j].reshape((1, X3.shape[1])) 
        root_word_matrix = X1[j].reshape((1, X1.shape[1], X1.shape[2], 1))
#         word_index = X4[j].reshape((1, X4.shape[1]))
        # predicts the target word given root word and features
        target = predict(infenc, infdec, [root_word_matrix, word_features], n_steps_out, n_input_length)
        root = ''.join(dg.one_hot_decode(X1[j]))#.replace('&', ' ')
        word = ''.join(dg.one_hot_decode(y[j]))#.replace('&', ' ')
        targetS = ''.join(dg.one_hot_decode(target))#.replace('&', ' ')
#         sims.append(dg.word_sim(word, targetS))
        
        # checks if the predicted and the real words are equal
        if ''.join(dg.one_hot_decode(y[j])).strip() == ''.join(dg.one_hot_decode(target)).strip():
            correct += 1
#         else:
#             print(root, word.split('&')[0], '\t\t', targetS.split('&')[0])
#         if root.strip() in targetS.strip():
#             in_word += 1
#     print(b, root, word, targetS)
    total += test_batch_size
    

print('Exact Accuracy: %.2f%%' % (float(correct)/float(total)*100.0))

Exact Accuracy: 64.57%


In [None]:
# lines = open("data/wol-multi.txt").readlines()
for i in range(10):
    print(dg.roots[i], dg.words[i], dg.word_indexes[i])

In [None]:
# word2feat, word2root= {}, {}

In [None]:
# for line in lines:
#     line = line[:-1].split(' ')
#     word2feat[line[0]] = line[1]
#     word2root[line[0]] = line[2]

In [None]:
# check = {}
# for word in word2feat:
#     root = word2feat[word]
#     feat = word2root[word]
#     key = root + " " + feat
#     if key not in check:
#         check[key] = []
#     check[key].append(word)
    

In [None]:
# counter = 0
# file = open("data/wol-m.txt", "w")
# for key in check.keys():
#     words = check[key]
#     for i in range(len(words)):
#         word = words[i]
#         line = "{0} {1} {2}\n".format(word, key, i)
#         file.write(line)
# file.close()