In [None]:
from data_gen import *
from models import *

In [None]:
# data generator
dg = DataGen(data="data/wolayitta_clean.txt")

# length of a word
n_input_length = len(char2int)
n_steps_in = dg.max_root_len
n_steps_out = dg.max_output_len


In [None]:
print("Total train data: ", len(dg.words) * .7)
batch_size = 128
# number of batches to train
n_batches = int(len(dg.words) * .7 / batch_size) 

# python generator to generate training data at each request
# E.x word_matrix, feature = next(gen)
gen = dg.cnn_gen_data(batch_size=batch_size, n_batches=n_batches)

In [None]:
# infenc - inference encoder model
# infdec - inference decoder model
# train - training model that combines both
# n_input_length - the length of the input and the output
# word_feat_len - the length of the word feature vector
# n_units - size of the hidden memory in the RNN
train, infenc, infdec = conv_model(n_input_length, n_input_length, dg2.word_feat_len, 256)
train.compile(optimizer='adam', loss='categorical_crossentropy')


In [None]:
# model train given the data generator, how many batches and number of epochs
history = train.fit_generator(gen, steps_per_epoch=n_batches, epochs = 2)

In [None]:
test_n_batches, test_batch_size =  int(len(dg.words) * .7 / batch_size), batch_size  
# test_n_batches, test_batch_size = 3, 10 

# data generator for test data
test_gen = dg.cnn_gen_data(batch_size=test_batch_size, n_batches=test_n_batches, trainset=False)

In [None]:
# shows sample examples and calculates accuracy

total, correct = 0, 0
in_word = 0
sims = []
for b in range(test_n_batches):
    # get data from test data generator
    [X1, X2, X3], y = next(test_gen)
    for j in range(test_batch_size):
        word_features = X3[j].reshape((1, X3.shape[1])) 
        root_word_matrix = X1[j].reshape((1, X1.shape[1], X1.shape[2], 1))
        
        # predicts the target word given root word and features
        target = predict(infenc, infdec, root_word_matrix, word_features, n_steps_out, n_input_length)
        root = ''.join(dg.one_hot_decode(X1[j]))#.replace('&', ' ')
        word = ''.join(dg.one_hot_decode(y[j]))#.replace('&', ' ')
        targetS = ''.join(dg.one_hot_decode(target))#.replace('&', ' ')
        sims.append(dg.word_sim(word, targetS))
        
        # checks if the predicted and the real words are equal
        if dg.one_hot_decode(y[j]) == dg.one_hot_decode(target):
            correct += 1
        else:
            print(root, word.split('&')[0], '\t\t', targetS.split('&')[0])
        if root.strip() in targetS.strip():
            in_word += 1
#     print(b, root, word, targetS)
    total += test_batch_size
    

print('Exact Accuracy: %.2f%%' % (float(correct)/float(total)*100.0))

In [None]:

# data generator for gofa word, root word, word feature
dg2 = DataGen(data="data/goffa.txt")


# length of a word
n_input_length = len(char2int)
n_steps_in = dg2.max_root_len
n_steps_out = dg2.max_output_len


In [None]:
# number of batches for the gofa data generator
n_batches = int(len(dg2.words) * .4 / batch_size) 
gen2 = dg2.cnn_gen_data(batch_size=batch_size, n_batches=n_batches)

In [None]:
# transfer the learned weights of wolaytta encoder from above
# fix it not to be trainable, only the encoder
# infenc.trainable = False

# train the decoder only with gofa data
history = train.fit_generator(gen2, steps_per_epoch=n_batches, epochs = 5)

In [None]:
# test data generator for gofa words

g_test_batches = 60 #int(len(dg2.words) * .1 / batch_size) 
gen3 = dg2.cnn_gen_data(batch_size=batch_size, n_batches=g_test_batches, trainset=False)

In [None]:

# shows sample examples and calculates accuracy

total, correct = 0, 0
in_word = 0
sims = []
for b in range(g_test_batches):
    # get data from test data generator
    [X1, X2, X3], y = next(gen3)
    for j in range(100):
        word_features = X3[j].reshape((1, X3.shape[1])) 
        root_word_matrix = X1[j].reshape((1, X1.shape[1], X1.shape[2], 1))
        
        # predicts the target word given root word and features
        target = predict(infenc, infdec, root_word_matrix, word_features, n_steps_out, n_input_length)
        root = ''.join(dg.one_hot_decode(X1[j]))#.replace('&', ' ')
        word = ''.join(dg.one_hot_decode(y[j]))#.replace('&', ' ')
        targetS = ''.join(dg.one_hot_decode(target))#.replace('&', ' ')
#         sims.append(dg.word_sim(word, targetS))
        
        # checks if the predicted and the real words are equal'
#         print(len(dg.one_hot_decode(y[j])), len(dg.one_hot_decode(target)))
#         print(len(dg.one_hot_decode(target)[:27]))
        if dg.one_hot_decode(y[j]) == dg.one_hot_decode(target)[:27]:
            correct += 1
        else:
            print(root, word.split('&')[0], '\t\t', targetS.split('&')[0])
        if root.strip() in targetS.strip():
            in_word += 1
#     print(b, root, word, targetS)
    total += batch_size
    

print('Exact Accuracy: %.2f%%' % (float(correct)/float(total)*100.0))