In [1]:
"""
This file provides some helper functions required to read and prepare data
for the model
"""

import pandas as pd
import re
import numpy as np
from collections import Counter
from tensorflow.keras.preprocessing.sequence import pad_sequences
from matplotlib import pyplot as plt
from wordcloud import WordCloud, STOPWORDS


SPLIT_PATTERN_WITH_DILIMITER = r'([`\-=~!@#$%^&*()_+\[\]{};\'\\:"|<,./<>?\n\s])\s*'
SPLIT_PATTERN_NO_DILIMITER   = r'[`\-=~!@#$%^&*()_+\[\]{};\'\\:"|<,./<>?\n\s]\s*'


def read_data(data_path):
    """
    Reads data from an excel file
    """
    data_set = pd.read_excel(data_path)
    qids_raw       = data_set["QID"].values
    conditions_raw = data_set["CONDITION"].values
    outputs_raw    = data_set["OUTPUT"].values
    return qids_raw, conditions_raw, outputs_raw

def write_data(qids, conditions, outputs, data_path):
    """
    Writes data to excel file
    """
    data_set = pd.DataFrame(list(zip(qids, conditions, outputs)),
                            columns=["QID", "CONDITION", "OUTPUT"])
    data_set.to_excel(data_path)


def prepare_data(qids_raw, conditions_raw, outputs_raw):
    """
    Prepares data for the model by
    Args:
        qids_raw: Pyhon list of raw qid texts
        conditions_raw: Pyhon list of raw condition texts
        outputs_raw: Pyhon list of raw output texts
    Returns:
        qids: Pyhon list of preprocessed qid sequnces
        conditions: Pyhon list of preprocessed condition sequnces
        outputs: Pyhon list of preprocessed output sequnces
        dictionaries_standardization: Pyhton list of dictionaries used for standardizing samples
    """

    qids = []
    conditions = []
    outputs = []
    dictionaries_lemanization = []
    for qid_raw, condition_raw, output_raw in zip(qids_raw, conditions_raw, outputs_raw):
        qid, condition, output, dictionary = preprocess_sample(qid_raw, condition_raw, output_raw)
        qids.append(qid)
        conditions.append(condition)
        outputs.append(output)
        dictionaries_lemanization.append(dictionary)

    return qids, conditions, outputs, dictionaries_standardization

def preprocess_sample(qid_raw, condition_raw, output_raw):
    qid, condition, output = split_to_words(qid_raw, condition_raw, output_raw)
    
    qid, condition, output, dictionary_standardization = standardize_words(qid, condition, output)
    return qid, condition, output, dictionary_standardization

def split_to_words(qid_raw, condition_raw, output_raw):
    qid       = re.split(SPLIT_PATTERN_NO_DILIMITER, str(qid_raw))
    condition = re.split(SPLIT_PATTERN_NO_DILIMITER, str(condition_raw))
    condition = [cond for cond in condition if cond != " " and cond != ""]
    output    = re.split(SPLIT_PATTERN_WITH_DILIMITER, str(output_raw))
    
    qid       = [x.lower() for x in qid]
    condition = [x.lower() for x in condition]
    output    = [x.lower() for x in output]
    
    return qid, condition, output

def standardize_words(qid, condition, output):
    dictionary_standardization = {}
    for index, id in enumerate(qid):
        standard_qid = '<QID{}>'.format(index)
        dictionary_standardization[standard_qid] = qid[index]
        qid[index] = standard_qid
    
        for word_index in range(len(condition)):
            if condition[word_index] == id:
                condition[word_index] = standard_qid

        for word_index in range(len(output)):
            if output[word_index] == id:
                output[word_index] = standard_qid

    digit_num = 0
    for word in condition:
        if word.isdigit():
            standard_digit = '<DGT{}>'.format(digit_num)
            digit_num += 1
            dictionary_standardization[standard_digit] = word

            for word_index in range(len(condition)):
                if condition[word_index] == word:
                    condition[word_index] = standard_digit

            for word_index in range(len(output)):
                if output[word_index] == word:
                    output[word_index] = standard_digit

    for word in output:
        if word.isdigit():
            standard_digit = '<DGT{}>'.format(digit_num)
            digit_num += 1
            dictionary_standardization[standard_digit] = word
            for word_index in range(len(output)):
                if output[word_index] == word:
                    output[word_index] = standard_digit
    
    condition   = ['<BOS>']  + condition + ['<EOS>']
    output      = ['<BOS>']  + output  + ['<EOS>']

    return qid, condition, output, dictionary_standardization


def create_vocabulary(word_list, max_vocab_size):
    """ Create Vocabulary dictionary
    Args:
        text(str): inout word list
        max_vocab_size: maximum number of words in the vocabulary
    Returns:
        word2id(dict): word to id mapping
        id2word(dict): id to word mapping
    """
    words = [word for sample in word_list for word in sample]
    freq = Counter(words)
    word2id = {'<PAD>' : 0}
    id2word = {0 : '<PAD>'}

    for word, _ in freq.most_common():
        id = len(word2id)
        if word not in word2id:
            word2id[word] = id
            id2word[id] = word
            print(word)
        if id == max_vocab_size - 1 :
            break

    return word2id, id2word


def replace_using_dict(list, dictionary, drop_unknown=False):
    translated_list = []
    for line in list:
        if drop_unknown:
            translated_line = [dictionary[word] for word in line if word in dictionary]
        else:
            translated_line = [dictionary[word] if word in dictionary else word for word in line]
        translated_list.append(translated_line)
    
    return translated_list

def pad_with_zero(list, max_length, pad_type):
    padded_list = pad_sequences(list, maxlen=max_length, padding=pad_type, truncating='post')
    return padded_list


def log_to_shell(index, qid_raw, condition_raw, output_raw, decoded_seqeunce):
    """ Prints information to shell
    """
    print("Sample index",       index)
    print("QID: ",              qid_raw)
    print("CONDITION: ",        condition_raw)
    print("OUTPUT: ",           output_raw,'\n')
    print("Predicted OUTPUT: ", decoded_seqeunce, '\n\n')



def plot_word_cloud(word_list):
    words = ' '.join(word_list)
    wordcloud = WordCloud(width = 800, height = 800, 
                    background_color ='white',
                    stopwords = None,
                    collocations = False,
                    regexp=None,
                    min_word_length=0,
                    min_font_size = 10).generate(words) 
                         
    #plt.figure(figsize = (8, 8), facecolor = None) 
    plt.imshow(wordcloud) 
    plt.axis("off") 
    plt.tight_layout(pad = 0) 
    plt.show()


def plot_length_distribution(list, max_length=100, cdf=False):
    length_count = [0 for _ in range(max_length)]
    #item =[list]
    for item in list:
        item_length = len(item)
        if item_length < max_length:
            length_count[item_length] += 1

    length_count = np.array(length_count)
    length_freq = length_count/np.sum(length_count)
    if cdf:
        length_freq = np.cumsum(length_freq)
    plt.plot(length_freq)
    plt.show()


    """
    print(QIDs_process[:2],"\n")
    print(CONDITIONs_process[:2],"\n")
    print(CONDITIONs[:2],"\n")
    print(OUTPUTs_process[:2])
    
    all_word = [word for i in range(len(CONDITIONs_process)) for word in CONDITIONs_process[i]]
    plot_word_cloud(all_word)
    all_word = [word for i in range(len(OUTPUTs_process)) for word in OUTPUTs_process[i]]
    plot_word_cloud(all_word)
    """
    

  _np_qint8 = np.dtype([("qint8", np.int8, 1)])
  _np_quint8 = np.dtype([("quint8", np.uint8, 1)])
  _np_qint16 = np.dtype([("qint16", np.int16, 1)])
  _np_quint16 = np.dtype([("quint16", np.uint16, 1)])
  _np_qint32 = np.dtype([("qint32", np.int32, 1)])
  np_resource = np.dtype([("resource", np.ubyte, 1)])


In [2]:
""""
This file reads and preproces the train dataset and creates (and trains) a seq2seq model
using Recurrent Neurak Networks to predict a target sequnce from an input sequnce.
openpyxl
xlrd
numpy
tensorflow
pandas
sklearn
"""

### Import required packages
import numpy as np
import random
import argparse
import pickle
import os

import tensorflow as tf
from tensorflow.python import keras
from keras.models import Sequential, load_model, Model, Input
from keras.layers import Embedding, LSTM, Dense, Activation, Dropout, TimeDistributed, Bidirectional, Lambda
from keras.callbacks import ModelCheckpoint
from keras import backend as K

## Import helper functions
from sklearn.model_selection import train_test_split

from tools import read_data, prepare_data, create_vocabulary, replace_using_dict, pad_with_zero

## Define default training data path
MT_TRAINING_CORPUS_PATH  = "./data/MT_training_corpus.xlsx"

## Specify path to save model and metadata
MT_SEQ2SEQ_MODEL_PATH    = "./model/mt_seq2seq_model.h5"
MT_MODEL_CHECKPOINT_PATH = "./model/model.chpt"
MT_META_DATA_FILE_PATH   = "./model/metadata.pickle"

## Define model parameter
# Encoder and Decoder maximum vocabulary size
encoder_vocab_size = 150
decoder_vocab_size = 50

# Encoder and Decoder sequnces length
encoder_seq_length = 20
decoder_seq_length = 15

# Number of training epcohs
num_epochs = 30

# Training Batch size
batch_size = 20

# Number of LSTM latend dimention in both Encoder and Decoder
num_latent_dim = 40

# Fraction of data used for validation during training the model
validation_size = 0.1


def data_generator(X, y, batch_size):
    """ Creates a data genrator to feed encoder and decoder input sequnces and decoder
    target sequnce
    Args:
        X: input sequnces
        y: target sequnces
    Returns:
         yields a batch of encoder and decoder input sequnces and decoder target sequnce
    """
    while True:
        for j in range(random.randint(1,len(X)-batch_size)):
            encoder_input_sequnce  = np.zeros((batch_size, encoder_seq_length, encoder_vocab_size), dtype='float32')
            decoder_input_sequnce  = np.zeros((batch_size, decoder_seq_length, decoder_vocab_size), dtype='float32')
            decoder_target_sequnce = np.zeros((batch_size, decoder_seq_length, decoder_vocab_size), dtype='float32')

            for i, (input_seq, target_seq) in enumerate(zip(X[j:j+batch_size], y[j:j+batch_size])):
                for t, word in enumerate(input_seq):
                    encoder_input_sequnce[i, t, word] = 1  # encoder input seq
                for t, word in enumerate(target_seq):
                    if t < decoder_seq_length:
                        decoder_input_sequnce[i, t, word] = 1 # decoder input seq
                    if t>0:
                        decoder_target_sequnce[i, t-1, word] = 1 # decoder target seq

            yield([encoder_input_sequnce, decoder_input_sequnce], decoder_target_sequnce)            
 


def create_seq2seq_model(encoder_vocab_size, decoder_vocab_size, latent_dim):
    """ Creates a seq2seq model using Recurrent Neural Networks(RNN).
    The encoder consists of a left-to-right LSTM layer and outputs states to decoder.
    The decoder is also consists of a left-to-right LSTM layer and outputs a sequence that
    are fed to time distributed fully connected layers with softmax activation to predict 
    target sequence. 
    Args:
        encoder_vocab_size: number of encoder tokens (i.e., encoder vocab size)
        decoder_vocab_size: size of  decoder tokens (i.e., decoder vocab size)
        latent_dim: number of LSTM hidden dimenetions
    Returns:
        model: seq2seq model
    """
    
    ### Encoder
    ## Input layer
    encoder_inputs = Input(shape=(None, encoder_vocab_size), name='encoder_input')
    ## LSTM layer
    encoder = LSTM(latent_dim, return_state=True, name='encoder_lstm')
    encoder_outputs, state_h, state_c = encoder(encoder_inputs)
    # We keep encoder states and discard encoder ouput.
    encoder_states = [state_h, state_c]

    ### Decoder
    ## Input layer
    decoder_inputs = Input(shape=(None, decoder_vocab_size), name='decoder_input')
    ## Left to right LSTM layer
    # We set up our decoder to return full output sequences,
    decoder_lstm = LSTM(latent_dim, return_sequences=True, return_state=True, name='decoder_lstm')
    decoder_outputs, _, _ = decoder_lstm(decoder_inputs,
                                        initial_state=encoder_states)
    ## Fully connected layer
    decoder_dense = Dense(decoder_vocab_size, activation='softmax', name='decoder_dense')
    decoder_outputs = decoder_dense(decoder_outputs)

    ### Model to jointly train Encoder and Decoder 
    model = Model([encoder_inputs, decoder_inputs], decoder_outputs)

    return model

def create_seq2seq_inference_model(model, latent_dim):
    """ Creates a seq2seq inference model by extracting Encoder and Decoder models
     from the input seq2seq model.
    Args:
        model: a seq2seq model
        laten_dim: number of latent dimention of the seq2seq model
    Returns:
        encoder_model: encoder model of input seq2seq model
        decoder_model: decoder model of input seq2seq model
    """
    ### Inference Model
    # 1. Encode the input sequence using Encoder and return state for decoder input
    # 2. Run one step of decoder with this intial state and "start of sequnce" token
    #  as input. The output will be used as the next decoder input sequnce token
    # 3. This procedure is repteated to predict all output sequnce 
    
    ### Encoder Model
    encoder_inputs = model.input[0] 
    encoder_outputs, state_h_enc, state_c_enc = model.get_layer('encoder_lstm').output   # lstm_1
    encoder_states = [state_h_enc, state_c_enc]
    encoder_model = Model(encoder_inputs, encoder_states)
    ### Decoder Model
    ## Decoder State Input
    decoder_inputs = model.input[1]
    decoder_state_input_h = Input(shape=(latent_dim,), name='input_3')
    decoder_state_input_c = Input(shape=(latent_dim,), name='input_4')
    decoder_states_inputs = [decoder_state_input_h, decoder_state_input_c]
    ## Decoder LSTM layer
    decoder_lstm = model.get_layer('decoder_lstm')
    decoder_outputs, state_h_dec, state_c_dec = decoder_lstm(
        decoder_inputs, initial_state=decoder_states_inputs)
    decoder_states = [state_h_dec, state_c_dec]
    ## Decoder Fully connected layer
    decoder_dense = model.get_layer('decoder_dense')
    decoder_outputs = decoder_dense(decoder_outputs)

    decoder_model = Model([decoder_inputs] + decoder_states_inputs,
                          [decoder_outputs] + decoder_states)

    return encoder_model, decoder_model

def train_seq2seq_model(model, X_train, X_valid, y_train, y_valid, epochs):
    """ Compiles and trains the seq2seq model. The train data is fed to model
    using a generator function
    Args:
        model: seq2seq model
        X_train: train data input sequnce (conditions)
        X_valid: train data input sequnce (conditions)
        y_train: validation target sequnce sequnce (ouputs)
        y_valid: validation target sequnce (ouputs)
        epochs: number of epochs to train model
    Returns:
        model: trained seq2seq model
    """

    # Model is trainined to minimize cross enthrop between true target sequnce
    # and predicted target sequnce
    # Optimizer is set to Nadam and accuracy is used as metric
    model.compile(loss='categorical_crossentropy',
                  optimizer='Nadam',
                  metrics=['acc'])
    
    # Creats data genrators to feed train and validation data
    train_data_generator = data_generator(X_train, y_train, batch_size)
    valid_data_generator = data_generator(X_valid, y_valid, batch_size)
    
    # Define callback fo model checkpoint
    callbacks = [ModelCheckpoint(MT_MODEL_CHECKPOINT_PATH, save_best_only=True, save_weights_only=False)]
    
    # Train the model
    model.fit_generator(train_data_generator,
                        validation_data=valid_data_generator,
                        epochs=epochs,
                        callbacks=callbacks,
                        verbose=2,
                        steps_per_epoch=len(X_train)/batch_size,
                        validation_steps=len(X_valid)/batch_size)

    return model
    

def main():
    """ The main steps to train a seq2seq model:
    1. Read dataset
    2. Preproces each sequnce (create standarized sequnces)
        a. Change QID, CONDITION and OUTPUT text to lowercas
        b. split QID, CONDITION and OUTPUT text into tokens (words)
        c. Replace QID tokens in each sample with standrized tokens (i.e., <QID0>, <QID1>, ...)
        d. Replace digit tokens in each sample with standarized tokens (i.e., <DGT0>, <DGT1>, ...)
        e. Create standardization dictionary for each sample
        f. Add special tokens <BOS> and <EOS> to the begining and end of each sequence
    3. Create dictinries to convert input and target sequnces to an integer id
    4. Replace input and outpu sequnce tokens with an integre id
    5. Pad sequnces with zero to create fixed size input and target sequnces
        a. Input sequnce is pre-padded with zero
        b. Target sequnce is post-padded 
    4. Create a seq2seq model
    4. Train the model
    5. Save the model and model metadata (inclding dictionaries to conver words to id)
    """
    
    train_data_path = MT_TRAINING_CORPUS_PATH


    # Read dataset from Excel file
    qids_raw, conditions_raw, output_raw = read_data(train_data_path)
    print("\nLoaded train data set from [{}]\n".format(train_data_path))

    # Preprocess the raw input text data
    _, conditions, outputs, dictionaries_lemanization = prepare_data(qids_raw, conditions_raw, output_raw)
    
    # Create dictionaries to convert between word and an integer id
    # for conditions (Human Longuage) and ouputs (Machine longuage)
    condition_word2id, condition_id2word = create_vocabulary(conditions, encoder_vocab_size)
    output_word2id, output_id2word = create_vocabulary(outputs, decoder_vocab_size)
    
    # Replace words of condition and ouput with corresponding id in dictonaries
    conditions = replace_using_dict(conditions, condition_word2id, drop_unknown=True)
    outputs    = replace_using_dict(outputs, output_word2id, drop_unknown=True)

    # Fix all sequnces length to a fixed size with padding
    conditions = pad_with_zero(conditions, encoder_seq_length,'pre')
    outputs    = pad_with_zero(outputs, decoder_seq_length+1,'post')

    # Split train data into train and validation sets
    conditions_train, conditions_valid, outputs_train, outputs_valid = train_test_split(conditions, outputs, test_size=validation_size, random_state=42)

    # Created a seq2seq Recurrent Neural Network model
    model = create_seq2seq_model(encoder_vocab_size, decoder_vocab_size, num_latent_dim)
    model.summary()
    
    # Train the seq2seq model
    model = train_seq2seq_model(model, conditions_train, conditions_valid, outputs_train, outputs_valid, num_epochs)

    
    print("\nTrained seq2seq model saved in [{}]\n".format(MT_SEQ2SEQ_MODEL_PATH))
    

main()





Using TensorFlow backend.



Loaded train data set from [./data/MT_training_corpus.xlsx]

<BOS>
<EOS>
if
<DGT0>
<QID1>
ask
<DGT1>
<QID0>
or
in
selected
at
only
to
routing
not
<DGT2>
<QID2>
respondent
show
a1
question
and
screenout
is
coded
of
those
a2
instructions
continue
title
terminate
the
screen
out
brand
for
us
respondents
<DGT3>
otherwise
then
canada
age
term
a
a3
end
–
below
a4
aware
who
skip
above
select
years
least
answered
than
brands
any
close
salad
punch
that
<QID3>
was
ordered
else
uk
product
ne
q1002–a1
q1003–a2
<DGT4>
old
qid
exclude
a5
code
yes
all
race
as
programmer
do
hcp
q1002–a2
with
must
isn’t
younger
answer
hispanic
does
pn
one
answers
client
page
did
hidden
new
asked
drop
list
this
these
label
thank
down
priority
ca
screener
had
q1003–a1
france
months
codes
table
use
sample
on
used
from
advertising
q42061
price
equal
test
women
online
china
year
prescribed
info
state
node
recommended
ranch
have
latino
satisfaction
first
based
greater
no
(
)

.
<BOS>
<EOS>
<DGT0>
any
<QID1>
<DGT1>
,
<QID0>
 

  '. They will not be included '


Epoch 2/30
 - 3s - loss: 0.8291 - acc: 0.8203 - val_loss: 0.6315 - val_acc: 0.8708
Epoch 3/30
 - 3s - loss: 0.5282 - acc: 0.8772 - val_loss: 0.6167 - val_acc: 0.8342
Epoch 4/30
 - 2s - loss: 0.3279 - acc: 0.9248 - val_loss: 0.5624 - val_acc: 0.8633
Epoch 5/30
 - 3s - loss: 0.2477 - acc: 0.9391 - val_loss: 0.3600 - val_acc: 0.9103
Epoch 6/30
 - 2s - loss: 0.2134 - acc: 0.9479 - val_loss: 0.3846 - val_acc: 0.9069
Epoch 7/30
 - 2s - loss: 0.1887 - acc: 0.9574 - val_loss: 0.3148 - val_acc: 0.9231
Epoch 8/30
 - 2s - loss: 0.1212 - acc: 0.9727 - val_loss: 0.2900 - val_acc: 0.9164
Epoch 9/30
 - 2s - loss: 0.1852 - acc: 0.9595 - val_loss: 0.2776 - val_acc: 0.9428
Epoch 10/30
 - 3s - loss: 0.1335 - acc: 0.9715 - val_loss: 0.1501 - val_acc: 0.9717
Epoch 11/30
 - 3s - loss: 0.1251 - acc: 0.9725 - val_loss: 0.2493 - val_acc: 0.9319
Epoch 12/30
 - 3s - loss: 0.0988 - acc: 0.9778 - val_loss: 0.3006 - val_acc: 0.9236
Epoch 13/30
 - 2s - loss: 0.1274 - acc: 0.9728 - val_loss: 0.2258 - val_acc: 0.9411


In [3]:
"""
This file reads and preproces the test dataset. Loades a trained seq2seq model
and predict the iput for each sample in test dataset. It writes prediction results
to a file and print to shell
"""

### Import required packages
import numpy as np
import argparse
import pickle
import os

from tensorflow.python import keras
from keras.models import load_model

MT_TEST_CORPUS_PATH                 = "./data/MT_test_submission.xlsx"
MT_TEST_CORPUS_PATH_WITH_PREDCITION = "./data/MT_test_submission_with_predcitions.xlsx"

## Import helper functions and constant

## Specify prediction paramets
# Beam serahc paramets to predict the most likely target sequence
beam_search_max_branch = 3 # Maximum number of branch at each time step for beam search
beam_search_max_depth = 4  # Maimum sequnce step to branch in beam search

def decode_sequence(input_seq, encoder_model, decoder_model, word2id, id2word):
    """
    """
    # Encode the input as state vectors.
    encoder_input = np.zeros((1, len(input_seq), encoder_vocab_size))
    for t, word_id in enumerate(input_seq):
        encoder_input[0, t, word_id] = 1

    states_value = encoder_model.predict([encoder_input])
    # Generate empty target sequence of length 1.
    decoder_input = np.zeros((1, 1, decoder_vocab_size))
    # Populate the first character of target sequence with the start character.
    decoder_input[0, 0, word2id['<BOS>'] ] = 1 
    seq_length = 0
    sampled_seq, sampled_seq_prob, sampled_seq_length = decode_sequence_beam(decoder_model, decoder_input, states_value, word2id, seq_length)
    
    return sampled_seq, sampled_seq_prob

def decode_sequence_beam(decoder_model, decoder_input, states_value, word2id, seq_length):
    
    output_tokens, h, c = decoder_model.predict([decoder_input] + states_value)
    states_value = [h, c]
    
    seq_length += 1
    # Sample a token
    if seq_length < beam_search_max_depth:
        number_search_branch = beam_search_max_branch
    else:
        number_search_branch = 1
    
    beam_top_token_indecies = np.argsort(output_tokens[0, -1, :])[-number_search_branch:]
    sampled_seq_list = []
    sampled_seq_prob_list = []
    sampled_seq_length_list = []
    for beam in range(number_search_branch):
        sampled_token_index = beam_top_token_indecies[beam]
        sampled_token_prob  = output_tokens[0, -1, sampled_token_index]
        if sampled_token_index == word2id['<EOS>'] or seq_length == decoder_seq_length:
            return [sampled_token_index,0], sampled_token_prob, 0.00000001
        else:
            # Update the target sequence (of length 1).
            decoder_input = np.zeros((1, 1, decoder_vocab_size))
            decoder_input[0, 0, sampled_token_index] = 1
            # Update states
            sampled_seq, sampled_seq_prob, sampled_seq_length = decode_sequence_beam(decoder_model, decoder_input, states_value, word2id, seq_length)
            sampled_seq.append(sampled_token_index)
            sampled_seq_prob *= sampled_token_prob
            
            sampled_seq_list.append(sampled_seq)
            sampled_seq_prob_list.append(sampled_seq_prob)
            sampled_seq_length_list.append(sampled_seq_length)
    
    weighted_prob = np.log(np.array(sampled_seq_prob_list))/np.array(sampled_seq_length_list)
    
    best_beam = np.argmax(weighted_prob)
    
    return sampled_seq_list[best_beam], sampled_seq_prob_list[best_beam], sampled_seq_length_list[best_beam]+1




def main():

    # construct the argument parser and parse the arguments
    test_data_path = MT_TEST_CORPUS_PATH
    test_data_output_path = MT_TEST_CORPUS_PATH_WITH_PREDCITION

    # Load model and metadata
    model = load_model(MT_SEQ2SEQ_MODEL_PATH)

    with open(MT_META_DATA_FILE_PATH,'rb') as f:
        [condition_word2id, condition_id2word, output_word2id, output_id2word] = pickle.load(f)

    print("\nLoaded a trained seq2seq model from [{}]\n".format(MT_SEQ2SEQ_MODEL_PATH))

    encoder_model, decoder_model = create_seq2seq_inference_model(model, num_latent_dim)
    
    #test_data_path = MT_TRAINING_CORPUS_PATH
    
    #Read dataset from Excel file
    qids_raw, conditions_raw, output_raw = read_data(test_data_path)
    print("\nLoaded test dataset from [{}]\n".format(test_data_path))

    # Preprocess the raw input text data
    _, conditions, _, dictionaries_lemanization = prepare_data(qids_raw, conditions_raw, output_raw)
    
    # Replace words of qid, condition and ouput with corresponding id in dictonaries
    conditions = replace_using_dict(conditions, condition_word2id, drop_unknown=True)

    # Fix all sequnces length to a fixed size with padding
    conditions = pad_with_zero(conditions, encoder_seq_length,'pre')

    outputs_predcited = [None for _ in conditions]
    for sample_index, condition in enumerate(conditions):

        input_seq = condition
        decoded_seqeunce, _ = decode_sequence(input_seq, encoder_model, decoder_model, output_word2id, output_id2word)
        
        decoded_seqeunce = replace_using_dict([decoded_seqeunce], output_id2word)
        decoded_seqeunce = replace_using_dict(decoded_seqeunce, dictionaries_lemanization[sample_index])

        decoded_seqeunce = [seq for seq in decoded_seqeunce[0] if seq != '<PAD>' and seq != '<EOS>']
        decoded_seqeunce = reversed(decoded_seqeunce)
        decoded_seqeunce = ''.join(decoded_seqeunce)

        outputs_predcited[sample_index] = decoded_seqeunce

        if sample_index % 10 == 0:
            log_to_shell(sample_index, qids_raw[sample_index],
                           conditions_raw[sample_index], output_raw[sample_index],
                           decoded_seqeunce )
        
    write_data(qids_raw, conditions_raw, outputs_predcited, test_data_output_path)
    print("\nSaved predictions to [{}]\n".format(test_data_output_path))


main()


Loaded a trained seq2seq model from [./model/mt_seq2seq_model.h5]


Loaded test dataset from [./data/MT_test_submission.xlsx]

Sample index 0
QID:  1010
CONDITION:  Terminate if respondent selected ‘A4’ for all 3 product types
OUTPUT:  nan 

Predicted OUTPUT:  1010.any(3) 


Sample index 10
QID:  Q16A,QD
CONDITION:  ASK ONLY IF QD = 1-4 (ORDERED ANY TEST SALAD)
OUTPUT:  nan 

Predicted OUTPUT:  qd.between(1:4) 


Sample index 20
QID:  QR8,Q30,QR7
CONDITION:  ASK IF Q30=1 AND QR7=1-4
OUTPUT:  nan 

Predicted OUTPUT:  qr7.any(1)&qr7.any(1) 


Sample index 30
QID:  8014
CONDITION:  END Younger than 18 years
OUTPUT:  nan 

Predicted OUTPUT:  8014<18 


Sample index 40
QID:  Q26B,QE2
CONDITION:  ASK ONLY IF QE2=1
OUTPUT:  nan 

Predicted OUTPUT:  qe2.any(1) 


Sample index 50
QID:  Q3,Q2
CONDITION:  ASK IF Q2=1
OUTPUT:  nan 

Predicted OUTPUT:  q2.any(1) 


Sample index 60
QID:  Q17b,QD
CONDITION:  ASK ONLY IF QD = 1 (ORDERED BIG TEX TACO SALAD)
OUTPUT:  nan 

Predicted OUTPUT:  qd.any(1) 