In [1]:
#files
TEST_FILE = "data/test.txt"
TRAIN_FILE = "data/train.txt"
F_VOCAB_FILE = "data/vocab.f.txt"
Q_VOCAB_FILE = "data/vocab.q.txt"

In [2]:
import dynet_config
# Declare GPU as the default device type
dynet_config.set_gpu()
# Set some parameters manualy
dynet_config.set(mem=400, random_seed=123456789)
# Initialize dynet import using above configuration in the current scope
import dynet as dy

In [3]:
from mnnl import RNNSequencePredictor
import random
random.seed(33)

In [4]:
def read_data(fh):
    for line in fh:
        sentence, lf = line.strip().split("\t")
        sentence = sentence.split()
        lf = lf.split()
        yield sentence, lf

In [5]:
def read_vocab(filename):
    t2i = {"_UNK": 0, "<s>": 1, "</s>":2}
    with open(filename) as target:
        for line in target:
            token = line.strip().split()[0]
            if token not in t2i:
                t2i[token] = len(t2i)
    return t2i

In [11]:
class Seq2Seq:
    def __init__(self, w2i, lf2i, options):
        self.options = options
        self.w2i = w2i
        self.lf2i = lf2i
        self.i2lf = {lf2i[lf]:lf for lf in lf2i}
        self.wdims = options.wembedding_dims
        self.lfdims = options.lfembedding_dims
        self.ldims = options.lstm_dims
        self.ext_embeddings = None
        
        self.model = dy.ParameterCollection()
        self.trainer = dy.AdamTrainer(self.model)
        self.__load_model()
        self.wlookup = self.model.add_lookup_parameters((len(w2i), self.wdims))
        self.lflookup = self.model.add_lookup_parameters((len(lf2i), self.lfdims))

        self.context_encoder = [dy.VanillaLSTMBuilder(1, self.wdims, self.ldims, self.model)]
        self.logical_form_decoder = dy.VanillaLSTMBuilder(1, self.lfdims, self.ldims, self.model)
        
        self.W_s = self.model.add_parameters((len(self.lf2i), self.ldims))
        self.W_sb = self.model.add_parameters((len(self.lf2i)))
    
    def __load_model(self):
        if self.options.external_embedding is not None:
            if os.path.isfile(os.path.join(self.options.saved_parameters_dir,
                                           self.options.saved_prevectors)):
                self.__load_external_embeddings(os.path.join(self.options.saved_parameters_dir,
                                                             self.options.saved_prevectors),
                                                "pickle")
            else:
                self.__load_external_embeddings(self.options.external_embedding,
                                                self.options.external_embedding_type)
                self.__save_model()
    
    def __save_model(self):
        IOUtils.save_embeddings(os.path.join(self.options.saved_parameters_dir,
                                             self.options.saved_prevectors),
                                self.ext_embeddings)

    def __load_external_embeddings(self, embedding_file, embedding_file_type):
        ext_embeddings, ext_emb_dim = IOUtils.load_embeddings_file(
            embedding_file,
            embedding_file_type,
            lower=True)
        assert ext_emb_dim == self.wdims
        self.ext_embeddings = {}
        print("Initializing word embeddings by pre-trained vectors")
        count = 0
        for word in self.w2i:
            if word in ext_embeddings:
                count += 1
                self.ext_embeddings[word] = ext_embeddings[word]
                self.wlookup.init_row(self.w2i[word], ext_embeddings[word])
        print("Vocab size: %d; #words having pretrained vectors: %d" % (len(self.w2i), count))
    
    
    def predict(self, test_path, test_num):
        with open(test_path, 'r') as test:
            for _, (iPair, (sentence, lf)) in zip(range(test_num), enumerate(read_data(test))):
                dy.renew_cg() 
                print(iPair, sentence)
                state = self.context_encoder[0].initial_state()
            
                for entry in sentence:
                    if entry not in self.w2i:
                        print("Entry {} does not exist\n".format(entry))
                    state = state.add_input(self.wlookup[self.w2i[entry] if entry in self.w2i else self.w2i['_UNK']])                                          
                hidden_context = state.h()
                state = self.logical_form_decoder.initial_state()
                state.set_h(hidden_context)
                predicted_sequence = []
                next_input = self.lf2i["<s>"]
                counter = 0
                while True:
                    counter += 1
                    state = state.add_input(self.lflookup[next_input])
                    probs = dy.softmax(self.W_s * state.output() + self.W_sb)
                    next_input = probs.npvalue().argmax()
                    if next_input != self.lf2i["</s>"] and counter < 50:
                        predicted_sequence.append(next_input)
                    else:
                        break
                print("Index {}\nOriginal : {}\nPrediction {}\n\n\n".format(iPair, lf, " ".join(self.i2lf[c] for c in predicted_sequence)))
                
                    
                    
    def train(self, train_path):
        total_loss = 0
        with open(train_path, 'r') as train:
            shuffledData = list(read_data(train))
            random.shuffle(shuffledData)
            
            for iPair, (sentence, lf) in enumerate(shuffledData):
                #I-Context Encoding
                state = self.context_encoder[0].initial_state()
            
                for entry in sentence:
                    state = state.add_input(self.wlookup[self.w2i[entry] if entry in self.w2i else self.w2i['_UNK']])                                          
                hidden_context = state.h()
                
                state = self.logical_form_decoder.initial_state()
                state.set_h(hidden_context)
                decoder_in = [self.lf2i["<s>"]] + [self.lf2i[i] if i in self.lf2i else self.lf2i['_UNK'] for i in lf]
                decoder_out = [self.lf2i[i] if i in self.lf2i else self.lf2i['_UNK'] for i in lf] + [self.lf2i["</s>"]]
                probs = []
                for i in decoder_in:
                    state = state.add_input(self.lflookup[i])
                    p = dy.softmax(self.W_s * state.output() + self.W_sb)
                    probs.append(p)
                loss = [-dy.log(dy.pick(p, o)) for p, o in zip(probs, decoder_out)]
                loss = dy.esum(loss)
                cur_loss = loss.scalar_value()
                total_loss += cur_loss
                loss.backward()
                self.trainer.update()
                if iPair != 0 and iPair % 10 == 0:
                    print("Pair:" + str(iPair) + " Loss:" + str(total_loss / (iPair + 1)))
                
                dy.renew_cg()

In [7]:
class Options:
    def __init__(self):
        self.wembedding_dims = 300
        self.lfembedding_dims = 64
        self.lstm_dims = 128
        self.external_embedding = None

In [8]:
w2i = read_vocab(Q_VOCAB_FILE)
lf2i = read_vocab(F_VOCAB_FILE)

In [12]:
options = Options()
model = Seq2Seq(w2i, lf2i, options)
model.train(TRAIN_FILE)

Pair:10 Loss:68.7444014115767
Pair:20 Loss:65.63372811816987
Pair:30 Loss:58.14587789966214
Pair:40 Loss:57.88414987703649
Pair:50 Loss:54.927369342130774
Pair:60 Loss:54.656258786310914
Pair:70 Loss:52.05173723462602
Pair:80 Loss:52.03498049135561
Pair:90 Loss:49.68558443509615
Pair:100 Loss:48.604320866046564
Pair:110 Loss:47.90536575489216
Pair:120 Loss:47.7429783561013
Pair:130 Loss:47.51488952054322
Pair:140 Loss:46.770583632989975
Pair:150 Loss:45.58440130120081
Pair:160 Loss:44.25736822993119
Pair:170 Loss:43.01433369709037
Pair:180 Loss:42.18116620100664
Pair:190 Loss:41.62635757785817
Pair:200 Loss:40.54232901008568
Pair:210 Loss:41.046242754606276
Pair:220 Loss:40.91912695293513
Pair:230 Loss:40.36715181152542
Pair:240 Loss:40.26017418145144
Pair:250 Loss:39.67378628111456
Pair:260 Loss:39.07234614288213
Pair:270 Loss:38.29354243964727
Pair:280 Loss:37.73374922165243
Pair:290 Loss:37.422832788880335
Pair:300 Loss:37.09688743642002
Pair:310 Loss:36.582490873490116
Pair:320 Los

In [13]:
model.predict(TEST_FILE, 20)

0 ['where', 'is', 'c0']
Index 0
Original : ['(', 'lambda', '$0', 'e', '(', 'loc:t', 'c0', '$0', ')', ')']
Prediction ( lambda $0 e ( and ( state:t $0 ) ( loc:t $0 s0 ) ) )



1 ['river', 'in', 's0']
Index 1
Original : ['(', 'lambda', '$0', 'e', '(', 'and', '(', 'river:t', '$0', ')', '(', 'loc:t', '$0', 's0', ')', ')', ')']
Prediction ( lambda $0 e ( and ( state:t $0 ) ( loc:t $0 s0 ) ) )



2 ['where', 'are', 'mountain']
Index 2
Original : ['(', 'lambda', '$0', 'e', '(', 'exists', '$1', '(', 'and', '(', 'mountain:t', '$1', ')', '(', 'loc:t', '$1', '$0', ')', ')', ')', ')']
Prediction ( lambda $0 e ( and ( state:t $0 ) ( loc:t $0 s0 ) ) )



3 ['where', 'is', 'c0']
Index 3
Original : ['(', 'lambda', '$0', 'e', '(', 'loc:t', 'c0', '$0', ')', ')']
Prediction ( lambda $0 e ( and ( state:t $0 ) ( loc:t $0 s0 ) ) )



4 ['what', 'state', 'border', 's0']
Index 4
Original : ['(', 'lambda', '$0', 'e', '(', 'and', '(', 'state:t', '$0', ')', '(', 'next_to:t', '$0', 's0', ')', ')', ')']
Prediction