In [1]:
#files
TEST_FILE = "data/test.txt"
TRAIN_FILE = "data/train.txt"
F_VOCAB_FILE = "data/vocab.f.txt"
Q_VOCAB_FILE = "data/vocab.q.txt"

In [2]:
import dynet_config
# Declare GPU as the default device type
dynet_config.set_gpu()
# Set some parameters manualy
dynet_config.set(mem=400, random_seed=123456789)
# Initialize dynet import using above configuration in the current scope
import dynet as dy

In [3]:
from mnnl import RNNSequencePredictor
import random
random.seed(33)

In [4]:
def read_data(fh):
    for line in fh:
        sentence, lf = line.strip().split("\t")
        sentence = sentence.split()
        lf = lf.split()
        yield sentence, lf

In [5]:
def read_vocab(filename):
    t2i = {"_UNK": 0, "<s>": 1, "</s>":2}
    with open(filename) as target:
        for line in target:
            token = line.strip().split()[0]
            if token not in t2i:
                t2i[token] = len(t2i)
    return t2i

In [29]:
class Seq2Seq:
    def __init__(self, w2i, lf2i, options):
        self.options = options
        self.w2i = w2i
        self.lf2i = lf2i
        self.wdims = options.wembedding_dims
        self.lfdims = options.lfembedding_dims
        self.ldims = options.lstm_dims
        self.ext_embeddings = None
        
        self.model = dy.ParameterCollection()
        self.trainer = dy.AdamTrainer(self.model)
        self.__load_model()
        self.wlookup = self.model.add_lookup_parameters((len(w2i), self.wdims))
        self.lflookup = self.model.add_lookup_parameters((len(lf2i), self.lfdims))

        self.context_encoder = [dy.VanillaLSTMBuilder(1, self.wdims, self.ldims, self.model)]
        self.logical_form_decoder = dy.VanillaLSTMBuilder(1, self.lfdims, self.ldims, self.model)
        
        self.W_s = self.model.add_parameters((len(self.lf2i), self.ldims))
        self.W_sb = self.model.add_parameters((len(self.lf2i)))
    
    def __load_model(self):
        if self.options.external_embedding is not None:
            if os.path.isfile(os.path.join(self.options.saved_parameters_dir,
                                           self.options.saved_prevectors)):
                self.__load_external_embeddings(os.path.join(self.options.saved_parameters_dir,
                                                             self.options.saved_prevectors),
                                                "pickle")
            else:
                self.__load_external_embeddings(self.options.external_embedding,
                                                self.options.external_embedding_type)
                self.__save_model()
    
    def __save_model(self):
        IOUtils.save_embeddings(os.path.join(self.options.saved_parameters_dir,
                                             self.options.saved_prevectors),
                                self.ext_embeddings)

    def __load_external_embeddings(self, embedding_file, embedding_file_type):
        ext_embeddings, ext_emb_dim = IOUtils.load_embeddings_file(
            embedding_file,
            embedding_file_type,
            lower=True)
        assert ext_emb_dim == self.wdims
        self.ext_embeddings = {}
        print("Initializing word embeddings by pre-trained vectors")
        count = 0
        for word in self.w2i:
            if word in ext_embeddings:
                count += 1
                self.ext_embeddings[word] = ext_embeddings[word]
                self.wlookup.init_row(self.w2i[word], ext_embeddings[word])
        print("Vocab size: %d; #words having pretrained vectors: %d" % (len(self.w2i), count))
    
    
    def predict(sels, test_path):
        
        
        
    def train(self, train_path):
        total_loss = 0
        with open(train_path, 'r') as train:
            shuffledData = list(read_data(train))
            random.shuffle(shuffledData)
            
            for iPair, (sentence, lf) in enumerate(shuffledData):
                print(iPair, sentence, lf)
                #I-Context Encoding
                state = self.context_encoder[0].initial_state()
            
                for entry in sentence:
                    state = state.add_input(self.wlookup[self.w2i[entry] if entry in self.w2i else self.w2i['_UNK']])                                          
                hidden_context = state.h()
                
                
                
                state = self.logical_form_decoder.initial_state()
                state.set_h(hidden_context)
                decoder_in = [self.lf2i["<s>"]] + [self.lf2i[i] if i in self.lf2i else self.lf2i['_UNK'] for i in lf]
                decoder_out = [self.lf2i[i] if i in self.lf2i else self.lf2i['_UNK'] for i in lf] + [self.lf2i["</s>"]]
                probs = []
                for i in decoder_in:
                    state = state.add_input(self.lflookup[i])
                    p = dy.softmax(self.W_s * state.output() + self.W_sb)
                    probs.append(p)
                loss = [-dy.log(dy.pick(p, o)) for p, o in zip(probs, decoder_out)]
                loss = dy.esum(loss)
                cur_loss = loss.scalar_value()
                total_loss += cur_loss
                loss.backward()
                self.trainer.update()
                if iPair != 0 and iPair % 10 == 0:
                    print("Pair:" + str(iPair) + " Loss:" + str(total_loss / (iPair + 1)))
                dy.renew_cg()
                

In [7]:
class Options:
    def __init__(self):
        self.wembedding_dims = 300
        self.lfembedding_dims = 64
        self.lstm_dims = 128
        self.external_embedding = None

In [8]:
w2i = read_vocab(Q_VOCAB_FILE)
lf2i = read_vocab(F_VOCAB_FILE)

In [30]:
options = Options()
model = Seq2Seq(w2i, lf2i, options)
model.train(TEST_FILE)

0 ['what', 'is', 'the', 'largest', 'capit', 'citi', 'in', 'the', 'co0'] ['(', 'argmax', '$0', '(', 'and', '(', 'capital:t', '$0', ')', '(', 'city:t', '$0', ')', '(', 'loc:t', '$0', 'co0', ')', ')', '(', 'size:i', '$0', ')', ')']
1 ['what', 'state', 'border', 'the', 'state', 'that', 'border', 'the', 'most', 'state'] ['(', 'lambda', '$0', 'e', '(', 'and', '(', 'state:t', '$0', ')', '(', 'next_to:t', '$0', '(', 'argmax', '$1', '(', 'state:t', '$1', ')', '(', 'count', '$2', '(', 'and', '(', 'state:t', '$2', ')', '(', 'next_to:t', '$1', '$2', ')', ')', ')', ')', ')', ')', ')']
2 ['what', 'state', 'is', 'c0', 'in'] ['(', 'lambda', '$0', 'e', '(', 'and', '(', 'state:t', '$0', ')', '(', 'loc:t', 'c0', '$0', ')', ')', ')']
3 ['what', 'is', 'the', 'popul', 'of', 'c0'] ['(', 'population:i', 'c0', ')']
4 ['what', 'is', 'the', 'lowest', 'point', 'of', 'all', 'state', 'through', 'which', 'the', 'r0', 'run', 'through'] ['(', 'argmin', '$0', '(', 'and', '(', 'place:t', '$0', ')', '(', 'exists', '$1', 