In [49]:
#files
TEST_FILE = "data/test.txt"
TRAIN_FILE = "data/train.txt"
WHOLE_FILE = "data/whole.txt"
F_VOCAB_FILE = "data/vocab.f.txt"
Q_VOCAB_FILE = "data/vocab.q.txt"

In [50]:
import dynet_config
# Declare GPU as the default device type
dynet_config.set_gpu()
# Set some parameters manualy
dynet_config.set(mem=400, random_seed=123456789)
# Initialize dynet import using above configuration in the current scope
import dynet as dy

In [51]:
from mnnl import RNNSequencePredictor
import random
import os
from utils.io_utils import IOUtils
random.seed(33)

In [4]:
def read_data(fh):
    for line in fh:
        sentence, lf = line.strip().split("\t")
        sentence = sentence.split()
        lf = lf.split()
        yield sentence, lf

In [5]:
def read_vocab(filename):
    t2i = {"_UNK": 0, "<s>": 1, "</s>":2}
    with open(filename) as target:
        for line in target:
            token = line.strip().split()[0]
            if token not in t2i:
                t2i[token] = len(t2i)
    return t2i

In [68]:
def is_equal(gold, predictions):
    total_correct = 0.0
    if len(gold) == len(predictions):
        equal = True
        for g, p in zip(gold, predictions):
            if g != p:
                equal = False
        return equal
    return False

In [90]:
class Seq2Seq:
    def __init__(self, w2i, lf2i, options):
        self.options = options
        self.w2i = w2i
        #self.lf2i = lf2i
        #self.i2lf = {lf2i[lf]:lf for lf in lf2i}
        self.i2w = {w2i[w]:w for w in w2i}
        self.wdims = options.wembedding_dims
        #self.lfdims = options.lfembedding_dims
        self.ldims = options.lstm_dims
        self.ext_embeddings = None
        
        self.model = dy.ParameterCollection()
        self.trainer = dy.AdamTrainer(self.model)
        self.wlookup = self.model.add_lookup_parameters((len(w2i), self.wdims))
        #self.lflookup = self.model.add_lookup_parameters((len(lf2i), self.lfdims))
        self.__load_model()

        self.context_encoder = [dy.VanillaLSTMBuilder(1, self.wdims, self.ldims, self.model)]
        self.logical_form_decoder = dy.VanillaLSTMBuilder(1, self.wdims , self.ldims, self.model)
        
        self.W_s = self.model.add_parameters((len(self.w2i), self.ldims))
        self.W_sb = self.model.add_parameters((len(self.w2i)))
    
    def __load_model(self):
        if self.options.external_embedding is not None:
            if os.path.isfile(os.path.join(self.options.saved_parameters_dir,
                                           self.options.saved_prevectors)):
                self.__load_external_embeddings(os.path.join(self.options.saved_parameters_dir,
                                                             self.options.saved_prevectors),
                                                "pickle")
            else:
                self.__load_external_embeddings(self.options.external_embedding,
                                                self.options.external_embedding_type)
                self.__save_model()
    
    def __save_model(self):
        IOUtils.save_embeddings(os.path.join(self.options.saved_parameters_dir,
                                             self.options.saved_prevectors),
                                self.ext_embeddings)

    def __load_external_embeddings(self, embedding_file, embedding_file_type):
        ext_embeddings, ext_emb_dim = IOUtils.load_embeddings_file(
            embedding_file,
            embedding_file_type,
            lower=True)
        assert ext_emb_dim == self.wdims
        self.ext_embeddings = {}
        print("Initializing word embeddings by pre-trained vectors")
        count = 0
        for word in self.w2i:
            if word in ext_embeddings:
                count += 1
                self.ext_embeddings[word] = ext_embeddings[word]
                self.wlookup.init_row(self.w2i[word], ext_embeddings[word])
        print("Vocab size: %d; #words having pretrained vectors: %d" % (len(self.w2i), count))
    
    
    def predict(self, test_path, test_num):
        with open(test_path, 'r') as test:
            total_correct = 0.0
            total_examples = 0
            for _, (iPair, (sentence, lf)) in zip(range(test_num), enumerate(read_data(test))):
                total_examples += 1
                dy.renew_cg() 
                encoder_state = self.context_encoder[0].initial_state()
                for entry in sentence:
                    encoder_state = encoder_state.add_input(self.wlookup[self.w2i[entry] if entry in self.w2i else self.w2i['_UNK']])                                          
                hidden_context = encoder_state.output()
                
                decoder_state = self.logical_form_decoder.initial_state()
                decoder_state.set_h([hidden_context])
                predicted_sequence = []
                next_input = self.w2i["<s>"]
                counter = 0
                while True:
                    counter += 1
                    decoder_state = decoder_state.add_input(self.wlookup[i])
                    probs = dy.softmax(self.W_s * decoder_state.output() + self.W_sb)
                    next_input = probs.npvalue().argmax()
                    if next_input != self.w2i["</s>"] and counter < 50:
                        predicted_sequence.append(next_input)
                    else:
                        break
                predictions = [self.i2w[c] for c in predicted_sequence]
                if is_equal(lf, predictions):
                    total_correct += 1
                #print("Index {}\nOriginal : {}\nPrediction {}\n\n\n".format(iPair, lf, predictions))
            print("Accuracy : {}".format(total_correct/total_examples))
                    
                    
    def train(self, train_path):
        total_loss = 0
        with open(train_path, 'r') as train:
            shuffledData = list(read_data(train))
            random.shuffle(shuffledData)
            
            for iPair, (sentence, lf) in enumerate(shuffledData):
                #I-Context Encoding
                encoder_state = self.context_encoder[0].initial_state()
            
                for entry in sentence:
                    encoder_state = encoder_state.add_input(self.wlookup[self.w2i[entry] if entry in self.w2i else self.w2i['_UNK']])                                          
                hidden_context = encoder_state.output()
                
                decoder_state = self.logical_form_decoder.initial_state()
                decoder_state.set_h([hidden_context])
                decoder_in = [self.w2i["<s>"]] + [self.w2i[i] if i in self.w2i else self.w2i['_UNK'] for i in lf]
                decoder_out = [self.w2i[i] if i in self.w2i else self.w2i['_UNK'] for i in lf] + [self.w2i["</s>"]]
                probs = []
                for i in decoder_in:
                    decoder_state = decoder_state.add_input(self.wlookup[i])
                    p = dy.softmax(self.W_s * decoder_state.output() + self.W_sb)
                    probs.append(p)
                loss = [-dy.log(dy.pick(p, o)) for p, o in zip(probs, decoder_out)]
                loss = dy.esum(loss)
                cur_loss = loss.scalar_value()
                total_loss += cur_loss
                loss.backward()
                self.trainer.update()
                #if iPair != 0 and iPair % 50 == 0:
                #    print("Pair:" + str(iPair) + " Loss:" + str(total_loss / (iPair + 1)))
                
                dy.renew_cg()

In [87]:
class Options:
    def __init__(self):
        self.wembedding_dims = 300
        self.lfembedding_dims = 64
        self.lstm_dims = 128
        self.external_embedding = "data/GoogleNews-vectors-negative300-SLIM.bin"
        self.saved_parameters_dir = "data/saved-parameters"
        self.saved_prevectors = "GoogleNews-embedings.pickle"
        self.external_embedding_type = "word2vec"

In [88]:
w2i = read_vocab(Q_VOCAB_FILE)
lf2i = read_vocab(F_VOCAB_FILE)
for k in lf2i:
    if k not in w2i:
        w2i[k] = len(w2i)

In [91]:
options = Options()
model = Seq2Seq(w2i, lf2i, options)
for i in range(20):
    model.train(WHOLE_FILE)
    model.predict(TEST_FILE, 100)

Initializing word embeddings by pre-trained vectors
Vocab size: 163; #words having pretrained vectors: 100
Accuracy : 0.0
Accuracy : 0.0
Accuracy : 0.0
Accuracy : 0.0
Accuracy : 0.0
Accuracy : 0.0
Accuracy : 0.0


KeyboardInterrupt: 