Permalink
Browse files

.

  • Loading branch information...
Yoon Kim
Yoon Kim committed Dec 23, 2016
1 parent f772c20 commit b89f9c7148195645086a00af3c6a2a0c2f6ebb35
Showing with 2,520 additions and 2 deletions.
  1. +1 −0 .gitignore
  2. +39 −2 README.md
  3. +57 −0 data-entail.lua
  4. +46 −0 get_pretrain_vecs.py
  5. +650 −0 models/EisnerCRF.lua
  6. +95 −0 models/model_utils.lua
  7. +443 −0 models/models-entail.lua
  8. +210 −0 models/test-eisnercrf.lua
  9. +313 −0 preprocess-entail.py
  10. +666 −0 train-entail.lua
View
@@ -4,6 +4,7 @@
*.dict
*.lua~
*.py~
*.txt
# Compiled Lua sources
luac.out
View
@@ -1,2 +1,39 @@
# struct-attn-net
Code for structured attention networks
# Structured Attention Networks
## Entailment
### Data preprocessing
First run:
```
python preprocess-entail.py --srcfile path-to-sent1-train --targetfile path-to-sent2-train
--labelfile path-to-label-train --srcvalfile path-to-sent1-val --targetvalfile path-to-sent2-val
--labelvalfile path-to-label-val --srctestfile path-to-sent1-test --targettestfile path-to-sent2-test
--labeltestfile path-to-label-test --outputfile data/entail --glove path-to-glove
```
This will create the data hdf5 files. Vocabulary is based on the pretrained Glove embeddings.
sent1 is the premise and sent1 is the hypothesis.
Now run:
```
python get_pretrain_vecs.py --wv_file path-to-glove --outputfile data/glove.hdf5
--dictionary path-to-dict
```
`path-to-dict` is the `*.word.dict` file created from running `preprocess-entail.py`
### Training
Baseline model
```
th train-entail.lua -parser 0
```
The baseline model essentially replicates the results of Parikh et al. (2016). The only
differences are that we use a hidden layer size of 300 (they use 200), batch size of 32 (they use 4),
and we do vanilla SGD while they use asynchronous SGD.
Structured attention
```
th train-entail.lua -parser 1 -use_parent 1
```
See `train-entail.lua` (or the paper) for hyperparameters and more training options.
You can add `-gpuid 1` to use the GPU.
View
@@ -0,0 +1,57 @@
--
-- Manages encoder/decoder data matrices.
--
local data = torch.class("data")
function data:__init(opt, data_file)
local f = hdf5.open(data_file, 'r')
self.source = f:read('source'):all()
self.target = f:read('target'):all()
self.target_l = f:read('target_l'):all() --max target length each batch
self.source_l = f:read('source_l'):all()
self.label = f:read('label'):all()
self.batch_l = f:read('batch_l'):all()
self.batch_idx = f:read('batch_idx'):all()
self.target_size = f:read('target_size'):all()[1]
self.source_size = f:read('source_size'):all()[1]
self.label_size = f:read('label_size'):all()[1]
self.length = self.batch_l:size(1)
self.seq_length = self.target:size(2)
self.batches = {}
for i = 1, self.length do
local source_i = self.source:sub(self.batch_idx[i], self.batch_idx[i]+self.batch_l[i]-1,
1, self.source_l[i])
local target_i = self.target:sub(self.batch_idx[i], self.batch_idx[i]+self.batch_l[i]-1,
1, self.target_l[i])
local label_i = self.label:sub(self.batch_idx[i], self.batch_idx[i] + self.batch_l[i]-1)
table.insert(self.batches, {target_i, source_i, self.batch_l[i], self.target_l[i],
self.source_l[i], label_i})
end
end
function data:size()
return self.length
end
function data.__index(self, idx)
if type(idx) == "string" then
return data[idx]
else
local target = self.batches[idx][1]
local source = self.batches[idx][2]
local batch_l = self.batches[idx][3]
local target_l = self.batches[idx][4]
local source_l = self.batches[idx][5]
local label = self.batches[idx][6]
if opt.gpuid >= 0 then --if multi-gpu, source lives in gpuid1, rest on gpuid2
source = source:cuda()
target = target:cuda()
label = label:cuda()
end
return {target, source, batch_l, target_l, source_l, label}
end
end
return data
View
@@ -0,0 +1,46 @@
import numpy as np
import h5py
import re
import sys
import operator
import argparse
def load_glove_vec(fname, vocab):
word_vecs = {}
for line in open(fname, 'r'):
d = line.split()
word = d[0]
vec = np.array(map(float, d[1:]))
if word in vocab:
word_vecs[word] = vec
return word_vecs
def main():
parser = argparse.ArgumentParser(
description =__doc__,
formatter_class=argparse.RawDescriptionHelpFormatter)
parser.add_argument('--dictionary', help="*.dict file", type=str,
default='data/entail.word.dict')
parser.add_argument('--glove', help='pretrained word vectors', type=str, default='')
parser.add_argument('--outputfile', help="output hdf5 file", type=str,
default='data/glove.hdf5')
args = parser.parse_args()
vocab = open(args.dictionary, "r").read().split("\n")[:-1]
vocab = map(lambda x: (x.split()[0], int(x.split()[1])), vocab)
word2idx = {x[0]: x[1] for x in vocab}
print("vocab size is " + str(len(vocab)))
w2v_vecs = np.random.normal(size = (len(vocab), 300))
w2v = load_glove_vec(args.glove, word2idx)
print("num words in pretrained model is " + str(len(w2v)))
for word, vec in w2v.items():
w2v_vecs[word2idx[word] - 1 ] = vec
for i in range(len(w2v_vecs)):
w2v_vecs[i] = w2v_vecs[i] / np.linalg.norm(w2v_vecs[i])
with h5py.File(args.outputfile, "w") as f:
f["word_vecs"] = np.array(w2v_vecs)
if __name__ == '__main__':
main()
Oops, something went wrong.

0 comments on commit b89f9c7

Please sign in to comment.