-
Notifications
You must be signed in to change notification settings - Fork 0
/
run.py
405 lines (340 loc) · 16.5 KB
/
run.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
from __future__ import print_function
import tensorflow as tf
import numpy as np
import sys, os, time, pickle
import argparse
import csv
from bidirsimilarity import BiSimilarity
from similarity import SimilarityModel
from attention import Attention
DATA_PATH = "../../raw_data/quora_datasets/tokenized/all.txt"
TRAIN_PATH = "../../raw_data/quora_datasets/tokenized/train.txt"
TEST_PATH = "../../raw_data/quora_datasets/tokenized/test.txt"
GLOVE_VECTORS_PATH = "../../raw_data/GloVe.6B/embeddings/glove_300.npy"
TOKENS_TO_GLOVEID_PATH = "../../raw_data/GloVe.6B/tok2id/glove_300.pickle"
JACCARD_SIMILARITY_THRESH = 0.1
class Config:
"""Holds model hyperparams and data information.
The config class is used to store various hyperparameters and dataset
information parameters. Model objects are passed a Config() object at
instantiation.
"""
save_params = False
update_embeddings = True
# each word just indexes into glove vectors
dropout = 0.5
embed_size = 300 # word vector dimensions
output_size = 50
n_epochs = 3
max_grad_norm = 10.
lr = 0.001
n_classes = 2
# parameters that can be set from the command-line:
hidden_size = 250
batch_size = 1024
max_length = 27
distance_measure = "concat" # one of ["l2", "cosine", "custom_coef", "concat", "concat_steroids"]
cell = "gru" # one of ["rnn", "gru", "lstm"]
regularization_constant = 0.0001
# model improvement
augment_data = True
use_attention = False
bidir = False
def read_datafile(fstream):
"""
Reads a input stream @fstream (e.g. output of `open(fname, 'r')`) in TSV file format.
Input file is formatted as follows:
QUESTION1
QUESTION2
LABEL
...
where QUESTION1 and QUESTION2 are space-delimited strings, and LABEL is an int.
@returns a list of examples [(sentence1, sentence2, label)].
@sentence1 and @sentence2 are lists of strings, @label is a boolean
"""
examples = []
sentence1 = sentence2 = label = None
for line_num, line in enumerate(fstream):
line = line.strip()
if line_num % 3 == 0:
sentence1 = line.split()
elif line_num % 3 == 1:
sentence2 = line.split()
else:
label = int(line)
examples.append((sentence1, sentence2, label))
return examples
def load_and_preprocess_data(data_path, tokens_to_gloveID_path, max_length, augment_data=False):
"""
Reads the training and dev data sets from the given paths.
"""
print("Loading all data...")
with open(data_path, 'r') as data_file:
data = read_datafile(data_file)
print("Done. Read %d sentences" % len(data))
# now process all the input data: turn words into the glove indices
print("Converting words into glove vector indices...")
helper = ModelHelper.load(tokens_to_gloveID_path, max_length)
data_vectorized = helper.vectorize(data)
# split into train, dev, and test sets
train_indices = range(int(len(data)*0.7))
dev_indices = range(int(len(data)*0.7), int(len(data)*0.9))
test_indices = range(int(len(data)*0.9), len(data))
train_data = [data_vectorized[i] for i in train_indices]
dev_data = [data_vectorized[i] for i in dev_indices]
test_data = [data_vectorized[i] for i in test_indices]
if augment_data:
print("Augmenting data...")
helper.augment_data(train_data)
return helper, train_data, dev_data, test_data
def test_time_load_and_preprocess_data(train_path, test_path, tokens_to_gloveID_path, max_length, augment_data=False):
print("Loading all data...")
with open(train_path, 'r') as train_file:
train_data = read_datafile(train_file)
print("Training done. Read %d sentences" % len(train_data))
with open(test_path, 'r') as test_file:
test_data = read_datafile(test_file)
print("Testing done. Read %d sentences" % len(test_data))
# now process all the input data: turn words into the glove indices
print("Converting words into glove vector indices...")
helper = ModelHelper.load(tokens_to_gloveID_path, max_length)
train_data_vectorized = helper.vectorize(train_data)
test_data_vectorized = helper.vectorize(test_data)
if augment_data:
print("Augmenting data...")
helper.augment_data(train_data_vectorized)
return helper, train_data_vectorized, test_data_vectorized
class ModelHelper(object):
"""
This helper takes care of preprocessing data, constructing embeddings, etc.
"""
def __init__(self, tok2id, max_length):
self.tok2id = tok2id
self.UNKNOWN_WORD_INDEX = len(tok2id)
self.PADDING_WORD_INDEX = len(tok2id) + 1
# TODO: If we can have different amounts of padding for training vs. testing data,
# then we can just compute the max_length in the vectorize functions.
# Otherwise, we should load in max_length from some saved PKL file
self.max_length = max_length
# add additional embeddings for unknown word and padding word
def add_additional_embeddings(self, embeddings):
'''Creates additional embeddings for unknown words and the padding word
Returns a (2, embed_size) numpy array:
- 0th row is word vector for unknown word, average of some known words
- 1st row is word vector for padding word, all zeros
'''
print(embeddings[:100, :])
unknown_word_vector = np.mean(embeddings[:100, :], axis=0, dtype=np.float32) # vector for unknown word
padding_word_vector = np.zeros(embeddings.shape[1], dtype=np.float32)
self.additional_embeddings = np.stack([unknown_word_vector, padding_word_vector])
def pad_or_truncate_sentence(self, sentence):
"""Ensures @sentence is of length self.max_length by padding it with
self.PADDING_WORD_INDEX at the beginning of the sentence or by truncating the
rest of the sentence.
Args:
sentence: a list of integers representing word indices
Returns:
an integer numpy array of length self.max_length representing the sentence
"""
new_sentence = np.zeros(self.max_length, dtype=np.int32)
initial_length = len(sentence)
if initial_length < self.max_length:
num_padding = self.max_length - initial_length
new_sentence = [self.PADDING_WORD_INDEX]*num_padding + sentence
elif initial_length >= self.max_length:
new_sentence = sentence[0:self.max_length]
return new_sentence
def vectorize_example(self, example):
s1, s2, label = example
s1_vectorized = [self.tok2id.get(word, self.UNKNOWN_WORD_INDEX) for word in s1]
s2_vectorized = [self.tok2id.get(word, self.UNKNOWN_WORD_INDEX) for word in s2]
s1_vectorized = self.pad_or_truncate_sentence(s1_vectorized)
s2_vectorized = self.pad_or_truncate_sentence(s2_vectorized)
return (s1_vectorized, s2_vectorized, label)
def vectorize(self, data):
return [self.vectorize_example(example) for example in data]
def augment_data(self, data):
num_examples = len(data)
# augment with 50% more negative training examples
rand_rows = np.random.randint(0, high=num_examples, size=(num_examples,2))
rand_cols = np.random.randint(0, high=2, size=(num_examples,2))
neg_count = 0
order = np.arange(num_examples)
np.random.shuffle(order)
for i in order:
if rand_rows[i,0] == rand_rows[i,1]:
continue
q1 = data[rand_rows[i,0]][rand_cols[i,0]]
q2 = data[rand_rows[i,1]][rand_cols[i,1]]
if q1 == q2:
continue
if self.jaccard_similarity(q1,q2) < JACCARD_SIMILARITY_THRESH:
continue
data.append((q1,q2,0))
neg_count += 1
if neg_count == num_examples/2:
break
print("Added %d negative examples to the training set" % neg_count)
# augment with 25% more positive (flipped duplicate) training examples
flipped_count = 0
order = np.arange(num_examples)
np.random.shuffle(order)
for i in order:
if data[i][2] == 1:
q1 = data[i][0]
q2 = data[i][1]
data.append((q2,q1,1))
flipped_count += 1
if flipped_count == num_examples/4:
break
print("Added %d positive (flipped duplicate) examples to the training set" % flipped_count)
# augment with 25% more positive (exact duplicate) training examples
rand_rows = np.random.randint(0, high=num_examples, size=(num_examples,))
rand_cols = np.random.randint(0, high=2, size=(num_examples,))
exact_count = 0
order = np.arange(num_examples)
np.random.shuffle(order)
for i in order:
if data[rand_rows[i]][2] == 1:
continue
q = data[rand_rows[i]][rand_cols[i]]
data.append((q,q,1))
exact_count += 1
if exact_count == num_examples/4:
break
print("Added %d positive (exact duplicate) examples to the training set" % exact_count)
def jaccard_similarity(self,x,y):
intersection_cardinality = len(set.intersection(*[set(x), set(y)]))
union_cardinality = len(set.union(*[set(x), set(y)]))
return intersection_cardinality/float(union_cardinality)
@classmethod
def load(cls, tokens_to_gloveID_path, max_length):
# Make sure the directory exists.
assert os.path.exists(tokens_to_gloveID_path)
with open(tokens_to_gloveID_path, 'rb') as f:
tok2id = pickle.load(f)
return cls(tok2id, max_length)
def print_options(args, config):
print("Running with options:")
for key, value in vars(args).items():
print("\t%s: %s" % (key, value if value is not None else getattr(config, str(key))))
if __name__ == "__main__":
description = "Run the similarity_model"
parser = argparse.ArgumentParser(description=description)
parser.add_argument("-a", "--augment_data", action="store_true", help="augment data with negative and positive samples")
parser.add_argument("-b", "--batch_size", type=int, required=False, help="number of examples for each minibatch")
parser.add_argument("-c", "--cell", required=False, choices=["rnn", "gru", "lstm", "birnn", "bigru", "bilstm"], help="model cell type")
parser.add_argument("-d", "--distance_measure", required=False, choices=["l2", "cosine", "custom_coef", "concat", "concat_steroids"], help="distance measure")
parser.add_argument("-r", "--regularization_constant", type=float, required=False, help="regularization constant")
parser.add_argument("-hs", "--hidden_size", type=int, required=False, help="neural net hidden size")
parser.add_argument("-ml", "--max_length", type=int, required=False, help="maximum length of sentences")
parser.add_argument("-s", "--save_params", action="store_true", help="save trained variables to a checkpoint file")
parser.add_argument("-bi", "--bidir", action="store_true", help="save usage of bidirectional networks")
parser.add_argument("-at", "--attention", action="store_true", help="save usage of attention layer")
args = parser.parse_args()
config = Config()
config.augment_data = args.augment_data
config.save_params = args.save_params
if args.batch_size is not None:
config.batch_size = args.batch_size
if args.cell is not None:
config.cell = args.cell
if args.distance_measure is not None:
config.distance_measure = args.distance_measure
if args.regularization_constant is not None:
config.regularization_constant = args.regularization_constant
if args.hidden_size is not None:
config.hidden_size = args.hidden_size
if args.max_length is not None:
config.max_length = args.max_length
if args.bidir is not None:
config.bidir = args.bidir
accuracy_results = []
f1_results = []
print("Preparing data...")
helper, train, dev, test = load_and_preprocess_data(DATA_PATH, TOKENS_TO_GLOVEID_PATH, config.max_length)
# helper.max_length = config.max_length
print("Load embeddings...")
embeddings = np.load(GLOVE_VECTORS_PATH, mmap_mode='r')
config.embed_size = embeddings.shape[1]
# append unknown word and padding word vectors
helper.add_additional_embeddings(embeddings)
with tf.Graph().as_default():
print("Building model...")
start = time.time()
with tf.device('/gpu:2'):
if config.bidir == False:
model = SimilarityModel(helper, config, embeddings)
else:
model = BiSimilarity(helper, config, embeddings)
print("took %.2f seconds" % (time.time() - start))
init = tf.global_variables_initializer()
saver = None
if config.save_params:
saver = tf.train.Saver()
# sess_config = tf.ConfigProto(allow_soft_placement=True)
# sess_config.gpu_options.allow_growth = True
# sess_config.gpu_options.per_process_gpu_memory_fraction = 0.01
# session_config=tf.ConfigProto(gpu_options=tf.GPUOptions(per_process_gpu_memory_fraction=0.2))
with tf.Session() as session:
session.run(init)
best_dev_accuracy, dev_f1, test_accuracy, test_f1 = model.fit(session, saver, train, dev, test)
print("best dev accuracy: %f, dev f1: %f, test accuracy: %f, test f1: %f" % (best_dev_accuracy, dev_f1, test_accuracy, test_f1))
results_file_name = "model_results.csv"
if args.bidir == True:
results_file_name = "bidir_" + results_file_name
with open(results_file_name, 'a') as f:
fieldnames = ["cell", "distance_measure", "augment_data", "regularization_constant", "hidden_size", "max_length", "best_dev_accuracy", "dev_f1", "test_accuracy", "test_f1"]
writer = csv.DictWriter(f, fieldnames=fieldnames)
hyperparams_and_results_dict = {
"cell": config.cell,
"distance_measure": config.distance_measure,
"augment_data": config.augment_data,
"regularization_constant": config.regularization_constant,
"hidden_size": config.hidden_size,
"max_length": config.max_length,
"best_dev_accuracy": best_dev_accuracy,
"dev_f1": dev_f1,
"test_accuracy": test_accuracy,
"test_f1": test_f1
}
writer.writerow(hyperparams_and_results_dict)
accuracy_results.append((config.bidir, config.use_attention, best_accuracy))
f1_results.append((config.bidir, config.use_attention, best_f1))
print("best accuracy: %f, f1: %f" % (config.bidir, config.use_attention, best_accuracy, best_f1))
print("accuracy results:")
print(accuracy_results)
print("f1 results:")
print(f1_results)
'''
print("Preparing data...")
# helper, train, dev, test = load_and_preprocess_data(DATA_PATH, DATA_SPLIT_INDICES_PATH, TOKENS_TO_GLOVEID_PATH, config.max_length, config.augment_data)
helper, train, test = test_time_load_and_preprocess_data(TRAIN_PATH, TEST_PATH, TOKENS_TO_GLOVEID_PATH, config.max_length, config.augment_data)
print("Load embeddings...")
embeddings = np.load(GLOVE_VECTORS_PATH, mmap_mode='r')
config.embed_size = embeddings.shape[1]
# append unknown word and padding word vectors
helper.add_additional_embeddings(embeddings)
with tf.Graph().as_default():
print("Building model...")
start = time.time()
if config.bidir == False:
model = SimilarityModel(helper, config, embeddings)
else:
model = BiSimilarity(helper, config, embeddings)
print("took %.2f seconds" % (time.time() - start))
print_options(args, config)
# init = tf.global_variables_initializer()
init = tf.initialize_all_variables()
saver = None
if config.save_params:
saver = tf.train.Saver()
sess_config = tf.ConfigProto(allow_soft_placement=True)
sess_config.gpu_options.allow_growth = True
# sess_config.gpu_options.per_process_gpu_memory_fraction = 0.01
# start a TensorFlow session, initialize all variables, then run model
with tf.Session(config=sess_config) as session:
session.run(init)
model.test_time_fit(session, saver, train)
model.test_time_predict(session, test)
'''