-
Notifications
You must be signed in to change notification settings - Fork 0
/
data_utility.py
322 lines (270 loc) · 12 KB
/
data_utility.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
import pickle
import os.path
import operator
import cornell_loading
import numpy as np
from math import ceil
from config import Config as conf
from random import shuffle
from shutil import copyfile
START_TOKEN = "<bos>"
END_TOKEN = "<eos>"
UNK_TOKEN = "<unk>"
PAD_TOKEN = "<pad>"
START_TOKEN_INDEX = 0
END_TOKEN_INDEX = 1
UNK_TOKEN_INDEX = 2
PAD_TOKEN_INDEX = 3
TRAINING_FILEPATH = 'data/Training_Shuffled_Dataset.txt'
TRAINING_TUPLES_FILEPATH = 'Training_Shuffled_Dataset_tuples.txt'
VALIDATION_FILEPATH = 'data/Validation_Shuffled_Dataset.txt'
VALIDATION_TUPLES_FILEPATH = 'Validation_Shuffled_Dataset_tuples.txt'
VOCABULARY_FILEPATH = 'pickled_vars/vocabulary.p'
W2I_FILEPATH = 'pickled_vars/word_2_index.p'
I2W_FILEPATH = 'pickled_vars/index_2_index.p'
ENCODER_INPUT_FILEPATH = 'pickled_vars/encoder_inputs.p'
DECODER_INPUT_FILEPATH = 'pickled_vars/decoder_inputs.p'
###
# Creates an output file by transforming the original triples file to a tuples file
# preserving the order of the dialogs. e.g. for a dialog consisting of sent1 -- sent2 -- sent3
# the generated tuple "sent2 -- sent3" will directly follow "sent1 -- sent2"
#
# If output_filepath is None, then the output tuples will not be written to disk,
# but returned
###
def triples_to_tuples(input_filepath, output_filepath):
#print("Converting triples from {} to tuples..".format(input_filepath))
f = open(input_filepath, 'r')
f1 = open(output_filepath, 'w')
for line in f:
triples = line.strip().split('\t')
f1.write("{}\t{}\n".format(triples[0], triples[1]))
f1.write("{}\t{}\n".format(triples[1], triples[2]))
f.close()
f1.close()
if input_filepath == TRAINING_FILEPATH and conf.use_CORNELL_for_training:
merge(output_filepath, conf.CORNELL_TUPLES_PATH, conf.both_datasets_tuples_filepath)
def merge(base_dataset_tuples_filepath, cornell_tuples_filepath, output_filepath):
if conf.use_CORNELL_for_training:
numlines = 0
f = open(base_dataset_tuples_filepath, 'r')
f1 = open(output_filepath, 'w')
print("Merging base dataset with Cornell: loading base dataset..")
for line in f:
f1.write(line)
numlines = numlines + 1
f.close()
print("\tNumber of tuples loaded from base dataset: {}".format(numlines))
print("Merging base dataset with Cornell: loading Cornell dataset..")
if not os.path.isfile(cornell_tuples_filepath):
cornell_loading.create_Cornell_tuples(conf.CORNELL_lines_path, conf.CORNELL_conversations_path, conf.CORNELL_TUPLES_PATH)
f2 = open(cornell_tuples_filepath, 'r')
for line in f2:
couples = line.strip().split('\t')
if len(couples) > 2:
k = 1
while len(couples[k]) <= 0 and k <= len(couples):
k = k + 1
f1.write("{}\t{}\n".format(couples[0], couples[k]))
numlines = numlines + 1
else:
f1.write(line)
numlines = numlines + 1
f2.close()
f1.close()
print("\tTotal number of dumped lines: {}".format(numlines))
###
# Counts unique_tokens. No shit Sherlock...
###
def count_unique_tokens(filename):
f = open(filename, 'r')
s = set()
for line in f:
for word in line.split():
s.add(word)
f.close()
return len(s)
###
# Gets the vocabulary dictionary and returns it
# This fails if the dictionary do not exist yet.
###
def get_vocabulary():
return pickle.load(open(VOCABULARY_FILEPATH, 'rb'))
###
# Gets or creates a vocabulary based on vocabulary size
###
def get_or_create_vocabulary():
print("Getting vocabulary..")
try:
vocabulary = get_vocabulary()
except:
print("Building vocabulary..")
vocabulary = {}
if conf.use_CORNELL_for_training:
train_file = open(conf.both_datasets_tuples_filepath)
else:
train_file = open(TRAINING_TUPLES_FILEPATH)
for line in train_file:
conversation = line.strip().split()
for word in conversation:
vocabulary[word] = vocabulary.get(word, 0) + 1
sorted_vocab = sorted(vocabulary.items(), key=operator.itemgetter(1), reverse=True)
print("Total length of vocabulary: {}".format(len(sorted_vocab)))
sorted_vocab = sorted_vocab[:conf.vocabulary_size-4]
vocabulary = dict(sorted_vocab)
vocabulary[START_TOKEN] = 1
vocabulary[END_TOKEN] = 1
vocabulary[UNK_TOKEN] = 1
vocabulary[PAD_TOKEN] = 1
pickle.dump(vocabulary, open(VOCABULARY_FILEPATH, 'wb'))
train_file.close()
print("Vocabulary pickled!")
return vocabulary
###
# Gets word_2_index and index_2_word dictionaries and returns them
# This fails if the dictionaries do not exist yet.
###
def get_w2i_i2w_dicts():
return pickle.load(open(W2I_FILEPATH, 'rb')), pickle.load(open(I2W_FILEPATH, 'rb'))
###
# Creates word_2_index and index_2_word dictionaries and returns them
###
def get_or_create_dicts_from_train_data():
print("Getting word2index and index2word..")
try:
return get_w2i_i2w_dicts()
except:
print("Building word2index and index2word")
filename = TRAINING_TUPLES_FILEPATH
if not os.path.isfile(filename):
triples_to_tuples(TRAINING_FILEPATH, filename)
if conf.use_CORNELL_for_training:
f = open(conf.both_datasets_tuples_filepath, 'r')
else:
f = open(filename, 'r')
word_2_index = {START_TOKEN: START_TOKEN_INDEX, END_TOKEN: END_TOKEN_INDEX, UNK_TOKEN: UNK_TOKEN_INDEX, PAD_TOKEN: PAD_TOKEN_INDEX}
index_2_word = {START_TOKEN_INDEX: START_TOKEN, END_TOKEN_INDEX: END_TOKEN, UNK_TOKEN_INDEX: UNK_TOKEN, PAD_TOKEN_INDEX: PAD_TOKEN}
vocabulary = get_or_create_vocabulary()
index = 4 # because the first 4 elements are are already occupied by our tokens
for line in f:
conversation = line.strip().split()
# print(conversation)
for word in conversation:
if word in vocabulary and word not in word_2_index:
word_2_index[word] = index
index_2_word[index] = word
index += 1
pickle.dump(word_2_index, open(W2I_FILEPATH, 'wb'))
pickle.dump(index_2_word, open(I2W_FILEPATH, 'wb'))
print("word2index and index2word pickled!")
return word_2_index, index_2_word
##
# Applies the word-2-index conversion to a corpus of tuples.
#
# sentenceStringList: List of interaction strings. Each interaction consists of two
# sentences delimited by '\t'.
# vocabulary: The vocabulary used to create the w2i dictionary
# w2i_dict: A dictionary having words as keys and the corresponding
# index as a value.
##
def apply_w2i_to_corpus_tuples(interactionStringList, vocabulary, w2i_dict):
def apply_w2i_to_word(word):
if word in vocabulary:
return w2i_dict[word]
else:
return w2i_dict[UNK_TOKEN]
def apply_w2i_to_sentence(sentence_string):
return list(map(apply_w2i_to_word, sentence_string.split()))
tuples = map(lambda line: line.strip().split('\t'), interactionStringList)
input_sentences, answer_sentences = zip(*tuples)
encoder_inputs = list(map(apply_w2i_to_sentence, input_sentences))
decoder_inputs = list(map(apply_w2i_to_sentence, answer_sentences))
return encoder_inputs, decoder_inputs
###
# Returns data by type (train, eval), together with the word_2_index and index_2_word dicts
###
def get_data_by_type(t):
if t == 'train':
filename = TRAINING_TUPLES_FILEPATH
if conf.use_CORNELL_for_training:
filename = conf.both_datasets_tuples_filepath
elif t == 'eval':
filename = VALIDATION_TUPLES_FILEPATH
print("Reading values from: {}".format(filename))
if not os.path.isfile(filename):
triples_to_tuples(VALIDATION_FILEPATH, filename)
else:
print('Type must be "train" or "eval".')
return
word_2_index, index_2_word = get_or_create_dicts_from_train_data()
vocabulary = get_or_create_vocabulary()
try:
print("Getting encoder and decoder inputs..")
encoder_inputs = pickle.load(open(ENCODER_INPUT_FILEPATH, 'rb'))
decoder_inputs = pickle.load(open(DECODER_INPUT_FILEPATH, 'rb'))
except:
print("Building encoder and decoder inputs..")
with open(filename, 'r') as tuples_input:
lines = tuples_input.readlines()
encoder_inputs, decoder_inputs = apply_w2i_to_corpus_tuples(lines, vocabulary, word_2_index)
pickle.dump(encoder_inputs, open(ENCODER_INPUT_FILEPATH, 'wb'))
pickle.dump(decoder_inputs, open(DECODER_INPUT_FILEPATH, 'wb'))
print("encoder and decoder inputs pickled!")
return encoder_inputs, decoder_inputs, word_2_index, index_2_word
###
# Custom function for bucketing
###
def bucket_by_sequence_length(enc_inputs, dec_inputs, batch_size, sort_data=True, shuffle_batches=True, filter_long_sent=True):
assert len(enc_inputs) == len(dec_inputs)
enc_dec = list(zip(enc_inputs, dec_inputs))
if filter_long_sent:
enc_dec = list(filter(lambda tup: len(tup[0]) < conf.input_sentence_max_length and len(tup[1]) < conf.input_sentence_max_length, enc_dec))
if sort_data:
enc_dec = sorted(enc_dec, key=lambda inputs: (len(inputs[0]), len(inputs[1])))
enc_inputs, dec_inputs = zip(*enc_dec)
assert len(enc_inputs) == len(dec_inputs)
# else we keep the data unsorted
num_batches = ceil(len(enc_inputs) / batch_size)
all_batches = []
for batch_num in range(num_batches):
encoder_sequence_lengths = [len(sentence)
for sentence
in enc_inputs[batch_num*batch_size:(batch_num+1)*batch_size]]
max_len_enc = max(encoder_sequence_lengths)
encoder_batch = [list(reversed(sentence)) + ([PAD_TOKEN_INDEX] * (max_len_enc - encoder_sequence_lengths[i]))
for i, sentence
in enumerate(enc_inputs[batch_num*batch_size:(batch_num+1)*batch_size])]
encoder_batch = np.array(encoder_batch)
decoder_sequence_lengths = [len(sentence) + 1
for sentence
in dec_inputs[batch_num*batch_size:(batch_num+1)*batch_size]]
max_len_dec = max(decoder_sequence_lengths)
decoder_inputs_batch = [[START_TOKEN_INDEX] + sentence + ([PAD_TOKEN_INDEX] * (max_len_dec - decoder_sequence_lengths[i]))
for i, sentence
in enumerate(dec_inputs[batch_num*batch_size:(batch_num+1)*batch_size])]
decoder_inputs_batch = np.array(decoder_inputs_batch)
decoder_targets_batch = [sentence + [END_TOKEN_INDEX] + ([PAD_TOKEN_INDEX] * (max_len_dec - decoder_sequence_lengths[i]))
for i, sentence
in enumerate(dec_inputs[batch_num*batch_size:(batch_num+1)*batch_size])]
decoder_targets_batch = np.array(decoder_targets_batch)
all_batches.append((encoder_batch, encoder_sequence_lengths, decoder_inputs_batch, decoder_targets_batch, decoder_sequence_lengths))
if shuffle_batches:
shuffle(all_batches)
for i in range(num_batches):
yield all_batches[i]
def copy_config(to):
copyfile("./config.py", os.path.join(to, "config.py"))
def truncate_sentence(sent):
endidxArr = np.where(sent == END_TOKEN_INDEX)[0]
if endidxArr.size != 0:
return sent[:endidxArr[0]+1]
else:
padidxArr = np.where(sent == PAD_TOKEN_INDEX)[0]
if padidxArr.size != 0:
return sent[:padidxArr[0]]
else:
return sent
def truncate_after_eos(sentence_list):
return list(map(lambda sent: truncate_sentence(sent), sentence_list))
def undo_input_reversal(reversed_sentences_list):
return list(map(lambda sent: reversed(sent), reversed_sentences_list))