-
Notifications
You must be signed in to change notification settings - Fork 1
/
weakdb.py
400 lines (309 loc) · 15 KB
/
weakdb.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
import json
import pickle
import os
import sys
import numpy
# TODO:
# visualize extensions due to (data point, LF, data point+LF)
# sort by data points that added the largest number of extensions (to train or to val????)
# ask dan about other parameters. e.g., expected class balance?
# =====================
# sort by most similar to given data point?
# visualize clusters of data points / highlight a slice of data?
# =====================
# functionality for creating a slice manually? (location of inaccuracies? location of disagreements?)
# HACK(kayvonf): convert from fp32 to fp64 in order to serialize to JSON
# JSON library doesn't serialize float32 values.
def float32_to_float64(x):
return [ numpy.float64(item) for item in x]
# HACK(danfu): If the inputs are Numpy arrays, need to turn them into flat lists
np_arr = numpy.zeros(1)
def is_numpy(arr):
return isinstance(arr, type(np_arr))
def listify_numpy(arr):
if is_numpy(arr):
return arr.flatten().tolist()
else:
return arr
class WeakDB:
DATAPOINT_TYPE_UNKNOWN = "unknown"
DATAPOINT_TYPE_TEXT = "text"
DATAPOINT_TYPE_IMAGE_URL = "image_url"
DATAPOINT_TYPE_IMAGE_URL_SEQ = "image_url_seq"
CLOSEST_LIST_SIZE = 20
SAMPLE_LIST_SIZE = 50
def __init__(self, num_train=0, num_val=0, num_lf=0):
self.dump_name = "" # name of the debug dump
self.description = "" # human readable description of the dump
self.num_lf = num_lf
self.lf_names = [] # human-readable names of labeling functions
self.num_train = num_train # number of datapoints in the training set
self.num_val = num_val # number of datapoints in the val set
self.lf_matrix = [] # (num_train+num_val) x num_lf matrix (LF results before extension)
self.prob_labels = [] # label model output (before extension)
self.extended_lf_matrix = [] # (num_train+num_val) x num_lf matrix (LF results after extension)
self.extended_prob_labels = [] # label model output (after extension)
# description of the datapoints (for introspection during debugging)
self.datapoint_type = WeakDB.DATAPOINT_TYPE_UNKNOWN
self.datapoints = [] # holds either a text string or an image URL (for preview in viz)
self.ground_truth_labels = [] # ground truth labels
self.sorted_dists = [] # list of lists of k-nn indices (in closest to farthest order)
# set default LF names
for i in range(self.num_lf):
self.lf_names.append("LF%2d" % i)
def set_name(self, dump_name):
assert len(dump_name) > 0
self.dump_name = dump_name
def set_description(self, description):
self.description = description
def set_lf_names(self, lf_names):
assert len(lf_names) == self.num_lf
self.lf_names = lf_names
def set_lf_matrix(self, lf_matrix):
lf_matrix = listify_numpy(lf_matrix)
assert len(lf_matrix) == (self.num_train + self.num_val) * self.num_lf
self.lf_matrix = lf_matrix
def set_extended_lf_matrix(self, extended_lf_matrix):
extended_lf_matrix = listify_numpy(extended_lf_matrix)
assert len(extended_lf_matrix) == (self.num_train + self.num_val) * self.num_lf
self.extended_lf_matrix = extended_lf_matrix
def set_prob_labels(self, prob_labels):
prob_labels = listify_numpy(prob_labels)
assert len(prob_labels) == (self.num_train + self.num_val)
self.prob_labels = prob_labels
def set_extended_prob_labels(self, extended_prob_labels):
extended_prob_labels = listify_numpy(extended_prob_labels)
assert len(extended_prob_labels) == (self.num_train + self.num_val)
self.extended_prob_labels = extended_prob_labels
def set_ground_truth(self, ground_truth_labels):
ground_truth_labels = listify_numpy(ground_truth_labels)
assert len(ground_truth_labels) == (self.num_train + self.num_val)
self.ground_truth_labels = ground_truth_labels
def set_datapoints(self, datapoint_type, datapoints):
datapoints = listify_numpy(datapoints)
assert len(datapoints) == (self.num_train + self.num_val)
valid_types = [
WeakDB.DATAPOINT_TYPE_TEXT,
WeakDB.DATAPOINT_TYPE_IMAGE_URL
]
assert datapoint_type in valid_types
self.datapoint_type = datapoint_type
self.datapoints = datapoints
# similarity matrix should be (num_train + num_val) * num_train elements
# higher number is more similar
# if set_closest is True, set rankings to N closest; if set_sampled is True, set rankings to
# sample of items; only one of (set_closest, set_sampled) can be True at a time
def set_similarity_matrix(self, similarity_matrix, set_closest = False, set_sampled = False):
assert(not (set_closest and set_sampled))
if is_numpy(similarity_matrix):
# this is already in matrix form
if len(similarity_matrix.shape) == 2:
assert(similarity_matrix.shape == (self.num_train + self.num_val, self.num_train))
else:
assert(len(similarity_matrix) == ((self.num_train + self.num_val) * self.num_train))
sim_matrix_numpy = similarity_matrix.reshape(
(self.num_train + self.num_val, self.num_train))
else:
assert len(similarity_matrix) == ((self.num_train + self.num_val) * self.num_train)
sim_matrix_numpy = numpy.array(sim_matrix_numpy).reshape(
(self.num_train + self.num_val, self.num_train))
numpy.fill_diagonal(sim_matrix_numpy, numpy.min(sim_matrix_numpy))
# get the ranking
ranking = numpy.flip(numpy.argsort(sim_matrix_numpy, axis = 1), axis = 1)
if set_closest:
self.sorted_dists = ranking[:, :WeakDB.CLOSEST_LIST_SIZE].tolist()
elif set_sampled:
sample_skip = max(1, int(len(self.num_train) / WeakDB.SAMPLE_LIST_SIZE))
self.sorted_dists = ranking[:, ::sample_skip].tolist()
else:
self.sorted_dists = ranking.tolist()
# path generation for output json files (these are the files lfviz expects to load)
def get_dump_info_json_filename(self, target_dir):
filename = "%s.json" % (self.dump_name)
return os.path.join(target_dir, filename)
def get_lf_matrix_json_filename(self, target_dir, ext_type):
filename = "%s_lfmatrix_%s.json" % (self.dump_name, ext_type)
return os.path.join(target_dir, filename)
def get_prob_labels_json_filename(self, target_dir, ext_type):
filename = "%s_prob_labels_%s.json" % (self.dump_name, ext_type)
return os.path.join(target_dir, filename)
def get_ground_truth_labels_json_filename(self, target_dir):
filename = "%s_ground_truth_labels.json" % self.dump_name
return os.path.join(target_dir, filename)
def get_datapoints_json_filename(self, target_dir):
filename = "%s_datapoints.json" % self.dump_name
return os.path.join(target_dir, filename)
def get_similarity_json_filename(self, target_dir):
filename = "%s_sorted_dists.json" % self.dump_name
return os.path.join(target_dir, filename)
# FIXME(kayvonf): remove these once we sync with Linden
# input pickle files (from Linden)
def get_linden_lf_matrix_pkl_filename(self, linden_src_dir, split):
filename = "%s_%s.pkl" % (self.dump_name, split)
return os.path.join(linden_src_dir, filename)
def get_linden_lf_threshold_pkl_filename(self, linden_src_dir):
filename = "%s_lf_thresholds.pkl" % self.dump_name
return os.path.join(linden_src_dir, filename)
def get_linden_similarity_matrix_pkl_filename(self, linden_src_dir, split):
filename = "%s_%s_dists.pkl" % (self.dump_name, split)
return os.path.join(linden_src_dir, filename)
def get_linden_image_paths_pkl_filename(self, linden_src_dir, split):
filename = "%s_%s_paths.pkl" % (self.dump_name, split)
return os.path.join(linden_src_dir, filename)
# helper: this method does the heavy lifting of turning a row in an input
# distance matrix into a row of sorted (by distance) datapoint indices
def process_dists_row(self, dists_row, query_idx=-1):
pairs = [(x, dists_row[x]) for x in range(len(dists_row))]
pairs.sort(reverse=True, key=(lambda x : x[1]))
if query_idx >= 0 and query_idx != pairs[0][0]:
print("WARNING: closest datapoint to datapoint %d is datapoint %d (expected %d). Often a sign of duplicate data." % (query_idx, pairs[0][0], query_idx))
# Build three lists:
# The first is a list of the top N closest items to the query.
# The second is a sampling of SAMPLE_LIST_SIZE elements
# The third is a rank ordering of the datapoints (closest to farthest)
closest_n = []
sampled_n = []
ranking = []
sample_skip = max(1, int(len(dists_row) / WeakDB.SAMPLE_LIST_SIZE))
num_processed = 0
for (idx,dist) in pairs:
# throw out the nearest neighbor datapoint if it is the same as the query datapoint
if query_idx >= 0 and query_idx == idx:
continue
ranking.append(idx)
# store in closest-N list
if len(closest_n) < WeakDB.CLOSEST_LIST_SIZE:
closest_n.append((idx, dist))
# store in sorted samples list
if len(sampled_n) < WeakDB.SAMPLE_LIST_SIZE and num_processed % sample_skip == 0:
sampled_n.append((idx, dist))
num_processed = num_processed + 1
# FIXME(kayvonf): right now just returing the rank ordering until I figure out what
# I'd do with the additional information in the visualizer
#return {"ranking" : ranking, "closest" : closest_n, "sampling" : sampled_n}
return ranking
# FIXME(kayvonf): This is a function that I expect to go away soon
# It loads Linden's pkl files for the tennis task. It should be deprecated and Liden should just
# directly call WeakDB methods to initialize a WeakDB structure
def load_linden(self, linden_src_dir, dump_name):
LINDEN_TRAINING_SET = "train"
LINDEN_VAL_SET = "val"
self.dump_name = dump_name
# HACK(kayvonf): hardcoded since this is whole function is just a stop gap for Linden's data dumps
self.description = "Backhand slice detection (true=slice backhand, false=topspin backhand)"
self.lf_names = ["Ball velocity 1", "Ball velocity 2", "Wrist 1", "Wrist 2", "Human"]
# Linden's pkl format is a table with 2*num_lF + 2 columns
# -- The first num_lf columns are LF output
# -- The next num_lf columns are extended LF output
# -- The next column is LM output on original LF output
# -- The next column is LM output on extended LF output
# -- The last column is the ground truth label
# The code below converts Linden's input into a format where there are two tables.
# One for the LF matrix prior to extension, and another for the LF matrix after the extension.
# These tables contain *both* training and val set data.
lf_train_data = pickle.load( open(self.get_linden_lf_matrix_pkl_filename(linden_src_dir, LINDEN_TRAINING_SET), "rb") )
lf_val_data = pickle.load( open(self.get_linden_lf_matrix_pkl_filename(linden_src_dir, LINDEN_VAL_SET), "rb") )
self.num_train = len(lf_train_data)
self.num_val = len(lf_val_data)
self.num_lf = int((len(lf_train_data[0]) - 3) / 2)
print("Initializing WeakDB from Linden's files... %s" % self.dump_name)
print(" Source PKL dir: %s" % linden_src_dir)
print(" Num LF: %d" % self.num_lf)
print(" Num train: %d" % self.num_train)
print(" Num val: %d" % self.num_val)
# convert pre-extension results
self.lf_matrix = []
self.prob_labels = []
for row in lf_train_data:
for lf in range(self.num_lf):
self.lf_matrix.append(row[lf])
self.prob_labels.append(row[2*self.num_lf])
for row in lf_val_data:
for lf in range(self.num_lf):
self.lf_matrix.append(row[lf])
self.prob_labels.append(row[2*self.num_lf])
# convert post-extension results
self.extended_lf_matrix = []
self.extended_prob_labels = []
for row in lf_train_data:
for lf in range(self.num_lf):
self.extended_lf_matrix.append(row[self.num_lf + lf])
self.extended_prob_labels.append(row[2*self.num_lf+1])
for row in lf_val_data:
for lf in range(self.num_lf):
self.extended_lf_matrix.append(row[self.num_lf + lf])
self.extended_prob_labels.append(row[2*self.num_lf+1])
# process the ground truth labels
self.ground_truth_labels = []
for row in lf_train_data:
self.ground_truth_labels.append(row[2*self.num_lf+2])
for row in lf_val_data:
self.ground_truth_labels.append(row[2*self.num_lf+2])
# now process the datapoint descriptors
self.datapoint_type = WeakDB.DATAPOINT_TYPE_IMAGE_URL_SEQ
self.datapoints = []
datapoints_train = pickle.load(open(self.get_linden_image_paths_pkl_filename(linden_src_dir, LINDEN_TRAINING_SET), "rb"))
datapoints_val = pickle.load(open(self.get_linden_image_paths_pkl_filename(linden_src_dir, LINDEN_VAL_SET), "rb"))
for row in datapoints_train:
base_filename = row[0:-4]
urls = []
for i in range(3):
urls.append("%s-%d.jpg" % (base_filename, i))
self.datapoints.append(urls)
for row in datapoints_val:
base_filename = row[0:-4]
urls = []
for i in range(3):
urls.append("%s-%d.jpg" % (base_filename, i))
self.datapoints.append(urls)
# process the distance matrices
self.sorted_dists = []
train_similarity_matrix = pickle.load(open(self.get_linden_similarity_matrix_pkl_filename(linden_src_dir, LINDEN_TRAINING_SET), "rb"))
val_similarity_matrix = pickle.load(open(self.get_linden_similarity_matrix_pkl_filename(linden_src_dir, LINDEN_VAL_SET), "rb"))
cur_row_idx = 0
for row in train_similarity_matrix:
self.sorted_dists.append(self.process_dists_row(float32_to_float64(row), query_idx=cur_row_idx))
cur_row_idx=cur_row_idx+1
for row in val_similarity_matrix:
self.sorted_dists.append(self.process_dists_row(float32_to_float64(row)))
# writes the weakdb dump to a collection of json files that can be read by lfviz
def save_json(self, target_dir):
has_extended_data = (len(self.extended_lf_matrix) != 0)
has_similarity_data = (len(self.sorted_dists) != 0)
has_ground_truth = (len(self.ground_truth_labels) != 0)
dump_info = { "name" : self.dump_name,
"description" : self.description,
"num_lf" : self.num_lf,
"num_train" : self.num_train,
"num_val" : self.num_val,
"datatype" : self.datapoint_type,
"lf_names" : self.lf_names,
"has_extended_data" : has_extended_data,
"has_similarity_data" : has_similarity_data,
"has_ground_truth" : has_ground_truth
}
with open(self.get_dump_info_json_filename(target_dir), "wt") as f:
f.write(json.dumps(dump_info))
assert len(self.lf_matrix) > 0
with open(self.get_lf_matrix_json_filename(target_dir, "noext"), "wt") as f:
f.write(json.dumps(self.lf_matrix))
assert len(self.prob_labels) > 0
with open(self.get_prob_labels_json_filename(target_dir, "noext"), "wt") as f:
f.write(json.dumps(self.prob_labels))
assert len(self.datapoints) > 0
assert self.datapoint_type != WeakDB.DATAPOINT_TYPE_UNKNOWN
with open(self.get_datapoints_json_filename(target_dir), "wt") as f:
f.write(json.dumps(self.datapoints))
if has_extended_data:
assert len(self.extended_lf_matrix) > 0
with open(self.get_lf_matrix_json_filename(target_dir, "ext"), "wt") as f:
f.write(json.dumps(self.extended_lf_matrix))
assert len(self.extended_prob_labels) > 0
with open(self.get_prob_labels_json_filename(target_dir, "ext"), "wt") as f:
f.write(json.dumps(self.extended_prob_labels))
if has_ground_truth:
with open(self.get_ground_truth_labels_json_filename(target_dir), "wt") as f:
f.write(json.dumps(self.ground_truth_labels))
if has_similarity_data:
with open(self.get_similarity_json_filename(target_dir), "wt") as f:
f.write(json.dumps(self.sorted_dists))