-
Notifications
You must be signed in to change notification settings - Fork 11
/
dataset.py
427 lines (364 loc) · 15.7 KB
/
dataset.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
# Copyright (c) 2017, All Contributors (see CONTRIBUTORS file)
# Authors: Salvatore Trani <salvatore.trani@isti.cnr.it>
#
# This Source Code Form is subject to the terms of the Mozilla Public
# License, v. 2.0. If a copy of the MPL was not distributed with this
# file, You can obtain one at http://mozilla.org/MPL/2.0/.
"""
This module implements the generic class for loading/dumping a dataset from/to
file.
"""
import numpy as np
import numbers
import hashlib
from .svmlight_format import load_svmlight_file, dump_svmlight_file
class Dataset(object):
"""
This class describe the dataset object, with its utility and features
Attributes
----------
X : numpy 2d array of float
It is a dense numpy matrix of shape (n_instances, n_features),
y : numpy 1d array of float
It is a ndarray of shape (n_instances,) with the gold label
query_ids : numpy 1d array of int
It is a ndarray of shape(n_queries,)
query_offsets : numpy 1d array of int
It is a ndarray of shape(n_queries+1, ) with the start and end offsets
of each query. In particular. the i-th query has indices ranging in
[ query_offsets[i], query_offsets[i+1] ), with the latter excluded.
name : str
The name to give to the dataset
n_instances : int
The number of instances in the dataset
n_features : int
The number of features in the dataset
n_queries : int
The number of queries in the dataset
"""
def __init__(self, X, y, query_ids, name=None):
"""
This module implements the generic class for loading/dumping a dataset
from/to file.
Parameters
----------
X : numpy.ndarray
The matrix with feature values
y : numpy.array
The vector with label values
query_ids : numpy.array
The vector with the query_id for each sample.
"""
if query_ids.size != X.shape[0]:
raise Exception(
"query_ids has wrong size. Expected %s but got %s" % (
X.shape[0], query_ids.size))
# convert from query_ids per sample to query offset
self.query_ids, self.query_offsets = \
np.unique(query_ids, return_index=True)
# resort the arrays per offset (if the file does not contains qids in
# order, the np.unique will return qids with a different ordering...
idx_sort = np.argsort(self.query_offsets)
self.query_ids = self.query_ids[idx_sort]
self.query_offsets = self.query_offsets[idx_sort]
self.query_offsets = np.append(self.query_offsets, query_ids.size)
self.X, self.y = X, y
self.name = "Dataset %s" % (self.X.shape,)
if name is not None:
self.name = name
self.n_instances = self.y.size
self.n_features = self.X.shape[1]
self.n_queries = self.query_ids.size
self._hash_cached = None
@staticmethod
def load(f, name=None, format="svmlight"):
"""
This static method implements the loading of a dataset from file.
Parameters
----------
f : {str, file-like, int}
(Path to) a file to load. If a path ends in ".gz" or ".bz2", it will
be uncompressed on the fly. If an integer is passed, it is assumed
to be a file descriptor. A file-like or file descriptor will not be
closed by this function. A file-like object must be opened in
binary mode.
name : str
The name to be given to the current dataset
format : str
The format of the dataset file to load (actually supported is only
"svmlight" format)
Returns
-------
dataset : Dataset
The dataset read from file
"""
if format == "svmlight":
X, y, query_ids = load_svmlight_file(f, query_id=True)
else:
raise TypeError("Dataset format %s is not yet supported!" % format)
return Dataset(X, y, query_ids, name)
def subset_features(self, features):
"""
Create a new Dataset with only the features identified by the given
features parameters (indices). It is useful for performing feature
selection.
Parameters
----------
features : numpy array or list
The indices of the features to select in the resulting dataset
Returns
-------
dataset : rankeval.dataset.Dataset
The resulting dataset with the given subset of features
"""
return Dataset(self.X[:, features].copy(),
self.y,
self.get_qids_dataset(),
name=self.name)
def dump(self, f, format="svmlight"):
"""
This method implements the writing of a previously loaded dataset
according to the given format on file
Parameters
----------
f : {str, file-like, int}
(Path to) a file to dump. If a path ends in ".gz" or ".bz2", it will
be compressed on the fly. If an integer is passed, it is assumed
to be a file descriptor. A file-like or file descriptor will not be
closed by this function. A file-like object must be opened in
text mode.
format : str
The format to use for dumping the dataset on file (actually
supported is only "svmlight" format)
"""
# we need to unroll the query_ids and query_offsets.
# They are represented compact: they report only the query ids and the
# offsets where each query starts and ends.
query_ids = np.ndarray(self.n_instances, dtype=np.int32)
for qid, start_offset, end_offset in self.query_iterator():
for idx in np.arange(start_offset, end_offset):
query_ids[idx] = qid
if format == "svmlight":
dump_svmlight_file(self.X, self.y, f, query_ids)
else:
raise TypeError("Dataset format %s is not yet supported!" % format)
def split(self, train_size, vali_size=0, random_state=None):
"""
This method splits the dataset into train/validation/test partition.
It shuffle the query ids before partitioning. If vali_size=0, it means
the method will not create a validation set, thus returning only
train and test sets. Otherwise it will return train/vali/test sets.
Parameters
----------
train_size : float
The ratio of query ids in the training set. It should be between
0 and 1.
vali_size : float
The ratio of query ids in the validation set. It should be between
0 and 1. 0 means no validation to be created.
random_state : int
If int, random_state is the seed used by the random number
generator. If RandomState instance, random_state is the random
number generator. If None, the random number generator is the
RandomState instance used by np.random.
Returns
-------
(train, vali, test) datasets : tuple of rankeval.dataset.Dataset
The resulting datasets with the given fraction of query ids in each
partition. If qids_only=True, the methods yields only the query ids
of each fold, without creating the dataset.
"""
if train_size < 0 or train_size > 1 or (train_size + vali_size) > 1:
raise Exception("train and/or validation sizes are not correct!")
train_qn = int(round(train_size * self.n_queries))
vali_qn = int(round(vali_size * self.n_queries))
test_qn = self.n_queries - train_qn - vali_qn
qid_map = np.ndarray(self.n_instances, dtype=np.uint32)
for qid, start_offset, end_offset in self.query_iterator():
for idx in np.arange(start_offset, end_offset):
qid_map[idx] = qid
# add queries shuffling
rng = Dataset._check_random_state(random_state)
qids_permutation = rng.permutation(self.query_ids)
train_qid = qids_permutation[:train_qn]
vali_qid = qids_permutation[train_qn:train_qn + vali_qn]
test_qid = qids_permutation[-test_qn:]
train_mask = np.in1d(qid_map, train_qid)
vali_mask = np.in1d(qid_map, vali_qid)
test_mask = np.in1d(qid_map, test_qid)
train_dataset = Dataset(self.X[train_mask], self.y[train_mask],
qid_map[train_mask], name=self.name + ' Train')
if vali_size:
vali_dataset = Dataset(self.X[vali_mask], self.y[vali_mask],
qid_map[vali_mask], name=self.name + ' Vali')
test_dataset = Dataset(self.X[test_mask], self.y[test_mask],
qid_map[test_mask], name=self.name + ' Test')
if not vali_size:
return train_dataset, test_dataset
else:
return train_dataset, vali_dataset, test_dataset
def subset(self, query_ids, name=None):
"""
This method return a subset of the dataset according to the query_ids
parameter.
Parameters
----------
query_ids : numpy 1d array of int
It is a ndarray with the query_ids to select
name : str
The name to give to the dataset
Returns
-------
datasets : rankeval.dataset.Dataset
The resulting dataset with only the query_ids requested
"""
qid_map = self.get_qids_dataset()
mask = np.in1d(qid_map, query_ids)
return Dataset(self.X[mask], self.y[mask],
qid_map[mask], name=name)
def query_iterator(self):
"""
This method implements an iterator over the offsets of the query_ids
in the dataset.
Returns
-------
offsets : tuple of (int, int, int)
The query_id and the row index of instances belonging to the query.
The two indices represent (start, end) offsets.
"""
for i in np.arange(self.n_queries):
yield self.query_ids[i], \
self.query_offsets[i], self.query_offsets[i + 1]
def get_query_sizes(self):
"""
This method return the size of each query set.
Returns
-------
sizes : numpy 1d array of int
It is a ndarray of shape (n_queries,)
"""
return np.ediff1d(self.query_offsets)
def get_qids_dataset(self, dtype=np.int32):
"""
This method returns the query ids array in linear representation, i.e.,
with the qid of each instance. Useful for creating a new dataset
starting from a different one.
Returns
-------
query_ids : numpy 1d array
It is a ndarray of shape (n_instances,)
"""
query_ids = np.empty(shape=self.n_instances, dtype=dtype)
for qid, start_offset, end_offset in self.query_iterator():
query_ids[start_offset:end_offset] = qid
return query_ids
def get_query_offsets(self, query_id):
"""
This method return the offsets (start, end) of a given query_id in the
dataset. Useful for debugging/analyzing in details the behaviours of a
given model on specific set of queries.
Parameters
----------
query_id: int
The query to search in the dataset
Returns
-------
offsets : tuple of (int, int)
The query index of instances belonging to the query.
The two indices represent (start, end) offsets.
"""
idx = np.where(self.query_ids == query_id)[0]
if idx.size == 0:
raise LookupError("query_id {:d} is missing from the dataset") \
.format(query_id)
# Take first element
idx = idx[0]
return self.query_offsets[idx], self.query_offsets[idx + 1]
def kfold(self, n_folds=5, qids_only=False, shuffle=True):
"""
This method generates a k-fold splitting of the dataset, i.e., it splits
the dataset in n_folds and provide train/vali/test splitting of data
for each iteration (fold). Folds are rotated avoiding overlapping.
Shuffle the queries by default before splitting.
Parameters
----------
n_folds: int
The number of folds. Must be at least 2
qids_only : bool
Whether to yield only the query ids of each split in place of the
Dataset
shuffle : bool
Whether to shuffle the queries before splitting the folds
Yields:
-------
(train, vali, test) datasets : tuple of rankeval.dataset.Dataset
The datasets for that split. If qids_only=True, the methods yields
only the query ids of each fold, without creating the dataset.
"""
fold_size = int(np.floor(self.n_queries / n_folds))
qids = np.copy(self.query_ids)
if shuffle:
np.random.shuffle(qids)
split_points = [fold_size * i for i in np.arange(n_folds)]
for cur_fold in np.arange(n_folds):
idx_train = split_points[cur_fold]
idx_vali = split_points[(cur_fold - 2) % n_folds]
idx_test = split_points[(cur_fold - 1) % n_folds]
qids_train = qids[np.arange(
idx_train,
idx_vali + qids.size if idx_vali < idx_train else idx_vali
) % qids.size]
qids_vali = qids[np.arange(
idx_vali,
idx_test + qids.size if idx_test < idx_vali else idx_test
) % qids.size]
qids_test = qids[np.arange(
idx_test,
idx_train + qids.size if idx_train < idx_test else idx_train
) % qids.size]
if qids_only:
yield qids_train, qids_vali, qids_test
else:
yield self.subset(qids_train), \
self.subset(qids_vali), \
self.subset(qids_test)
@staticmethod
def _check_random_state(seed):
"""
Turn seed into a np.random.RandomState instance (took for sklearn)
Parameters
----------
seed : None | int | instance of RandomState
If seed is None, return the RandomState singleton used by np.random.
If seed is an int, return a new RandomState instance seeded with it.
If seed is already a RandomState instance, return it.
Otherwise raise ValueError.
"""
if seed is None or seed is np.random:
return np.random.mtrand._rand
if isinstance(seed, (numbers.Integral, np.integer)):
return np.random.RandomState(seed)
if isinstance(seed, np.random.RandomState):
return seed
raise ValueError('%r cannot be used to seed a numpy.random.RandomState'
' instance' % seed)
def __str__(self):
return self.name
def __hash__(self):
# Cache the hash given the computational cost to compute it
# ASSUMPTION: the object is unmodifiable!
if self._hash_cached is None:
h = hashlib.md5()
for arr in [self.X, self.y, self.query_ids]:
h.update(arr)
self._hash_cached = int(h.hexdigest(), 16)
return self._hash_cached
def __eq__(self, other):
# use != instead of == because it is more efficient for sparse matrices:
x_eq = not(self.X != other.X).any()
return x_eq and (self.y == other.y).all() and \
(self.query_ids == other.query_ids).all()
def __ne__(self, other):
# Not strictly necessary, but to avoid having both x==y and x!=y
# True at the same time
return not(self == other)