/
dictionary.py
351 lines (301 loc) · 14.1 KB
/
dictionary.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
from collections import OrderedDict
try:
import itertools.izip as zip
except ImportError:
pass
import numpy as np
from .interface import Interface, DataError
from ..dimension import Dimension
from ..element import Element
from ..dimension import OrderedDict as cyODict
from ..ndmapping import NdMapping, item_check
from .. import util
class DictInterface(Interface):
"""
Interface for simple dictionary-based dataset format. The dictionary
keys correspond to the column (i.e dimension) names and the values
are collections representing the values in that column.
"""
types = (dict, OrderedDict, cyODict)
datatype = 'dictionary'
@classmethod
def dimension_type(cls, dataset, dim):
name = dataset.get_dimension(dim, strict=True).name
values = dataset.data[name]
return type(values) if np.isscalar(values) else values.dtype.type
@classmethod
def init(cls, eltype, data, kdims, vdims):
odict_types = (OrderedDict, cyODict)
if kdims is None:
kdims = eltype.kdims
if vdims is None:
vdims = eltype.vdims
dimensions = [d.name if isinstance(d, Dimension) else
d for d in kdims + vdims]
if isinstance(data, tuple):
data = {d: v for d, v in zip(dimensions, data)}
elif util.is_dataframe(data) and all(d in data for d in dimensions):
data = {d: data[d] for d in dimensions}
elif isinstance(data, np.ndarray):
if data.ndim == 1:
if eltype._auto_indexable_1d and len(kdims)+len(vdims)>1:
data = np.column_stack([np.arange(len(data)), data])
else:
data = np.atleast_2d(data).T
data = {k: data[:,i] for i,k in enumerate(dimensions)}
elif isinstance(data, list) and data == []:
data = OrderedDict([(d, []) for d in dimensions])
elif isinstance(data, list) and np.isscalar(data[0]):
data = {dimensions[0]: np.arange(len(data)), dimensions[1]: data}
elif (isinstance(data, list) and isinstance(data[0], tuple) and len(data[0]) == 2
and any(isinstance(v, tuple) for v in data[0])):
dict_data = zip(*((util.wrap_tuple(k)+util.wrap_tuple(v))
for k, v in data))
data = {k: np.array(v) for k, v in zip(dimensions, dict_data)}
# Ensure that interface does not consume data of other types
# with an iterator interface
elif not any(isinstance(data, tuple(t for t in interface.types if t is not None))
for interface in cls.interfaces.values()):
data = {k: v for k, v in zip(dimensions, zip(*data))}
elif (isinstance(data, dict) and not any(isinstance(v, np.ndarray) for v in data.values()) and not
any(d in data or any(d in k for k in data if isinstance(k, tuple)) for d in dimensions)):
# For data where both keys and values are dimension values
# e.g. {('A', 'B'): (1, 2)} (should consider deprecating)
dict_data = sorted(data.items())
k, v = dict_data[0]
if len(util.wrap_tuple(k)) != len(kdims) or len(util.wrap_tuple(v)) != len(vdims):
raise ValueError("Dictionary data not understood, should contain a column "
"per dimension or a mapping between key and value dimension "
"values.")
dict_data = zip(*((util.wrap_tuple(k)+util.wrap_tuple(v))
for k, v in dict_data))
data = {k: np.array(v) for k, v in zip(dimensions, dict_data)}
if not isinstance(data, cls.types):
raise ValueError("DictInterface interface couldn't convert data.""")
elif isinstance(data, dict):
unpacked = []
for d, vals in data.items():
if isinstance(d, tuple):
vals = np.asarray(vals)
if vals.shape == (0,):
for sd in d:
unpacked.append((sd, np.array([], dtype=vals.dtype)))
elif not vals.ndim == 2 and vals.shape[1] == len(d):
raise ValueError("Values for %s dimensions did not have "
"the expected shape.")
else:
for i, sd in enumerate(d):
unpacked.append((sd, vals[:, i]))
else:
vals = vals if np.isscalar(vals) else np.asarray(vals)
if not np.isscalar(vals) and not vals.ndim == 1:
raise ValueError('DictInterface expects data for each column to be flat.')
unpacked.append((d, vals))
if not cls.expanded([d[1] for d in unpacked if not np.isscalar(d[1])]):
raise ValueError('DictInterface expects data to be of uniform shape.')
if isinstance(data, odict_types):
data.update(unpacked)
else:
data = OrderedDict(unpacked)
return data, {'kdims':kdims, 'vdims':vdims}, {}
@classmethod
def validate(cls, dataset, vdims=True):
dim_types = 'all' if vdims else 'key'
dimensions = dataset.dimensions(dim_types, label='name')
not_found = [d for d in dimensions if d not in dataset.data]
if not_found:
raise DataError('Following columns specified as dimensions '
'but not found in data: %s' % not_found, cls)
lengths = [(dim, 1 if np.isscalar(dataset.data[dim]) else len(dataset.data[dim]))
for dim in dimensions]
if len({l for d, l in lengths if l > 1}) > 1:
lengths = ', '.join(['%s: %d' % l for l in sorted(lengths)])
raise DataError('Length of columns must be equal or scalar, '
'columns have lengths: %s' % lengths, cls)
@classmethod
def unpack_scalar(cls, dataset, data):
"""
Given a dataset object and data in the appropriate format for
the interface, return a simple scalar.
"""
if len(data) != 1:
return data
key = list(data.keys())[0]
if len(data[key]) == 1 and key in dataset.vdims:
return data[key][0]
@classmethod
def isscalar(cls, dataset, dim):
name = dataset.get_dimension(dim, strict=True).name
values = dataset.data[name]
return np.isscalar(values) or len(np.unique(values)) == 1
@classmethod
def shape(cls, dataset):
return cls.length(dataset), len(dataset.data),
@classmethod
def length(cls, dataset):
lengths = [len(vals) for vals in dataset.data.values() if not np.isscalar(vals)]
return max(lengths) if lengths else 1
@classmethod
def array(cls, dataset, dimensions):
if not dimensions:
dimensions = dataset.dimensions(label='name')
else:
dimensions = [dataset.get_dimensions(d).name for d in dimensions]
arrays = [dataset.data[dim.name] for dim in dimensions]
return np.column_stack([np.full(len(dataset), arr) if np.isscalar(arr) else arr
for arr in arrays])
@classmethod
def add_dimension(cls, dataset, dimension, dim_pos, values, vdim):
dim = dimension.name if isinstance(dimension, Dimension) else dimension
data = list(dataset.data.items())
data.insert(dim_pos, (dim, values))
return OrderedDict(data)
@classmethod
def redim(cls, dataset, dimensions):
all_dims = dataset.dimensions()
renamed = []
for k, v in dataset.data.items():
if k in dimensions:
k = dimensions[k].name
elif k in all_dims:
k = dataset.get_dimension(k).name
renamed.append((k, v))
return OrderedDict(renamed)
@classmethod
def concat(cls, dataset_objs):
cast_objs = cls.cast(dataset_objs)
cols = set(tuple(c.data.keys()) for c in cast_objs)
if len(cols) != 1:
raise Exception("In order to concatenate, all Dataset objects "
"should have matching set of columns.")
concatenated = OrderedDict()
for column in cols.pop():
concatenated[column] = np.concatenate([obj[column] for obj in cast_objs])
return concatenated
@classmethod
def sort(cls, dataset, by=[], reverse=False):
by = [dataset.get_dimension(d).name for d in by]
if len(by) == 1:
sorting = cls.values(dataset, by[0]).argsort()
else:
arrays = [dataset.dimension_values(d) for d in by]
sorting = util.arglexsort(arrays)
return OrderedDict([(d, v if np.isscalar(v) else (v[sorting][::-1] if reverse else v[sorting]))
for d, v in dataset.data.items()])
@classmethod
def values(cls, dataset, dim, expanded=True, flat=True):
dim = dataset.get_dimension(dim).name
values = dataset.data.get(dim)
if np.isscalar(values):
if not expanded:
return np.array([values])
values = np.full(len(dataset), values, dtype=np.array(values).dtype)
else:
if not expanded:
return util.unique_array(values)
values = np.asarray(values)
return values
@classmethod
def reindex(cls, dataset, kdims, vdims):
dimensions = [dataset.get_dimension(d).name for d in kdims+vdims]
return OrderedDict([(d, dataset.dimension_values(d))
for d in dimensions])
@classmethod
def groupby(cls, dataset, dimensions, container_type, group_type, **kwargs):
# Get dimensions information
dimensions = [dataset.get_dimension(d) for d in dimensions]
kdims = [kdim for kdim in dataset.kdims if kdim not in dimensions]
vdims = dataset.vdims
# Update the kwargs appropriately for Element group types
group_kwargs = {}
group_type = dict if group_type == 'raw' else group_type
if issubclass(group_type, Element):
group_kwargs.update(util.get_param_values(dataset))
group_kwargs['kdims'] = kdims
group_kwargs.update(kwargs)
# Find all the keys along supplied dimensions
keys = (tuple(dataset.data[d.name] if np.isscalar(dataset.data[d.name])
else dataset.data[d.name][i] for d in dimensions)
for i in range(len(dataset)))
# Iterate over the unique entries applying selection masks
grouped_data = []
for unique_key in util.unique_iterator(keys):
mask = cls.select_mask(dataset, dict(zip(dimensions, unique_key)))
group_data = OrderedDict(((d.name, dataset.data[d.name] if np.isscalar(dataset.data[d.name])
else dataset.data[d.name][mask])
for d in kdims+vdims))
group_data = group_type(group_data, **group_kwargs)
grouped_data.append((unique_key, group_data))
if issubclass(container_type, NdMapping):
with item_check(False):
return container_type(grouped_data, kdims=dimensions)
else:
return container_type(grouped_data)
@classmethod
def select(cls, dataset, selection_mask=None, **selection):
if selection_mask is None:
selection_mask = cls.select_mask(dataset, selection)
indexed = cls.indexed(dataset, selection)
data = OrderedDict((k, v if np.isscalar(v) else v[selection_mask])
for k, v in dataset.data.items())
if indexed and len(list(data.values())[0]) == 1 and len(dataset.vdims) == 1:
value = data[dataset.vdims[0].name]
return value if np.isscalar(value) else value[0]
return data
@classmethod
def sample(cls, dataset, samples=[]):
mask = False
for sample in samples:
sample_mask = True
if np.isscalar(sample): sample = [sample]
for i, v in enumerate(sample):
name = dataset.get_dimension(i).name
sample_mask &= (dataset.data[name]==v)
mask |= sample_mask
return {k: col if np.isscalar(col) else np.array(col)[mask]
for k, col in dataset.data.items()}
@classmethod
def aggregate(cls, dataset, kdims, function, **kwargs):
kdims = [dataset.get_dimension(d, strict=True).name for d in kdims]
vdims = dataset.dimensions('value', label='name')
groups = cls.groupby(dataset, kdims, list, OrderedDict)
aggregated = OrderedDict([(k, []) for k in kdims+vdims])
for key, group in groups:
key = key if isinstance(key, tuple) else (key,)
for kdim, val in zip(kdims, key):
aggregated[kdim].append(val)
for vdim, arr in group.items():
if vdim in dataset.vdims:
if np.isscalar(arr):
reduced = arr
elif isinstance(function, np.ufunc):
reduced = function.reduce(arr, **kwargs)
else:
reduced = function(arr, **kwargs)
aggregated[vdim].append(reduced)
return aggregated
@classmethod
def iloc(cls, dataset, index):
rows, cols = index
scalar = False
if np.isscalar(cols):
scalar = np.isscalar(rows)
cols = [dataset.get_dimension(cols, strict=True)]
elif isinstance(cols, slice):
cols = dataset.dimensions()[cols]
else:
cols = [dataset.get_dimension(d, strict=True) for d in cols]
if np.isscalar(rows):
rows = [rows]
new_data = OrderedDict()
for d, values in dataset.data.items():
if d in cols:
if np.isscalar(values):
new_data[d] = values
else:
new_data[d] = values[rows]
if scalar:
arr = new_data[cols[0].name]
return arr if np.isscalar(arr) else arr[0]
return new_data
Interface.register(DictInterface)