-
-
Notifications
You must be signed in to change notification settings - Fork 363
/
datatypes.py
803 lines (655 loc) · 25.3 KB
/
datatypes.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
from __future__ import absolute_import
import re
from functools import total_ordering
import numpy as np
from numba import jit
from pandas.api.extensions import (
ExtensionDtype, ExtensionArray, register_extension_dtype)
from numbers import Integral
from pandas.api.types import pandas_dtype
from pandas.core.dtypes.common import is_extension_array_dtype
try:
# See if we can register extension type with dask >= 1.1.0
from dask.dataframe.extensions import make_array_nonempty
except ImportError:
make_array_nonempty = None
def _validate_ragged_properties(start_indices, flat_array):
"""
Validate that start_indices are flat_array arrays that may be used to
represent a valid RaggedArray.
Parameters
----------
flat_array: numpy array containing concatenation
of all nested arrays to be represented
by this ragged array
start_indices: unsiged integer numpy array the same
length as the ragged array where values
represent the index into flat_array where
the corresponding ragged array element
begins
Raises
------
ValueError:
if input arguments are invalid or incompatible properties
"""
# Validate start_indices
if (not isinstance(start_indices, np.ndarray) or
start_indices.dtype.kind != 'u' or
start_indices.ndim != 1):
raise ValueError("""
The start_indices property of a RaggedArray must be a 1D numpy array of
unsigned integers (start_indices.dtype.kind == 'u')
Received value of type {typ}: {v}""".format(
typ=type(start_indices), v=repr(start_indices)))
# Validate flat_array
if (not isinstance(flat_array, np.ndarray) or
flat_array.ndim != 1):
raise ValueError("""
The flat_array property of a RaggedArray must be a 1D numpy array
Received value of type {typ}: {v}""".format(
typ=type(flat_array), v=repr(flat_array)))
# Validate start_indices values
# We don't need to check start_indices < 0 because we already know that it
# has an unsigned integer datatype
#
# Note that start_indices[i] == len(flat_array) is valid as it represents
# and empty array element at the end of the ragged array.
invalid_inds = start_indices > len(flat_array)
if invalid_inds.any():
some_invalid_vals = start_indices[invalid_inds[:10]]
raise ValueError("""
Elements of start_indices must be less than the length of flat_array ({m})
Invalid values include: {vals}""".format(
m=len(flat_array), vals=repr(some_invalid_vals)))
# Internal ragged element array wrapper that provides
# equality, ordering, and hashing.
@total_ordering
class _RaggedElement(object):
@staticmethod
def ragged_or_nan(a):
if np.isscalar(a) and np.isnan(a):
return a
else:
return _RaggedElement(a)
@staticmethod
def array_or_nan(a):
if np.isscalar(a) and np.isnan(a):
return a
else:
return a.array
def __init__(self, array):
self.array = array
def __hash__(self):
return hash(self.array.tobytes())
def __eq__(self, other):
if not isinstance(other, _RaggedElement):
return False
return np.array_equal(self.array, other.array)
def __lt__(self, other):
if not isinstance(other, _RaggedElement):
return NotImplemented
return _lexograph_lt(self.array, other.array)
def __repr__(self):
array_repr = repr(self.array)
return array_repr.replace('array', 'ragged_element')
@register_extension_dtype
class RaggedDtype(ExtensionDtype):
"""
Pandas ExtensionDtype to represent a ragged array datatype
Methods not otherwise documented here are inherited from ExtensionDtype;
please see the corresponding method on that class for the docstring
"""
type = np.ndarray
base = np.dtype('O')
_subtype_re = re.compile(r"^ragged\[(?P<subtype>\w+)\]$")
_metadata = ('_dtype',)
@property
def name(self):
return 'Ragged[{subtype}]'.format(subtype=self.subtype)
def __repr__(self):
return self.name
@classmethod
def construct_array_type(cls):
return RaggedArray
@classmethod
def construct_from_string(cls, string):
# lowercase string
string = string.lower()
msg = "Cannot construct a 'RaggedDtype' from '{}'"
if string.startswith('ragged'):
# Extract subtype
try:
subtype_string = cls._parse_subtype(string)
return RaggedDtype(dtype=subtype_string)
except Exception:
raise TypeError(msg.format(string))
else:
raise TypeError(msg.format(string))
def __init__(self, dtype=np.float64):
if isinstance(dtype, RaggedDtype):
self._dtype = dtype.subtype
else:
self._dtype = np.dtype(dtype)
@property
def subtype(self):
return self._dtype
@classmethod
def _parse_subtype(cls, dtype_string):
"""
Parse a datatype string to get the subtype
Parameters
----------
dtype_string: str
A string like Ragged[subtype]
Returns
-------
subtype: str
Raises
------
ValueError
When the subtype cannot be extracted
"""
# Be case insensitive
dtype_string = dtype_string.lower()
match = cls._subtype_re.match(dtype_string)
if match:
subtype_string = match.groupdict()['subtype']
elif dtype_string == 'ragged':
subtype_string = 'float64'
else:
raise ValueError("Cannot parse {dtype_string}".format(
dtype_string=dtype_string))
return subtype_string
def missing(v):
return v is None or (np.isscalar(v) and np.isnan(v))
class RaggedArray(ExtensionArray):
"""
Pandas ExtensionArray to represent ragged arrays
Methods not otherwise documented here are inherited from ExtensionArray;
please see the corresponding method on that class for the docstring
"""
def __init__(self, data, dtype=None, copy=False):
"""
Construct a RaggedArray
Parameters
----------
data: list or array or dict or RaggedArray
* list or 1D-array: A List or 1D array of lists or 1D arrays that
should be represented by the RaggedArray
* dict: A dict containing 'start_indices' and 'flat_array' keys
with numpy array values where:
- flat_array: numpy array containing concatenation
of all nested arrays to be represented
by this ragged array
- start_indices: unsiged integer numpy array the same
length as the ragged array where values
represent the index into flat_array where
the corresponding ragged array element
begins
* RaggedArray: A RaggedArray instance to copy
dtype: RaggedDtype or np.dtype or str or None (default None)
Datatype to use to store underlying values from data.
If none (the default) then dtype will be determined using the
numpy.result_type function.
copy : bool (default False)
Whether to deep copy the input arrays. Only relevant when `data`
has type `dict` or `RaggedArray`. When data is a `list` or
`array`, input arrays are always copied.
"""
if (isinstance(data, dict) and
all(k in data for k in
['start_indices', 'flat_array'])):
_validate_ragged_properties(
start_indices=data['start_indices'],
flat_array=data['flat_array'])
self._start_indices = data['start_indices']
self._flat_array = data['flat_array']
dtype = self._flat_array.dtype
if copy:
self._start_indices = self._start_indices.copy()
self._flat_array = self._flat_array.copy()
elif isinstance(data, RaggedArray):
self._flat_array = data.flat_array
self._start_indices = data.start_indices
dtype = self._flat_array.dtype
if copy:
self._start_indices = self._start_indices.copy()
self._flat_array = self._flat_array.copy()
else:
# Compute lengths
index_len = len(data)
buffer_len = sum(len(datum)
if not missing(datum)
else 0 for datum in data)
# Compute necessary precision of start_indices array
for nbits in [8, 16, 32, 64]:
start_indices_dtype = 'uint' + str(nbits)
max_supported = np.iinfo(start_indices_dtype).max
if buffer_len <= max_supported:
break
# infer dtype if not provided
if dtype is None:
non_missing = [np.atleast_1d(v)
for v in data if not missing(v)]
if non_missing:
dtype = np.result_type(*non_missing)
else:
dtype = 'float64'
elif isinstance(dtype, RaggedDtype):
dtype = dtype.subtype
# Initialize representation arrays
self._start_indices = np.zeros(index_len, dtype=start_indices_dtype)
self._flat_array = np.zeros(buffer_len, dtype=dtype)
# Populate arrays
next_start_ind = 0
for i, array_el in enumerate(data):
# Compute element length
n = len(array_el) if not missing(array_el) else 0
# Update start indices
self._start_indices[i] = next_start_ind
# Update flat array
self._flat_array[next_start_ind:next_start_ind+n] = array_el
# increment next start index
next_start_ind += n
self._dtype = RaggedDtype(dtype=dtype)
def __eq__(self, other):
if isinstance(other, RaggedArray):
if len(other) != len(self):
raise ValueError("""
Cannot check equality of RaggedArray values of unequal length
len(ra1) == {len_ra1}
len(ra2) == {len_ra2}""".format(
len_ra1=len(self),
len_ra2=len(other)))
result = _eq_ragged_ragged(
self.start_indices, self.flat_array,
other.start_indices, other.flat_array)
else:
# Convert other to numpy arrauy
if not isinstance(other, np.ndarray):
other_array = np.asarray(other)
else:
other_array = other
if other_array.ndim == 1 and other_array.dtype.kind != 'O':
# Treat as ragged scalar
result = _eq_ragged_scalar(
self.start_indices, self.flat_array, other_array)
elif (other_array.ndim == 1 and
other_array.dtype.kind == 'O' and
len(other_array) == len(self)):
# Treat as vector
result = _eq_ragged_ndarray1d(
self.start_indices, self.flat_array, other_array)
elif (other_array.ndim == 2 and
other_array.dtype.kind != 'O' and
other_array.shape[0] == len(self)):
# Treat rows as ragged elements
result = _eq_ragged_ndarray2d(
self.start_indices, self.flat_array, other_array)
else:
raise ValueError("""
Cannot check equality of RaggedArray of length {ra_len} with:
{other}""".format(ra_len=len(self), other=repr(other)))
return result
def __ne__(self, other):
return np.logical_not(self == other)
@property
def flat_array(self):
"""
numpy array containing concatenation of all nested arrays
Returns
-------
np.ndarray
"""
return self._flat_array
@property
def start_indices(self):
"""
unsiged integer numpy array the same length as the ragged array where
values represent the index into flat_array where the corresponding
ragged array element begins
Returns
-------
np.ndarray
"""
return self._start_indices
def __len__(self):
return len(self._start_indices)
def __getitem__(self, item):
if isinstance(item, Integral):
if item < -len(self) or item >= len(self):
raise IndexError("{item} is out of bounds".format(item=item))
else:
# Convert negative item index
if item < 0:
item += len(self)
slice_start = self.start_indices[item]
slice_end = (self.start_indices[item+1]
if item + 1 <= len(self) - 1
else len(self.flat_array))
return (self.flat_array[slice_start:slice_end]
if slice_end!=slice_start
else np.nan)
elif type(item) == slice:
data = []
selected_indices = np.arange(len(self))[item]
for selected_index in selected_indices:
data.append(self[selected_index])
return RaggedArray(data, dtype=self.flat_array.dtype)
elif isinstance(item, np.ndarray) and item.dtype == 'bool':
data = []
for i, m in enumerate(item):
if m:
data.append(self[i])
return RaggedArray(data, dtype=self.flat_array.dtype)
elif isinstance(item, (list, np.ndarray)):
return self.take(item, allow_fill=False)
else:
raise IndexError(item)
@classmethod
def _from_sequence(cls, scalars, dtype=None, copy=False):
return RaggedArray(scalars, dtype=dtype)
@classmethod
def _from_factorized(cls, values, original):
return RaggedArray(
[_RaggedElement.array_or_nan(v) for v in values],
dtype=original.flat_array.dtype)
def _as_ragged_element_array(self):
return np.array([_RaggedElement.ragged_or_nan(self[i])
for i in range(len(self))])
def _values_for_factorize(self):
return self._as_ragged_element_array(), np.nan
def _values_for_argsort(self):
return self._as_ragged_element_array()
def unique(self):
from pandas import unique
uniques = unique(self._as_ragged_element_array())
return self._from_sequence(
[_RaggedElement.array_or_nan(v) for v in uniques],
dtype=self.dtype)
def fillna(self, value=None, method=None, limit=None):
# Override in RaggedArray to handle ndarray fill values
from pandas.util._validators import validate_fillna_kwargs
from pandas.core.missing import pad_1d, backfill_1d
value, method = validate_fillna_kwargs(value, method)
mask = self.isna()
if isinstance(value, RaggedArray):
if len(value) != len(self):
raise ValueError("Length of 'value' does not match. Got ({}) "
" expected {}".format(len(value), len(self)))
value = value[mask]
if mask.any():
if method is not None:
func = pad_1d if method == 'pad' else backfill_1d
new_values = func(self.astype(object), limit=limit,
mask=mask)
new_values = self._from_sequence(new_values, dtype=self.dtype)
else:
# fill with value
new_values = list(self)
mask_indices, = np.where(mask)
for ind in mask_indices:
new_values[ind] = value
new_values = self._from_sequence(new_values, dtype=self.dtype)
else:
new_values = self.copy()
return new_values
def shift(self, periods=1, fill_value=None):
# Override in RaggedArray to handle ndarray fill values
# Note: this implementation assumes that `self.dtype.na_value` can be
# stored in an instance of your ExtensionArray with `self.dtype`.
if not len(self) or periods == 0:
return self.copy()
if fill_value is None:
fill_value = np.nan
empty = self._from_sequence(
[fill_value] * min(abs(periods), len(self)),
dtype=self.dtype
)
if periods > 0:
a = empty
b = self[:-periods]
else:
a = self[abs(periods):]
b = empty
return self._concat_same_type([a, b])
def searchsorted(self, value, side="left", sorter=None):
arr = self._as_ragged_element_array()
if isinstance(value, RaggedArray):
search_value = value._as_ragged_element_array()
else:
search_value = _RaggedElement(value)
return arr.searchsorted(search_value, side=side, sorter=sorter)
def isna(self):
stop_indices = np.hstack([self.start_indices[1:],
[len(self.flat_array)]])
element_lengths = stop_indices - self.start_indices
return element_lengths == 0
def take(self, indices, allow_fill=False, fill_value=None):
if allow_fill:
invalid_inds = [i for i in indices if i < -1]
if invalid_inds:
raise ValueError("""
Invalid indices for take with allow_fill True: {inds}""".format(
inds=invalid_inds[:9]))
sequence = [self[i] if i >= 0 else fill_value
for i in indices]
else:
if len(self) == 0 and len(indices) > 0:
raise IndexError("cannot do a non-empty take")
sequence = [self[i] for i in indices]
return RaggedArray(sequence, dtype=self.flat_array.dtype)
def copy(self, deep=False):
data = dict(
flat_array=self.flat_array,
start_indices=self.start_indices)
return RaggedArray(data, copy=deep)
@classmethod
def _concat_same_type(cls, to_concat):
# concat flat_arrays
flat_array = np.hstack([ra.flat_array for ra in to_concat])
# offset and concat start_indices
offsets = np.hstack([
[0],
np.cumsum([len(ra.flat_array) for ra in to_concat[:-1]])])
start_indices = np.hstack([ra.start_indices + offset
for offset, ra in zip(offsets, to_concat)])
return RaggedArray(dict(
flat_array=flat_array, start_indices=start_indices),
copy=False)
@property
def dtype(self):
return self._dtype
@property
def nbytes(self):
return (self._flat_array.nbytes +
self._start_indices.nbytes)
def astype(self, dtype, copy=True):
dtype = pandas_dtype(dtype)
if isinstance(dtype, RaggedDtype):
if copy:
return self.copy()
return self
elif is_extension_array_dtype(dtype):
return dtype.construct_array_type()._from_sequence(
np.asarray(self))
return np.array([v for v in self], dtype=dtype, copy=copy)
@jit(nopython=True, nogil=True)
def _eq_ragged_ragged(start_indices1,
flat_array1,
start_indices2,
flat_array2):
"""
Compare elements of two ragged arrays of the same length
Parameters
----------
start_indices1: ndarray
start indices of a RaggedArray 1
flat_array1: ndarray
flat_array property of a RaggedArray 1
start_indices2: ndarray
start indices of a RaggedArray 2
flat_array2: ndarray
flat_array property of a RaggedArray 2
Returns
-------
mask: ndarray
1D bool array of same length as inputs with elements True when
corresponding elements are equal, False otherwise
"""
n = len(start_indices1)
m1 = len(flat_array1)
m2 = len(flat_array2)
result = np.zeros(n, dtype=np.bool_)
for i in range(n):
# Extract inds for ra1
start_index1 = start_indices1[i]
stop_index1 = start_indices1[i + 1] if i < n - 1 else m1
len_1 = stop_index1 - start_index1
# Extract inds for ra2
start_index2 = start_indices2[i]
stop_index2 = start_indices2[i + 1] if i < n - 1 else m2
len_2 = stop_index2 - start_index2
if len_1 != len_2:
el_equal = False
else:
el_equal = True
for flat_index1, flat_index2 in \
zip(range(start_index1, stop_index1),
range(start_index2, stop_index2)):
el_1 = flat_array1[flat_index1]
el_2 = flat_array2[flat_index2]
el_equal &= el_1 == el_2
result[i] = el_equal
return result
@jit(nopython=True, nogil=True)
def _eq_ragged_scalar(start_indices, flat_array, val):
"""
Compare elements of a RaggedArray with a scalar array
Parameters
----------
start_indices: ndarray
start indices of a RaggedArray
flat_array: ndarray
flat_array property of a RaggedArray
val: ndarray
Returns
-------
mask: ndarray
1D bool array of same length as inputs with elements True when
ragged element equals scalar val, False otherwise.
"""
n = len(start_indices)
m = len(flat_array)
cols = len(val)
result = np.zeros(n, dtype=np.bool_)
for i in range(n):
start_index = start_indices[i]
stop_index = start_indices[i+1] if i < n - 1 else m
if stop_index - start_index != cols:
el_equal = False
else:
el_equal = True
for val_index, flat_index in \
enumerate(range(start_index, stop_index)):
el_equal &= flat_array[flat_index] == val[val_index]
result[i] = el_equal
return result
def _eq_ragged_ndarray1d(start_indices, flat_array, a):
"""
Compare a RaggedArray with a 1D numpy object array of the same length
Parameters
----------
start_indices: ndarray
start indices of a RaggedArray
flat_array: ndarray
flat_array property of a RaggedArray
a: ndarray
1D numpy array of same length as ra
Returns
-------
mask: ndarray
1D bool array of same length as input with elements True when
corresponding elements are equal, False otherwise
Notes
-----
This function is not numba accelerated because it, by design, inputs
a numpy object array
"""
n = len(start_indices)
m = len(flat_array)
result = np.zeros(n, dtype=np.bool_)
for i in range(n):
start_index = start_indices[i]
stop_index = start_indices[i + 1] if i < n - 1 else m
a_val = a[i]
if (a_val is None or
(np.isscalar(a_val) and np.isnan(a_val)) or
len(a_val) == 0):
result[i] = start_index == stop_index
else:
result[i] = np.array_equal(flat_array[start_index:stop_index],
a_val)
return result
@jit(nopython=True, nogil=True)
def _eq_ragged_ndarray2d(start_indices, flat_array, a):
"""
Compare a RaggedArray with rows of a 2D numpy object array
Parameters
----------
start_indices: ndarray
start indices of a RaggedArray
flat_array: ndarray
flat_array property of a RaggedArray
a: ndarray
A 2D numpy array where the length of the first dimension matches the
length of the RaggedArray
Returns
-------
mask: ndarray
1D bool array of same length as input RaggedArray with elements True
when corresponding elements of ra equal corresponding row of `a`
"""
n = len(start_indices)
m = len(flat_array)
cols = a.shape[1]
# np.bool is an alias for Python's built-in bool type, np.bool_ is the
# numpy type that numba recognizes
result = np.zeros(n, dtype=np.bool_)
for row in range(n):
start_index = start_indices[row]
stop_index = start_indices[row + 1] if row < n - 1 else m
# Check equality
if stop_index - start_index != cols:
el_equal = False
else:
el_equal = True
for col, flat_index in enumerate(range(start_index, stop_index)):
el_equal &= flat_array[flat_index] == a[row, col]
result[row] = el_equal
return result
@jit(nopython=True, nogil=True)
def _lexograph_lt(a1, a2):
"""
Compare two 1D numpy arrays lexographically
Parameters
----------
a1: ndarray
1D numpy array
a2: ndarray
1D numpy array
Returns
-------
comparison:
True if a1 < a2, False otherwise
"""
for e1, e2 in zip(a1, a2):
if e1 < e2:
return True
elif e1 > e2:
return False
return len(a1) < len(a2)
def ragged_array_non_empty(dtype):
return RaggedArray([[1], [1, 2]], dtype=dtype)
if make_array_nonempty:
make_array_nonempty.register(RaggedDtype)(ragged_array_non_empty)