/
groupby.py
174 lines (149 loc) · 7.24 KB
/
groupby.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
# encoding: utf-8
from __future__ import absolute_import, division, print_function
import numpy as np
import larray as la
from liam2.context import context_length
from liam2.expr import expr_eval, collect_variables, not_hashable
from liam2.exprbases import TableExpression
from liam2.utils import expand, prod
from liam2.aggregates import Count
from liam2.partition import partition_nd
class GroupBy(TableExpression):
funcname = 'groupby'
no_eval = ('expressions', 'expr')
kwonlyargs = {'expr': None, 'filter': None, 'percent': False,
'pvalues': None, 'totals': True}
# noinspection PyNoneFunctionAssignment
def compute(self, context, *expressions, **kwargs):
if not expressions:
raise TypeError("groupby() takes at least 1 argument")
# TODO: allow lists/tuples of arguments to group by the combinations
# of keys
for expr in expressions:
if isinstance(expr, (bool, int, float)):
raise TypeError("groupby() does not work with constant "
"arguments")
if isinstance(expr, (tuple, list)):
raise TypeError("groupby() takes expressions as arguments, "
"not a list of expressions")
# On python 3, we could clean up this code (keyword only arguments).
expr = kwargs.pop('expr', None)
if expr is None:
expr = Count()
# by = kwargs.pop('by', None)
filter_value = kwargs.pop('filter', None)
percent = kwargs.pop('percent', False)
possible_values = kwargs.pop('pvalues', None)
totals = kwargs.pop('totals', True)
expr_vars = [v.name for v in collect_variables(expr)]
labels = [str(e) for e in expressions]
columns = [expr_eval(e, context) for e in expressions]
columns = [expand(c, context_length(context)) for c in columns]
if filter_value is not None:
filtered_columns = [col[filter_value] for col in columns]
# FIXME: use the actual filter_expr instead of not_hashable
filtered_context = context.subset(filter_value, expr_vars,
not_hashable)
else:
filtered_columns = columns
filtered_context = context
if possible_values is None:
possible_values = [np.unique(col) for col in filtered_columns]
# We pre-filtered columns instead of passing the filter to partition_nd
# because it is a bit faster this way. The indices are still correct,
# because we use them on a filtered_context.
groups = partition_nd(filtered_columns, True, possible_values)
if not groups:
# return la.LArray([], labels, possible_values)
return la.LArray([])
# evaluate the expression on each group
# we use not_hashable to avoid storing the subset in the cache
contexts = [filtered_context.subset(indices, expr_vars, not_hashable)
for indices in groups]
data = [expr_eval(expr, c) for c in contexts]
# TODO: use group_indices_nd directly to avoid using np.unique
# this is twice as fast (unique is very slow) but breaks because
# the rest of the code assumes all combinations are present
# if self.filter is not None:
# filter_value = expr_eval(self.filter, context)
# else:
# filter_value = True
#
# d = group_indices_nd(columns, filter_value)
# pvalues = sorted(d.keys())
# ndim = len(columns)
# possible_values = [[pv[i] for pv in pvalues]
# for i in range(ndim)]
# groups = [d[k] for k in pvalues]
# groups is a (flat) list of list.
# the first variable is the outer-most "loop",
# the last one the inner most.
# add total for each row
len_pvalues = [len(vals) for vals in possible_values]
if percent:
totals = True
if totals:
width = len_pvalues[-1]
height = prod(len_pvalues[:-1])
rows_indices = [np.concatenate([groups[y * width + x]
for x in range(width)])
for y in range(height)]
cols_indices = [np.concatenate([groups[y * width + x]
for y in range(height)])
for x in range(width)]
cols_indices.append(np.concatenate(cols_indices))
# evaluate the expression on each "combined" group (ie compute totals)
row_ctxs = [filtered_context.subset(indices, expr_vars, not_hashable)
for indices in rows_indices]
row_totals = [expr_eval(expr, ctx) for ctx in row_ctxs]
col_ctxs = [filtered_context.subset(indices, expr_vars, not_hashable)
for indices in cols_indices]
col_totals = [expr_eval(expr, ctx) for ctx in col_ctxs]
else:
row_totals = None
col_totals = None
if percent:
# convert to np.float64 to get +-inf if total_value is int(0)
# instead of Python's built-in behaviour of raising an exception.
# This can happen at least when using the default expr (count())
# and the filter yields empty groups
total_value = np.float64(col_totals[-1])
data = [100.0 * value / total_value for value in data]
row_totals = [100.0 * value / total_value for value in row_totals]
col_totals = [100.0 * value / total_value for value in col_totals]
# if self.by or self.percent:
# if self.percent:
# total_value = data[-1]
# divisors = [total_value for _ in data]
# else:
# num_by = len(self.by)
# inc = prod(len_pvalues[-num_by:])
# num_groups = len(groups)
# num_categories = prod(len_pvalues[:-num_by])
#
# categories_groups_idx = [range(cat_idx, num_groups, inc)
# for cat_idx in range(num_categories)]
#
# divisors = ...
#
# data = [100.0 * value / divisor
# for value, divisor in zip(data, divisors)]
# convert to a 1d array. We don't simply use data = np.array(data),
# because if data is a list of ndarray (for example if we use
# groupby(a, expr=id), *and* all the ndarrays have the same length,
# the result is a 2d array instead of an array of ndarrays like we
# need (at this point).
arr = np.empty(len(data), dtype=type(data[0]))
arr[:] = data
data = arr
# and reshape it
data = data.reshape(len_pvalues)
axes = [la.Axis(axis_labels, axis_name)
for axis_name, axis_labels in zip(labels, possible_values)]
# FIXME: also handle totals
return la.LArray(data, axes)
# return la.LArray(data, labels, possible_values,
# row_totals, col_totals)
functions = {
'groupby': GroupBy
}