From da877dff21bd6b56732c0edbaf65b6a2decc2619 Mon Sep 17 00:00:00 2001 From: Jeff Reback Date: Tue, 29 May 2018 07:02:31 -0400 Subject: [PATCH] CLN: clean up groupby / categorical xref #21151 --- pandas/core/arrays/categorical.py | 67 ------------------- pandas/core/groupby/categorical.py | 100 +++++++++++++++++++++++++++++ pandas/core/groupby/groupby.py | 20 ++---- 3 files changed, 106 insertions(+), 81 deletions(-) create mode 100644 pandas/core/groupby/categorical.py diff --git a/pandas/core/arrays/categorical.py b/pandas/core/arrays/categorical.py index 0252b5b52ae94..375db28a4ee5a 100644 --- a/pandas/core/arrays/categorical.py +++ b/pandas/core/arrays/categorical.py @@ -698,73 +698,6 @@ def _set_categories(self, categories, fastpath=False): self._dtype = new_dtype - def _codes_for_groupby(self, sort, observed): - """ - Code the categories to ensure we can groupby for categoricals. - - If observed=True, we return a new Categorical with the observed - categories only. - - If sort=False, return a copy of self, coded with categories as - returned by .unique(), followed by any categories not appearing in - the data. If sort=True, return self. - - This method is needed solely to ensure the categorical index of the - GroupBy result has categories in the order of appearance in the data - (GH-8868). - - Parameters - ---------- - sort : boolean - The value of the sort parameter groupby was called with. - observed : boolean - Account only for the observed values - - Returns - ------- - Categorical - If sort=False, the new categories are set to the order of - appearance in codes (unless ordered=True, in which case the - original order is preserved), followed by any unrepresented - categories in the original order. - """ - - # we only care about observed values - if observed: - unique_codes = unique1d(self.codes) - cat = self.copy() - - take_codes = unique_codes[unique_codes != -1] - if self.ordered: - take_codes = np.sort(take_codes) - - # we recode according to the uniques - categories = self.categories.take(take_codes) - codes = _recode_for_categories(self.codes, - self.categories, - categories) - - # return a new categorical that maps our new codes - # and categories - dtype = CategoricalDtype(categories, ordered=self.ordered) - return type(self)(codes, dtype=dtype, fastpath=True) - - # Already sorted according to self.categories; all is fine - if sort: - return self - - # sort=False should order groups in as-encountered order (GH-8868) - cat = self.unique() - - # But for groupby to work, all categories should be present, - # including those missing from the data (GH-13179), which .unique() - # above dropped - cat.add_categories( - self.categories[~self.categories.isin(cat.categories)], - inplace=True) - - return self.reorder_categories(cat.categories) - def _set_dtype(self, dtype): """Internal method for directly updating the CategoricalDtype diff --git a/pandas/core/groupby/categorical.py b/pandas/core/groupby/categorical.py new file mode 100644 index 0000000000000..71db97a211190 --- /dev/null +++ b/pandas/core/groupby/categorical.py @@ -0,0 +1,100 @@ +import numpy as np +from pandas.core.algorithms import unique1d +from pandas.core.arrays.categorical import ( + _recode_for_categories, CategoricalDtype, Categorical) +from pandas import CategoricalIndex, Series + + +def recode_for_groupby(c, sort, observed): + """ + Code the categories to ensure we can groupby for categoricals. + + If observed=True, we return a new Categorical with the observed + categories only. + + If sort=False, return a copy of self, coded with categories as + returned by .unique(), followed by any categories not appearing in + the data. If sort=True, return self. + + This method is needed solely to ensure the categorical index of the + GroupBy result has categories in the order of appearance in the data + (GH-8868). + + Parameters + ---------- + c : Categorical + sort : boolean + The value of the sort parameter groupby was called with. + observed : boolean + Account only for the observed values + + Returns + ------- + New Categorical + If sort=False, the new categories are set to the order of + appearance in codes (unless ordered=True, in which case the + original order is preserved), followed by any unrepresented + categories in the original order. + Categorical or None + If we are observed, return the original categorical, otherwise None + """ + + # we only care about observed values + if observed: + unique_codes = unique1d(c.codes) + + take_codes = unique_codes[unique_codes != -1] + if c.ordered: + take_codes = np.sort(take_codes) + + # we recode according to the uniques + categories = c.categories.take(take_codes) + codes = _recode_for_categories(c.codes, + c.categories, + categories) + + # return a new categorical that maps our new codes + # and categories + dtype = CategoricalDtype(categories, ordered=c.ordered) + return Categorical(codes, dtype=dtype, fastpath=True), c + + # Already sorted according to c.categories; all is fine + if sort: + return c, None + + # sort=False should order groups in as-encountered order (GH-8868) + cat = c.unique() + + # But for groupby to work, all categories should be present, + # including those missing from the data (GH-13179), which .unique() + # above dropped + cat = cat.add_categories( + c.categories[~c.categories.isin(cat.categories)]) + + return c.reorder_categories(cat.categories), None + + +def recode_from_groupby(c, sort, ci): + """ + Reverse the codes_to_groupby to account for sort / observed. + + Parameters + ---------- + c : Categorical + sort : boolean + The value of the sort parameter groupby was called with. + ci : CategoricalIndex + The codes / categories to recode + + Returns + ------- + CategoricalIndex + """ + + # we re-order to the original category orderings + if sort: + return ci.set_categories(c.categories) + + # we are not sorting, so add unobserved to the end + return ci.add_categories( + c.categories[~c.categories.isin(ci.categories)]) diff --git a/pandas/core/groupby/groupby.py b/pandas/core/groupby/groupby.py index c69d7f43de8ea..08acdcf94094a 100644 --- a/pandas/core/groupby/groupby.py +++ b/pandas/core/groupby/groupby.py @@ -2994,9 +2994,9 @@ def __init__(self, index, grouper=None, obj=None, name=None, level=None, # a passed Categorical elif is_categorical_dtype(self.grouper): - self.all_grouper = self.grouper - self.grouper = self.grouper._codes_for_groupby( - self.sort, observed) + from pandas.core.groupby.categorical import recode_for_groupby + self.grouper, self.all_grouper = recode_for_groupby( + self.grouper, self.sort, observed) categories = self.grouper.categories # we make a CategoricalIndex out of the cat grouper @@ -3073,17 +3073,9 @@ def labels(self): @cache_readonly def result_index(self): if self.all_grouper is not None: - all_categories = self.all_grouper.categories - - # we re-order to the original category orderings - if self.sort: - return self.group_index.set_categories(all_categories) - - # we are not sorting, so add unobserved to the end - categories = self.group_index.categories - return self.group_index.add_categories( - all_categories[~all_categories.isin(categories)]) - + from pandas.core.groupby.categorical import recode_from_groupby + return recode_from_groupby(self.all_grouper, + self.sort, self.group_index) return self.group_index @property