Skip to content

Commit

Permalink
CLN: clean up groupby / categorical
Browse files Browse the repository at this point in the history
  • Loading branch information
jreback committed Jul 6, 2018
1 parent d24a950 commit da877df
Show file tree
Hide file tree
Showing 3 changed files with 106 additions and 81 deletions.
67 changes: 0 additions & 67 deletions pandas/core/arrays/categorical.py
Original file line number Diff line number Diff line change
Expand Up @@ -698,73 +698,6 @@ def _set_categories(self, categories, fastpath=False):

self._dtype = new_dtype

def _codes_for_groupby(self, sort, observed):
"""
Code the categories to ensure we can groupby for categoricals.
If observed=True, we return a new Categorical with the observed
categories only.
If sort=False, return a copy of self, coded with categories as
returned by .unique(), followed by any categories not appearing in
the data. If sort=True, return self.
This method is needed solely to ensure the categorical index of the
GroupBy result has categories in the order of appearance in the data
(GH-8868).
Parameters
----------
sort : boolean
The value of the sort parameter groupby was called with.
observed : boolean
Account only for the observed values
Returns
-------
Categorical
If sort=False, the new categories are set to the order of
appearance in codes (unless ordered=True, in which case the
original order is preserved), followed by any unrepresented
categories in the original order.
"""

# we only care about observed values
if observed:
unique_codes = unique1d(self.codes)
cat = self.copy()

take_codes = unique_codes[unique_codes != -1]
if self.ordered:
take_codes = np.sort(take_codes)

# we recode according to the uniques
categories = self.categories.take(take_codes)
codes = _recode_for_categories(self.codes,
self.categories,
categories)

# return a new categorical that maps our new codes
# and categories
dtype = CategoricalDtype(categories, ordered=self.ordered)
return type(self)(codes, dtype=dtype, fastpath=True)

# Already sorted according to self.categories; all is fine
if sort:
return self

# sort=False should order groups in as-encountered order (GH-8868)
cat = self.unique()

# But for groupby to work, all categories should be present,
# including those missing from the data (GH-13179), which .unique()
# above dropped
cat.add_categories(
self.categories[~self.categories.isin(cat.categories)],
inplace=True)

return self.reorder_categories(cat.categories)

def _set_dtype(self, dtype):
"""Internal method for directly updating the CategoricalDtype
Expand Down
100 changes: 100 additions & 0 deletions pandas/core/groupby/categorical.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,100 @@
import numpy as np
from pandas.core.algorithms import unique1d
from pandas.core.arrays.categorical import (
_recode_for_categories, CategoricalDtype, Categorical)
from pandas import CategoricalIndex, Series


def recode_for_groupby(c, sort, observed):
"""
Code the categories to ensure we can groupby for categoricals.
If observed=True, we return a new Categorical with the observed
categories only.
If sort=False, return a copy of self, coded with categories as
returned by .unique(), followed by any categories not appearing in
the data. If sort=True, return self.
This method is needed solely to ensure the categorical index of the
GroupBy result has categories in the order of appearance in the data
(GH-8868).
Parameters
----------
c : Categorical
sort : boolean
The value of the sort parameter groupby was called with.
observed : boolean
Account only for the observed values
Returns
-------
New Categorical
If sort=False, the new categories are set to the order of
appearance in codes (unless ordered=True, in which case the
original order is preserved), followed by any unrepresented
categories in the original order.
Categorical or None
If we are observed, return the original categorical, otherwise None
"""

# we only care about observed values
if observed:
unique_codes = unique1d(c.codes)

take_codes = unique_codes[unique_codes != -1]
if c.ordered:
take_codes = np.sort(take_codes)

# we recode according to the uniques
categories = c.categories.take(take_codes)
codes = _recode_for_categories(c.codes,
c.categories,
categories)

# return a new categorical that maps our new codes
# and categories
dtype = CategoricalDtype(categories, ordered=c.ordered)
return Categorical(codes, dtype=dtype, fastpath=True), c

# Already sorted according to c.categories; all is fine
if sort:
return c, None

# sort=False should order groups in as-encountered order (GH-8868)
cat = c.unique()

# But for groupby to work, all categories should be present,
# including those missing from the data (GH-13179), which .unique()
# above dropped
cat = cat.add_categories(
c.categories[~c.categories.isin(cat.categories)])

return c.reorder_categories(cat.categories), None


def recode_from_groupby(c, sort, ci):
"""
Reverse the codes_to_groupby to account for sort / observed.
Parameters
----------
c : Categorical
sort : boolean
The value of the sort parameter groupby was called with.
ci : CategoricalIndex
The codes / categories to recode
Returns
-------
CategoricalIndex
"""

# we re-order to the original category orderings
if sort:
return ci.set_categories(c.categories)

# we are not sorting, so add unobserved to the end
return ci.add_categories(
c.categories[~c.categories.isin(ci.categories)])
20 changes: 6 additions & 14 deletions pandas/core/groupby/groupby.py
Original file line number Diff line number Diff line change
Expand Up @@ -2994,9 +2994,9 @@ def __init__(self, index, grouper=None, obj=None, name=None, level=None,
# a passed Categorical
elif is_categorical_dtype(self.grouper):

self.all_grouper = self.grouper
self.grouper = self.grouper._codes_for_groupby(
self.sort, observed)
from pandas.core.groupby.categorical import recode_for_groupby
self.grouper, self.all_grouper = recode_for_groupby(
self.grouper, self.sort, observed)
categories = self.grouper.categories

# we make a CategoricalIndex out of the cat grouper
Expand Down Expand Up @@ -3073,17 +3073,9 @@ def labels(self):
@cache_readonly
def result_index(self):
if self.all_grouper is not None:
all_categories = self.all_grouper.categories

# we re-order to the original category orderings
if self.sort:
return self.group_index.set_categories(all_categories)

# we are not sorting, so add unobserved to the end
categories = self.group_index.categories
return self.group_index.add_categories(
all_categories[~all_categories.isin(categories)])

from pandas.core.groupby.categorical import recode_from_groupby
return recode_from_groupby(self.all_grouper,
self.sort, self.group_index)
return self.group_index

@property
Expand Down

0 comments on commit da877df

Please sign in to comment.