ENH: Implement DataFrame.astype('category') (pandas-dev#18099)

harisbal · Mar 1, 2018 · 96b8bb1 · 96b8bb1
1 parent 6ef4be3
commit 96b8bb1
Show file tree

Hide file tree

Showing 4 changed files with 139 additions and 24 deletions.
diff --git a/doc/source/categorical.rst b/doc/source/categorical.rst
@@ -46,9 +46,14 @@ The categorical data type is useful in the following cases:
 
 See also the :ref:`API docs on categoricals<api.categorical>`.
 
+.. _categorical.objectcreation:
+
 Object Creation
 ---------------
 
+Series Creation
+~~~~~~~~~~~~~~~
+
 Categorical ``Series`` or columns in a ``DataFrame`` can be created in several ways:
 
 By specifying ``dtype="category"`` when constructing a ``Series``:
@@ -77,7 +82,7 @@ discrete bins. See the :ref:`example on tiling <reshaping.tile.cut>` in the docs
     df['group'] = pd.cut(df.value, range(0, 105, 10), right=False, labels=labels)
     df.head(10)
 
-By passing a :class:`pandas.Categorical` object to a `Series` or assigning it to a `DataFrame`.
+By passing a :class:`pandas.Categorical` object to a ``Series`` or assigning it to a ``DataFrame``.
 
 .. ipython:: python
 
@@ -89,6 +94,55 @@ By passing a :class:`pandas.Categorical` object to a `Series` or assigning it to
     df["B"] = raw_cat
     df
 
+Categorical data has a specific ``category`` :ref:`dtype <basics.dtypes>`:
+
+.. ipython:: python
+
+    df.dtypes
+
+DataFrame Creation
+~~~~~~~~~~~~~~~~~~
+
+Similar to the previous section where a single column was converted to categorical, all columns in a
+``DataFrame`` can be batch converted to categorical either during or after construction.
+
+This can be done during construction by specifying ``dtype="category"`` in the ``DataFrame`` constructor:
+
+.. ipython:: python
+
+    df = pd.DataFrame({'A': list('abca'), 'B': list('bccd')}, dtype="category")
+    df.dtypes
+
+Note that the categories present in each column differ; the conversion is done column by column, so
+only labels present in a given column are categories:
+
+.. ipython:: python
+
+    df['A']
+    df['B']
+
+
+.. versionadded:: 0.23.0
+
+Analogously, all columns in an existing ``DataFrame`` can be batch converted using :meth:`DataFrame.astype`:
+
+.. ipython:: python
+
+    df = pd.DataFrame({'A': list('abca'), 'B': list('bccd')})
+    df_cat = df.astype('category')
+    df_cat.dtypes
+
+This conversion is likewise done column by column:
+
+.. ipython:: python
+
+    df_cat['A']
+    df_cat['B']
+
+
+Controlling Behavior
+~~~~~~~~~~~~~~~~~~~~
+
 In the examples above where we passed ``dtype='category'``, we used the default 
 behavior:
 
@@ -108,21 +162,36 @@ of :class:`~pandas.api.types.CategoricalDtype`.
     s_cat = s.astype(cat_type)
     s_cat
 
-Categorical data has a specific ``category`` :ref:`dtype <basics.dtypes>`:
+Similarly, a ``CategoricalDtype`` can be used with a ``DataFrame`` to ensure that categories
+are consistent among all columns.
 
 .. ipython:: python
 
-    df.dtypes
+    df = pd.DataFrame({'A': list('abca'), 'B': list('bccd')})
+    cat_type = CategoricalDtype(categories=list('abcd'),
+                                ordered=True)
+    df_cat = df.astype(cat_type)
+    df_cat['A']
+    df_cat['B']
 
 .. note::
 
-    In contrast to R's `factor` function, categorical data is not converting input values to
-    strings and categories will end up the same data type as the original values.
+    To perform table-wise conversion, where all labels in the entire ``DataFrame`` are used as
+    categories for each column, the ``categories`` parameter can be determined programatically by
+    ``categories = pd.unique(df.values.ravel())``.
 
-.. note::
+If you already have ``codes`` and ``categories``, you can use the 
+:func:`~pandas.Categorical.from_codes` constructor to save the factorize step 
+during normal constructor mode:
 
-    In contrast to R's `factor` function, there is currently no way to assign/change labels at
-    creation time. Use `categories` to change the categories after creation time.
+.. ipython:: python
+
+    splitter = np.random.choice([0,1], 5, p=[0.5,0.5])
+    s = pd.Series(pd.Categorical.from_codes(splitter, categories=["train", "test"]))
+
+
+Regaining Original Data
+~~~~~~~~~~~~~~~~~~~~~~~
 
 To get back to the original ``Series`` or NumPy array, use 
 ``Series.astype(original_dtype)`` or ``np.asarray(categorical)``:
@@ -136,14 +205,15 @@ To get back to the original ``Series`` or NumPy array, use
     s2.astype(str)
     np.asarray(s2)
 
-If you already have `codes` and `categories`, you can use the 
-:func:`~pandas.Categorical.from_codes` constructor to save the factorize step 
-during normal constructor mode:
+.. note::
 
-.. ipython:: python
+    In contrast to R's `factor` function, categorical data is not converting input values to
+    strings; categories will end up the same data type as the original values.
 
-    splitter = np.random.choice([0,1], 5, p=[0.5,0.5])
-    s = pd.Series(pd.Categorical.from_codes(splitter, categories=["train", "test"]))
+.. note::
+
+    In contrast to R's `factor` function, there is currently no way to assign/change labels at
+    creation time. Use `categories` to change the categories after creation time.
 
 .. _categorical.categoricaldtype:
 

diff --git a/doc/source/whatsnew/v0.23.0.txt b/doc/source/whatsnew/v0.23.0.txt
@@ -268,6 +268,37 @@ The :func:`DataFrame.assign` now accepts dependent keyword arguments for python
 
       df.assign(A=df.A+1, C= lambda df: df.A* -1)
 
+
+.. _whatsnew_0230.enhancements.astype_category:
+
+``DataFrame.astype`` performs column-wise conversion to ``Categorical``
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+:meth:`DataFrame.astype` can now perform column-wise conversion to ``Categorical`` by supplying the string ``'category'`` or
+a :class:`~pandas.api.types.CategoricalDtype`. Previously, attempting this would raise a ``NotImplementedError``. See the
+:ref:`categorical.objectcreation` section of the documentation for more details and examples. (:issue:`12860`, :issue:`18099`)
+
+Supplying the string ``'category'`` performs column-wise conversion, with only labels appearing in a given column set as categories:
+
+.. ipython:: python
+
+    df = pd.DataFrame({'A': list('abca'), 'B': list('bccd')})
+    df = df.astype('category')
+    df['A'].dtype
+    df['B'].dtype
+
+
+Supplying a ``CategoricalDtype`` will make the categories in each column consistent with the supplied dtype:
+
+.. ipython:: python
+
+    from pandas.api.types import CategoricalDtype
+    df = pd.DataFrame({'A': list('abca'), 'B': list('bccd')})
+    cdt = CategoricalDtype(categories=list('abcd'), ordered=True)
+    df = df.astype(cdt)
+    df['A'].dtype
+    df['B'].dtype
+
 .. _whatsnew_0230.enhancements.other:
 
 Other Enhancements

diff --git a/pandas/core/generic.py b/pandas/core/generic.py
@@ -18,6 +18,7 @@
     is_number,
     is_integer, is_bool,
     is_bool_dtype,
+    is_categorical_dtype,
     is_numeric_dtype,
     is_datetime64_dtype,
     is_timedelta64_dtype,
@@ -4429,14 +4430,18 @@ def astype(self, dtype, copy=True, errors='raise', **kwargs):
                 if col_name not in self:
                     raise KeyError('Only a column name can be used for the '
                                    'key in a dtype mappings argument.')
-            from pandas import concat
             results = []
             for col_name, col in self.iteritems():
                 if col_name in dtype:
                     results.append(col.astype(dtype[col_name], copy=copy))
                 else:
                     results.append(results.append(col.copy() if copy else col))
-            return concat(results, axis=1, copy=False)
+            return pd.concat(results, axis=1, copy=False)
+
+        elif is_categorical_dtype(dtype) and self.ndim > 1:
+            # GH 18099: columnwise conversion to categorical
+            results = (self[col].astype(dtype, copy=copy) for col in self)
+            return pd.concat(results, axis=1, copy=False)
 
         # else, only a single dtype is given
         new_data = self._data.astype(dtype=dtype, copy=copy, errors=errors,

diff --git a/pandas/tests/frame/test_dtypes.py b/pandas/tests/frame/test_dtypes.py
@@ -8,11 +8,11 @@
 
 import numpy as np
 from pandas import (DataFrame, Series, date_range, Timedelta, Timestamp,
-                    compat, concat, option_context)
+                    Categorical, compat, concat, option_context)
 from pandas.compat import u
 from pandas import _np_version_under1p14
 
-from pandas.core.dtypes.dtypes import DatetimeTZDtype
+from pandas.core.dtypes.dtypes import DatetimeTZDtype, CategoricalDtype
 from pandas.tests.frame.common import TestData
 from pandas.util.testing import (assert_series_equal,
                                  assert_frame_equal,
@@ -619,12 +619,21 @@ def test_astype_duplicate_col(self):
         expected = concat([a1_str, b, a2_str], axis=1)
         assert_frame_equal(result, expected)
 
-    @pytest.mark.parametrize('columns', [['x'], ['x', 'y'], ['x', 'y', 'z']])
-    def test_categorical_astype_ndim_raises(self, columns):
-        # GH 18004
-        msg = '> 1 ndim Categorical are not supported at this time'
-        with tm.assert_raises_regex(NotImplementedError, msg):
-            DataFrame(columns=columns).astype('category')
+    @pytest.mark.parametrize('dtype', [
+        'category',
+        CategoricalDtype(),
+        CategoricalDtype(ordered=True),
+        CategoricalDtype(ordered=False),
+        CategoricalDtype(categories=list('abcdef')),
+        CategoricalDtype(categories=list('edba'), ordered=False),
+        CategoricalDtype(categories=list('edcb'), ordered=True)], ids=repr)
+    def test_astype_categorical(self, dtype):
+        # GH 18099
+        d = {'A': list('abbc'), 'B': list('bccd'), 'C': list('cdde')}
+        df = DataFrame(d)
+        result = df.astype(dtype)
+        expected = DataFrame({k: Categorical(d[k], dtype=dtype) for k in d})
+        tm.assert_frame_equal(result, expected)
 
     @pytest.mark.parametrize("cls", [
         pd.api.types.CategoricalDtype,