From e02f73787bfbd00ce2ad899542538dfad854eb01 Mon Sep 17 00:00:00 2001 From: Tom Augspurger Date: Tue, 6 Mar 2018 09:38:32 -0600 Subject: [PATCH] DOC: add doc on ExtensionArray and extending pandas (#19936) --- doc/source/developer.rst | 43 ----- doc/source/ecosystem.rst | 35 +++++ doc/source/extending.rst | 269 ++++++++++++++++++++++++++++++++ doc/source/index.rst.template | 1 + doc/source/internals.rst | 152 +----------------- doc/source/whatsnew/v0.16.1.txt | 2 +- pandas/core/accessor.py | 6 +- 7 files changed, 312 insertions(+), 196 deletions(-) create mode 100644 doc/source/extending.rst diff --git a/doc/source/developer.rst b/doc/source/developer.rst index 0ef097da090f2..b8bb2b2fcbe2f 100644 --- a/doc/source/developer.rst +++ b/doc/source/developer.rst @@ -140,46 +140,3 @@ As an example of fully-formed metadata: 'metadata': None} ], 'pandas_version': '0.20.0'} - -.. _developer.register-accessors: - -Registering Custom Accessors ----------------------------- - -Libraries can use the decorators -:func:`pandas.api.extensions.register_dataframe_accessor`, -:func:`pandas.api.extensions.register_series_accessor`, and -:func:`pandas.api.extensions.register_index_accessor`, to add additional "namespaces" to -pandas objects. All of these follow a similar convention: you decorate a class, providing the name of attribute to add. The -class's `__init__` method gets the object being decorated. For example: - -.. code-block:: python - - @pd.api.extensions.register_dataframe_accessor("geo") - class GeoAccessor(object): - def __init__(self, pandas_obj): - self._obj = pandas_obj - - @property - def center(self): - # return the geographic center point of this DataFarme - lon = self._obj.latitude - lat = self._obj.longitude - return (float(lon.mean()), float(lat.mean())) - - def plot(self): - # plot this array's data on a map, e.g., using Cartopy - pass - -Now users can access your methods using the `geo` namespace: - - >>> ds = pd.DataFrame({'longitude': np.linspace(0, 10), - ... 'latitude': np.linspace(0, 20)}) - >>> ds.geo.center - (5.0, 10.0) - >>> ds.geo.plot() - # plots data on a map - -This can be a convenient way to extend pandas objects without subclassing them. -If you write a custom accessor, make a pull request adding it to our -:ref:`ecosystem` page. diff --git a/doc/source/ecosystem.rst b/doc/source/ecosystem.rst index c770bf2851643..30cdb06b28487 100644 --- a/doc/source/ecosystem.rst +++ b/doc/source/ecosystem.rst @@ -262,3 +262,38 @@ Data validation Engarde is a lightweight library used to explicitly state your assumptions abour your datasets and check that they're *actually* true. + +.. _ecosystem.extensions: + +Extension Data Types +-------------------- + +Pandas provides an interface for defining +:ref:`extension types ` to extend NumPy's type +system. The following libraries implement that interface to provide types not +found in NumPy or pandas, which work well with pandas' data containers. + +`cyberpandas`_ +~~~~~~~~~~~~~~ + +Cyberpandas provides an extension type for storing arrays of IP Addresses. These +arrays can be stored inside pandas' Series and DataFrame. + +.. _ecosystem.accessors: + +Accessors +--------- + +A directory of projects providing +:ref:`extension accessors `. This is for users to +discover new accessors and for library authors to coordinate on the namespace. + +============== ========== ========================= +Library Accessor Classes +============== ========== ========================= +`cyberpandas`_ ``ip`` ``Series`` +`pdvega`_ ``vgplot`` ``Series``, ``DataFrame`` +============== ========== ========================= + +.. _cyberpandas: https://cyberpandas.readthedocs.io/en/latest +.. _pdvega: https://jakevdp.github.io/pdvega/ diff --git a/doc/source/extending.rst b/doc/source/extending.rst new file mode 100644 index 0000000000000..25c4ba4a4a2a3 --- /dev/null +++ b/doc/source/extending.rst @@ -0,0 +1,269 @@ +.. _extending: + +**************** +Extending Pandas +**************** + +While pandas provides a rich set of methods, containers, and data types, your +needs may not be fully satisfied. Pandas offers a few options for extending +pandas. + +.. _extending.register-accessors: + +Registering Custom Accessors +---------------------------- + +Libraries can use the decorators +:func:`pandas.api.extensions.register_dataframe_accessor`, +:func:`pandas.api.extensions.register_series_accessor`, and +:func:`pandas.api.extensions.register_index_accessor`, to add additional +"namespaces" to pandas objects. All of these follow a similar convention: you +decorate a class, providing the name of attribute to add. The class's +``__init__`` method gets the object being decorated. For example: + +.. code-block:: python + + @pd.api.extensions.register_dataframe_accessor("geo") + class GeoAccessor(object): + def __init__(self, pandas_obj): + self._obj = pandas_obj + + @property + def center(self): + # return the geographic center point of this DataFrame + lat = self._obj.latitude + lon = self._obj.longitude + return (float(lon.mean()), float(lat.mean())) + + def plot(self): + # plot this array's data on a map, e.g., using Cartopy + pass + +Now users can access your methods using the ``geo`` namespace: + + >>> ds = pd.DataFrame({'longitude': np.linspace(0, 10), + ... 'latitude': np.linspace(0, 20)}) + >>> ds.geo.center + (5.0, 10.0) + >>> ds.geo.plot() + # plots data on a map + +This can be a convenient way to extend pandas objects without subclassing them. +If you write a custom accessor, make a pull request adding it to our +:ref:`ecosystem` page. + +.. _extending.extension-types: + +Extension Types +--------------- + +Pandas defines an interface for implementing data types and arrays that *extend* +NumPy's type system. Pandas itself uses the extension system for some types +that aren't built into NumPy (categorical, period, interval, datetime with +timezone). + +Libraries can define a custom array and data type. When pandas encounters these +objects, they will be handled properly (i.e. not converted to an ndarray of +objects). Many methods like :func:`pandas.isna` will dispatch to the extension +type's implementation. + +If you're building a library that implements the interface, please publicize it +on :ref:`ecosystem.extensions`. + +The interface consists of two classes. + +``ExtensionDtype`` +^^^^^^^^^^^^^^^^^^ + +An ``ExtensionDtype`` is similar to a ``numpy.dtype`` object. It describes the +data type. Implementors are responsible for a few unique items like the name. + +One particularly important item is the ``type`` property. This should be the +class that is the scalar type for your data. For example, if you were writing an +extension array for IP Address data, this might be ``ipaddress.IPv4Address``. + +See the `extension dtype source`_ for interface definition. + +``ExtensionArray`` +^^^^^^^^^^^^^^^^^^ + +This class provides all the array-like functionality. ExtensionArrays are +limited to 1 dimension. An ExtensionArray is linked to an ExtensionDtype via the +``dtype`` attribute. + +Pandas makes no restrictions on how an extension array is created via its +``__new__`` or ``__init__``, and puts no restrictions on how you store your +data. We do require that your array be convertible to a NumPy array, even if +this is relatively expensive (as it is for ``Categorical``). + +They may be backed by none, one, or many NumPy arrays. For example, +``pandas.Categorical`` is an extension array backed by two arrays, +one for codes and one for categories. An array of IPv6 addresses may +be backed by a NumPy structured array with two fields, one for the +lower 64 bits and one for the upper 64 bits. Or they may be backed +by some other storage type, like Python lists. + +See the `extension array source`_ for the interface definition. The docstrings +and comments contain guidance for properly implementing the interface. + +.. _extension dtype source: https://github.com/pandas-dev/pandas/blob/master/pandas/core/dtypes/base.py +.. _extension array source: https://github.com/pandas-dev/pandas/blob/master/pandas/core/arrays/base.py + +.. _extending.subclassing-pandas: + +Subclassing pandas Data Structures +---------------------------------- + +.. warning:: There are some easier alternatives before considering subclassing ``pandas`` data structures. + + 1. Extensible method chains with :ref:`pipe ` + + 2. Use *composition*. See `here `_. + + 3. Extending by :ref:`registering an accessor ` + + 4. Extending by :ref:`extension type ` + +This section describes how to subclass ``pandas`` data structures to meet more specific needs. There are two points that need attention: + +1. Override constructor properties. +2. Define original properties + +.. note:: + + You can find a nice example in `geopandas `_ project. + +Override Constructor Properties +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +Each data structure has several *constructor properties* for returning a new +data structure as the result of an operation. By overriding these properties, +you can retain subclasses through ``pandas`` data manipulations. + +There are 3 constructor properties to be defined: + +- ``_constructor``: Used when a manipulation result has the same dimesions as the original. +- ``_constructor_sliced``: Used when a manipulation result has one lower dimension(s) as the original, such as ``DataFrame`` single columns slicing. +- ``_constructor_expanddim``: Used when a manipulation result has one higher dimension as the original, such as ``Series.to_frame()`` and ``DataFrame.to_panel()``. + +Following table shows how ``pandas`` data structures define constructor properties by default. + +=========================== ======================= ============= +Property Attributes ``Series`` ``DataFrame`` +=========================== ======================= ============= +``_constructor`` ``Series`` ``DataFrame`` +``_constructor_sliced`` ``NotImplementedError`` ``Series`` +``_constructor_expanddim`` ``DataFrame`` ``Panel`` +=========================== ======================= ============= + +Below example shows how to define ``SubclassedSeries`` and ``SubclassedDataFrame`` overriding constructor properties. + +.. code-block:: python + + class SubclassedSeries(Series): + + @property + def _constructor(self): + return SubclassedSeries + + @property + def _constructor_expanddim(self): + return SubclassedDataFrame + + class SubclassedDataFrame(DataFrame): + + @property + def _constructor(self): + return SubclassedDataFrame + + @property + def _constructor_sliced(self): + return SubclassedSeries + +.. code-block:: python + + >>> s = SubclassedSeries([1, 2, 3]) + >>> type(s) + + + >>> to_framed = s.to_frame() + >>> type(to_framed) + + + >>> df = SubclassedDataFrame({'A', [1, 2, 3], 'B': [4, 5, 6], 'C': [7, 8, 9]}) + >>> df + A B C + 0 1 4 7 + 1 2 5 8 + 2 3 6 9 + + >>> type(df) + + + >>> sliced1 = df[['A', 'B']] + >>> sliced1 + A B + 0 1 4 + 1 2 5 + 2 3 6 + >>> type(sliced1) + + + >>> sliced2 = df['A'] + >>> sliced2 + 0 1 + 1 2 + 2 3 + Name: A, dtype: int64 + >>> type(sliced2) + + +Define Original Properties +^^^^^^^^^^^^^^^^^^^^^^^^^^ + +To let original data structures have additional properties, you should let ``pandas`` know what properties are added. ``pandas`` maps unknown properties to data names overriding ``__getattribute__``. Defining original properties can be done in one of 2 ways: + +1. Define ``_internal_names`` and ``_internal_names_set`` for temporary properties which WILL NOT be passed to manipulation results. +2. Define ``_metadata`` for normal properties which will be passed to manipulation results. + +Below is an example to define two original properties, "internal_cache" as a temporary property and "added_property" as a normal property + +.. code-block:: python + + class SubclassedDataFrame2(DataFrame): + + # temporary properties + _internal_names = pd.DataFrame._internal_names + ['internal_cache'] + _internal_names_set = set(_internal_names) + + # normal properties + _metadata = ['added_property'] + + @property + def _constructor(self): + return SubclassedDataFrame2 + +.. code-block:: python + + >>> df = SubclassedDataFrame2({'A': [1, 2, 3], 'B': [4, 5, 6], 'C': [7, 8, 9]}) + >>> df + A B C + 0 1 4 7 + 1 2 5 8 + 2 3 6 9 + + >>> df.internal_cache = 'cached' + >>> df.added_property = 'property' + + >>> df.internal_cache + cached + >>> df.added_property + property + + # properties defined in _internal_names is reset after manipulation + >>> df[['A', 'B']].internal_cache + AttributeError: 'SubclassedDataFrame2' object has no attribute 'internal_cache' + + # properties defined in _metadata are retained + >>> df[['A', 'B']].added_property + property diff --git a/doc/source/index.rst.template b/doc/source/index.rst.template index cb6cce5edaf79..1ef88a524732f 100644 --- a/doc/source/index.rst.template +++ b/doc/source/index.rst.template @@ -157,5 +157,6 @@ See the package overview for more detail about what's in the library. {% if not single_doc -%} developer internals + extending release {% endif -%} diff --git a/doc/source/internals.rst b/doc/source/internals.rst index 957f82fd9eba7..b120e3a98db7f 100644 --- a/doc/source/internals.rst +++ b/doc/source/internals.rst @@ -15,7 +15,8 @@ Internals ********* -This section will provide a look into some of pandas internals. +This section will provide a look into some of pandas internals. It's primarily +intended for developers of pandas itself. Indexing -------- @@ -107,156 +108,9 @@ containers (``Index`` classes and ``Series``) we have the following convention: So, for example, ``Series[category]._values`` is a ``Categorical``, while ``Series[category]._ndarray_values`` is the underlying codes. - .. _ref-subclassing-pandas: Subclassing pandas Data Structures ---------------------------------- -.. warning:: There are some easier alternatives before considering subclassing ``pandas`` data structures. - - 1. Extensible method chains with :ref:`pipe ` - - 2. Use *composition*. See `here `_. - - 3. Extending by :ref:`registering an accessor ` - -This section describes how to subclass ``pandas`` data structures to meet more specific needs. There are 2 points which need attention: - -1. Override constructor properties. -2. Define original properties - -.. note:: You can find a nice example in `geopandas `_ project. - -Override Constructor Properties -~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ - -Each data structure has constructor properties to specifying data constructors. By overriding these properties, you can retain defined-classes through ``pandas`` data manipulations. - -There are 3 constructors to be defined: - -- ``_constructor``: Used when a manipulation result has the same dimesions as the original. -- ``_constructor_sliced``: Used when a manipulation result has one lower dimension(s) as the original, such as ``DataFrame`` single columns slicing. -- ``_constructor_expanddim``: Used when a manipulation result has one higher dimension as the original, such as ``Series.to_frame()`` and ``DataFrame.to_panel()``. - -Following table shows how ``pandas`` data structures define constructor properties by default. - -=========================== ======================= =================== ======================= -Property Attributes ``Series`` ``DataFrame`` ``Panel`` -=========================== ======================= =================== ======================= -``_constructor`` ``Series`` ``DataFrame`` ``Panel`` -``_constructor_sliced`` ``NotImplementedError`` ``Series`` ``DataFrame`` -``_constructor_expanddim`` ``DataFrame`` ``Panel`` ``NotImplementedError`` -=========================== ======================= =================== ======================= - -Below example shows how to define ``SubclassedSeries`` and ``SubclassedDataFrame`` overriding constructor properties. - -.. code-block:: python - - class SubclassedSeries(Series): - - @property - def _constructor(self): - return SubclassedSeries - - @property - def _constructor_expanddim(self): - return SubclassedDataFrame - - class SubclassedDataFrame(DataFrame): - - @property - def _constructor(self): - return SubclassedDataFrame - - @property - def _constructor_sliced(self): - return SubclassedSeries - -.. code-block:: python - - >>> s = SubclassedSeries([1, 2, 3]) - >>> type(s) - - - >>> to_framed = s.to_frame() - >>> type(to_framed) - - - >>> df = SubclassedDataFrame({'A', [1, 2, 3], 'B': [4, 5, 6], 'C': [7, 8, 9]}) - >>> df - A B C - 0 1 4 7 - 1 2 5 8 - 2 3 6 9 - - >>> type(df) - - - >>> sliced1 = df[['A', 'B']] - >>> sliced1 - A B - 0 1 4 - 1 2 5 - 2 3 6 - >>> type(sliced1) - - - >>> sliced2 = df['A'] - >>> sliced2 - 0 1 - 1 2 - 2 3 - Name: A, dtype: int64 - >>> type(sliced2) - - -Define Original Properties -~~~~~~~~~~~~~~~~~~~~~~~~~~ - -To let original data structures have additional properties, you should let ``pandas`` know what properties are added. ``pandas`` maps unknown properties to data names overriding ``__getattribute__``. Defining original properties can be done in one of 2 ways: - -1. Define ``_internal_names`` and ``_internal_names_set`` for temporary properties which WILL NOT be passed to manipulation results. -2. Define ``_metadata`` for normal properties which will be passed to manipulation results. - -Below is an example to define 2 original properties, "internal_cache" as a temporary property and "added_property" as a normal property - -.. code-block:: python - - class SubclassedDataFrame2(DataFrame): - - # temporary properties - _internal_names = pd.DataFrame._internal_names + ['internal_cache'] - _internal_names_set = set(_internal_names) - - # normal properties - _metadata = ['added_property'] - - @property - def _constructor(self): - return SubclassedDataFrame2 - -.. code-block:: python - - >>> df = SubclassedDataFrame2({'A': [1, 2, 3], 'B': [4, 5, 6], 'C': [7, 8, 9]}) - >>> df - A B C - 0 1 4 7 - 1 2 5 8 - 2 3 6 9 - - >>> df.internal_cache = 'cached' - >>> df.added_property = 'property' - - >>> df.internal_cache - cached - >>> df.added_property - property - - # properties defined in _internal_names is reset after manipulation - >>> df[['A', 'B']].internal_cache - AttributeError: 'SubclassedDataFrame2' object has no attribute 'internal_cache' - - # properties defined in _metadata are retained - >>> df[['A', 'B']].added_property - property +This section has been moved to :ref:`extending.subclassing-pandas`. diff --git a/doc/source/whatsnew/v0.16.1.txt b/doc/source/whatsnew/v0.16.1.txt index b1e8aa10457f8..9e1dc391d7ace 100644 --- a/doc/source/whatsnew/v0.16.1.txt +++ b/doc/source/whatsnew/v0.16.1.txt @@ -313,7 +313,7 @@ Other Enhancements - Add/delete ``str/dt/cat`` accessors dynamically from ``__dir__``. (:issue:`9910`) - Add ``normalize`` as a ``dt`` accessor method. (:issue:`10047`) -- ``DataFrame`` and ``Series`` now have ``_constructor_expanddim`` property as overridable constructor for one higher dimensionality data. This should be used only when it is really needed, see :ref:`here ` +- ``DataFrame`` and ``Series`` now have ``_constructor_expanddim`` property as overridable constructor for one higher dimensionality data. This should be used only when it is really needed, see :ref:`here ` - ``pd.lib.infer_dtype`` now returns ``'bytes'`` in Python 3 where appropriate. (:issue:`10032`) diff --git a/pandas/core/accessor.py b/pandas/core/accessor.py index 96bf628c8d7ff..06c4068f86bfe 100644 --- a/pandas/core/accessor.py +++ b/pandas/core/accessor.py @@ -191,9 +191,9 @@ def __init__(self, pandas_obj): @property def center(self): - # return the geographic center point of this DataFarme - lon = self._obj.latitude - lat = self._obj.longitude + # return the geographic center point of this DataFrame + lat = self._obj.latitude + lon = self._obj.longitude return (float(lon.mean()), float(lat.mean())) def plot(self):