From e02f73787bfbd00ce2ad899542538dfad854eb01 Mon Sep 17 00:00:00 2001
From: Tom Augspurger <TomAugspurger@users.noreply.github.com>
Date: Tue, 6 Mar 2018 09:38:32 -0600
Subject: [PATCH] DOC: add doc on ExtensionArray and extending pandas (#19936)

---
 doc/source/developer.rst        |  43 -----
 doc/source/ecosystem.rst        |  35 +++++
 doc/source/extending.rst        | 269 ++++++++++++++++++++++++++++++++
 doc/source/index.rst.template   |   1 +
 doc/source/internals.rst        | 152 +-----------------
 doc/source/whatsnew/v0.16.1.txt |   2 +-
 pandas/core/accessor.py         |   6 +-
 7 files changed, 312 insertions(+), 196 deletions(-)
 create mode 100644 doc/source/extending.rst

diff --git a/doc/source/developer.rst b/doc/source/developer.rst
index 0ef097da090f2..b8bb2b2fcbe2f 100644
--- a/doc/source/developer.rst
+++ b/doc/source/developer.rst
@@ -140,46 +140,3 @@ As an example of fully-formed metadata:
          'metadata': None}
     ],
     'pandas_version': '0.20.0'}
-
-.. _developer.register-accessors:
-
-Registering Custom Accessors
-----------------------------
-
-Libraries can use the decorators
-:func:`pandas.api.extensions.register_dataframe_accessor`,
-:func:`pandas.api.extensions.register_series_accessor`, and
-:func:`pandas.api.extensions.register_index_accessor`, to add additional "namespaces" to
-pandas objects. All of these follow a similar convention: you decorate a class, providing the name of attribute to add. The
-class's `__init__` method gets the object being decorated. For example:
-
-.. code-block:: python
-
-   @pd.api.extensions.register_dataframe_accessor("geo")
-   class GeoAccessor(object):
-       def __init__(self, pandas_obj):
-           self._obj = pandas_obj
-
-       @property
-       def center(self):
-           # return the geographic center point of this DataFarme
-           lon = self._obj.latitude
-           lat = self._obj.longitude
-           return (float(lon.mean()), float(lat.mean()))
-
-       def plot(self):
-           # plot this array's data on a map, e.g., using Cartopy
-           pass
-
-Now users can access your methods using the `geo` namespace:
-
-      >>> ds = pd.DataFrame({'longitude': np.linspace(0, 10),
-      ...                    'latitude': np.linspace(0, 20)})
-      >>> ds.geo.center
-      (5.0, 10.0)
-      >>> ds.geo.plot()
-      # plots data on a map
-
-This can be a convenient way to extend pandas objects without subclassing them.
-If you write a custom accessor, make a pull request adding it to our
-:ref:`ecosystem` page.
diff --git a/doc/source/ecosystem.rst b/doc/source/ecosystem.rst
index c770bf2851643..30cdb06b28487 100644
--- a/doc/source/ecosystem.rst
+++ b/doc/source/ecosystem.rst
@@ -262,3 +262,38 @@ Data validation
 
 Engarde is a lightweight library used to explicitly state your assumptions abour your datasets
 and check that they're *actually* true.
+
+.. _ecosystem.extensions:
+
+Extension Data Types
+--------------------
+
+Pandas provides an interface for defining
+:ref:`extension types <extending.extension-types>` to extend NumPy's type
+system. The following libraries implement that interface to provide types not
+found in NumPy or pandas, which work well with pandas' data containers.
+
+`cyberpandas`_
+~~~~~~~~~~~~~~
+
+Cyberpandas provides an extension type for storing arrays of IP Addresses. These
+arrays can be stored inside pandas' Series and DataFrame.
+
+.. _ecosystem.accessors:
+
+Accessors
+---------
+
+A directory of projects providing
+:ref:`extension accessors <extending.register-accessors>`. This is for users to
+discover new accessors and for library authors to coordinate on the namespace.
+
+============== ========== =========================
+Library        Accessor   Classes
+============== ========== =========================
+`cyberpandas`_ ``ip``     ``Series``
+`pdvega`_      ``vgplot`` ``Series``, ``DataFrame``
+============== ========== =========================
+
+.. _cyberpandas: https://cyberpandas.readthedocs.io/en/latest
+.. _pdvega: https://jakevdp.github.io/pdvega/
diff --git a/doc/source/extending.rst b/doc/source/extending.rst
new file mode 100644
index 0000000000000..25c4ba4a4a2a3
--- /dev/null
+++ b/doc/source/extending.rst
@@ -0,0 +1,269 @@
+.. _extending:
+
+****************
+Extending Pandas
+****************
+
+While pandas provides a rich set of methods, containers, and data types, your
+needs may not be fully satisfied. Pandas offers a few options for extending
+pandas.
+
+.. _extending.register-accessors:
+
+Registering Custom Accessors
+----------------------------
+
+Libraries can use the decorators
+:func:`pandas.api.extensions.register_dataframe_accessor`,
+:func:`pandas.api.extensions.register_series_accessor`, and
+:func:`pandas.api.extensions.register_index_accessor`, to add additional
+"namespaces" to pandas objects. All of these follow a similar convention: you
+decorate a class, providing the name of attribute to add. The class's
+``__init__`` method gets the object being decorated. For example:
+
+.. code-block:: python
+
+   @pd.api.extensions.register_dataframe_accessor("geo")
+   class GeoAccessor(object):
+       def __init__(self, pandas_obj):
+           self._obj = pandas_obj
+
+       @property
+       def center(self):
+           # return the geographic center point of this DataFrame
+           lat = self._obj.latitude
+           lon = self._obj.longitude
+           return (float(lon.mean()), float(lat.mean()))
+
+       def plot(self):
+           # plot this array's data on a map, e.g., using Cartopy
+           pass
+
+Now users can access your methods using the ``geo`` namespace:
+
+      >>> ds = pd.DataFrame({'longitude': np.linspace(0, 10),
+      ...                    'latitude': np.linspace(0, 20)})
+      >>> ds.geo.center
+      (5.0, 10.0)
+      >>> ds.geo.plot()
+      # plots data on a map
+
+This can be a convenient way to extend pandas objects without subclassing them.
+If you write a custom accessor, make a pull request adding it to our
+:ref:`ecosystem` page.
+
+.. _extending.extension-types:
+
+Extension Types
+---------------
+
+Pandas defines an interface for implementing data types and arrays that *extend*
+NumPy's type system. Pandas itself uses the extension system for some types
+that aren't built into NumPy (categorical, period, interval, datetime with
+timezone).
+
+Libraries can define a custom array and data type. When pandas encounters these
+objects, they will be handled properly (i.e. not converted to an ndarray of
+objects). Many methods like :func:`pandas.isna` will dispatch to the extension
+type's implementation.
+
+If you're building a library that implements the interface, please publicize it
+on :ref:`ecosystem.extensions`.
+
+The interface consists of two classes.
+
+``ExtensionDtype``
+^^^^^^^^^^^^^^^^^^
+
+An ``ExtensionDtype`` is similar to a ``numpy.dtype`` object. It describes the
+data type. Implementors are responsible for a few unique items like the name.
+
+One particularly important item is the ``type`` property. This should be the
+class that is the scalar type for your data. For example, if you were writing an
+extension array for IP Address data, this might be ``ipaddress.IPv4Address``.
+
+See the `extension dtype source`_ for interface definition.
+
+``ExtensionArray``
+^^^^^^^^^^^^^^^^^^
+
+This class provides all the array-like functionality. ExtensionArrays are
+limited to 1 dimension. An ExtensionArray is linked to an ExtensionDtype via the
+``dtype`` attribute.
+
+Pandas makes no restrictions on how an extension array is created via its
+``__new__`` or ``__init__``, and puts no restrictions on how you store your
+data. We do require that your array be convertible to a NumPy array, even if
+this is relatively expensive (as it is for ``Categorical``).
+
+They may be backed by none, one, or many NumPy arrays. For example,
+``pandas.Categorical`` is an extension array backed by two arrays,
+one for codes and one for categories. An array of IPv6 addresses may
+be backed by a NumPy structured array with two fields, one for the
+lower 64 bits and one for the upper 64 bits. Or they may be backed
+by some other storage type, like Python lists.
+
+See the `extension array source`_ for the interface definition. The docstrings
+and comments contain guidance for properly implementing the interface.
+
+.. _extension dtype source: https://github.com/pandas-dev/pandas/blob/master/pandas/core/dtypes/base.py
+.. _extension array source: https://github.com/pandas-dev/pandas/blob/master/pandas/core/arrays/base.py
+
+.. _extending.subclassing-pandas:
+
+Subclassing pandas Data Structures
+----------------------------------
+
+.. warning:: There are some easier alternatives before considering subclassing ``pandas`` data structures.
+
+  1. Extensible method chains with :ref:`pipe <basics.pipe>`
+
+  2. Use *composition*. See `here <http://en.wikipedia.org/wiki/Composition_over_inheritance>`_.
+
+  3. Extending by :ref:`registering an accessor <extending.register-accessors>`
+
+  4. Extending by :ref:`extension type <extending.extension-types>`
+
+This section describes how to subclass ``pandas`` data structures to meet more specific needs. There are two points that need attention:
+
+1. Override constructor properties.
+2. Define original properties
+
+.. note::
+
+   You can find a nice example in `geopandas <https://github.com/geopandas/geopandas>`_ project.
+
+Override Constructor Properties
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+Each data structure has several *constructor properties* for returning a new
+data structure as the result of an operation. By overriding these properties,
+you can retain subclasses through ``pandas`` data manipulations.
+
+There are 3 constructor properties to be defined:
+
+- ``_constructor``: Used when a manipulation result has the same dimesions as the original.
+- ``_constructor_sliced``: Used when a manipulation result has one lower dimension(s) as the original, such as ``DataFrame`` single columns slicing.
+- ``_constructor_expanddim``: Used when a manipulation result has one higher dimension as the original, such as ``Series.to_frame()`` and ``DataFrame.to_panel()``.
+
+Following table shows how ``pandas`` data structures define constructor properties by default.
+
+===========================  ======================= =============
+Property Attributes          ``Series``              ``DataFrame``      
+===========================  ======================= =============
+``_constructor``             ``Series``              ``DataFrame``      
+``_constructor_sliced``      ``NotImplementedError`` ``Series``         
+``_constructor_expanddim``   ``DataFrame``           ``Panel``          
+===========================  ======================= =============
+
+Below example shows how to define ``SubclassedSeries`` and ``SubclassedDataFrame`` overriding constructor properties.
+
+.. code-block:: python
+
+   class SubclassedSeries(Series):
+
+       @property
+       def _constructor(self):
+           return SubclassedSeries
+
+       @property
+       def _constructor_expanddim(self):
+           return SubclassedDataFrame
+
+   class SubclassedDataFrame(DataFrame):
+
+       @property
+       def _constructor(self):
+           return SubclassedDataFrame
+
+       @property
+       def _constructor_sliced(self):
+           return SubclassedSeries
+
+.. code-block:: python
+
+   >>> s = SubclassedSeries([1, 2, 3])
+   >>> type(s)
+   <class '__main__.SubclassedSeries'>
+
+   >>> to_framed = s.to_frame()
+   >>> type(to_framed)
+   <class '__main__.SubclassedDataFrame'>
+
+   >>> df = SubclassedDataFrame({'A', [1, 2, 3], 'B': [4, 5, 6], 'C': [7, 8, 9]})
+   >>> df
+      A  B  C
+   0  1  4  7
+   1  2  5  8
+   2  3  6  9
+
+   >>> type(df)
+   <class '__main__.SubclassedDataFrame'>
+
+   >>> sliced1 = df[['A', 'B']]
+   >>> sliced1
+      A  B
+   0  1  4
+   1  2  5
+   2  3  6
+   >>> type(sliced1)
+   <class '__main__.SubclassedDataFrame'>
+
+   >>> sliced2 = df['A']
+   >>> sliced2
+   0    1
+   1    2
+   2    3
+   Name: A, dtype: int64
+   >>> type(sliced2)
+   <class '__main__.SubclassedSeries'>
+
+Define Original Properties
+^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+To let original data structures have additional properties, you should let ``pandas`` know what properties are added. ``pandas`` maps unknown properties to data names overriding ``__getattribute__``. Defining original properties can be done in one of 2 ways:
+
+1. Define ``_internal_names`` and ``_internal_names_set`` for temporary properties which WILL NOT be passed to manipulation results.
+2. Define ``_metadata`` for normal properties which will be passed to manipulation results.
+
+Below is an example to define two original properties, "internal_cache" as a temporary property and "added_property" as a normal property
+
+.. code-block:: python
+
+   class SubclassedDataFrame2(DataFrame):
+
+       # temporary properties
+       _internal_names = pd.DataFrame._internal_names + ['internal_cache']
+       _internal_names_set = set(_internal_names)
+
+       # normal properties
+       _metadata = ['added_property']
+
+       @property
+       def _constructor(self):
+           return SubclassedDataFrame2
+
+.. code-block:: python
+
+   >>> df = SubclassedDataFrame2({'A': [1, 2, 3], 'B': [4, 5, 6], 'C': [7, 8, 9]})
+   >>> df
+      A  B  C
+   0  1  4  7
+   1  2  5  8
+   2  3  6  9
+
+   >>> df.internal_cache = 'cached'
+   >>> df.added_property = 'property'
+
+   >>> df.internal_cache
+   cached
+   >>> df.added_property
+   property
+
+   # properties defined in _internal_names is reset after manipulation
+   >>> df[['A', 'B']].internal_cache
+   AttributeError: 'SubclassedDataFrame2' object has no attribute 'internal_cache'
+
+   # properties defined in _metadata are retained
+   >>> df[['A', 'B']].added_property
+   property
diff --git a/doc/source/index.rst.template b/doc/source/index.rst.template
index cb6cce5edaf79..1ef88a524732f 100644
--- a/doc/source/index.rst.template
+++ b/doc/source/index.rst.template
@@ -157,5 +157,6 @@ See the package overview for more detail about what's in the library.
     {% if not single_doc -%}
     developer
     internals
+    extending
     release
     {% endif -%}
diff --git a/doc/source/internals.rst b/doc/source/internals.rst
index 957f82fd9eba7..b120e3a98db7f 100644
--- a/doc/source/internals.rst
+++ b/doc/source/internals.rst
@@ -15,7 +15,8 @@
 Internals
 *********
 
-This section will provide a look into some of pandas internals.
+This section will provide a look into some of pandas internals. It's primarily
+intended for developers of pandas itself.
 
 Indexing
 --------
@@ -107,156 +108,9 @@ containers (``Index`` classes and ``Series``) we have the following convention:
 So, for example, ``Series[category]._values`` is a ``Categorical``, while
 ``Series[category]._ndarray_values`` is the underlying codes.
 
-
 .. _ref-subclassing-pandas:
 
 Subclassing pandas Data Structures
 ----------------------------------
 
-.. warning:: There are some easier alternatives before considering subclassing ``pandas`` data structures.
-
-  1. Extensible method chains with :ref:`pipe <basics.pipe>`
-
-  2. Use *composition*. See `here <http://en.wikipedia.org/wiki/Composition_over_inheritance>`_.
-
-  3. Extending by :ref:`registering an accessor <internals.register-accessors>`
-
-This section describes how to subclass ``pandas`` data structures to meet more specific needs. There are 2 points which need attention:
-
-1. Override constructor properties.
-2. Define original properties
-
-.. note:: You can find a nice example in `geopandas <https://github.com/geopandas/geopandas>`_ project.
-
-Override Constructor Properties
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-
-Each data structure has constructor properties to specifying data constructors. By overriding these properties, you can retain defined-classes through ``pandas`` data manipulations.
-
-There are 3 constructors to be defined:
-
-- ``_constructor``: Used when a manipulation result has the same dimesions as the original.
-- ``_constructor_sliced``: Used when a manipulation result has one lower dimension(s) as the original, such as ``DataFrame`` single columns slicing.
-- ``_constructor_expanddim``: Used when a manipulation result has one higher dimension as the original, such as ``Series.to_frame()`` and ``DataFrame.to_panel()``.
-
-Following table shows how ``pandas`` data structures define constructor properties by default.
-
-===========================  ======================= =================== =======================
-Property Attributes          ``Series``              ``DataFrame``       ``Panel``
-===========================  ======================= =================== =======================
-``_constructor``             ``Series``              ``DataFrame``       ``Panel``
-``_constructor_sliced``      ``NotImplementedError`` ``Series``          ``DataFrame``
-``_constructor_expanddim``   ``DataFrame``           ``Panel``           ``NotImplementedError``
-===========================  ======================= =================== =======================
-
-Below example shows how to define ``SubclassedSeries`` and ``SubclassedDataFrame`` overriding constructor properties.
-
-.. code-block:: python
-
-   class SubclassedSeries(Series):
-
-       @property
-       def _constructor(self):
-           return SubclassedSeries
-
-       @property
-       def _constructor_expanddim(self):
-           return SubclassedDataFrame
-
-   class SubclassedDataFrame(DataFrame):
-
-       @property
-       def _constructor(self):
-           return SubclassedDataFrame
-
-       @property
-       def _constructor_sliced(self):
-           return SubclassedSeries
-
-.. code-block:: python
-
-   >>> s = SubclassedSeries([1, 2, 3])
-   >>> type(s)
-   <class '__main__.SubclassedSeries'>
-
-   >>> to_framed = s.to_frame()
-   >>> type(to_framed)
-   <class '__main__.SubclassedDataFrame'>
-
-   >>> df = SubclassedDataFrame({'A', [1, 2, 3], 'B': [4, 5, 6], 'C': [7, 8, 9]})
-   >>> df
-      A  B  C
-   0  1  4  7
-   1  2  5  8
-   2  3  6  9
-
-   >>> type(df)
-   <class '__main__.SubclassedDataFrame'>
-
-   >>> sliced1 = df[['A', 'B']]
-   >>> sliced1
-      A  B
-   0  1  4
-   1  2  5
-   2  3  6
-   >>> type(sliced1)
-   <class '__main__.SubclassedDataFrame'>
-
-   >>> sliced2 = df['A']
-   >>> sliced2
-   0    1
-   1    2
-   2    3
-   Name: A, dtype: int64
-   >>> type(sliced2)
-   <class '__main__.SubclassedSeries'>
-
-Define Original Properties
-~~~~~~~~~~~~~~~~~~~~~~~~~~
-
-To let original data structures have additional properties, you should let ``pandas`` know what properties are added. ``pandas`` maps unknown properties to data names overriding ``__getattribute__``. Defining original properties can be done in one of 2 ways:
-
-1. Define ``_internal_names`` and ``_internal_names_set`` for temporary properties which WILL NOT be passed to manipulation results.
-2. Define ``_metadata`` for normal properties which will be passed to manipulation results.
-
-Below is an example to define 2 original properties, "internal_cache" as a temporary property and "added_property" as a normal property
-
-.. code-block:: python
-
-   class SubclassedDataFrame2(DataFrame):
-
-       # temporary properties
-       _internal_names = pd.DataFrame._internal_names + ['internal_cache']
-       _internal_names_set = set(_internal_names)
-
-       # normal properties
-       _metadata = ['added_property']
-
-       @property
-       def _constructor(self):
-           return SubclassedDataFrame2
-
-.. code-block:: python
-
-   >>> df = SubclassedDataFrame2({'A': [1, 2, 3], 'B': [4, 5, 6], 'C': [7, 8, 9]})
-   >>> df
-      A  B  C
-   0  1  4  7
-   1  2  5  8
-   2  3  6  9
-
-   >>> df.internal_cache = 'cached'
-   >>> df.added_property = 'property'
-
-   >>> df.internal_cache
-   cached
-   >>> df.added_property
-   property
-
-   # properties defined in _internal_names is reset after manipulation
-   >>> df[['A', 'B']].internal_cache
-   AttributeError: 'SubclassedDataFrame2' object has no attribute 'internal_cache'
-
-   # properties defined in _metadata are retained
-   >>> df[['A', 'B']].added_property
-   property
+This section has been moved to :ref:`extending.subclassing-pandas`.
diff --git a/doc/source/whatsnew/v0.16.1.txt b/doc/source/whatsnew/v0.16.1.txt
index b1e8aa10457f8..9e1dc391d7ace 100644
--- a/doc/source/whatsnew/v0.16.1.txt
+++ b/doc/source/whatsnew/v0.16.1.txt
@@ -313,7 +313,7 @@ Other Enhancements
 - Add/delete ``str/dt/cat`` accessors dynamically from ``__dir__``. (:issue:`9910`)
 - Add ``normalize`` as a ``dt`` accessor method. (:issue:`10047`)
 
-- ``DataFrame`` and ``Series`` now have ``_constructor_expanddim`` property as overridable constructor for one higher dimensionality data. This should be used only when it is really needed, see :ref:`here <ref-subclassing-pandas>`
+- ``DataFrame`` and ``Series`` now have ``_constructor_expanddim`` property as overridable constructor for one higher dimensionality data. This should be used only when it is really needed, see :ref:`here <extending.subclassing-pandas>`
 
 - ``pd.lib.infer_dtype`` now returns ``'bytes'`` in Python 3 where appropriate. (:issue:`10032`)
 
diff --git a/pandas/core/accessor.py b/pandas/core/accessor.py
index 96bf628c8d7ff..06c4068f86bfe 100644
--- a/pandas/core/accessor.py
+++ b/pandas/core/accessor.py
@@ -191,9 +191,9 @@ def __init__(self, pandas_obj):
 
         @property
         def center(self):
-            # return the geographic center point of this DataFarme
-            lon = self._obj.latitude
-            lat = self._obj.longitude
+            # return the geographic center point of this DataFrame
+            lat = self._obj.latitude
+            lon = self._obj.longitude
             return (float(lon.mean()), float(lat.mean()))
 
         def plot(self):