# Extending pandas types: Pandas Extension Array support

In [1]:
import numpy as np
import pandas as pd

## Packages that already use the ExtensionArray interface

* [cyberpandas](https://github.com/ContinuumIO/cyberpandas): IP Address dtype and array for pandas 
* [fletcher](https://github.com/xhochy/fletcher): Pandas ExtensionDType/Array backed by Apache Arrow
* [pandas-pint](https://github.com/hgrecco/pint-pandas): Pandas support for pint (physical quantities in Python)
* [GeoPandas](https://github.com/geopandas/geopandas): Pandas support for geographic vector data (ExtensionArray support is currently in a PR, not in master or release)
* ... ?

Some examples

In [2]:
import fletcher
import cyberpandas

In [3]:
df = pd.DataFrame({
    'integer': pd.array([1, 2, np.nan, 4], dtype='Int64'),
    'ipaddresses': cyberpandas.IPArray(['192.168.1.1', '192.168.1.10', '192.168.1.10', '0.0.0.0']),
    'list_column': fletcher.FletcherArray([[1, 2], [3, 4], [3, 4, 5], None]),
    'str_column': fletcher.FletcherArray(['Test', None, 'Strings', 'Native']),
})

In [4]:
df

Unnamed: 0,integer,ipaddresses,list_column,str_column
0,1.0,192.168.1.1,[1 2],Test
1,2.0,192.168.1.10,[3 4],
2,,192.168.1.10,[3 4 5],Strings
3,4.0,0.0.0.0,,Native


In [5]:
df.dtypes

integer                              Int64
ipaddresses                             ip
list_column    fletcher[list<item: int64>]
str_column                fletcher[string]
dtype: object

In [6]:
df.isna()

Unnamed: 0,integer,ipaddresses,list_column,str_column
0,False,False,False,False
1,False,False,False,True
2,True,False,False,False
3,False,True,True,False


In [7]:
df.ipaddresses.ip.is_ipv6

0    False
1    False
2    False
3    False
Name: ipaddresses, dtype: bool

In [8]:
ExtensionDtype??

Object `ExtensionDtype` not found.


In [9]:
df.groupby('ipaddresses').size()

ipaddresses
192.168.1.1     1
192.168.1.10    2
dtype: int64

## How does it look like? A short demo

In [10]:
from pandas.api.extensions import ExtensionDtype, ExtensionArray

A `ExtensionDtype` subclass to describe the data type (the name, the scalar type):

In [11]:
from shapely.geometry.base import BaseGeometry

class GeometryDtype(ExtensionDtype):

    @property
    def name(self):
        return "my-geometry-type"

    @property
    def type(self):
        """The scalar type"""
        return BaseGeometry

    @classmethod
    def construct_from_string(cls, string):
        if string == cls.name:
            return cls()
        else:
            raise TypeError("Cannot construct a '{}' from "
                            "'{}'".format(cls, string))

A `ExtensionArray` subclass that stores the data and defines a required set of methods:

In [12]:
class GeometryArray(ExtensionArray):
    
    def __init__(self, geoms):
        self._data = geoms
    
    @property
    def dtype(self):
        return GeometryDtype()
    
    def __len__(self):
        return len(self._data)
    
    def __getitem__(self, key):
        if isinstance(key, int):
            return self._data[key]
        else:
            return GeometryArray(self._data[key])
        
    def __array__(self):
        return self._data

    def isna(self):
        return np.array([val is None for val in self._data], dtype=bool)

    def _from_sequence():
        GemetryArray(np.asarray(values, dtype=object))
    
    @classmethod
    def _concat_same_type(cls, to_concat):
        data = np.concatenate([v._data for v in to_concat])
        return GeometryArray(data)

This can now be stored in a Series / DataFrame, and normal operations work:

In [13]:
from shapely.geometry import Point

df = pd.DataFrame({
    'a': [1, 2, 3],
    'geoms': GeometryArray(np.array([Point(0, 0), None, Point(2, 2)], dtype=object))})

In [14]:
type(df)

pandas.core.frame.DataFrame

In [15]:
df

Unnamed: 0,a,geoms
0,1,POINT (0 0)
1,2,
2,3,POINT (2 2)


In [16]:
df.dtypes

a                   int64
geoms    my-geometry-type
dtype: object

In [17]:
df['geoms'].isna()

0    False
1     True
2    False
Name: geoms, dtype: bool

We could now also define a geometry accessor:

In [18]:
@pd.api.extensions.register_series_accessor("geo")
class GeoAccessor(object):
    def __init__(self, pandas_obj):
        self._validate(pandas_obj)
        self._obj = pandas_obj

    @staticmethod
    def _validate(obj):
        if obj.dtype != GeometryDtype():
            raise AttributeError("Must have a geometry dtype.")

    @property
    def x(self):
        return pd.Series(
            np.array([p.x if p is not None else np.nan for p in self._obj.array._data]),
            index=self._obj.index)

In [19]:
df['geoms'].geo.x

0    0.0
1    NaN
2    2.0
dtype: float64