# Pandas 2.x uses pyarrow in the backend to overcome some of NumPy's limitations

In [19]:
# Required imports
import pandas as pd
import pyarrow as pa
import datetime as dt

## Comparing pandas 1.x (relying on NumPy) and pandas 2.x (pyarrow)

### Integers

In [2]:
# We create three list of integers with small, large and missing values, respectively
small_values = [1, 99, 127]
large_values = [2**31, 2**63, 2**100]
missing_values = [None, 1, -45]

In [3]:
small_ser = pd.Series(small_values)
small_ser

0      1
1     99
2    127
dtype: int64

In [5]:
# Given that the value is so small, we can transform it into int8 (only up to 127)
small_ser.astype("int8")

0      1
1     99
2    127
dtype: int8

In [6]:
# This can also be done when creating the series
small_ser = pd.Series(small_values, dtype="int8")
small_ser

0      1
1     99
2    127
dtype: int8

In [7]:
# We use [pyarrow] to indicate pyarrow types
small_ser_pa = pd.Series(small_values, dtype="int8[pyarrow]")
small_ser_pa

0      1
1     99
2    127
dtype: int8[pyarrow]

In [12]:
# Instead of erroring, NumPy transforms it to object
large_ser = pd.Series(large_values)
large_ser

0                         2147483648
1                9223372036854775808
2    1267650600228229401496703205376
dtype: object

In [10]:
# This will error, as it doesn't fall back to object type. Same would happen if int64 was introduced.
large_ser_pa = pd.Series(large_values, dtype="int64[pyarrow]")
large_ser_pa

OverflowError: Python int too large to convert to C long

In [13]:
# The type will fall back to float64, as NumPy doesn't support
# missing values in integers
missing_ser = pd.Series(missing_values)
missing_ser

0     NaN
1     1.0
2   -45.0
dtype: float64

The above point means that, whenever we are dealing with a NumPy int series (int64 for example),
we will know for sure that there are no missing values, while we won't be able to say the same for
floating points. It is worth mentioning that this doesn't mean that an int series is completely
correct, missing values might be hidden as a specific value (-1, 999 or other)

In [14]:
# Pyarrow does support missing values for integer series
missing_ser_pa = pd.Series(missing_values, dtype="int8[pyarrow]")
missing_ser_pa

0    <NA>
1       1
2     -45
dtype: int8[pyarrow]

In [15]:
# One thing to take into account is type changes
# Specially if changing from bigger sizes
medium_values = [2**15, 2**31-8,2**63]
medium_ser = pd.Series(medium_values)
medium_ser

0                  32768
1             2147483640
2    9223372036854775808
dtype: uint64

In [16]:
# This will overflow the values, without raising an error
medium_ser.astype("int8")

0    0
1   -8
2    0
dtype: int8

In [17]:
# This will overflow the series, but unlike NumPy, it will cause an error
medium_ser.astype("int8[pyarrow]")

ArrowInvalid: Integer value 32768 not in range: 0 to 127

### Floating point

In [18]:
# We create again three lists: one has missing values, one doesn't and one has text values.
float_vals = [1.5, 2.7, 127.0]
float_missing = [None, 1.5, -45.0]
float_rain = [1.5, 2.7, 0.0, "T", 1.5, 0]

In [19]:
# Pandas 1.x
pd.Series(float_vals)

0      1.5
1      2.7
2    127.0
dtype: float64

In [20]:
pd.Series(float_missing)

0     NaN
1     1.5
2   -45.0
dtype: float64

In [22]:
# Pandas 2.x
# This will create a double precision floating point (64 bits)
# Type can also be specified as double[pyarrow]
pd.Series(float_vals, dtype="float64[pyarrow]")

0      1.5
1      2.7
2    127.0
dtype: double[pyarrow]

In [23]:
pd.Series(float_missing, dtype="float[pyarrow]")

0    <NA>
1     1.5
2   -45.0
dtype: float[pyarrow]

In [24]:
# moving to the string list
pd.Series(float_rain)

0    1.5
1    2.7
2    0.0
3      T
4    1.5
5      0
dtype: object

NumPy changes the type to object to account for the string. In this case, we have to manually change it to a number.
Note that the number could also be in string format, Numpy will detect it

In [28]:
# However, if the string is a number
pd.Series(float_rain).replace("T", "0").astype("float64")

0    1.5
1    2.7
2    0.0
3    0.0
4    1.5
5    0.0
dtype: float64

In [26]:
# For pyarrow, it tries to transform the string into double and fails
pd.Series(float_rain).astype("float64[pyarrow]")

ArrowInvalid: Could not convert 'T' with type str: tried to convert to double

In [30]:
# This will also error, pyarrow doesn't transform strings
pd.Series(float_rain).replace("T", "0").astype("float64[pyarrow]")

ArrowInvalid: Could not convert '0.0' with type str: tried to convert to double

In [31]:
# We have two ways of solving this: Replacing for a number directly (not string)
# Or transforming first to NumPy and then to pyarrow:
(
    pd.Series(float_rain)
    .replace("T", "0.0")
    .astype("float")
    .astype("float64[pyarrow]")
)

0    1.5
1    2.7
2    0.0
3    0.0
4    1.5
5    0.0
dtype: double[pyarrow]

### Strings

In [16]:
# Due to defining a type like string[pyarrow] was done in Pandas 1.5, it became legacy
# Hence we import through the pyarrow library
string_pa = pd.ArrowDtype(pa.string())

In [3]:
text_freeform = ["My name is Jeff", "I like pandas", "I like programming"]
text_with_missing = ["My name is Jeff", None, "I like programming"]

In [4]:
# Converting to pandas 1.x type
pd.Series(text_freeform)

0       My name is Jeff
1         I like pandas
2    I like programming
dtype: object

In [5]:
pd.Series(text_with_missing)

0       My name is Jeff
1                  None
2    I like programming
dtype: object

Note that NumPy stores strings as object, as it doesn't support strings directly

In [6]:
tf1 = pd.Series(text_freeform, dtype=string_pa)
tf1

0       My name is Jeff
1         I like pandas
2    I like programming
dtype: string[pyarrow]

In [7]:
# Just to show about the legacy topic, using string[pyarrow]:
tf2 = pd.Series(text_freeform, dtype="string[pyarrow]")
tf2

0       My name is Jeff
1         I like pandas
2    I like programming
dtype: string

In [8]:
# Comparing type of previous 2
tf1.dtype == tf2.dtype

False

In [9]:
pd.Series(text_with_missing, dtype=string_pa)

0       My name is Jeff
1                  <NA>
2    I like programming
dtype: string[pyarrow]

The string[pyarrow] type is faster and uses less memory than pandas 1.x

## Categorical Data

In [10]:
states = ["CA", "NY", "TX"]
months = ["Jan", "Feb", "Mar", "Apr", "May", "Jun",
          "Jul", "Aug", "Sep", "Oct", "Nov", "Dec"]

NumPy doesn't support categorical data natively, but as it is so common, pandas 1.5x does have it.

In [12]:
pd.Series(states, dtype="category")

0    CA
1    NY
2    TX
dtype: category
Categories (3, object): ['CA', 'NY', 'TX']

In [13]:
pd.Series(months, dtype="category")

0     Jan
1     Feb
2     Mar
3     Apr
4     May
5     Jun
6     Jul
7     Aug
8     Sep
9     Oct
10    Nov
11    Dec
dtype: category
Categories (12, object): ['Apr', 'Aug', 'Dec', 'Feb', ..., 'May', 'Nov', 'Oct', 'Sep']

The above series is actually not ordered by month, if we apply sort, it would give us alphabetical order:

In [14]:
pd.Series(months, dtype="category").sort_values()

3     Apr
7     Aug
11    Dec
1     Feb
0     Jan
6     Jul
5     Jun
2     Mar
4     May
10    Nov
9     Oct
8     Sep
dtype: category
Categories (12, object): ['Apr', 'Aug', 'Dec', 'Feb', ..., 'May', 'Nov', 'Oct', 'Sep']

In order to correctly sort data, we need to create an ordered categorical type and then pass the type as the dtype parameter:

In [15]:
month_cat = pd.CategoricalDtype(categories=months, ordered=True)
pd.Series(months, dtype=month_cat).sort_values()

0     Jan
1     Feb
2     Mar
3     Apr
4     May
5     Jun
6     Jul
7     Aug
8     Sep
9     Oct
10    Nov
11    Dec
dtype: category
Categories (12, object): ['Jan' < 'Feb' < 'Mar' < 'Apr' ... 'Sep' < 'Oct' < 'Nov' < 'Dec']

Pyarrow doesn't have a categorical type, but it does include a dictionary type. However, the author suggests using the Pandas 1.x categorical type,
as the dictionary type isn't directly exposed in pandas 2.x

In [17]:
# Using pandas categorical type
pd.Series(months, dtype=string_pa).astype(month_cat)

0     Jan
1     Feb
2     Mar
3     Apr
4     May
5     Jun
6     Jul
7     Aug
8     Sep
9     Oct
10    Nov
11    Dec
dtype: category
Categories (12, object): ['Jan' < 'Feb' < 'Mar' < 'Apr' ... 'Sep' < 'Oct' < 'Nov' < 'Dec']

In [18]:
# Using PyArrow dictionary type
pd.Series(months, dtype=pd.ArrowDtype(pa.dictionary(pa.int64(), pa.string())))

0     Jan
1     Feb
2     Mar
3     Apr
4     May
5     Jun
6     Jul
7     Aug
8     Sep
9     Oct
10    Nov
11    Dec
dtype: dictionary<values=string, indices=int64, ordered=0>[pyarrow]

Again, as mentioned above, the author suggests using pandas 1.x

## Dates and times

In [20]:
dt_list = [dt.datetime(2020, 1, 1, 4, 30), dt.datetime(2020, 1, 2),
           dt.datetime(2020, 1, 3)]
string_dates = ["2020-01-01 04:30:00", "2020-01-02 00:00:00",
                "2020-01-03 00:00:00"]
string_dates_missing = ["2020-01-01 4:30", None, "2020-01-03"]
epoch_dates = [1577836800, 1577923200, 1578009600]

Converting using Pandas 1.x ...

In [21]:
pd.Series(dt_list)

0   2020-01-01 04:30:00
1   2020-01-02 00:00:00
2   2020-01-03 00:00:00
dtype: datetime64[ns]

In [22]:
pd.Series(string_dates, dtype="datetime64[ns]")

0   2020-01-01 04:30:00
1   2020-01-02 00:00:00
2   2020-01-03 00:00:00
dtype: datetime64[ns]

In [23]:
pd.Series(string_dates_missing, dtype="datetime64[ns]")

0   2020-01-01 04:30:00
1                   NaT
2   2020-01-03 00:00:00
dtype: datetime64[ns]

In [24]:
# For this case, using ns for nanoseconds gives erroneous results
pd.Series(epoch_dates, dtype="datetime64[ns]")

0   1970-01-01 00:00:01.577836800
1   1970-01-01 00:00:01.577923200
2   1970-01-01 00:00:01.578009600
dtype: datetime64[ns]

In [25]:
# Instead, we need to use seconds!
pd.Series(epoch_dates, dtype="datetime64[s]")

0   2020-01-01
1   2020-01-02
2   2020-01-03
dtype: datetime64[s]

Now let's convert using Pandas 2.x ...

In [26]:
pd.Series(dt_list, dtype="timestamp[ns][pyarrow]")

0    2020-01-01 04:30:00
1    2020-01-02 00:00:00
2    2020-01-03 00:00:00
dtype: timestamp[ns][pyarrow]

In [27]:
pd.Series(string_dates, dtype="timestamp[ns][pyarrow]")

0    2020-01-01 04:30:00
1    2020-01-02 00:00:00
2    2020-01-03 00:00:00
dtype: timestamp[ns][pyarrow]

In [30]:
# This errors as pyarrow doesn't handle as gracefully wrong formats
pd.Series(string_dates_missing, dtype="timestamp[ns][pyarrow]")

ValueError: time data "2020-01-03" doesn't match format "%Y-%m-%d %H:%M", at position 2. You might want to try:
    - passing `format` if your strings have a consistent format;
    - passing `format='ISO8601'` if your strings are all ISO8601 but not necessarily in exactly the same format;
    - passing `format='mixed'`, and the format will be inferred for each element individually. You might want to use `dayfirst` alongside this.

In [31]:
string_dates_missing_formatted = ["2020-01-01 04:30:00", None, "2020-01-03 00:00:00"]

In [32]:
# This works
pd.Series(string_dates_missing_formatted, dtype="timestamp[ns][pyarrow]")

0    2020-01-01 04:30:00
1                   <NA>
2    2020-01-03 00:00:00
dtype: timestamp[ns][pyarrow]

In [33]:
# Now for epochs, same as with 1.x we need to account seconds and not nanoseconds
pd.Series(epoch_dates, dtype="timestamp[s][pyarrow]")

0    2020-01-01 00:00:00
1    2020-01-02 00:00:00
2    2020-01-03 00:00:00
dtype: timestamp[s][pyarrow]