# Pandas 2.x uses pyarrow in the backend to overcome some of NumPy's limitations

In [32]:
# Required imports
import pandas as pd
import pyarrow as pa

## Comparing pandas 1.x (relying on NumPy) and pandas 2.x (pyarrow)

### Integers

In [2]:
# We create three list of integers with small, large and missing values, respectively
small_values = [1, 99, 127]
large_values = [2**31, 2**63, 2**100]
missing_values = [None, 1, -45]

In [3]:
small_ser = pd.Series(small_values)
small_ser

0      1
1     99
2    127
dtype: int64

In [5]:
# Given that the value is so small, we can transform it into int8 (only up to 127)
small_ser.astype("int8")

0      1
1     99
2    127
dtype: int8

In [6]:
# This can also be done when creating the series
small_ser = pd.Series(small_values, dtype="int8")
small_ser

0      1
1     99
2    127
dtype: int8

In [7]:
# We use [pyarrow] to indicate pyarrow types
small_ser_pa = pd.Series(small_values, dtype="int8[pyarrow]")
small_ser_pa

0      1
1     99
2    127
dtype: int8[pyarrow]

In [12]:
# Instead of erroring, NumPy transforms it to object
large_ser = pd.Series(large_values)
large_ser

0                         2147483648
1                9223372036854775808
2    1267650600228229401496703205376
dtype: object

In [10]:
# This will error, as it doesn't fall back to object type. Same would happen if int64 was introduced.
large_ser_pa = pd.Series(large_values, dtype="int64[pyarrow]")
large_ser_pa

OverflowError: Python int too large to convert to C long

In [13]:
# The type will fall back to float64, as NumPy doesn't support
# missing values in integers
missing_ser = pd.Series(missing_values)
missing_ser

0     NaN
1     1.0
2   -45.0
dtype: float64

The above point means that, whenever we are dealing with a NumPy int series (int64 for example),
we will know for sure that there are no missing values, while we won't be able to say the same for
floating points. It is worth mentioning that this doesn't mean that an int series is completely
correct, missing values might be hidden as a specific value (-1, 999 or other)

In [14]:
# Pyarrow does support missing values for integer series
missing_ser_pa = pd.Series(missing_values, dtype="int8[pyarrow]")
missing_ser_pa

0    <NA>
1       1
2     -45
dtype: int8[pyarrow]

In [15]:
# One thing to take into account is type changes
# Specially if changing from bigger sizes
medium_values = [2**15, 2**31-8,2**63]
medium_ser = pd.Series(medium_values)
medium_ser

0                  32768
1             2147483640
2    9223372036854775808
dtype: uint64

In [16]:
# This will overflow the values, without raising an error
medium_ser.astype("int8")

0    0
1   -8
2    0
dtype: int8

In [17]:
# This will overflow the series, but unlike NumPy, it will cause an error
medium_ser.astype("int8[pyarrow]")

ArrowInvalid: Integer value 32768 not in range: 0 to 127

### Floating point

In [18]:
# We create again three lists: one has missing values, one doesn't and one has text values.
float_vals = [1.5, 2.7, 127.0]
float_missing = [None, 1.5, -45.0]
float_rain = [1.5, 2.7, 0.0, "T", 1.5, 0]

In [19]:
# Pandas 1.x
pd.Series(float_vals)

0      1.5
1      2.7
2    127.0
dtype: float64

In [20]:
pd.Series(float_missing)

0     NaN
1     1.5
2   -45.0
dtype: float64

In [22]:
# Pandas 2.x
# This will create a double precision floating point (64 bits)
# Type can also be specified as double[pyarrow]
pd.Series(float_vals, dtype="float64[pyarrow]")

0      1.5
1      2.7
2    127.0
dtype: double[pyarrow]

In [23]:
pd.Series(float_missing, dtype="float[pyarrow]")

0    <NA>
1     1.5
2   -45.0
dtype: float[pyarrow]

In [24]:
# moving to the string list
pd.Series(float_rain)

0    1.5
1    2.7
2    0.0
3      T
4    1.5
5      0
dtype: object

NumPy changes the type to object to account for the string. In this case, we have to manually change it to a number.
Note that the number could also be in string format, Numpy will detect it

In [28]:
# However, if the string is a number
pd.Series(float_rain).replace("T", "0").astype("float64")

0    1.5
1    2.7
2    0.0
3    0.0
4    1.5
5    0.0
dtype: float64

In [26]:
# For pyarrow, it tries to transform the string into double and fails
pd.Series(float_rain).astype("float64[pyarrow]")

ArrowInvalid: Could not convert 'T' with type str: tried to convert to double

In [30]:
# This will also error, pyarrow doesn't transform strings
pd.Series(float_rain).replace("T", "0").astype("float64[pyarrow]")

ArrowInvalid: Could not convert '0.0' with type str: tried to convert to double

In [31]:
# We have two ways of solving this: Replacing for a number directly (not string)
# Or transforming first to NumPy and then to pyarrow:
(
    pd.Series(float_rain)
    .replace("T", "0.0")
    .astype("float")
    .astype("float64[pyarrow]")
)

0    1.5
1    2.7
2    0.0
3    0.0
4    1.5
5    0.0
dtype: double[pyarrow]

### Strings

In [33]:
# Due to defining a type like string[pyarrow] was done in Pandas 1.5, it became legacy
# Hence we import through the pyarrow library
string_pa = pd.ArrowDtype(pa.string())

In [34]:
text_freeform = ["My name is Jeff", "I like pandas", "I like programming"]
text_with_missing = ["My name is Jeff", None, "I like programming"]

In [35]:
# Converting to pandas 1.x type
pd.Series(text_freeform)

0       My name is Jeff
1         I like pandas
2    I like programming
dtype: object

In [36]:
pd.Series(text_with_missing)

0       My name is Jeff
1                  None
2    I like programming
dtype: object

Note that NumPy stores strings as object, as it doesn't support strings directly

In [37]:
tf1 = pd.Series(text_freeform, dtype=string_pa)
tf1

0       My name is Jeff
1         I like pandas
2    I like programming
dtype: string[pyarrow]

In [38]:
# Just to show about the legacy topic, using string[pyarrow]:
tf2 = pd.Series(text_freeform, dtype="string[pyarrow]")
tf2

0       My name is Jeff
1         I like pandas
2    I like programming
dtype: string

In [39]:
# Comparing type of previous 2
tf1.dtype == tf2.dtype

False

In [40]:
pd.Series(text_with_missing, dtype=string_pa)

0       My name is Jeff
1                  <NA>
2    I like programming
dtype: string[pyarrow]

The string[pyarrow] type is faster and uses less memory than pandas 1.x