In [1]:
import pandas as pd

In [2]:
ser = pd.Series([-1.5, 0.2, None], dtype="float32[pyarrow]")

ser

0    -1.5
1     0.2
2    <NA>
dtype: float[pyarrow]

In [3]:
idx = pd.Index([True, None], dtype="bool[pyarrow]")

idx

Index([True, <NA>], dtype='bool[pyarrow]')

In [4]:
df = pd.DataFrame([[1, 2], [3, 4]], dtype="uint64[pyarrow]")

df

Unnamed: 0,0,1
0,1,2
1,3,4


In [5]:
import pyarrow as pa

data = list("abc")

ser_sd = pd.Series(data, dtype="string[pyarrow]")

ser_ad = pd.Series(data, dtype=pd.ArrowDtype(pa.string()))

ser_ad.dtype == ser_sd.dtype

False

In [6]:
ser_sd.str.contains("a")

0     True
1    False
2    False
dtype: boolean

In [7]:
ser_ad.str.contains("a")

0     True
1    False
2    False
dtype: bool[pyarrow]

In [8]:
ser_sd

0    a
1    b
2    c
dtype: string

In [9]:
ser_ad

0    a
1    b
2    c
dtype: string[pyarrow]

In [10]:
import pyarrow as pa

list_str_type = pa.list_(pa.string())

ser = pd.Series([["hello"], ["there"]], dtype=pd.ArrowDtype(list_str_type))

ser

0    ['hello']
1    ['there']
dtype: list<item: string>[pyarrow]

In [11]:
from datetime import time

idx = pd.Index([time(12, 30), None], dtype=pd.ArrowDtype(pa.time64("us")))

idx

Index([12:30:00, <NA>], dtype='time64[us][pyarrow]')

In [12]:
from decimal import Decimal

decimal_type = pd.ArrowDtype(pa.decimal128(3, scale=2))

data = [[Decimal("3.19"), None], [None, Decimal("-1.23")]]

df = pd.DataFrame(data, dtype=decimal_type)

df

Unnamed: 0,0,1
0,3.19,
1,,-1.23


In [13]:
pa_array = pa.array(

    [{"1": "2"}, {"10": "20"}, None],

    type=pa.map_(pa.string(), pa.string()),

)

ser = pd.Series(pd.arrays.ArrowExtensionArray(pa_array))

ser

0      [('1', '2')]
1    [('10', '20')]
2              <NA>
dtype: map<string, string>[pyarrow]

In [14]:
ser = pd.Series([1, 2, None], dtype="uint8[pyarrow]")

pa.array(ser)

<pyarrow.lib.UInt8Array object at 0x000001FF8530B9A0>
[
  1,
  2,
  null
]

In [15]:
idx = pd.Index(ser)

pa.array(idx)

<pyarrow.lib.UInt8Array object at 0x000001FF8530A260>
[
  1,
  2,
  null
]

In [16]:
table = pa.table([pa.array([1, 2, 3], type=pa.int64())], names=["a"])

df = table.to_pandas(types_mapper=pd.ArrowDtype)

df

Unnamed: 0,a
0,1
1,2
2,3


In [17]:
df.dtypes

a    int64[pyarrow]
dtype: object

In [18]:
import pyarrow as pa

ser = pd.Series([-1.545, 0.211, None], dtype="float32[pyarrow]")

ser.mean()

-0.6669999808073044

In [19]:
ser + ser

0    -3.09
1    0.422
2     <NA>
dtype: float[pyarrow]

In [20]:
ser > (ser + 1)

0    False
1    False
2     <NA>
dtype: bool[pyarrow]

In [21]:
ser.dropna()

0   -1.545
1    0.211
dtype: float[pyarrow]

In [22]:
ser.isna()

0    False
1    False
2     True
dtype: bool

In [23]:
ser.fillna(0)

0   -1.545
1    0.211
2      0.0
dtype: float[pyarrow]

In [24]:
ser_str = pd.Series(["a", "b", None], dtype=pd.ArrowDtype(pa.string()))

ser_str.str.startswith("a")

0     True
1    False
2     <NA>
dtype: bool[pyarrow]

In [28]:
import os
os.environ["ARROW_TIMEZONE_DATABASE"] = "C:/anaconda3/lib/site-packages/tzdata"

In [27]:
import tzdata
print(tzdata.__file__)

C:\anaconda3\lib\site-packages\tzdata\__init__.py


In [31]:
import io

data = io.StringIO("""a,b,c

   1,2.5,True

   3,4.5,False

""")



df = pd.read_csv(data, engine="pyarrow")

df

Unnamed: 0,a,b,c
0,1,2.5,True
1,3,4.5,False


In [32]:
import io

data = io.StringIO("""a,b,c,d,e,f,g,h,i

    1,2.5,True,a,,,,,

    3,4.5,False,b,6,7.5,True,a,

""")



df_pyarrow = pd.read_csv(data, dtype_backend="pyarrow")

df_pyarrow.dtypes

a     int64[pyarrow]
b    double[pyarrow]
c      bool[pyarrow]
d    string[pyarrow]
e     int64[pyarrow]
f    double[pyarrow]
g      bool[pyarrow]
h    string[pyarrow]
i      null[pyarrow]
dtype: object