In [23]:
import pandas as pd
import pyarrow as pa
import pyarrow.parquet as pq

# Just use pyarrow to write the parquet

Pandas doesn't touch anything. We have some `fixed_size_list`.

Pandas can read it, and just reads the list as an object.

In [24]:
a = pa.array([1, 2, 3])
b = pa.array([[True, False, True]] * 3, type=pa.list_(pa.bool_(), 3))
table = pa.Table.from_arrays([a, b], names=['a', 'b'])

with pq.ParquetWriter("fixed_list.parquet", schema=table.schema) as writer:
    writer.write_table(table)

In [25]:
data_frame = pd.read_parquet("fixed_list.parquet")
data_frame

Unnamed: 0,a,b
0,1,"[True, False, True]"
1,2,"[True, False, True]"
2,3,"[True, False, True]"


In [26]:
data_frame.dtypes

a     int64
b    object
dtype: object

In [27]:
pq.read_metadata("fixed_list.parquet").schema.to_arrow_schema()

a: int64
b: fixed_size_list<element: bool>[3]
  child 0, element: bool

# Write table with pandas

Convert the table to pandas. Use pandas to write to parquet.

The `fixed_size_list` is converted into a plain `list` on-disk. Not so great, but pandas can still interpret the parquet file.

In [28]:
pa_table = table.to_pandas()
pa_table.to_parquet("through_pandas.parquet")
data_frame = pd.read_parquet("through_pandas.parquet")
data_frame

Unnamed: 0,a,b
0,1,"[True, False, True]"
1,2,"[True, False, True]"
2,3,"[True, False, True]"


In [29]:
data_frame.dtypes

a     int64
b    object
dtype: object

In [30]:
pq.read_metadata("through_pandas.parquet").schema.to_arrow_schema()

a: int64
b: list<element: bool>
  child 0, element: bool
-- schema metadata --
pandas: '{"index_columns": [{"kind": "range", "name": null, "start": 0, "' + 463

# Convert to pandas with arrow types AND THEN write to parquet with pandas

The `fixed_size_list` is NOT converted to a `list` on disk, and looks like it's still a `fixed_size_list` on disk. 

But pandas, when reading, uses its own metadata to interpret the file and doesn't know how to do it.

In [31]:
pa_table = table.to_pandas(types_mapper=pd.ArrowDtype)
pa_table.to_parquet("using_types_mapper.parquet")

In [32]:
pq.read_metadata("using_types_mapper.parquet").schema.to_arrow_schema()

a: int64
b: fixed_size_list<element: bool>[3]
  child 0, element: bool
-- schema metadata --
pandas: '{"index_columns": [{"kind": "range", "name": null, "start": 0, "' + 501

In [33]:
just_pyarrow_schema = pq.read_metadata("fixed_list.parquet").schema.to_arrow_schema()
too_much_pandas_schema = pq.read_metadata("using_types_mapper.parquet").schema.to_arrow_schema()
too_much_pandas_schema.equals(just_pyarrow_schema, check_metadata=False)

True

In [34]:
data_frame = pd.read_parquet("using_types_mapper.parquet")
data_frame

NotImplementedError: Passing pyarrow type specific parameters ([3]) in the string is not supported. Please construct an ArrowDtype object with a pyarrow_dtype instance with specific parameters.