Skip to content
Merged
60 changes: 56 additions & 4 deletions src/nested_pandas/nestedframe/io.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,7 @@
import pyarrow as pa
import pyarrow.fs
import pyarrow.parquet as pq
from pyarrow.lib import ArrowInvalid
from upath import UPath

from ..series.dtype import NestedDtype
Expand Down Expand Up @@ -95,6 +96,12 @@ def read_parquet(
like ```pd.read_parquet("data.parquet", columns=["nested.a", "nested"])```
from working, as this implies both full and partial load of "nested".

Additionally with partial loading, be aware that nested-pandas (and pyarrow)
only supports partial loading of struct of list columns. Your data may be
stored as a list of structs, which can be read by nested-pandas, but without
support for partial loading. We try to throw a helpful error message in these
cases.

Furthermore, there are some cases where subcolumns will have the same name
as a top-level column. For example, if you have a column "nested" with
subcolumns "nested.a" and "nested.b", and also a top-level column "a". In
Expand Down Expand Up @@ -214,25 +221,70 @@ def _read_parquet_into_table(
return _read_remote_parquet_directory(
path_to_data, filesystem, storage_options, columns, **kwargs
)

with fsspec.parquet.open_parquet_file(
path_to_data.path,
columns=columns,
storage_options=storage_options,
fs=filesystem,
engine="pyarrow",
) as parquet_file:
return pq.read_table(parquet_file, columns=columns, **kwargs)
return _read_table_with_partial_load_check(parquet_file, columns=columns, **kwargs)

# All other cases, including file-like objects, directories, and
# even lists of the foregoing.

# If `filesystem` is specified - use it, passing it as part of **kwargs
if kwargs.get("filesystem") is not None:
return pq.read_table(data, columns=columns, **kwargs)
return _read_table_with_partial_load_check(data, columns=columns, **kwargs)

# Otherwise convert with a special function
data, filesystem = _transform_read_parquet_data_arg(data)
return pq.read_table(data, filesystem=filesystem, columns=columns, **kwargs)
return _read_table_with_partial_load_check(data, columns=columns, filesystem=filesystem, **kwargs)


def _read_table_with_partial_load_check(data, columns=None, filesystem=None, **kwargs):
"""Read a pyarrow table with partial load check for nested structures"""
try:
return pq.read_table(data, columns=columns, **kwargs)
except ArrowInvalid as e:
# if it's not related to partial loading of nested structures, re-raise
if "No match for" not in str(e):
raise e
if columns is not None:
check_schema = any("." in col for col in columns) # Check for potential partial loads
if check_schema:
try:
_validate_structs_from_schema(data, columns=columns, filesystem=filesystem)
except ValueError as validation_error:
raise validation_error from e # Chain the exceptions for better context
raise e


def _validate_structs_from_schema(data, columns=None, filesystem=None):
"""Validate that nested columns are structs"""
if columns is not None:
schema = pq.read_schema(data, filesystem=filesystem)
for col in columns:
# check if column is a partial load of a nested structure
if "." in col:
# first check if column exists as a top-level column
if col in schema.names:
continue
# if not, inspect the base column name type
else:
if col.split(".")[0] in schema.names:
# check if the column is a list-struct
col_type = schema.field(col.split(".")[0]).type
if not pa.types.is_struct(col_type):
base_col = col.split(".")[0]
raise ValueError(
f"The provided column '{col}' signals to partially load a nested structure, "
f"but the nested structure '{base_col}' is not a struct. "
"Partial loading of nested structures is only supported for struct of list "
f"columns. To resolve this, fully load the column '{base_col}' "
f"instead of partially loading it and perform column selection afterwards."
)


def _is_local_dir(upath: UPath) -> bool:
Expand Down Expand Up @@ -273,7 +325,7 @@ def _read_remote_parquet_directory(
fs=filesystem,
engine="pyarrow",
) as parquet_file:
table = pq.read_table(parquet_file, columns=columns, **kwargs)
table = _read_table_with_partial_load_check(parquet_file, columns=columns, **kwargs)
tables.append(table)
return pa.concat_tables(tables)

Expand Down
Binary file added tests/list_struct_data/list_struct.parquet
Binary file not shown.
14 changes: 14 additions & 0 deletions tests/nested_pandas/nestedframe/test_io.py
Original file line number Diff line number Diff line change
Expand Up @@ -459,3 +459,17 @@ def test__get_storage_options():
assert storage_opts is not None
# S3 should NOT have the block_size override (only HTTP/HTTPS)
assert storage_opts.get("block_size") != FSSPEC_BLOCK_SIZE


def test_list_struct_partial_loading_error():
"""Test that attempting to partially load a list-struct raises an error."""
# Load in the example file
with pytest.raises(ValueError):
read_parquet("tests/list_struct_data/list_struct.parquet", columns=["lightcurve.hmjd"])


def test_normal_loading_error():
"""Test that making a normal naming mistake raises the normal pyarrow error."""
# Load in the example file
with pytest.raises(ValueError, match="No match for*"):
read_parquet("tests/test_data/nested.parquet", columns=["not_a_column"])