diff --git a/src/nested_pandas/nestedframe/io.py b/src/nested_pandas/nestedframe/io.py index d3ca5b49..a37f16e8 100644 --- a/src/nested_pandas/nestedframe/io.py +++ b/src/nested_pandas/nestedframe/io.py @@ -8,6 +8,7 @@ import pyarrow as pa import pyarrow.fs import pyarrow.parquet as pq +from pyarrow.lib import ArrowInvalid from upath import UPath from ..series.dtype import NestedDtype @@ -95,6 +96,12 @@ def read_parquet( like ```pd.read_parquet("data.parquet", columns=["nested.a", "nested"])``` from working, as this implies both full and partial load of "nested". + Additionally with partial loading, be aware that nested-pandas (and pyarrow) + only supports partial loading of struct of list columns. Your data may be + stored as a list of structs, which can be read by nested-pandas, but without + support for partial loading. We try to throw a helpful error message in these + cases. + Furthermore, there are some cases where subcolumns will have the same name as a top-level column. For example, if you have a column "nested" with subcolumns "nested.a" and "nested.b", and also a top-level column "a". In @@ -214,6 +221,7 @@ def _read_parquet_into_table( return _read_remote_parquet_directory( path_to_data, filesystem, storage_options, columns, **kwargs ) + with fsspec.parquet.open_parquet_file( path_to_data.path, columns=columns, @@ -221,18 +229,62 @@ def _read_parquet_into_table( fs=filesystem, engine="pyarrow", ) as parquet_file: - return pq.read_table(parquet_file, columns=columns, **kwargs) + return _read_table_with_partial_load_check(parquet_file, columns=columns, **kwargs) # All other cases, including file-like objects, directories, and # even lists of the foregoing. # If `filesystem` is specified - use it, passing it as part of **kwargs if kwargs.get("filesystem") is not None: - return pq.read_table(data, columns=columns, **kwargs) + return _read_table_with_partial_load_check(data, columns=columns, **kwargs) # Otherwise convert with a special function data, filesystem = _transform_read_parquet_data_arg(data) - return pq.read_table(data, filesystem=filesystem, columns=columns, **kwargs) + return _read_table_with_partial_load_check(data, columns=columns, filesystem=filesystem, **kwargs) + + +def _read_table_with_partial_load_check(data, columns=None, filesystem=None, **kwargs): + """Read a pyarrow table with partial load check for nested structures""" + try: + return pq.read_table(data, columns=columns, **kwargs) + except ArrowInvalid as e: + # if it's not related to partial loading of nested structures, re-raise + if "No match for" not in str(e): + raise e + if columns is not None: + check_schema = any("." in col for col in columns) # Check for potential partial loads + if check_schema: + try: + _validate_structs_from_schema(data, columns=columns, filesystem=filesystem) + except ValueError as validation_error: + raise validation_error from e # Chain the exceptions for better context + raise e + + +def _validate_structs_from_schema(data, columns=None, filesystem=None): + """Validate that nested columns are structs""" + if columns is not None: + schema = pq.read_schema(data, filesystem=filesystem) + for col in columns: + # check if column is a partial load of a nested structure + if "." in col: + # first check if column exists as a top-level column + if col in schema.names: + continue + # if not, inspect the base column name type + else: + if col.split(".")[0] in schema.names: + # check if the column is a list-struct + col_type = schema.field(col.split(".")[0]).type + if not pa.types.is_struct(col_type): + base_col = col.split(".")[0] + raise ValueError( + f"The provided column '{col}' signals to partially load a nested structure, " + f"but the nested structure '{base_col}' is not a struct. " + "Partial loading of nested structures is only supported for struct of list " + f"columns. To resolve this, fully load the column '{base_col}' " + f"instead of partially loading it and perform column selection afterwards." + ) def _is_local_dir(upath: UPath) -> bool: @@ -273,7 +325,7 @@ def _read_remote_parquet_directory( fs=filesystem, engine="pyarrow", ) as parquet_file: - table = pq.read_table(parquet_file, columns=columns, **kwargs) + table = _read_table_with_partial_load_check(parquet_file, columns=columns, **kwargs) tables.append(table) return pa.concat_tables(tables) diff --git a/tests/list_struct_data/list_struct.parquet b/tests/list_struct_data/list_struct.parquet new file mode 100644 index 00000000..85ab45c9 Binary files /dev/null and b/tests/list_struct_data/list_struct.parquet differ diff --git a/tests/nested_pandas/nestedframe/test_io.py b/tests/nested_pandas/nestedframe/test_io.py index f11064aa..b5bcae25 100644 --- a/tests/nested_pandas/nestedframe/test_io.py +++ b/tests/nested_pandas/nestedframe/test_io.py @@ -459,3 +459,17 @@ def test__get_storage_options(): assert storage_opts is not None # S3 should NOT have the block_size override (only HTTP/HTTPS) assert storage_opts.get("block_size") != FSSPEC_BLOCK_SIZE + + +def test_list_struct_partial_loading_error(): + """Test that attempting to partially load a list-struct raises an error.""" + # Load in the example file + with pytest.raises(ValueError): + read_parquet("tests/list_struct_data/list_struct.parquet", columns=["lightcurve.hmjd"]) + + +def test_normal_loading_error(): + """Test that making a normal naming mistake raises the normal pyarrow error.""" + # Load in the example file + with pytest.raises(ValueError, match="No match for*"): + read_parquet("tests/test_data/nested.parquet", columns=["not_a_column"])