Skip to content

Commit

Permalink
Fix ZeroDivisionError on 0-row Parquets with boolean cols, upgrade …
Browse files Browse the repository at this point in the history
…to Python 3.11 (#50)

* fix compression ratio when uncompressed_size == 0

* bump to python 3.11, update GHAs

* GHA: cache poetry

See also: actions/setup-python#765

* test_parquet util

* add empty test0.parquet + `inspect` test

* test name typo fix

* loosen dep version pins
  • Loading branch information
ryan-williams committed Jan 2, 2024
1 parent 2ba9539 commit e4faa21
Show file tree
Hide file tree
Showing 13 changed files with 1,054 additions and 864 deletions.
14 changes: 10 additions & 4 deletions .github/workflows/cli_test.yml
Original file line number Diff line number Diff line change
Expand Up @@ -7,16 +7,22 @@ on:
pull_request:
types: [opened, synchronize]

env:
PYTHON_VERSION: 3.11

jobs:
build:
runs-on: ubuntu-latest

steps:
- uses: actions/checkout@v1
- name: Set up Python 3.8
uses: actions/setup-python@v1
- uses: actions/checkout@v3
- name: Install poetry
run: pipx install poetry
- name: Set up Python ${{ env.PYTHON_VERSION }}
uses: actions/setup-python@v4
with:
python-version: 3.8
python-version: ${{ env.PYTHON_VERSION }}
cache: poetry
- name: Install pipenv and dependencies
run: |
python -m pip install --upgrade pip
Expand Down
14 changes: 10 additions & 4 deletions .github/workflows/unittest.yml
Original file line number Diff line number Diff line change
Expand Up @@ -7,16 +7,22 @@ on:
pull_request:
types: [opened, synchronize]

env:
PYTHON_VERSION: 3.11

jobs:
build:
runs-on: ubuntu-latest

steps:
- uses: actions/checkout@v1
- name: Set up Python 3.8
uses: actions/setup-python@v1
- uses: actions/checkout@v3
- name: Install poetry
run: pipx install poetry
- name: Set up Python ${{ env.PYTHON_VERSION }}
uses: actions/setup-python@v4
with:
python-version: 3.8
python-version: ${{ env.PYTHON_VERSION }}
cache: poetry
- name: Install poetry and dependencies
run: |
python -m pip install --upgrade pip
Expand Down
9 changes: 7 additions & 2 deletions parquet_tools/commands/inspect.py
Original file line number Diff line number Diff line change
Expand Up @@ -89,8 +89,13 @@ def _simple_schema_expression(file_meta, schema) -> str:
for i, column in enumerate(columns):
col = schema.column(i)
col_meta = file_meta.row_group(0).column(i)
col_compression_space_saving_ratio = 1 - (col_meta.total_compressed_size / col_meta.total_uncompressed_size)
col_compression = f"{col_meta.compression} (space_saved: {col_compression_space_saving_ratio*100:.0f}%)"
if col_meta.total_uncompressed_size:
col_compression_space_saving_ratio = 1 - (col_meta.total_compressed_size / col_meta.total_uncompressed_size)
col_compression_space_saving_pct = col_compression_space_saving_ratio * 100
col_compression_space_saving_pct_str = f"{col_compression_space_saving_pct:.0f}%"
else:
col_compression_space_saving_pct_str = 'N/A'
col_compression = f"{col_meta.compression} (space_saved: {col_compression_space_saving_pct_str})"
exp += dedent(f'''
############ Column({column}) ############
name: {col.name}
Expand Down
1,701 changes: 890 additions & 811 deletions poetry.lock

Large diffs are not rendered by default.

33 changes: 17 additions & 16 deletions pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -11,25 +11,26 @@ keywords = ["parquet-tools", "parquet"]
exclude = ["parquet_tools/parquet.thrift", "parquet_tools/README.md"]

[tool.poetry.dependencies]
python = "^3.8"
halo = "^0.0.29"
python = ">=3.8"
halo = "^0.0.31"
pyarrow = "*"
pandas = ">=1"
tabulate = "^0.8.7"
boto3 = "^1.13"
thrift = "^0.13.0"
colorama = "^0.4.3"
pandas = "^2.1.4"
tabulate = "^0.9.0"
boto3 = "^1.34.11"
thrift = "^0.16.0"
colorama = "^0.4.6"

[tool.poetry.dev-dependencies]
autopep8 = "^1.5.3"
flake8 = "^3.8.3"
mypy = "^0.780"
pylint = "^2.5.3"
pytest = "^5.4.3"
pytest-mock = "^3.1.1"
moto = "*"
wheel = "^0.38.1"
twine = "^3.1.1"
autopep8 = "^2.0.4"
flake8 = "^6.1.0"
mypy = "^1.8.0"
pyarrow = "14.0.2" # various test outputs are sensitive to this
pylint = "^3.0.3"
pytest = "^7.4.3"
pytest-mock = "^3.12.0"
moto = "^4.2.12"
wheel = "^0.42.0"
twine = "^4.0.2"

[tool.poetry.scripts]
parquet-tools = "parquet_tools.cli:main"
Expand Down
Binary file added tests/test0.parquet
Binary file not shown.
56 changes: 56 additions & 0 deletions tests/test0_inspect.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,56 @@

############ file meta data ############
created_by: parquet-cpp-arrow version 14.0.2
num_columns: 4
num_rows: 0
num_row_groups: 1
format_version: 2.6
serialized_size: 2398


############ Columns ############
a
b
c
d

############ Column(a) ############
name: a
path: a
max_definition_level: 1
max_repetition_level: 0
physical_type: DOUBLE
logical_type: None
converted_type (legacy): NONE
compression: SNAPPY (space_saved: -7%)

############ Column(b) ############
name: b
path: b
max_definition_level: 1
max_repetition_level: 0
physical_type: INT32
logical_type: Null
converted_type (legacy): NONE
compression: SNAPPY (space_saved: -7%)

############ Column(c) ############
name: c
path: c
max_definition_level: 1
max_repetition_level: 0
physical_type: INT64
logical_type: None
converted_type (legacy): NONE
compression: SNAPPY (space_saved: -7%)

############ Column(d) ############
name: d
path: d
max_definition_level: 1
max_repetition_level: 0
physical_type: BOOLEAN
logical_type: None
converted_type (legacy): NONE
compression: SNAPPY (space_saved: N/A)

Binary file modified tests/test1.parquet
Binary file not shown.
Binary file modified tests/test2.parquet
Binary file not shown.
20 changes: 8 additions & 12 deletions tests/test_inspect.py
Original file line number Diff line number Diff line change
@@ -1,18 +1,14 @@
import pytest
from parquet_tools.commands.inspect import _execute_detail, _execute_simple
import pyarrow as pa
import pandas as pd
import pytest
from tempfile import TemporaryDirectory
import numpy as np

from parquet_tools.commands.inspect import _execute_detail, _execute_simple
from tests.test_parquets import get_test_dataframe


@pytest.fixture
def parquet_file():
df = pd.DataFrame(
{'one': [-1, np.nan, 2.5],
'two': ['foo', 'bar', 'baz'],
'three': [True, False, True]}
)
df = get_test_dataframe()
table = pa.Table.from_pandas(df)
with TemporaryDirectory() as tmp_path:
pq_path = f'{tmp_path}/test.pq'
Expand All @@ -36,12 +32,12 @@ def test_excute_simple(capfd, parquet_file):
assert err == ''
assert out == '''
############ file meta data ############
created_by: parquet-cpp-arrow version 5.0.0
created_by: parquet-cpp-arrow version 14.0.2
num_columns: 3
num_rows: 3
num_row_groups: 1
format_version: 1.0
serialized_size: 2222
format_version: 2.6
serialized_size: 2223
############ Columns ############
Expand Down
41 changes: 27 additions & 14 deletions tests/test_parquet.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,8 @@
from os import path
from os.path import dirname

from subprocess import check_output

from parquet_tools.parquet.reader import get_filemetadata
from parquet_tools.gen_py.parquet.ttypes import (FileMetaData, SchemaElement, LogicalType, StringType, RowGroup, ColumnMetaData, ColumnChunk,
Statistics, PageEncodingStats, KeyValue)
Expand All @@ -11,10 +16,9 @@ def fmd(self) -> FileMetaData:
return fmd

def test_version(self, fmd):
assert fmd.version == 2

assert fmd.version == 1

def test_schma(self, fmd):
def test_schema(self, fmd):
assert fmd.schema == [
SchemaElement(
type=None,
Expand Down Expand Up @@ -88,7 +92,7 @@ def test_row_groups(self, fmd):
file_offset=108,
meta_data=ColumnMetaData(
type=5,
encodings=[2, 0, 3],
encodings=[0, 3, 8],
path_in_schema=['one'],
codec=1,
num_values=3,
Expand All @@ -109,11 +113,11 @@ def test_row_groups(self, fmd):
encoding_stats=[
PageEncodingStats(
page_type=2,
encoding=2,
encoding=0,
count=1),
PageEncodingStats(
page_type=0,
encoding=2,
encoding=8,
count=1)],
bloom_filter_offset=None),
offset_index_offset=None,
Expand All @@ -127,7 +131,7 @@ def test_row_groups(self, fmd):
file_offset=281,
meta_data=ColumnMetaData(
type=6,
encodings=[2, 0, 3],
encodings=[0, 3, 8],
path_in_schema=['two'],
codec=1,
num_values=3,
Expand All @@ -147,11 +151,11 @@ def test_row_groups(self, fmd):
encoding_stats=[
PageEncodingStats(
page_type=2,
encoding=2,
encoding=0,
count=1),
PageEncodingStats(
page_type=0,
encoding=2,
encoding=8,
count=1)
],
bloom_filter_offset=None
Expand All @@ -168,7 +172,7 @@ def test_row_groups(self, fmd):
file_offset=388,
meta_data=ColumnMetaData(
type=0,
encodings=[0, 3],
encodings=[3, 0],
path_in_schema=['three'],
codec=1,
num_values=3,
Expand Down Expand Up @@ -200,10 +204,10 @@ def test_row_groups(self, fmd):
crypto_metadata=None,
encrypted_column_metadata=None
)],
total_byte_size=226,
total_byte_size=216,
num_rows=3,
sorting_columns=None,
file_offset=108,
file_offset=4,
total_compressed_size=226,
ordinal=0)
]
Expand All @@ -212,8 +216,17 @@ def test_key_value_metadata(self, fmd):
assert fmd.key_value_metadata == [
KeyValue(
key='pandas',
value='{"index_columns": [{"kind": "range", "name": null, "start": 0, "stop": 3, "step": 1}], "column_indexes": [{"name": null, "field_name": null, "pandas_type": "unicode", "numpy_type": "object", "metadata": {"encoding": "UTF-8"}}], "columns": [{"name": "one", "field_name": "one", "pandas_type": "float64", "numpy_type": "float64", "metadata": null}, {"name": "two", "field_name": "two", "pandas_type": "unicode", "numpy_type": "object", "metadata": null}, {"name": "three", "field_name": "three", "pandas_type": "bool", "numpy_type": "bool", "metadata": null}], "creator": {"library": "pyarrow", "version": "0.17.0"}, "pandas_version": "1.0.3"}'),
value='{"index_columns": [{"kind": "range", "name": null, "start": 0, "stop": 3, "step": 1}], "column_indexes": [{"name": null, "field_name": null, "pandas_type": "unicode", "numpy_type": "object", "metadata": {"encoding": "UTF-8"}}], "columns": [{"name": "one", "field_name": "one", "pandas_type": "float64", "numpy_type": "float64", "metadata": null}, {"name": "two", "field_name": "two", "pandas_type": "unicode", "numpy_type": "object", "metadata": null}, {"name": "three", "field_name": "three", "pandas_type": "bool", "numpy_type": "bool", "metadata": null}], "creator": {"library": "pyarrow", "version": "14.0.2"}, "pandas_version": "2.1.4"}'),
KeyValue(
key='ARROW:schema',
value='/////4gDAAAQAAAAAAAKAA4ABgAFAAgACgAAAAABAwAQAAAAAAAKAAwAAAAEAAgACgAAALgCAAAEAAAAAQAAAAwAAAAIAAwABAAIAAgAAACQAgAABAAAAIECAAB7ImluZGV4X2NvbHVtbnMiOiBbeyJraW5kIjogInJhbmdlIiwgIm5hbWUiOiBudWxsLCAic3RhcnQiOiAwLCAic3RvcCI6IDMsICJzdGVwIjogMX1dLCAiY29sdW1uX2luZGV4ZXMiOiBbeyJuYW1lIjogbnVsbCwgImZpZWxkX25hbWUiOiBudWxsLCAicGFuZGFzX3R5cGUiOiAidW5pY29kZSIsICJudW1weV90eXBlIjogIm9iamVjdCIsICJtZXRhZGF0YSI6IHsiZW5jb2RpbmciOiAiVVRGLTgifX1dLCAiY29sdW1ucyI6IFt7Im5hbWUiOiAib25lIiwgImZpZWxkX25hbWUiOiAib25lIiwgInBhbmRhc190eXBlIjogImZsb2F0NjQiLCAibnVtcHlfdHlwZSI6ICJmbG9hdDY0IiwgIm1ldGFkYXRhIjogbnVsbH0sIHsibmFtZSI6ICJ0d28iLCAiZmllbGRfbmFtZSI6ICJ0d28iLCAicGFuZGFzX3R5cGUiOiAidW5pY29kZSIsICJudW1weV90eXBlIjogIm9iamVjdCIsICJtZXRhZGF0YSI6IG51bGx9LCB7Im5hbWUiOiAidGhyZWUiLCAiZmllbGRfbmFtZSI6ICJ0aHJlZSIsICJwYW5kYXNfdHlwZSI6ICJib29sIiwgIm51bXB5X3R5cGUiOiAiYm9vbCIsICJtZXRhZGF0YSI6IG51bGx9XSwgImNyZWF0b3IiOiB7ImxpYnJhcnkiOiAicHlhcnJvdyIsICJ2ZXJzaW9uIjogIjAuMTcuMCJ9LCAicGFuZGFzX3ZlcnNpb24iOiAiMS4wLjMifQAAAAYAAABwYW5kYXMAAAMAAABsAAAAMAAAAAQAAACw////AAABBhQAAAAMAAAABAAAAAAAAADY////BQAAAHRocmVlAAAA2P///wAAAQUYAAAAEAAAAAQAAAAAAAAABAAEAAQAAAADAAAAdHdvABAAFAAIAAYABwAMAAAAEAAQAAAAAAABAyAAAAAUAAAABAAAAAAAAAAAAAYACAAGAAYAAAAAAAIAAwAAAG9uZQAAAAAA')
value='/////4gDAAAQAAAAAAAKAA4ABgAFAAgACgAAAAABBAAQAAAAAAAKAAwAAAAEAAgACgAAALgCAAAEAAAAAQAAAAwAAAAIAAwABAAIAAgAAACQAgAABAAAAIECAAB7ImluZGV4X2NvbHVtbnMiOiBbeyJraW5kIjogInJhbmdlIiwgIm5hbWUiOiBudWxsLCAic3RhcnQiOiAwLCAic3RvcCI6IDMsICJzdGVwIjogMX1dLCAiY29sdW1uX2luZGV4ZXMiOiBbeyJuYW1lIjogbnVsbCwgImZpZWxkX25hbWUiOiBudWxsLCAicGFuZGFzX3R5cGUiOiAidW5pY29kZSIsICJudW1weV90eXBlIjogIm9iamVjdCIsICJtZXRhZGF0YSI6IHsiZW5jb2RpbmciOiAiVVRGLTgifX1dLCAiY29sdW1ucyI6IFt7Im5hbWUiOiAib25lIiwgImZpZWxkX25hbWUiOiAib25lIiwgInBhbmRhc190eXBlIjogImZsb2F0NjQiLCAibnVtcHlfdHlwZSI6ICJmbG9hdDY0IiwgIm1ldGFkYXRhIjogbnVsbH0sIHsibmFtZSI6ICJ0d28iLCAiZmllbGRfbmFtZSI6ICJ0d28iLCAicGFuZGFzX3R5cGUiOiAidW5pY29kZSIsICJudW1weV90eXBlIjogIm9iamVjdCIsICJtZXRhZGF0YSI6IG51bGx9LCB7Im5hbWUiOiAidGhyZWUiLCAiZmllbGRfbmFtZSI6ICJ0aHJlZSIsICJwYW5kYXNfdHlwZSI6ICJib29sIiwgIm51bXB5X3R5cGUiOiAiYm9vbCIsICJtZXRhZGF0YSI6IG51bGx9XSwgImNyZWF0b3IiOiB7ImxpYnJhcnkiOiAicHlhcnJvdyIsICJ2ZXJzaW9uIjogIjE0LjAuMiJ9LCAicGFuZGFzX3ZlcnNpb24iOiAiMi4xLjQifQAAAAYAAABwYW5kYXMAAAMAAABsAAAAMAAAAAQAAACw////AAABBhAAAAAYAAAABAAAAAAAAAAFAAAAdGhyZWUAAADc////2P///wAAAQUQAAAAGAAAAAQAAAAAAAAAAwAAAHR3bwAEAAQABAAAABAAFAAIAAYABwAMAAAAEAAQAAAAAAABAxAAAAAcAAAABAAAAAAAAAADAAAAb25lAAAABgAIAAYABgAAAAAAAgAAAAAA')
]

def test_inspect(self):
tests_dir = dirname(__file__)
test0_parquet_path = path.join(tests_dir, 'test0.parquet')
actual = check_output(['parquet-tools', 'inspect', test0_parquet_path]).decode()
expected_path = path.join(tests_dir, 'test0_inspect.txt')
with open(expected_path, 'r') as f:
expected = f.read()
assert actual == expected
25 changes: 25 additions & 0 deletions tests/test_parquets.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,25 @@
from os.path import dirname

import numpy as np
import pandas as pd


def get_test_dataframe():
return pd.DataFrame(
{'one': [-1, np.nan, 2.5],
'two': ['foo', 'bar', 'baz'],
'three': [True, False, True]}
)


def write_test_dataframes():
df = get_test_dataframe()
tests_dir = dirname(__file__)
df.to_parquet(f'{tests_dir}/test1.parquet')
df.to_parquet(f'{tests_dir}/test2.parquet')
df0 = pd.DataFrame({ 'a': [], 'b': [], 'c': [], 'd': [], }).astype({ 'a': float, 'b': str, 'c': int, 'd': bool })
df0.to_parquet(f'{tests_dir}/test0.parquet')


if __name__ == '__main__':
write_test_dataframes()
5 changes: 4 additions & 1 deletion tests/test_util.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,7 @@ def test_is_wildcard(self, pf, expected):
@pytest.mark.parametrize('pf, expected', [
(
LocalParquetFile(path='./tests/*.parquet'), [
LocalParquetFile('./tests/test0.parquet'),
LocalParquetFile('./tests/test1.parquet'),
LocalParquetFile('./tests/test2.parquet')
]
Expand Down Expand Up @@ -82,11 +83,13 @@ def test_multiple_localfile(self):
LocalParquetFile(path='./tests/*.parquet'),
)

assert len(actual) == 2
assert len(actual) == 3
assert isinstance(actual[0], LocalParquetFile)
assert isinstance(actual[1], LocalParquetFile)
assert isinstance(actual[1], LocalParquetFile)

assert {a.path for a in actual} == {
'./tests/test0.parquet',
'./tests/test1.parquet',
'./tests/test2.parquet',
}
Expand Down

0 comments on commit e4faa21

Please sign in to comment.