In [1]:
import pyarrow as pa

days = pa.array([1, 12, 17, 23, 28], type=pa.int8())

days

<pyarrow.lib.Int8Array object at 0x132b2cac0>
[
  1,
  12,
  17,
  23,
  28
]

In [2]:
month = pa.array([1, 3, 5, 7, 1], type=pa.int8())

years = pa.array([1900, 2000, 1995, 2000, 1995], type=pa.int16())

birthdays_table = pa.table([days, month, years], names=["days", "months", "years"])

birthdays_table

pyarrow.Table
days: int8
months: int8
years: int16
----
days: [[1,12,17,23,28]]
months: [[1,3,5,7,1]]
years: [[1900,2000,1995,2000,1995]]

In [3]:
import pyarrow.parquet as pq

pq.write_table(birthdays_table, 'birthdays.parquet')

reloaded_birthdays = pq.read_table('birthdays.parquet')

reloaded_birthdays

pyarrow.Table
days: int8
months: int8
years: int16
----
days: [[1,12,17,23,28]]
months: [[1,3,5,7,1]]
years: [[1900,2000,1995,2000,1995]]

In [4]:
import pyarrow.compute as pc

pc.value_counts(birthdays_table['years'])

<pyarrow.lib.StructArray object at 0x133031820>
-- is_valid: all not null
-- child 0 type: int16
  [
    1900,
    2000,
    1995
  ]
-- child 1 type: int64
  [
    1,
    2,
    2
  ]

In [5]:
import pyarrow.dataset as ds

ds.write_dataset(birthdays_table, "savedir", format='parquet',
                 partitioning=ds.partitioning(
                     pa.schema([birthdays_table.schema.field('years')])
                 ))

In [6]:
birthdays_dataset = ds.dataset('savedir', format='parquet', partitioning=['years'])

birthdays_dataset.files

['savedir/1900/part-0.parquet',
 'savedir/1995/part-0.parquet',
 'savedir/2000/part-0.parquet']

In [7]:
import datetime

current_year = datetime.datetime.utcnow().year

for table_chunk in birthdays_dataset.to_batches():
    print("AGES", pc.subtract(current_year, table_chunk['years']))


AGES [
  122
]
AGES [
  27,
  27
]
AGES [
  22,
  22
]
