### Arrow Experiments
https://arrow.apache.org/docs/python/index.html

Arrow Memory Pools

In [1]:
import pyarrow as pa
data = b'abcdefghijklmnopqrstuvwxyz'
buf = pa.py_buffer(data)

In [2]:
buf

<pyarrow.lib.Buffer at 0x7ff4dd0bb930>

In [3]:
buf.size

26

In [4]:
memoryview(buf)

<memory at 0x7ff4c451fdc0>

In [5]:
buf.to_pybytes()

b'abcdefghijklmnopqrstuvwxyz'

Memory Pools

In [13]:
pa.total_allocated_bytes()

0

In [14]:
buf = pa.allocate_buffer(1024, resizable=True)
pa.total_allocated_bytes()

1024

In [15]:
buf.resize(2048)
pa.total_allocated_bytes()

2048

Input and outputs - Files and Streams

Input Stream

In [22]:
buf = memoryview(b"some data")
stream = pa.input_stream(buf)
stream.read(4)

b'some'

In [23]:
import gzip
with gzip.open('example.gz', 'wb') as f:
    f.write(b'some data\n' * 3)
stream = pa.input_stream('example.gz')
stream.read()

b'some data\nsome data\nsome data\n'

Output Streams

In [24]:
with pa.output_stream('example1.dat') as stream:
    stream.write(b'some data')

f = open('example1.dat', 'rb')

f.read()

b'some data'

CSV Files

In [26]:
import pandas
import pyarrow as pa
from pyarrow import csv

pa.csv.ParseOptions(delimiter=',')
fn = 'orders.csv'
table = csv.read_csv(fn)
table
pa.Table
len(table)
df = table.to_pandas()
df.head()

Unnamed: 0,O_ORDERKEY,O_CUSTKEY,O_ORDERSTATUS,O_TOTALPRICE,O_ORDERDATE,O_ORDERPRIORITY,O_CLERK,O_SHIPPRIORITY,O_COMMENT
0,1,37,O,131251.81,1996-01-02,5-LOW,Clerk#000000951,0,nstructions sleep furiously among
1,2,79,O,40183.29,1996-12-01,1-URGENT,Clerk#000000880,0,foxes. pending accounts at the pending silent...
2,3,124,F,160882.76,1993-10-14,5-LOW,Clerk#000000955,0,sly final accounts boost. carefully regular id...
3,4,137,O,31084.79,1995-10-11,5-LOW,Clerk#000000124,0,sits. slyly regular warthogs cajole. regular r...
4,5,46,F,86615.25,1994-07-30,5-LOW,Clerk#000000925,0,quickly. bold deposits sleep slyly. packages u...


In [27]:
pa.cpu_count()

8

Data Types and In-Memory Data Model
pyarrow types - DataType (metadata), Schema, Array, RecordBatch (Array objects with Schema), Tables (columns with one or more Array)

Metadata for type information

Fixed length primitive - int, float, bool, date
Variable length primitive - string, binary
Nested - List, struct, union
Dictionary - Encoded Categorical type

In [33]:
import pyarrow as pa
t1 = pa.int32()
t2 = pa.string()
t3 = pa.binary()
t4 = pa.binary(10)
t5 = pa.timestamp('ms')

In [34]:
t1

DataType(int32)

In [35]:
print(t1)
print(t2)
print(t3)
print(t4)
print(t5)

int32
string
binary
fixed_size_binary[10]
timestamp[ms]


Field has data type and some information

In [36]:
f0 = pa.field('int32_field', t1)

In [39]:
f0

pyarrow.Field<int32_field: int32>

In [40]:
f0.name
f0.type

DataType(int32)

In [41]:
t6 = pa.list_(t1)
t6

ListType(list<item: int32>)

Struct is a collection of names fields

In [43]:
fields = [
    pa.field('s0', t1),
    pa.field('s1', t2),
    pa.field('s2', t4),
    pa.field('s3', t6),
]


t7 = pa.struct(fields)

print(t7)

struct<s0: int32, s1: string, s2: fixed_size_binary[10], s3: list<item: int32>>


In [44]:
t8 = pa.struct([('s0', t1), ('s1', t2), ('s2', t4), ('s3', t6)])
print(t8)
t8 == t7

struct<s0: int32, s1: string, s2: fixed_size_binary[10], s3: list<item: int32>>


True

Schemas are Struct defining name and type of the columns.

In [45]:
my_schema = pa.schema([('field0', t1),
                       ('field1', t2),
                       ('field2', t4),
                       ('field3', t6)])
my_schema

field0: int32
field1: string
field2: fixed_size_binary[10]
field3: list<item: int32>
  child 0, item: int32

Arrays - One block of data

In [46]:
arr = pa.array([1, 2, None, 3])
arr

<pyarrow.lib.Int64Array object at 0x7f7d34986e80>
[
  1,
  2,
  null,
  3
]

In [47]:
pa.array([1, 2], type=pa.uint16())

<pyarrow.lib.UInt16Array object at 0x7f7d34986880>
[
  1,
  2
]

In [48]:
arr.type

DataType(int64)

In [49]:
len(arr)

4

In [50]:
arr.null_count

1

List Arrays

In [51]:
nested_arr = pa.array([[], None, [1, 2], [None, 1]])
nested_arr

<pyarrow.lib.ListArray object at 0x7f7d34986b20>
[
  [],
  null,
  [
    1,
    2
  ],
  [
    null,
    1
  ]
]

Struct Arrays - Pass type explicitly

In [54]:
ty = pa.struct([('x', pa.int8()),
                ('y', pa.bool_())])
pa.array([{'x': 1, 'y': True}, {'x': 2, 'y': False}], type=ty)

<pyarrow.lib.StructArray object at 0x7f7d349ec6a0>
-- is_valid: all not null
-- child 0 type: int8
  [
    1,
    2
  ]
-- child 1 type: bool
  [
    true,
    false
  ]

In [55]:
pa.array([(3, True), (4, False)], type=ty)

<pyarrow.lib.StructArray object at 0x7f7d34992100>
-- is_valid: all not null
-- child 0 type: int8
  [
    3,
    4
  ]
-- child 1 type: bool
  [
    true,
    false
  ]

In [56]:
xs = pa.array([5, 6, 7], type=pa.int16())
ys = pa.array([False, True, True])
arr = pa.StructArray.from_arrays((xs, ys), names=('x', 'y'))

In [57]:
arr.type

StructType(struct<x: int16, y: bool>)

In [58]:
arr

<pyarrow.lib.StructArray object at 0x7f7d349923a0>
-- is_valid: all not null
-- child 0 type: int16
  [
    5,
    6,
    7
  ]
-- child 1 type: bool
  [
    false,
    true,
    true
  ]

Union Arrays

In [59]:
xs = pa.array([5, 6, 7])
ys = pa.array([False, False, True])
types = pa.array([0, 1, 1], type=pa.int8())
union_arr = pa.UnionArray.from_sparse(types, [xs, ys])

In [60]:
union_arr.type

UnionType(sparse_union<0: int64=0, 1: bool=1>)

In [61]:
union_arr

<pyarrow.lib.UnionArray object at 0x7f7d34992160>
-- is_valid: all not null
-- type_ids:   [
    0,
    1,
    1
  ]
-- child 0 type: int64
  [
    5,
    6,
    7
  ]
-- child 1 type: bool
  [
    false,
    false,
    true
  ]

In [62]:
xs = pa.array([5, 6, 7])
ys = pa.array([False, True])
types = pa.array([0, 1, 1, 0, 0], type=pa.int8())
offsets = pa.array([0, 0, 1, 1, 2], type=pa.int32())
union_arr = pa.UnionArray.from_dense(types, offsets, [xs, ys])

In [63]:
union_arr.type

UnionType(dense_union<0: int64=0, 1: bool=1>)

In [64]:
union_arr

<pyarrow.lib.UnionArray object at 0x7f7d34992520>
-- is_valid: all not null
-- type_ids:   [
    0,
    1,
    1,
    0,
    0
  ]
-- value_offsets:   [
    0,
    0,
    1,
    1,
    2
  ]
-- child 0 type: int64
  [
    5,
    6,
    7
  ]
-- child 1 type: bool
  [
    false,
    true
  ]

Dictionary Arrays

In [65]:
indices = pa.array([0, 1, 0, 1, 2, 0, None, 2])
dictionary = pa.array(['foo', 'bar', 'baz'])
dict_array = pa.DictionaryArray.from_arrays(indices, dictionary)

dict_array

In [67]:
print(dict_array.type)
dict_array.indices
dict_array.dictionary

dictionary<values=string, indices=int64, ordered=0>


<pyarrow.lib.StringArray object at 0x7f7d3499f220>
[
  "foo",
  "bar",
  "baz"
]

In [68]:
dict_array.to_pandas()

0    foo
1    bar
2    foo
3    bar
4    baz
5    foo
6    NaN
7    baz
dtype: category
Categories (3, object): ['foo', 'bar', 'baz']

Record Batches - Collection of Equal length array instances

In [69]:
data = [
    pa.array([1, 2, 3, 4]),
    pa.array(['foo', 'bar', 'baz', None]),
    pa.array([True, None, False, True])
]

In [70]:
data

[<pyarrow.lib.Int64Array object at 0x7f7d34992d00>
 [
   1,
   2,
   3,
   4
 ],
 <pyarrow.lib.StringArray object at 0x7f7d349929a0>
 [
   "foo",
   "bar",
   "baz",
   null
 ],
 <pyarrow.lib.BooleanArray object at 0x7f7d349928e0>
 [
   true,
   null,
   false,
   true
 ]]

In [71]:
batch = pa.RecordBatch.from_arrays(data, ['f0', 'f1', 'f2'])

<pyarrow.lib.StringArray object at 0x7f7d34992760>
[
  "foo",
  "bar",
  "baz",
  null
]

In [72]:
batch.num_columns

3

In [73]:
batch.num_rows

4

In [74]:
batch.schema

f0: int64
f1: string
f2: bool

In [75]:
batch[1]

<pyarrow.lib.StringArray object at 0x7f7d34992820>
[
  "foo",
  "bar",
  "baz",
  null
]

In [76]:
batch2 = batch.slice(1, 3)

In [77]:
batch2[1]

<pyarrow.lib.StringArray object at 0x7f7d349ecdc0>
[
  "bar",
  "baz",
  null
]

Tables - Single Logical data set with multiple batches and array pieces

In [78]:
batches = [batch] * 5
table = pa.Table.from_batches(batches)

In [79]:
table

pyarrow.Table
f0: int64
f1: string
f2: bool

In [81]:
pa.Table

pyarrow.lib.Table

In [82]:
table.num_rows

20

In [83]:
c = table[0]

Can be converted to pandas for processing

In [85]:
c.to_pandas()

0     1
1     2
2     3
3     4
4     1
5     2
6     3
7     4
8     1
9     2
10    3
11    4
12    1
13    2
14    3
15    4
16    1
17    2
18    3
19    4
Name: f0, dtype: int64

In [86]:
tables = [table] * 2
table_all = pa.concat_tables(tables)

In [87]:
table_all.num_rows

40

In [88]:
c = table_all[0]
c.num_chunks

10

Streaming, Serialization and IPC

In [89]:
import pyarrow as pa
data = [
    pa.array([1, 2, 3, 4]),
    pa.array(['foo', 'bar', 'baz', None]),
    pa.array([True, None, False, True])
]
batch = pa.record_batch(data, names=['f0', 'f1', 'f2'])
batch.num_rows
batch.num_columns

3

In [90]:
sink = pa.BufferOutputStream()
writer = pa.ipc.new_stream(sink, batch.schema)

In [91]:
for i in range(5):
   writer.write_batch(batch)
writer.close()
buf = sink.getvalue()
buf.size

1984

In [92]:
reader = pa.ipc.open_stream(buf)
reader.schema
batches = [b for b in reader]
len(batches)

5

In [93]:
batches[0].equals(batch)

True

In [94]:
sink = pa.BufferOutputStream()
writer = pa.ipc.new_file(sink, batch.schema)
for i in range(10):
   writer.write_batch(batch)
writer.close()
buf = sink.getvalue()
buf.size

4226

In [95]:
reader = pa.ipc.open_file(buf)

In [96]:
reader.num_record_batches
b = reader.get_batch(3)

In [97]:
b.equals(batch)

True

In [98]:
df = pa.ipc.open_file(buf).read_pandas()
df[:5]

Unnamed: 0,f0,f1,f2
0,1,foo,True
1,2,bar,
2,3,baz,False
3,4,,True
4,1,foo,True


In [99]:
import numpy as np
data = {
    i: np.random.randn(500, 500)
    for i in range(100)
}

In [100]:
buf = pa.serialize(data).to_buffer()

In [101]:
type(buf)

pyarrow.lib.Buffer

In [102]:
buf.size

200028928

In [103]:
restored_data = pa.deserialize(buf)
restored_data[0]

array([[-2.57709743, -0.65188211,  0.62138459, ..., -1.30677344,
        -0.16321181, -1.34221709],
       [ 0.15277745,  2.23685843, -1.47097744, ...,  1.44389604,
        -0.76526336, -0.03399684],
       [-0.76425333,  0.09543512,  1.88301336, ...,  0.03670551,
        -0.22610357,  0.5129054 ],
       ...,
       [-0.54037216, -0.68647922, -0.58087724, ..., -0.86372509,
        -0.26299099, -1.01148445],
       [-2.24586222,  0.76992033,  0.32833126, ..., -0.8088799 ,
        -0.57458875,  1.1653306 ],
       [-0.16203357,  0.54109069,  0.10904997, ...,  1.28284833,
        -0.9897756 , -1.31204098]])

In [104]:
import pandas as pd
df = pd.DataFrame({'a': [1, 2, 3, 4, 5]})
context = pa.default_serialization_context()
serialized_df = context.serialize(df)
df_components = serialized_df.to_components()
original_df = context.deserialize_components(df_components)
original_df

  context = pa.default_serialization_context()


Unnamed: 0,a
0,1
1,2
2,3
3,4
4,5


Filesystem Interfaces - Local FileSystem, S3, HDFS

In [105]:
from pyarrow import fs
local = fs.LocalFileSystem()

In [106]:
s3, path = fs.FileSystem.from_uri("s3://my-bucket")

In [107]:
s3

<pyarrow._s3fs.S3FileSystem at 0x7f7d34988470>

In [108]:
path

'my-bucket'

In [110]:
pq.read_table("s3://my-bucket/data.parquet")
s3 = fs.S3FileSystem(".")
pq.read_table("my-bucket/data.parquet", filesystem=s3)

NameError: name 'pq' is not defined

In [111]:
local = fs.LocalFileSystem()

with local.open_output_stream("test.arrow") as file:
   with pa.RecordBatchFileWriter(file, table.schema) as writer:
      writer.write_table(table)

In [112]:
local.get_file_info(fs.FileSelector("dataset/", recursive=True))

FileNotFoundError: [Errno 2] Cannot list directory 'dataset/'. Detail: [errno 2] No such file or directory

In [113]:
local.get_file_info('test.arrow')
local.get_file_info('non_existent')

<FileInfo for 'non_existent': type=FileType.NotFound>

In [114]:
from pyarrow import fs
s3 = fs.S3FileSystem(region='eu-west-3')

In [115]:
f = s3.open_input_stream('my-test-bucket/Dir1/File2')
f.readall()

OSError: When reading information for key 'Dir1/File2' in bucket 'my-test-bucket': AWS Error [code 100]: No response body.

Set classpath to include hadoop libraries
export CLASSPATH=`$HADOOP_HOME/bin/hdfs classpath --glob`

In [117]:
import gcsfs
fs = gcsfs.GCSFileSystem(project='my-google-project')

# using this to read a partitioned dataset
import pyarrow.dataset as ds
ds.dataset("data/", filesystem=fs)

ModuleNotFoundError: No module named 'gcsfs'

Plasma In-Memory Object Storage

In [119]:
plasma_store  -s /tmp/plasma

NameError: name 'plasma_store' is not defined

NumPy to Arrow

In [120]:
import numpy as np
import pyarrow as pa
data = np.arange(10, dtype='int16')
arr = pa.array(data)
arr

<pyarrow.lib.Int16Array object at 0x7f7ccc449e20>
[
  0,
  1,
  2,
  3,
  4,
  5,
  6,
  7,
  8,
  9
]

Arrow to Numpy

In [121]:
import numpy as np
import pyarrow as pa
arr = pa.array([4, 5, 6], type=pa.int32())
view = arr.to_numpy()
view

array([4, 5, 6], dtype=int32)

Pandas Integration

In [122]:
import pandas as pd
import pyarrow as pa

In [123]:
import pyarrow as pa
import pandas as pd

df = pd.DataFrame({"a": [1, 2, 3]})
# Convert from pandas to Arrow
table = pa.Table.from_pandas(df)
# Convert back to pandas
df_new = table.to_pandas()

# Infer Arrow schema from pandas
schema = pa.Schema.from_pandas(df)

In [124]:
from datetime import date
s = pd.Series([date(2018, 12, 31), None, date(2000, 1, 1)])
s

0    2018-12-31
1          None
2    2000-01-01
dtype: object

In [126]:
arr = pa.array(s)
arr.type

DataType(date32[day])

In [127]:
arr[0]

<pyarrow.Date32Scalar: datetime.date(2018, 12, 31)>

In [128]:
arr = pa.array(s, type='date64')
arr.type

DataType(date64[ms])

In [129]:
arr.to_pandas()

0    2018-12-31
1          None
2    2000-01-01
dtype: object

In [130]:
s2 = pd.Series(arr.to_pandas(date_as_object=False))
s2.dtype

dtype('<M8[ns]')

Reading CSV files

In [131]:
from pyarrow import csv
fn = 'orders.csv.gz'
table = csv.read_csv(fn)
table

pyarrow.Table
O_ORDERKEY: int64
O_CUSTKEY: int64
O_ORDERSTATUS: string
O_TOTALPRICE: double
O_ORDERDATE: timestamp[s]
O_ORDERPRIORITY: string
O_CLERK: string
O_SHIPPRIORITY: int64
O_COMMENT: string

In [133]:
pa.Table

pyarrow.lib.Table

In [134]:
len(table)

1500

In [135]:
df = table.to_pandas()
df.head()

Unnamed: 0,O_ORDERKEY,O_CUSTKEY,O_ORDERSTATUS,O_TOTALPRICE,O_ORDERDATE,O_ORDERPRIORITY,O_CLERK,O_SHIPPRIORITY,O_COMMENT
0,1,37,O,131251.81,1996-01-02,5-LOW,Clerk#000000951,0,nstructions sleep furiously among
1,2,79,O,40183.29,1996-12-01,1-URGENT,Clerk#000000880,0,foxes. pending accounts at the pending silent...
2,3,124,F,160882.76,1993-10-14,5-LOW,Clerk#000000955,0,sly final accounts boost. carefully regular id...
3,4,137,O,31084.79,1995-10-11,5-LOW,Clerk#000000124,0,sits. slyly regular warthogs cajole. regular r...
4,5,46,F,86615.25,1994-07-30,5-LOW,Clerk#000000925,0,quickly. bold deposits sleep slyly. packages u...


JSon Reader

In [136]:
from pyarrow import json
fn = 'my_data.json'
table = json.read_json(fn)
table

FileNotFoundError: [Errno 2] Failed to open local file 'my_data.json'. Detail: [errno 2] No such file or directory

Parquet Reader

In [137]:
import numpy as np
import pandas as pd
import pyarrow as pa
df = pd.DataFrame({'one': [-1, np.nan, 2.5],
                   'two': ['foo', 'bar', 'baz'],
                   'three': [True, False, True]},
                   index=list('abc'))
table = pa.Table.from_pandas(df)

In [138]:
import pyarrow.parquet as pq
pq.write_table(table, 'example.parquet')

In [140]:
table2 = pq.read_table('example.parquet')
table2.to_pandas()

Unnamed: 0,one,two,three
a,-1.0,foo,True
b,,bar,False
c,2.5,baz,True


In [141]:
pq.read_table('example.parquet', columns=['one', 'three'])

pyarrow.Table
one: double
three: bool

In [142]:
pq.read_pandas('example.parquet', columns=['two']).to_pandas()

Unnamed: 0,two
a,foo
b,bar
c,baz


In [143]:
df = pd.DataFrame({'one': [-1, np.nan, 2.5],
                   'two': ['foo', 'bar', 'baz'],
                   'three': [True, False, True]},
                   index=list('abc'))

In [144]:
df

Unnamed: 0,one,two,three
a,-1.0,foo,True
b,,bar,False
c,2.5,baz,True


In [145]:
table = pa.Table.from_pandas(df, preserve_index=False)

In [146]:
pq.write_table(table, 'example_noindex.parquet')
t = pq.read_table('example_noindex.parquet')
t.to_pandas()

Unnamed: 0,one,two,three
0,-1.0,foo,True
1,,bar,False
2,2.5,baz,True


In [147]:
parquet_file = pq.ParquetFile('example.parquet')
parquet_file.metadata
parquet_file.schema

<pyarrow._parquet.ParquetSchema object at 0x7f7cc6e26ec0>
required group field_id=0 schema {
  optional double field_id=1 one;
  optional binary field_id=2 two (String);
  optional boolean field_id=3 three;
  optional binary field_id=4 __index_level_0__ (String);
}

In [148]:
parquet_file.num_row_groups
parquet_file.read_row_group(0)

pyarrow.Table
one: double
two: string
three: bool
__index_level_0__: string

In [149]:
writer = pq.ParquetWriter('example2.parquet', table.schema)
for i in range(3):
    writer.write_table(table)
writer.close()
pf2 = pq.ParquetFile('example2.parquet')
pf2.num_row_groups

3

In [150]:
with pq.ParquetWriter('example3.parquet', table.schema) as writer:
    for i in range(3):
        writer.write_table(table)

In [151]:
parquet_file = pq.ParquetFile('example.parquet')
metadata = parquet_file.metadata

In [152]:
metadata = pq.read_metadata('example.parquet')
metadata

<pyarrow._parquet.FileMetaData object at 0x7f7cc6e2c6d0>
  created_by: parquet-cpp version 1.5.1-SNAPSHOT
  num_columns: 4
  num_rows: 3
  num_row_groups: 1
  format_version: 1.0
  serialized_size: 2578

In [153]:
metadata.row_group(0)

<pyarrow._parquet.RowGroupMetaData object at 0x7f7cc6e280e0>
  num_columns: 4
  num_rows: 3
  total_byte_size: 296

In [154]:
metadata.row_group(0).column(0)

<pyarrow._parquet.ColumnChunkMetaData object at 0x7f7cc6e2c8b0>
  file_offset: 108
  file_path: 
  physical_type: DOUBLE
  num_values: 3
  path_in_schema: one
  is_stats_set: True
  statistics:
    <pyarrow._parquet.Statistics object at 0x7f7cc6e2cc20>
      has_min_max: True
      min: -1.0
      max: 2.5
      null_count: 1
      distinct_count: 0
      num_values: 2
      physical_type: DOUBLE
      logical_type: None
      converted_type (legacy): NONE
  compression: SNAPPY
  encodings: ('PLAIN_DICTIONARY', 'PLAIN', 'RLE')
  has_dictionary_page: True
  dictionary_page_offset: 4
  data_page_offset: 36
  total_compressed_size: 104
  total_uncompressed_size: 100

In [None]:
pq.read_table(table, ".", read_dictionary=['binary_c0', 'stringb_c2'])

Arrow can read data from HDFS, Azure blob storage, S3 storage etc. and create an Arrow Array or Table.

Tabular Data Set - pyarrow.dataset allows connection to database also. 
It also can be used with Cuda with Numba package. Numba implements pyarrow code on LLVM or parallel compiler or GPU for parallel implementations.

In [157]:
import tempfile
import pathlib
import pyarrow as pa
import pyarrow.parquet as pq
base = pathlib.Path(tempfile.gettempdir())
(base / "parquet_dataset").mkdir(exist_ok=True)
# creating an Arrow Table
table = pa.table({'a': range(10), 'b': np.random.randn(10), 'c': [1, 2] * 5})
# writing it into two parquet files
pq.write_table(table.slice(0, 5), base / "parquet_dataset/data1.parquet")
pq.write_table(table.slice(5, 10), base / "parquet_dataset/data2.parquet")

In [158]:
import pyarrow.dataset as ds
dataset = ds.dataset(base / "parquet_dataset", format="parquet")
dataset

<pyarrow._dataset.FileSystemDataset at 0x7f7cc6e2a570>

In [159]:
dataset.files

['/tmp/parquet_dataset/data1.parquet', '/tmp/parquet_dataset/data2.parquet']

In [160]:
print(dataset.schema.to_string(show_field_metadata=False))

a: int64
b: double
c: int64


In [161]:
dataset.to_table()

pyarrow.Table
a: int64
b: double
c: int64

In [162]:
dataset.to_table().to_pandas()

Unnamed: 0,a,b,c
0,0,-1.119416,1
1,1,1.512939,2
2,2,-0.16344,1
3,3,-0.530309,2
4,4,-0.479388,1
5,5,0.875306,2
6,6,-0.207172,1
7,7,-0.17125,2
8,8,0.087928,1
9,9,-0.090089,2


In [163]:
dataset = ds.dataset(base / "parquet_dataset", format="parquet")
dataset.to_table(columns=['a', 'b']).to_pandas()

Unnamed: 0,a,b
0,0,-1.119416
1,1,1.512939
2,2,-0.16344
3,3,-0.530309
4,4,-0.479388
5,5,0.875306
6,6,-0.207172
7,7,-0.17125
8,8,0.087928
9,9,-0.090089


In [164]:
dataset.to_table(filter=ds.field('a') >= 7).to_pandas()

Unnamed: 0,a,b,c
0,7,-0.17125,2
1,8,0.087928,1
2,9,-0.090089,2


In [165]:
dataset.to_table(filter=ds.field('c') == 2).to_pandas()

Unnamed: 0,a,b,c
0,1,1.512939,2
1,3,-0.530309,2
2,5,0.875306,2
3,7,-0.17125,2
4,9,-0.090089,2


In [166]:
ds.field('a') != 3

<pyarrow.dataset.Expression (a != 3:int64)>

In [167]:
ds.field('a').isin([1, 2, 3])

<pyarrow.dataset.Expression (a is in [
  1,
  2,
  3
])>

In [168]:
(ds.field('a') > ds.field('b')) & (ds.field('b') > 1)

<pyarrow.dataset.Expression ((a > b) and (b > 1:int64))>