### Arrow Experiments
https://arrow.apache.org/docs/python/index.html

Arrow Memory Pools

In [1]:
import pyarrow as pa

data = b'abcdefghijklmnopqrstuvwxyz'
buf = pa.py_buffer(data)
buf
buf.size

26

In [3]:
memoryview(buf)

<memory at 0x7f40541567c0>

In [4]:
buf.to_pybytes()

b'abcdefghijklmnopqrstuvwxyz'

In [5]:
pa.total_allocated_bytes()

0

In [7]:
buf = pa.allocate_buffer(1024, resizable=True)
pa.total_allocated_bytes()

1024

In [8]:
buf.resize(2048)
pa.total_allocated_bytes()

2048

Input Streams

In [9]:
buf = memoryview(b"some data")
stream = pa.input_stream(buf)
stream.read(4)

b'some'

In [10]:
import gzip
with gzip.open('example.gz', 'wb') as f:
    f.write(b'some data\n' * 3)
stream = pa.input_stream('example.gz')
stream.read()

b'some data\nsome data\nsome data\n'

Output Streams

In [11]:
with pa.output_stream('example1.dat') as stream:
    stream.write(b'some data')

f = open('example1.dat', 'rb')

f.read()

b'some data'

CSV Files

In [13]:
import pandas
import pyarrow as pa
from pyarrow import csv
pa.csv.ParseOptions(delimiter=',')
fn = 'orders.csv'
table = csv.read_csv(fn)
table
pyarrow.Table
len(table)
df = table.to_pandas()
df.head()

Unnamed: 0,O_ORDERKEY,O_CUSTKEY,O_ORDERSTATUS,O_TOTALPRICE,O_ORDERDATE,O_ORDERPRIORITY,O_CLERK,O_SHIPPRIORITY,O_COMMENT
0,1,37,O,131251.81,1996-01-02,5-LOW,Clerk#000000951,0,nstructions sleep furiously among
1,2,79,O,40183.29,1996-12-01,1-URGENT,Clerk#000000880,0,foxes. pending accounts at the pending silent...
2,3,124,F,160882.76,1993-10-14,5-LOW,Clerk#000000955,0,sly final accounts boost. carefully regular id...
3,4,137,O,31084.79,1995-10-11,5-LOW,Clerk#000000124,0,sits. slyly regular warthogs cajole. regular r...
4,5,46,F,86615.25,1994-07-30,5-LOW,Clerk#000000925,0,quickly. bold deposits sleep slyly. packages u...


In [15]:
pa.cpu_count()

8

In [17]:
import pyarrow as pa
t1 = pa.int32()
t2 = pa.string()
t3 = pa.binary()
t4 = pa.binary(10)
t5 = pa.timestamp('ms')

t1
print(t1)
print(t2)
print(t3)
print(t4)
print(t5)

int32
string
binary
fixed_size_binary[10]
timestamp[ms]


In [18]:
f0 = pa.field('int32_field', t1)
f0
f0.name
f0.type

DataType(int32)

Struct is created as a schema. List is a set of named fields

In [28]:
t6 = pa.list_(t1)
t6

ListType(list<item: int32>)

In [29]:
fields = [
    pa.field('s0', t1),
    pa.field('s1', t2),
    pa.field('s2', t4),
    pa.field('s3', t6),
]


t7 = pa.struct(fields)

print(t7)

struct<s0: int32, s1: string, s2: fixed_size_binary[10], s3: list<item: int32>>


In [30]:
t8 = pa.struct([('s0', t1), ('s1', t2), ('s2', t4), ('s3', t6)])
print(t8)
t8 == t7

struct<s0: int32, s1: string, s2: fixed_size_binary[10], s3: list<item: int32>>


True

Schemas are similar to Struct

In [31]:
my_schema = pa.schema([('field0', t1),
                       ('field1', t2),
                       ('field2', t4),
                       ('field3', t6)])
my_schema

field0: int32
field1: string
field2: fixed_size_binary[10]
field3: list<item: int32>
  child 0, item: int32

Arrays

In [32]:
arr = pa.array([1, 2, None, 3])
arr

<pyarrow.lib.Int64Array object at 0x7efe537d06a0>
[
  1,
  2,
  null,
  3
]

In [33]:
pa.array([1, 2], type=pa.uint16())

<pyarrow.lib.UInt16Array object at 0x7efe537d0220>
[
  1,
  2
]

In [35]:
arr.type

DataType(int64)

In [36]:
len(arr)

4

In [37]:
arr.null_count

1

In [38]:
nested_arr = pa.array([[], None, [1, 2], [None, 1]])
nested_arr

<pyarrow.lib.ListArray object at 0x7efe537d0760>
[
  [],
  null,
  [
    1,
    2
  ],
  [
    null,
    1
  ]
]

In [39]:
ty = pa.struct([('x', pa.int8()),
                ('y', pa.bool_())])
pa.array([{'x': 1, 'y': True}, {'x': 2, 'y': False}], type=ty)

<pyarrow.lib.StructArray object at 0x7efe537d0ee0>
-- is_valid: all not null
-- child 0 type: int8
  [
    1,
    2
  ]
-- child 1 type: bool
  [
    true,
    false
  ]

In [40]:
pa.array([(3, True), (4, False)], type=ty)

<pyarrow.lib.StructArray object at 0x7efe537d0dc0>
-- is_valid: all not null
-- child 0 type: int8
  [
    3,
    4
  ]
-- child 1 type: bool
  [
    true,
    false
  ]