### Reading the first parquet file

In [1]:
import pyarrow as pa
import pyarrow.parquet as pq
import os

In [2]:
def get_first_parquet_from_path(path):
    for (dir_path, _, files) in os.walk(path):
        for f in files:
            if f.endswith(".parquet"):
                first_pq_path = os.path.join(dir_path, f)
                return first_pq_path

In [3]:
path = './train_parquet'
first_pq = get_first_parquet_from_path(path)
first_pq

'./train_parquet/000000000_000100000.parquet'

In [4]:
first_ds = pq.read_table(first_pq)
first_ds.num_rows, first_ds.num_columns, first_ds.schema

(5227653,
 4,
 session: int64
 aid: int64
 ts: int64
 type: string
 -- schema metadata --
 pandas: '{"index_columns": [{"kind": "range", "name": null, "start": 0, "' + 688)

In [5]:
parquet_file = pq.ParquetFile(first_pq)
ts=parquet_file.metadata.row_group(0)
for nm in range(ts.num_columns):
    print(ts.column(nm))

<pyarrow._parquet.ColumnChunkMetaData object at 0x7fb0b117b6d0>
  file_offset: 1045713
  file_path: 
  physical_type: INT64
  num_values: 5227653
  path_in_schema: session
  is_stats_set: True
  statistics:
    <pyarrow._parquet.Statistics object at 0x7fb0b117b720>
      has_min_max: True
      min: 0
      max: 99999
      null_count: 0
      distinct_count: 0
      num_values: 5227653
      physical_type: INT64
      logical_type: None
      converted_type (legacy): NONE
  compression: SNAPPY
  encodings: ('RLE_DICTIONARY', 'PLAIN', 'RLE')
  has_dictionary_page: True
  dictionary_page_offset: 4
  data_page_offset: 400266
  total_compressed_size: 1045709
  total_uncompressed_size: 1510526
<pyarrow._parquet.ColumnChunkMetaData object at 0x7fb0b117b6d0>
  file_offset: 19895452
  file_path: 
  physical_type: INT64
  num_values: 5227653
  path_in_schema: aid
  is_stats_set: True
  statistics:
    <pyarrow._parquet.Statistics object at 0x7fb0b117b9f0>
      has_min_max: True
      min: 1
 

### Reading and Writing Single Files

In [6]:
first_ds.to_pandas()

Unnamed: 0,session,aid,ts,type
0,0,1517085,1659304800025,clicks
1,0,1563459,1659304904511,clicks
2,0,1309446,1659367439426,clicks
3,0,16246,1659367719997,clicks
4,0,1781822,1659367871344,clicks
...,...,...,...,...
5227648,99999,1544954,1660373630318,clicks
5227649,99999,1032408,1660373656430,clicks
5227650,99999,1544954,1660373678083,clicks
5227651,99999,554230,1660373715477,clicks


In [7]:
pq.read_table(first_pq, columns=['aid', 'type'])

pyarrow.Table
aid: int64
type: string
----
aid: [[1517085,1563459,1309446,16246,1781822,...,833777,833777,833777,953177,1400776],[953177,1270763,953177,953177,1243640,...,445834,445834,43464,982826,1250310],...,[1155739,866939,1470432,1439071,24635,...,1733488,90153,1640659,1796075,1640659],[276502,361506,276502,762756,90929,...,1544954,1032408,1544954,554230,1544954]]
type: [["clicks","clicks","clicks","clicks","clicks",...,"carts","clicks","clicks","clicks","clicks"],["clicks","clicks","clicks","clicks","clicks",...,"carts","carts","clicks","clicks","clicks"],...,["clicks","clicks","clicks","clicks","clicks",...,"clicks","clicks","clicks","clicks","clicks"],["clicks","clicks","clicks","clicks","clicks",...,"clicks","clicks","clicks","clicks","clicks"]]

In [8]:
pq.read_pandas(first_pq, columns=['ts']).to_pandas()

Unnamed: 0,ts
0,1659304800025
1,1659304904511
2,1659367439426
3,1659367719997
4,1659367871344
...,...
5227648,1660373630318
5227649,1660373656430
5227650,1660373678083
5227651,1660373715477


### Reading Parquet and Memory Mapping

In [9]:
pq_array = pa.parquet.read_table(first_pq, memory_map=True)

### Finer-grained Reading and Writing

In [10]:
parquet_file = pq.ParquetFile(first_pq)
parquet_file.metadata

<pyarrow._parquet.FileMetaData object at 0x7fb0c56ea1d0>
  created_by: parquet-cpp-arrow version 10.0.1
  num_columns: 4
  num_rows: 5227653
  num_row_groups: 1
  format_version: 2.6
  serialized_size: 2799

In [11]:
parquet_file.schema

<pyarrow._parquet.ParquetSchema object at 0x7fb0b11773c0>
required group field_id=-1 schema {
  optional int64 field_id=-1 session;
  optional int64 field_id=-1 aid;
  optional int64 field_id=-1 ts;
  optional binary field_id=-1 type (String);
}

In [12]:
parquet_file.num_row_groups

1

In [13]:
parquet_file.read_row_group(0)

pyarrow.Table
session: int64
aid: int64
ts: int64
type: string
----
session: [[0,0,0,0,0,...,99999,99999,99999,99999,99999]]
aid: [[1517085,1563459,1309446,16246,1781822,...,1544954,1032408,1544954,554230,1544954]]
ts: [[1659304800025,1659304904511,1659367439426,1659367719997,1659367871344,...,1660373630318,1660373656430,1660373678083,1660373715477,1660373725829]]
type: [["clicks","clicks","clicks","clicks","clicks",...,"clicks","clicks","clicks","clicks","clicks"]]