### Reading the first parquet file

In [2]:
import pyarrow as pa
import pyarrow.parquet as pq
import os

In [3]:
def get_first_parquet_from_path(path):
    for (dir_path, _, files) in os.walk(path):
        for f in files:
            if f.endswith(".parquet"):
                first_pq_path = os.path.join(dir_path, f)
                return first_pq_path

In [4]:
path = './train_parquet'
first_pq = get_first_parquet_from_path(path)
first_pq

'./train_parquet\\000000000_000100000.parquet'

In [5]:
first_ds = pq.read_table(first_pq)
first_ds.num_rows, first_ds.num_columns, first_ds.schema

(5227653,
 4,
 session: int64
 aid: int64
 ts: int64
 type: string
 -- schema metadata --
 pandas: '{"index_columns": [{"kind": "range", "name": null, "start": 0, "' + 688)

In [6]:
parquet_file = pq.ParquetFile(first_pq)
ts=parquet_file.metadata.row_group(0)
for nm in range(ts.num_columns):
    print(ts.column(nm))

<pyarrow._parquet.ColumnChunkMetaData object at 0x000001FAF4BA6B10>
  file_offset: 1045713
  file_path: 
  physical_type: INT64
  num_values: 5227653
  path_in_schema: session
  is_stats_set: True
  statistics:
    <pyarrow._parquet.Statistics object at 0x000001FAF4BA6B60>
      has_min_max: True
      min: 0
      max: 99999
      null_count: 0
      distinct_count: 0
      num_values: 5227653
      physical_type: INT64
      logical_type: None
      converted_type (legacy): NONE
  compression: SNAPPY
  encodings: ('RLE_DICTIONARY', 'PLAIN', 'RLE')
  has_dictionary_page: True
  dictionary_page_offset: 4
  data_page_offset: 400266
  total_compressed_size: 1045709
  total_uncompressed_size: 1510526
<pyarrow._parquet.ColumnChunkMetaData object at 0x000001FAF4BA6B10>
  file_offset: 19895452
  file_path: 
  physical_type: INT64
  num_values: 5227653
  path_in_schema: aid
  is_stats_set: True
  statistics:
    <pyarrow._parquet.Statistics object at 0x000001FAF4BA6E30>
      has_min_max: Tru

### Reading and Writing Single Files

In [14]:
first_ds.to_pandas()

Unnamed: 0,session,aid,ts,type
0,0,1517085,1659304800025,clicks
1,0,1563459,1659304904511,clicks
2,0,1309446,1659367439426,clicks
3,0,16246,1659367719997,clicks
4,0,1781822,1659367871344,clicks
...,...,...,...,...
5227648,99999,1544954,1660373630318,clicks
5227649,99999,1032408,1660373656430,clicks
5227650,99999,1544954,1660373678083,clicks
5227651,99999,554230,1660373715477,clicks


In [17]:
import numpy as np
import pandas as pd

df = first_ds.to_pandas()

df['ts'] = pd.to_datetime(df['ts'], unit='ms')
df['day_of_week'] = df['ts'].dt.dayofweek
df['hour_of_day'] = df['ts'].dt.hour
df['elapsed_time'] = df['ts'] - df.groupby('session')['ts'].transform('first')
df['elapsed_time'] = df['elapsed_time'].dt.total_seconds().astype(np.int64)

df

Unnamed: 0,session,aid,ts,type,day_of_week,hour_of_day,elapsed_time
0,0,1517085,2022-07-31 22:00:00.025,clicks,6,22,0
1,0,1563459,2022-07-31 22:01:44.511,clicks,6,22,104
2,0,1309446,2022-08-01 15:23:59.426,clicks,0,15,62639
3,0,16246,2022-08-01 15:28:39.997,clicks,0,15,62919
4,0,1781822,2022-08-01 15:31:11.344,clicks,0,15,63071
...,...,...,...,...,...,...,...
5227648,99999,1544954,2022-08-13 06:53:50.318,clicks,5,6,1046918
5227649,99999,1032408,2022-08-13 06:54:16.430,clicks,5,6,1046944
5227650,99999,1544954,2022-08-13 06:54:38.083,clicks,5,6,1046966
5227651,99999,554230,2022-08-13 06:55:15.477,clicks,5,6,1047003
