# Objective
Experiment with parquet and arrow formatted files

In [1]:
# import required services
import datetime
import json
import platform
import time
import warnings
from pathlib import Path
import os
import glob
import shutil
import inspect

import click
import duckdb
import ibis
import psutil
from jinja2 import Template
from memory_profiler import memory_usage

import pyarrow as pa
import pyarrow.parquet as pq
import pyarrow.compute as pc
import pyarrow.dataset as ds

In [2]:
def platform_info():
    print(f"{inspect.stack()[0][3]}  entered")
    return {
        "machine": platform.machine(),
        "version": platform.version(),
        "platform": platform.platform(),
        "system": platform.system(),
        "cpu_count": psutil.cpu_count(),
        "memory": psutil.virtual_memory().total,
        "processor": platform.processor(),
    }

In [3]:
def setGlobals(env_in):
    # set environment characteristics. 
    # expects env_in to be a dictionary
    env = env_in
    env['data'] = 'data'
    env['perf'] = 'perf'
    env['acq'] = 'acq'
    env['files'] = {'perf': 'Performance_2000Q1.txt.parquet', 'perf2': 'Performance_2000Q2.txt.parquet', 'perf3': 'Performance_2000Q3.txt.parquet', 'perf4': 'Performance_2000Q4.txt.parquet', 'acq': 'Acquisition_2000Q1.txt.parquet'}
    env['path_separator'] = os.path.sep
    env['platform_info'] = platform_info()
    env['threads'] = env['platform_info']['cpu_count']
    
    return(env)

In [4]:
env = setGlobals({})
f1 = env['data'] + env['path_separator'] + env['perf'] + env['path_separator'] + env['files']['perf']
f2 = env['data'] + env['path_separator'] + env['perf'] + env['path_separator'] + env['files']['perf2']
f3 = env['data'] + env['path_separator'] + env['perf'] + env['path_separator'] + env['files']['perf3']
f4 = env['data'] + env['path_separator'] + env['perf'] + env['path_separator'] + env['files']['perf4']

f1_parquet = pq.read_table([f1, f2, f3, f4])
f1_pq_metadata = pq.read_metadata(f1)
print(f1_pq_metadata)

platform_info  entered
<pyarrow._parquet.FileMetaData object at 0x16a077600>
  created_by: parquet-cpp-arrow version 7.0.0
  num_columns: 31
  num_rows: 9094679
  num_row_groups: 1
  format_version: 1.0
  serialized_size: 7413


In [5]:
print(pc.count(f1_parquet['loan_id']))

36190382


In [10]:
dataset = ds.dataset("./data/perf", format="parquet")
row_count = 0
t0 = time.time()
for record_batch in dataset.to_batches():
    loan_id = record_batch.column("loan_id")
    row_count += len(loan_id)
t1 = time.time()    
print(f"{row_count} rows in {t1-t0} seconds")

1851966762 rows in 77.74129486083984 seconds
