In [55]:
from pathlib import Path
import json
from typing import Union

import pyarrow.parquet as pq
import pyarrow.dataset as ds

from pyarrow.dataset import Dataset

In [2]:
dir_prj = Path.cwd().parent
dir_data = dir_prj / 'data'

gpqt_pth = dir_prj / "data/raw/foursquare/geoparquet"

gpqt_pth

WindowsPath('D:/projects/arcpy-parquet/data/raw/foursquare/geoparquet')

In [145]:
# get max length of a column from the metadata
get_col_max_len = lambda col: len(col.statistics.max) if isinstance(col.statistics.max, str) else None

# get maximum lengths for each row group in the table metadata
get_row_group_max_lengths = lambda rg: [get_col_max_len(rg.column(idx)) for idx in range(rg.num_columns)]


def get_string_columns(dataset: Union[Path, Dataset]) -> list[str]:
    """Get list of string column names for a Parquet dataset"""
    # create arrow dataset object if a path
    if isinstance(dataset, Path):
        dataset = ds.dataset(dataset, format='parquet')
        
    if not isinstance(dataset, Dataset):
        raise ValueError('dataset must be a PyArrow Dataset or path to a Parquet dataset')

    # get the string columns
    str_col_lst = [col.name for col in dataset.schema if "string" in str(col.type)]

    return str_col_lst


def get_file_max_len(pqt_file: Union[str, Path]) -> dict[str, int]:
    """Get a list of maximum string lengths for a file by reading the metadata statistics"""
    # get the table metadata
    meta = pq.read_metadata(pqt_file)
    
    # get a list of maximum lengths for every row group in the metadata
    max_len_lst_lst = [get_row_group_max_lengths(meta.row_group(idx)) for idx in range(meta.num_row_groups)]
    
    # zip the values into sets for each row
    max_len_zipped_lst = [set(val for val in vals if val is not None) for vals in zip(*max_len_lst_lst)]

    # get the maximum lengths in a single list of values
    max_len_lst = [max(val) if len(val) > 0 else None for val in max_len_zipped_lst]

    # create a dictionary of maximum lengths
    max_len_dict = {nm: max_len for nm, max_len in zip(meta.schema.names, max_len_lst)}

    return max_len_dict
    

def get_parquet_max_string_lengths(parquet_dataset: Union[str, Path]) -> dict[str, int]:
    """
    For a Parquet datset, get the maximum string lengths for all string columns.

    Args:
        parquet_dataset: Path to Parquet dataset.
    """
    # create a parquet dataset to work with
    dataset = ds.dataset(gpqt_pth, format='parquet')
    
    # get maximum lengths for each column from the metadata for each file in the dataset
    max_len_lst_lst = [list(get_file_max_len(fl).values()) for fl in dataset.files]
    
    # zip the values into sets for each row
    max_len_zipped_lst = [set(val for val in vals if val is not None) for vals in zip(*max_len_lst_lst)]
    
    # get the maximum lengths in a single list of values
    max_len_lst = [max(val) if len(val) > 0 else None for val in max_len_zipped_lst]
    
    # create a dictionary of maximum lengths
    max_len_dict = {nm: max_len for nm, max_len in zip(dataset.schema.names, max_len_lst)}
    
    return max_len_dict

In [144]:
get_parquet_max_string_lengths(gpqt_pth)

{'name': 42,
 'latitude': None,
 'longitude': None,
 'address': 94,
 'locality': 25,
 'region': 2,
 'postcode': 10,
 'admin_region': None,
 'post_town': None,
 'po_box': 11,
 'country': 2,
 'date_created': 10,
 'date_refreshed': 10,
 'date_closed': 10,
 'tel': 14,
 'website': 106,
 'email': 30,
 'facebook_id': None,
 'instagram': 15,
 'twitter': 13,
 'fsq_category_ids': 12,
 'fsq_category_labels': 72,
 'placemaker_url': None,
 'unresolved_flags': None,
 'dt': None,
 'geometry': 24}