In [45]:
from pathlib import Path
import json

import pyarrow.parquet as pq
import pyarrow.dataset as ds

In [18]:
dir_prj = Path.cwd().parent
dir_data = dir_prj / 'data'

gpqt_pth = dir_prj / "data/raw/foursquare/geoparquet/parquet"

gpqt_pth

WindowsPath('D:/projects/arcpy-parquet/data/raw/foursquare/geoparquet/parquet')

In [46]:
dataset = dataset = ds.dataset(gpqt_pth, format='parquet')

dataset

<pyarrow._dataset.FileSystemDataset at 0x2808c14a500>

In [None]:
class GeoParquetMetadata(object):

    def __init__(self, parquet_dataset: Path) -> None:

        if isinstance(parquet_dataset, str):
            parquet_dataset = Path(parquet_dataset)

        

In [116]:
# get the explicitly added metadata for all the files
meta_lst = [pq.read_metadata(fl).metadata for fl in dataset.files]

# get the geography information - the metadata making the parquet dataset GEOparquet
geo_binary_lst = [meta.get(b'geo') for meta in meta_lst]

# convert the binary string into a list of dictionaries
geo_lst = [json.loads(geo) for geo in geo_binary_lst]

# get the geography definitions without the bounding boxes, and convert back to strings so they can be compared in a set
geo_set = set(json.dumps({nm: {k: v for k, v in col_dict.items() if k != 'bbox'} for nm, col_dict in geo.get('columns').items()}) for geo in geo_lst)

# ensure only one geography is present
if len(geo_set) > 1:
    raise ValueError("More than one spatial reference detected. Cannot convert data.")

# convert the geography definition back to a dictionary
uniq_geo = [json.loads(geo) for geo in geo_set][0]

# get the bounding box for all the files, the entire parquet dataset
coords_lst = list(zip(*[geo.get('columns').get('geometry').get('bbox') for geo in geo_lst]))

min_coords = [min(coords) for coords in coords_lst[:2]]
max_coords = [max(coords) for coords in coords_lst[2:]]

bbox = min_coords + max_coords

# add the bounding box back onto the geometry definition
uniq_geo

{'geometry': {'encoding': 'WKB',
  'geometry_types': ['Point'],
  'crs': {'type': 'GeographicCRS',
   'name': 'GCS WGS 1984',
   'bbox': {'east_longitude': 180.0,
    'west_longitude': -180.0,
    'south_latitude': -90.0,
    'north_latitude': 90.0},
   'datum': {'type': 'GeodeticReferenceFrame',
    'name': 'D WGS 1984',
    'ellipsoid': {'name': 'WGS 1984',
     'semi_major_axis': 6378137.0,
     'inverse_flattening': 298.257223563},
    'prime_meridian': {'name': 'Greenwich', 'longitude': 0.0},
    'id': {'authority': 'EPSG', 'code': 6326}},
   'coordinate_system': {'subtype': 'ellipsoidal',
    'axis': [{'name': 'Latitude',
      'abbreviation': 'lat',
      'direction': 'north',
      'unit': 'degree'},
     {'name': 'Longitude',
      'abbreviation': 'lon',
      'direction': 'east',
      'unit': 'degree'}]},
   'area': 'World (by country)',
   'id': {'authority': 'EPSG', 'code': 4326}}}}

In [121]:
geo

{'version': '1.0.0',
 'primary_column': 'geometry',
 'columns': {'geometry': {'encoding': 'WKB',
   'geometry_types': ['Point'],
   'crs': {'type': 'GeographicCRS',
    'name': 'GCS WGS 1984',
    'bbox': {'east_longitude': 180.0,
     'west_longitude': -180.0,
     'south_latitude': -90.0,
     'north_latitude': 90.0},
    'datum': {'type': 'GeodeticReferenceFrame',
     'name': 'D WGS 1984',
     'ellipsoid': {'name': 'WGS 1984',
      'semi_major_axis': 6378137.0,
      'inverse_flattening': 298.257223563},
     'prime_meridian': {'name': 'Greenwich', 'longitude': 0.0},
     'id': {'authority': 'EPSG', 'code': 6326}},
    'coordinate_system': {'subtype': 'ellipsoidal',
     'axis': [{'name': 'Latitude',
       'abbreviation': 'lat',
       'direction': 'north',
       'unit': 'degree'},
      {'name': 'Longitude',
       'abbreviation': 'lon',
       'direction': 'east',
       'unit': 'degree'}]},
    'area': 'World (by country)',
    'id': {'authority': 'EPSG', 'code': 4326}},
   

In [114]:
coords_lst = list(zip(*[geo.get('columns').get('geometry').get('bbox') for geo in geo_lst]))

min_coords = [min(coords) for coords in coords_lst[:2]]
max_coords = [max(coords) for coords in coords_lst[2:]]

bbox = min_coords + max_coords

bbox

[-149.90083515644073,
 25.772332637530994,
 -70.11639202164899,
 61.21882061399109]

In [19]:
gpqt_file = list(gpqt_pth.glob('**/*.parquet'))[0]

In [20]:
meta = pq.read_metadata(gpqt_file)

In [44]:
[fld.describe() for fld in meta.schema]

AttributeError: 'pyarrow._parquet.ColumnSchema' object has no attribute 'describe'

In [28]:
geo_meta = meta.metadata[b'geo']

geo_meta

b'{"version":"1.0.0","primary_column":"geometry","columns":{"geometry":{"encoding":"WKB","geometry_types":["Point"],"crs":{"type":"GeographicCRS","name":"GCS WGS 1984","bbox":{"east_longitude":180.0,"west_longitude":-180.0,"south_latitude":-90.0,"north_latitude":90.0},"datum":{"type":"GeodeticReferenceFrame","name":"D WGS 1984","ellipsoid":{"name":"WGS 1984","semi_major_axis":6378137.0,"inverse_flattening":298.257223563},"prime_meridian":{"name":"Greenwich","longitude":0.0},"id":{"authority":"EPSG","code":6326}},"coordinate_system":{"subtype":"ellipsoidal","axis":[{"name":"Latitude","abbreviation":"lat","direction":"north","unit":"degree"},{"name":"Longitude","abbreviation":"lon","direction":"east","unit":"degree"}]},"area":"World (by country)","id":{"authority":"EPSG","code":4326}},"bbox":[-86.30001068115234,25.772332637530994,-80.19155100671779,32.6189679949766]}}}'

In [34]:
print(json.dumps(json.loads(geo_meta), indent=4))

{
    "version": "1.0.0",
    "primary_column": "geometry",
    "columns": {
        "geometry": {
            "encoding": "WKB",
            "geometry_types": [
                "Point"
            ],
            "crs": {
                "type": "GeographicCRS",
                "name": "GCS WGS 1984",
                "bbox": {
                    "east_longitude": 180.0,
                    "west_longitude": -180.0,
                    "south_latitude": -90.0,
                    "north_latitude": 90.0
                },
                "datum": {
                    "type": "GeodeticReferenceFrame",
                    "name": "D WGS 1984",
                    "ellipsoid": {
                        "name": "WGS 1984",
                        "semi_major_axis": 6378137.0,
                        "inverse_flattening": 298.257223563
                    },
                    "prime_meridian": {
                        "name": "Greenwich",
                        "longitude": 0.0
        