In [1]:
import time
from pathlib import Path
from datetime import datetime
import dask.dataframe as dd
from distributed import LocalCluster, Client
import spatialpandas as spd
from spatialpandas.geometry import (
    PointArray, MultiPointArray, LineArray,
    MultiLineArray, PolygonArray, MultiPolygonArray
)
from spatialpandas import GeoSeries, GeoDataFrame
%matplotlib inline

In [2]:
# set up data paths
base_path = Path().cwd().parent
data_dir = base_path.joinpath('data')

In [3]:
# create local dask cluster
cluster = LocalCluster(#silence_logs=logging.ERROR,
                       dashboard_address=':8790',
                       n_workers=1,
                       threads_per_worker=4,
                       memory_limit='4 GB')


client = Client(cluster)
client

0,1
Client  Scheduler: tcp://127.0.0.1:33989  Dashboard: http://127.0.0.1:8790/status,Cluster  Workers: 1  Cores: 4  Memory: 3.73 GiB


In [4]:
# read in the point data
#cont_us_path = data_dir.joinpath('contiguous_us_w_geohash.parquet')
cont_us_path = data_dir.joinpath('contiguous_us.parquet')
ddf = dd.read_parquet(cont_us_path, columns=['latitude', 'longitude'])
display(ddf.head(2))
len(ddf)

Unnamed: 0,latitude,longitude


113944489

In [5]:
# load data into spatialpandas geodataframe
df = ddf.map_partitions(
    lambda df: GeoDataFrame(dict(
        position=PointArray(df[['longitude', 'latitude']]),
        **{col: df[col] for col in df.columns}
    ))
)

In [6]:
t0 = time.time()
# spatially sort the data
savepath = data_dir.joinpath('us_cont_spatiallysorted.parquet')
df.pack_partitions(npartitions=df.npartitions, shuffle='disk').to_parquet(savepath)
dt = time.time() - t0

In [7]:
# save timing info
with open(f'spatial_sort_time-{datetime.now()}.csv', 'w') as f:
    f.write(f'time_min,npartitions\n{dt/60},{df.npartitions}')
print('dt (s):', dt)

dt (s): 515.2166604995728


In [8]:
# check the saved file has the same data as the original
df = spd.io.read_parquet_dask(savepath)
display(df.head(2))
len(df)

Unnamed: 0_level_0,position,latitude,longitude
hilbert_distance,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
25629,"Point([-124.443, 24.447])",24.447,-124.443
99850,"Point([-124.532, 24.704])",24.704,-124.532


113944489

In [None]:
# release the dask workers
cluster.scale(0)