In [1]:
import time
from pathlib import Path
from datetime import datetime
import dask.dataframe as dd
from distributed import LocalCluster, Client
import spatialpandas as spd
from spatialpandas.geometry import (
    PointArray, MultiPointArray, LineArray,
    MultiLineArray, PolygonArray, MultiPolygonArray
)
from spatialpandas import GeoSeries, GeoDataFrame
%matplotlib inline

In [2]:
# set up data paths
base_path = Path().cwd().parent
data_dir = base_path.joinpath('data')

In [3]:
# create local dask cluster
cluster = LocalCluster(#silence_logs=logging.ERROR,
                       dashboard_address=':8790',
                       n_workers=1,
                       threads_per_worker=4,
                       memory_limit='4 GB')


client = Client(cluster)
client

Perhaps you already have a cluster running?
Hosting the HTTP server on port 38979 instead


0,1
Client  Scheduler: tcp://127.0.0.1:34535  Dashboard: http://127.0.0.1:38979/status,Cluster  Workers: 1  Cores: 4  Memory: 3.73 GiB


In [4]:
spatial_sort_path = data_dir.joinpath('us_cont_spatiallysorted.parquet')

In [5]:
from spatialpandas.io import read_parquet, read_parquet_dask

In [6]:
df = read_parquet(spatial_sort_path / "part.0.parquet")
df

Unnamed: 0_level_0,position,latitude,longitude
hilbert_distance,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
25629,"Point([-124.443, 24.447])",24.447000,-124.443000
99850,"Point([-124.532, 24.704])",24.704000,-124.532000
216826,"Point([-124.248, 24.591])",24.591000,-124.248000
253418,"Point([-124.085, 24.463])",24.463000,-124.085000
290728,"Point([-123.719, 24.554])",24.554000,-123.719000
...,...,...,...
51981088,"Point([-111.4537759, 26.0588929])",26.058893,-111.453776
51981088,"Point([-111.4547769, 26.0591679])",26.059168,-111.454777
51981088,"Point([-111.4536229, 26.0588199])",26.058820,-111.453623
51981088,"Point([-111.4541189, 26.0589329])",26.058933,-111.454119


In [7]:
import pygeos

In [8]:
def spatialpandas_to_geopandas(df):
    arr = pygeos.points(df.position.array.flat_values.reshape(-1, 2))
    gdf = geopandas.GeoDataFrame(df[["latitude", "longitude"]], geometry=arr)
    return gdf

In [9]:
# load spatially sorted us data
spatial_sort_path = data_dir.joinpath('us_cont_spatiallysorted.parquet')
ddf = read_parquet_dask(spatial_sort_path)
ddf.head(2)

Unnamed: 0_level_0,position,latitude,longitude
hilbert_distance,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
25629,"Point([-124.443, 24.447])",24.447,-124.443
99850,"Point([-124.532, 24.704])",24.704,-124.532


In [10]:
import dask_geopandas

In [11]:
gddf = ddf.map_partitions(spatialpandas_to_geopandas)
gddf

Unnamed: 0_level_0,latitude,longitude,geometry
npartitions=1218,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
,float64,float64,geometry
,...,...,...
...,...,...,...
,...,...,...
,...,...,...


In [12]:
t0 = time.time()
# spatially sort the data
savepath = data_dir.joinpath('us_cont_spatiallysorted_geopandas.parquet')
gddf.to_parquet(savepath)
dt = time.time() - t0

In [14]:
# save timing info
with open(f'to_dask_geopandas_time-{datetime.now()}.csv', 'w') as f:
    f.write(f'time_min,npartitions\n{dt/60},{gddf.npartitions}')
print('dt (s):', dt)

dt (s): 354.27196502685547


In [None]:
dask_geopandas.read_parquet

In [15]:
# check the saved file has the same data as the original
df = dask_geopandas.read_parquet(savepath)
display(df.head(2))
len(df)

Unnamed: 0_level_0,latitude,longitude,geometry
hilbert_distance,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
25629,24.447,-124.443,POINT (-124.44300 24.44700)
99850,24.704,-124.532,POINT (-124.53200 24.70400)


113944489

In [16]:
# release the dask workers
client.shutdown()