In [1]:
from pathlib import Path
import time
import dask.dataframe as dd

In [2]:
from distributed import Client, LocalCluster

cluster = LocalCluster(memory_limit='4GB', threads_per_worker=4, n_workers=1)
client = Client(cluster)

client

0,1
Client  Scheduler: tcp://127.0.0.1:36387  Dashboard: http://127.0.0.1:8787/status,Cluster  Workers: 1  Cores: 4  Memory: 3.73 GiB


In [3]:
# set up data paths
base_path = Path().cwd().parent
data_dir = base_path.joinpath('data')
raw_data_dir = base_path.joinpath('../../benchmark-data')

### Download and extract OpenStreetMap point data
Download from this url: `https://planet.openstreetmap.org/gps/simple-gps-points-120604.csv.xz`  
and place it in the `data` folder

In [None]:
# unpack the xz data using the `unxz` package
! unxz -d ../data/simple-gps-points-120604.csv.xz

In [4]:
# set the output path
pt_data_path = data_dir.joinpath('contiguous_us.parquet')

In [5]:
# read in the csv point data
ddf = dd.read_csv(raw_data_dir.joinpath('simple-gps-points-120604.csv')).loc[:, ['latitude', 'longitude']]
# convert to degreess
ddf.latitude = ddf.latitude/10**7
ddf.longitude = ddf.longitude/10**7

In [6]:
ddf

Unnamed: 0_level_0,latitude,longitude
npartitions=1218,Unnamed: 1_level_1,Unnamed: 2_level_1
,float64,float64
,...,...
...,...,...
,...,...
,...,...


### Extract the contiguous US and save as parquet

In [7]:
%%time 
t0 = time.time()
# reduce osm data to continguous us
usdf = ddf[ddf.latitude.between(24.396308, 49.384358) & ddf.longitude.between(-124.848974, -66.885444)]
# write to file
usdf.to_parquet(pt_data_path, engine='pyarrow', compression='snappy')
dt_hr = (time.time() - t0)/60/60

CPU times: user 1min 45s, sys: 4.62 s, total: 1min 50s
Wall time: 7min 25s
