In [1]:
import logging
import time
from datetime import datetime
from pathlib import Path

import geohash
import numpy as np
import pandas as pd
import geopandas as gpd
import dask
import dask.dataframe as dd
from distributed import LocalCluster, Client

In [2]:
# prepare a local dask cluster
cluster = LocalCluster(#silence_logs=logging.ERROR,
                       dashboard_address=':8790',
                       n_workers=1,
                       threads_per_worker=4,
                       memory_limit='4 GB')
client = Client(cluster)
client

0,1
Client  Scheduler: tcp://127.0.0.1:42051  Dashboard: http://127.0.0.1:8790/status,Cluster  Workers: 1  Cores: 4  Memory: 3.73 GiB


In [3]:
# set up data paths
base_path = Path().cwd().parent
data_dir = base_path.joinpath('data')

In [4]:
# load in the point data
df = dd.read_parquet(data_dir.joinpath('contiguous_us.parquet'), engine='pyarrow')
df.head()

Unnamed: 0,latitude,longitude


In [5]:
def calculate_geohash(df):
    """Calculate the geohashes
    map function for distributed processing"""
    # add a dummy column if necessary 
    if 'geohash' not in df.columns:
        df['geohash'] = ''
    lat90indices = (df.latitude == 90) 
    df.loc[lat90indices, 'geohash'] = 'zzzzzzzzzzzz' # geohash.encode fails if lat==90
    valid_indices = (df.longitude.between(-180, 180)) & (df.latitude >= -90) & (df.latitude < 90)
    df.loc[valid_indices, 'geohash'] = df.loc[valid_indices, :].apply(lambda row: geohash.encode(row.latitude, row.longitude), axis=1)
    return df

In [6]:
t0 = time.time()
# apply the geohashing function to each partition of data
df = df.map_partitions(calculate_geohash, meta={'latitude': float, 'longitude': float, 'geohash': 'object'})
# repartition the data
df_repartition = df.repartition(npartitions=200)
# save to parquet file
df_repartition.to_parquet(data_dir.joinpath('contiguous_us_w_geohash.parquet'), engine='pyarrow', compression=None)
dt_hr = (time.time() - t0)/60/60

distributed.core - ERROR - Exception while handling op heartbeat_worker
Traceback (most recent call last):
  File "/home/joris/miniconda3/envs/geo/lib/python3.9/site-packages/distributed/core.py", line 497, in handle_comm
    result = handler(comm, **msg)
  File "/home/joris/miniconda3/envs/geo/lib/python3.9/site-packages/distributed/scheduler.py", line 3861, in heartbeat_worker
    ws._executing = {
  File "/home/joris/miniconda3/envs/geo/lib/python3.9/site-packages/distributed/scheduler.py", line 3862, in <dictcomp>
    parent._tasks[key]: duration for key, duration in executing.items()
KeyError: "('calculate_geohash-3aca8bb37c6f5e9437449ae28547861e', 151)"


KeyboardInterrupt: 



In [None]:
# save out timing info
with open(f'us_geohash_time_{datetime.now()}.csv', 'w') as f:
    f.write(f'dt_hr, {dt_hr}')

In [None]:
# release the dask workers
cluster.scale(0)