In [1]:
from functools import reduce 
from glob import glob
from multiprocessing import Pool

from tqdm.notebook import tqdm
import numpy as np
import pandas as pd

In [2]:
pd.read_parquet('./parquet/2020/sg_taxi_20200324.parquet').head()

Unnamed: 0_level_0,lon,lat
ts,Unnamed: 1_level_1,Unnamed: 2_level_1
2020-03-23 23:59:53+08:00,103.62149,1.2763
2020-03-23 23:59:53+08:00,103.62701,1.3027
2020-03-23 23:59:53+08:00,103.628,1.29908
2020-03-23 23:59:53+08:00,103.6314,1.31135
2020-03-23 23:59:53+08:00,103.63279,1.32907


In [3]:
def get_min_max_lon_lat(parquet_path):
    def _agg(df):
        return np.concatenate([df.min().values, df.max().values])
    
    return _agg(pd.read_parquet(parquet_path))


get_min_max_lon_lat(parquet_path='./parquet/2020/sg_taxi_20200129.parquet')

array([103.6135485,   1.23331  , 104.01705  ,   1.46998  ])

In [4]:
def get_bounding_box(paths, decimal_precision):
    def _min_and_max(res):
        return {
            'lon_min': res[:, 0].min().round(decimal_precision) - (10 ** -decimal_precision), 
            'lat_min': res[:, 1].min().round(decimal_precision) - (10 ** -decimal_precision), 
            'lon_max': res[:, 2].max().round(decimal_precision) - (10 ** -decimal_precision), 
            'lat_max': res[:, 3].max().round(decimal_precision) - (10 ** -decimal_precision)
        }
    
    with Pool(maxtasksperchild=1) as pool:
        return _min_and_max(
            np.concatenate([[e] for e in tqdm(pool.imap_unordered(get_min_max_lon_lat, paths), total=len(paths))])
        ) 
    
bbox = get_bounding_box(paths=glob('./parquet/*/*.parquet'), decimal_precision=4)
bbox

HBox(children=(FloatProgress(value=0.0, max=1746.0), HTML(value='')))

KeyboardInterrupt: 

In [5]:
def parquet_to_matrix(parquet_path, decimal_precision):
    return pd\
            .read_parquet(parquet_path)\
            .assign(grid_lon=lambda df: df.lon.round(decimal_precision), 
                    grid_lat=lambda df: df.lat.round(decimal_precision))\
            .groupby(['grid_lat', 'grid_lon'])\
            .size()\
            .rename('num')

parquet_to_matrix(parquet_path='./parquet/2018/sg_taxi_20180319.parquet', 
                  decimal_precision=4)

grid_lat  grid_lon
1.2336    103.6756    1
1.2337    103.6760    1
1.2338    103.6757    7
          103.6758    2
1.2339    103.6758    2
                     ..
1.4694    103.8116    1
          103.8118    2
1.4695    103.8142    1
1.4697    103.8141    1
1.4699    103.8150    1
Name: num, Length: 722221, dtype: int64

In [6]:
def _mp(args):
    parquet_path, decimal_precision = args
    return parquet_to_matrix(parquet_path=parquet_path, decimal_precision=decimal_precision)

def multiple_parquets_to_matrix(parquet_paths, decimal_precision):
    with Pool(maxtasksperchild=1) as pool:
        return reduce(lambda ser1, ser2: ser1.add(ser2, fill_value=0), 
                      tqdm(pool.imap_unordered(
                          _mp, 
                          [(path, decimal_precision) for path in parquet_paths]
                      ), total=len(parquet_paths)))

In [None]:
grid_all_years = multiple_parquets_to_matrix(
    parquet_paths=glob('./parquet/*/*.parquet'), 
    decimal_precision=4)
grid_all_years.head()

HBox(children=(FloatProgress(value=0.0, max=1746.0), HTML(value='')))