In [1]:
import os

os.environ['POLARS_MAX_THREADS'] = '16'

import polars as pl
import geopandas as gpd
import concurrent.futures
import tqdm
from functools import lru_cache

In [2]:
@lru_cache(maxsize=1)
def load_kommune_data():
    gdf_kom = gpd.read_parquet('data/kommune.pq').to_crs(25832)
    gdf_kom['code'] = gdf_kom['code'].astype('int')
    return gdf_kom

In [None]:
def parse_voronoi(kom: int):

    gdf_kom = load_kommune_data()    

    df = (pl.scan_parquet('data/adresser.pq', low_memory = True)
          .filter(pl.col("kommunekode")==kom)
          .select(pl.col("vejnavn", "husnr", "postnr", "kommunekode", "landsdelsnuts3"), pl.col("etrs89koordinat_øst").alias("etrs89_east"), pl.col("etrs89koordinat_nord").alias("etrs89_north"))
          .filter(pl.struct(pl.col("etrs89_east", "etrs89_north")).is_first_distinct())
          .collect(engine = 'streaming')
          .to_pandas()
    )

    gdf_adr = gpd.GeoDataFrame(df, geometry=gpd.points_from_xy(x = df.etrs89_east, y=df.etrs89_north), crs=25832)


    kom_shape = gdf_kom[gdf_kom['code']==kom].reset_index(drop=True)
    gdf_adr['voronoi'] = gdf_adr.voronoi_polygons()

    gdf_adr['points'] = gdf_adr['geometry']

    gdf_adr = gdf_adr.drop('geometry', axis=1)
    gdf_adr = gdf_adr.set_geometry('voronoi')

    
    gdf_out = gpd.overlay(gdf_adr, kom_shape, how = 'intersection')

    print(f'{kom} to voronoi done.')

    gdf_out.to_parquet(f'data/voronoi/adresser_voronoi_{kom}.pq')

# test = parse_voronoi(306)
# test.plot()


In [4]:
lf = pl.scan_parquet('data/adresser.pq').select(pl.col("kommunekode").unique())
kommunerz = lf.collect().to_series().to_list()
with concurrent.futures.ThreadPoolExecutor(max_workers = 8) as executor:
    res = list(tqdm.tqdm(executor.map(parse_voronoi, kommunerz)))

0it [00:00, ?it/s]

147 to voronoi done.
161 to voronoi done.
153 to voronoi done.
155 to voronoi done.
163 to voronoi done.
165 to voronoi done.
159 to voronoi done.
167 to voronoi done.
157 to voronoi done.
151 to voronoi done.
187 to voronoi done.
183 to voronoi done.
175 to voronoi done.
173 to voronoi done.
169 to voronoi done.
201 to voronoi done.
190 to voronoi done.
223 to voronoi done.
219 to voronoi done.
210 to voronoi done.
240 to voronoi done.
230 to voronoi done.
253 to voronoi done.
269 to voronoi done.
217 to voronoi done.
185 to voronoi done.
270 to voronoi done.
260 to voronoi done.
259 to voronoi done.
329 to voronoi done.
320 to voronoi done.
336 to voronoi done.
265 to voronoi done.
340 to voronoi done.
350 to voronoi done.
306 to voronoi done.
316 to voronoi done.
250 to voronoi done.
360 to voronoi done.
101 to voronoi done.


1it [03:23, 203.60s/it]

411 to voronoi done.
326 to voronoi done.


36it [03:40,  4.05s/it]

330 to voronoi done.


38it [03:56,  4.91s/it]

410 to voronoi done.
420 to voronoi done.
440 to voronoi done.
450 to voronoi done.
430 to voronoi done.
370 to voronoi done.


43it [05:11,  6.99s/it]

492 to voronoi done.
482 to voronoi done.
480 to voronoi done.
530 to voronoi done.
479 to voronoi done.
461 to voronoi done.
563 to voronoi done.
550 to voronoi done.
376 to voronoi done.


44it [07:24, 14.10s/it]

390 to voronoi done.


45it [07:58, 15.57s/it]

575 to voronoi done.
510 to voronoi done.
607 to voronoi done.
573 to voronoi done.


46it [08:42, 18.38s/it]

400 to voronoi done.
580 to voronoi done.
561 to voronoi done.
665 to voronoi done.
671 to voronoi done.


60it [09:12,  7.70s/it]

540 to voronoi done.
710 to voronoi done.
661 to voronoi done.
615 to voronoi done.


68it [09:46,  6.45s/it]

727 to voronoi done.
707 to voronoi done.
741 to voronoi done.
706 to voronoi done.
657 to voronoi done.
621 to voronoi done.


69it [10:07,  7.32s/it]

630 to voronoi done.


70it [10:08,  6.87s/it]

746 to voronoi done.
740 to voronoi done.
773 to voronoi done.
756 to voronoi done.
730 to voronoi done.


79it [11:01,  6.35s/it]

766 to voronoi done.
810 to voronoi done.
825 to voronoi done.
787 to voronoi done.
820 to voronoi done.
779 to voronoi done.
840 to voronoi done.
813 to voronoi done.
849 to voronoi done.
751 to voronoi done.


83it [13:20, 13.18s/it]

791 to voronoi done.
860 to voronoi done.
846 to voronoi done.


85it [13:43, 12.93s/it]

760 to voronoi done.
851 to voronoi done.


99it [17:27, 10.58s/it]


In [8]:
import glob
import pandas as pd

vo_list = glob.glob('data/voronoi/adresser_v*.pq')

gdf = gpd.read_parquet(vo_list[0])

for data in vo_list:
    if data == vo_list[0]:
        pass
    gdf2 = gpd.read_parquet(data)

    gdf = pd.concat([gdf, gdf2])

gdf = gdf.reset_index(drop = True)

gdf.to_parquet('data/adresser_voronoi.pq')

In [None]:
lf = pl.scan_parquet('data/adresser.pq').select(
    pl.col("vejnavn", "husnr", "postnr", "kommunekode", "landsdelsnuts3"), pl.col("etrs89koordinat_øst").alias("etrs89_east"), pl.col("etrs89koordinat_nord").alias("etrs89_north")
).filter(pl.struct(pl.col("etrs89_east", "etrs89_north")).is_first_distinct()).with_columns(
    pl.struct(pl.col("etrs89_east", "etrs89_north")).hash().alias("address_id").rank("dense").shrink_dtype()
)

df = lf.collect(engine = 'streaming').to_pandas()

gdf = gpd.GeoDataFrame(df, geometry = gpd.points_from_xy(x = df.etrs89_east, y=df.etrs89_north), crs = 25832)
gdf['voronoi'] = gdf.voronoi_polygons()
gdf['points'] = gdf['geometry']
gdf = gdf.drop('geometry', axis=1)
gdf = gdf.set_geometry('voronoi')
gdf.to_parquet('data/adresser_geo.pq')


Unnamed: 0,vejnavn,husnr,postnr,kommunekode,landsdelsnuts3,etrs89_east,etrs89_north,address_id,voronoi,points
0,Bakkenellikevej,2,4583,306,DK022,653487.940000,6.203741e+06,2548631,"POLYGON ((-8961.698 6202346.066, 309146.199 61...",POINT (653487.94 6203740.87)
1,Valmuestien,6,3460,230,DK013,714572.710000,6.195075e+06,624411,"POLYGON ((-8961.698 6251221.718, 193849.586 62...",POINT (714572.71 6195075.09)
2,Nyelandsvej,78,2000,147,DK011,721167.760000,6.176715e+06,1863571,"POLYGON ((439354.507 6152292.735, 441694.374 6...",POINT (721167.76 6176714.85)
3,Havrevej,10,2970,223,DK013,717609.060000,6.198722e+06,1831869,"POLYGON ((442281.922 6157271.189, 442279.449 6...",POINT (717609.06 6198722.49)
4,Tvedgade,13H,6760,561,DK032,485270.270000,6.131594e+06,300457,"POLYGON ((442279.449 6157229.122, 442281.922 6...",POINT (485270.27 6131594.06)
...,...,...,...,...,...,...,...,...,...,...
2603260,Kløvervangen,84B,8541,751,DK042,580541.961084,6.236081e+06,1856797,"POLYGON ((892459.702 6121825.714, 892461.613 6...",POINT (580541.961 6236080.534)
2603261,Helga Pedersens Gade,121,8000,751,DK042,576440.556954,6.225236e+06,2546037,"POLYGON ((1343443.947 6028854.169, 1005298.466...",POINT (576440.557 6225235.552)
2603262,Helga Pedersens Gade,125,8000,751,DK042,576429.198737,6.225231e+06,536266,"POLYGON ((892438.811 6147869.878, 892437.364 6...",POINT (576429.199 6225231.225)
2603263,Thomas Windings Gade,24,8200,751,DK042,571645.363291,6.231257e+06,1560955,"POLYGON ((892505.447 6147975.386, 892468.062 6...",POINT (571645.363 6231256.853)


In [None]:
gdf = gpd.read_parquet('data/adresser_geo.pq')
gdf_kom = gpd.read_parquet('data/kommune.pq')
gdf_kom = gdf_kom.to_crs(25832)

gdf = gpd.overlay(gdf, gdf_kom, how = 'intersection')

In [None]:
gdf.to_parquet('data/adresser_geo_voronoi_parsed.pq')

In [None]:
gdf = gpd.read_parquet('data/adresser_geo.pq')
gdf