In [1]:
import requests
import geopandas as gpd
import pandas as pd
from tqdm import tqdm
import os
from pathlib import Path

# WFS base URL
BASE_URL = (
    "https://geo.rijkswaterstaat.nl/services/ogc/gdr/verkeersongevallen_nederland/wfs"
    "?service=WFS&version=2.0.0&request=GetFeature"
    "&typeNames=verkeersongevallen_nederland:ongevallen_2022_2024"
    "&outputFormat=application/json"
)

CHUNK_SIZE = 10000
MAX_RECORDS = 382421
part = 0

for start in tqdm(range(0, MAX_RECORDS, CHUNK_SIZE)):
    url = f"{BASE_URL}&count={CHUNK_SIZE}&startIndex={start}"

    print(f"Downloading chunk starting at {start}...")

    response = requests.get(url)
    data = response.json()

    gdf = gpd.GeoDataFrame.from_features(data["features"])

    gdf["lon"] = gdf.geometry.x
    gdf["lat"] = gdf.geometry.y
    gdf = gdf.drop(columns=["geometry"])

    RAW_DIR = Path("datasets/RawData")
    if not RAW_DIR.exists():
        RAW_DIR = Path("../datasets/RawData")
    RAW_DIR.mkdir(parents=True, exist_ok=True)

    file = RAW_DIR / f"accidents_part_{part}.parquet"
    gdf.to_parquet(file, index=False)

    print(f"Saved {file} ({len(gdf)} rows)\n")

    part += 1

  0%|          | 0/39 [00:00<?, ?it/s]

Downloading chunk starting at 0...


  3%|▎         | 1/39 [00:04<02:55,  4.63s/it]

Saved ../datasets/RawData/accidents_part_0.parquet (10000 rows)

Downloading chunk starting at 10000...


  5%|▌         | 2/39 [00:08<02:42,  4.40s/it]

Saved ../datasets/RawData/accidents_part_1.parquet (10000 rows)

Downloading chunk starting at 20000...


  8%|▊         | 3/39 [00:17<03:40,  6.11s/it]

Saved ../datasets/RawData/accidents_part_2.parquet (10000 rows)

Downloading chunk starting at 30000...


 10%|█         | 4/39 [00:20<03:03,  5.24s/it]

Saved ../datasets/RawData/accidents_part_3.parquet (10000 rows)

Downloading chunk starting at 40000...


 13%|█▎        | 5/39 [00:23<02:29,  4.40s/it]

Saved ../datasets/RawData/accidents_part_4.parquet (10000 rows)

Downloading chunk starting at 50000...


 15%|█▌        | 6/39 [00:27<02:19,  4.21s/it]

Saved ../datasets/RawData/accidents_part_5.parquet (10000 rows)

Downloading chunk starting at 60000...


 18%|█▊        | 7/39 [00:31<02:07,  3.99s/it]

Saved ../datasets/RawData/accidents_part_6.parquet (10000 rows)

Downloading chunk starting at 70000...


 21%|██        | 8/39 [00:35<02:09,  4.18s/it]

Saved ../datasets/RawData/accidents_part_7.parquet (10000 rows)

Downloading chunk starting at 80000...


 23%|██▎       | 9/39 [00:37<01:39,  3.32s/it]

Saved ../datasets/RawData/accidents_part_8.parquet (10000 rows)

Downloading chunk starting at 90000...


 26%|██▌       | 10/39 [00:38<01:18,  2.72s/it]

Saved ../datasets/RawData/accidents_part_9.parquet (10000 rows)

Downloading chunk starting at 100000...


 28%|██▊       | 11/39 [00:45<01:51,  3.99s/it]

Saved ../datasets/RawData/accidents_part_10.parquet (10000 rows)

Downloading chunk starting at 110000...


 31%|███       | 12/39 [00:49<01:44,  3.89s/it]

Saved ../datasets/RawData/accidents_part_11.parquet (10000 rows)

Downloading chunk starting at 120000...


 33%|███▎      | 13/39 [00:54<01:55,  4.46s/it]

Saved ../datasets/RawData/accidents_part_12.parquet (10000 rows)

Downloading chunk starting at 130000...


 36%|███▌      | 14/39 [00:59<01:52,  4.52s/it]

Saved ../datasets/RawData/accidents_part_13.parquet (10000 rows)

Downloading chunk starting at 140000...


 38%|███▊      | 15/39 [01:00<01:25,  3.56s/it]

Saved ../datasets/RawData/accidents_part_14.parquet (10000 rows)

Downloading chunk starting at 150000...


 41%|████      | 16/39 [01:03<01:14,  3.25s/it]

Saved ../datasets/RawData/accidents_part_15.parquet (10000 rows)

Downloading chunk starting at 160000...


 44%|████▎     | 17/39 [01:04<00:59,  2.70s/it]

Saved ../datasets/RawData/accidents_part_16.parquet (10000 rows)

Downloading chunk starting at 170000...


 46%|████▌     | 18/39 [01:06<00:49,  2.34s/it]

Saved ../datasets/RawData/accidents_part_17.parquet (10000 rows)

Downloading chunk starting at 180000...


 49%|████▊     | 19/39 [01:09<00:54,  2.71s/it]

Saved ../datasets/RawData/accidents_part_18.parquet (10000 rows)

Downloading chunk starting at 190000...


 51%|█████▏    | 20/39 [01:15<01:08,  3.63s/it]

Saved ../datasets/RawData/accidents_part_19.parquet (10000 rows)

Downloading chunk starting at 200000...


 54%|█████▍    | 21/39 [01:18<01:01,  3.41s/it]

Saved ../datasets/RawData/accidents_part_20.parquet (10000 rows)

Downloading chunk starting at 210000...


 56%|█████▋    | 22/39 [01:22<00:58,  3.41s/it]

Saved ../datasets/RawData/accidents_part_21.parquet (10000 rows)

Downloading chunk starting at 220000...


 59%|█████▉    | 23/39 [01:24<00:51,  3.20s/it]

Saved ../datasets/RawData/accidents_part_22.parquet (10000 rows)

Downloading chunk starting at 230000...


 62%|██████▏   | 24/39 [01:26<00:40,  2.69s/it]

Saved ../datasets/RawData/accidents_part_23.parquet (10000 rows)

Downloading chunk starting at 240000...


 64%|██████▍   | 25/39 [01:27<00:32,  2.33s/it]

Saved ../datasets/RawData/accidents_part_24.parquet (10000 rows)

Downloading chunk starting at 250000...


 67%|██████▋   | 26/39 [01:29<00:27,  2.10s/it]

Saved ../datasets/RawData/accidents_part_25.parquet (10000 rows)

Downloading chunk starting at 260000...


 69%|██████▉   | 27/39 [01:30<00:23,  1.95s/it]

Saved ../datasets/RawData/accidents_part_26.parquet (10000 rows)

Downloading chunk starting at 270000...


 72%|███████▏  | 28/39 [01:33<00:23,  2.18s/it]

Saved ../datasets/RawData/accidents_part_27.parquet (10000 rows)

Downloading chunk starting at 280000...


 74%|███████▍  | 29/39 [01:35<00:19,  1.96s/it]

Saved ../datasets/RawData/accidents_part_28.parquet (10000 rows)

Downloading chunk starting at 290000...


 77%|███████▋  | 30/39 [01:40<00:26,  2.98s/it]

Saved ../datasets/RawData/accidents_part_29.parquet (10000 rows)

Downloading chunk starting at 300000...


 79%|███████▉  | 31/39 [01:41<00:20,  2.53s/it]

Saved ../datasets/RawData/accidents_part_30.parquet (10000 rows)

Downloading chunk starting at 310000...


 82%|████████▏ | 32/39 [01:43<00:15,  2.22s/it]

Saved ../datasets/RawData/accidents_part_31.parquet (10000 rows)

Downloading chunk starting at 320000...


 85%|████████▍ | 33/39 [01:47<00:16,  2.77s/it]

Saved ../datasets/RawData/accidents_part_32.parquet (10000 rows)

Downloading chunk starting at 330000...


 87%|████████▋ | 34/39 [01:53<00:18,  3.76s/it]

Saved ../datasets/RawData/accidents_part_33.parquet (10000 rows)

Downloading chunk starting at 340000...


 90%|████████▉ | 35/39 [01:57<00:15,  3.76s/it]

Saved ../datasets/RawData/accidents_part_34.parquet (10000 rows)

Downloading chunk starting at 350000...


 92%|█████████▏| 36/39 [01:59<00:10,  3.39s/it]

Saved ../datasets/RawData/accidents_part_35.parquet (10000 rows)

Downloading chunk starting at 360000...


 95%|█████████▍| 37/39 [02:06<00:08,  4.45s/it]

Saved ../datasets/RawData/accidents_part_36.parquet (10000 rows)

Downloading chunk starting at 370000...


 97%|█████████▋| 38/39 [02:10<00:04,  4.34s/it]

Saved ../datasets/RawData/accidents_part_37.parquet (10000 rows)

Downloading chunk starting at 380000...


100%|██████████| 39/39 [02:11<00:00,  3.37s/it]

Saved ../datasets/RawData/accidents_part_38.parquet (2421 rows)






In [2]:
import pandas as pd
import glob
from pathlib import Path

RAW_DIR = Path("datasets/RawData")
if not RAW_DIR.exists():
    RAW_DIR = Path("../datasets/RawData")
RAW_DIR.mkdir(parents=True, exist_ok=True)

files = sorted([str(p) for p in RAW_DIR.glob("accidents_part_*.parquet")])

if not files:
    files = glob.glob("datasets/RawData/accidents_part_*.parquet") or glob.glob("../datasets/RawData/accidents_part_*.parquet")

if not files:
    raise FileNotFoundError("No parquet part files found in datasets/RawData or ../datasets/RawData")

df = pd.concat([pd.read_parquet(f) for f in files], ignore_index=True)

print(df.shape)
df.to_parquet(RAW_DIR / "accidents_2022_2024_full.parquet", index=False)


(382421, 40)
