In [1]:
import requests
import pandas as pd
import re
from datetime import datetime
from dateutil.relativedelta import relativedelta

server = 'https://pubgeo.zwemwater.nl/geoserver/zwr_public/wfs'

In [2]:
# Locations
body = """
    <GetFeature xmlns="http://www.opengis.net/wfs" service="WFS" version="1.1.0" outputFormat="application/json" xsi:schemaLocation="http://www.opengis.net/wfs http://schemas.opengis.net/wfs/1.1.0/wfs.xsd" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance">
        <Query typeName="zwr_public:zwemplekken_details" srsName="EPSG:28992" xmlns:zwr_public="https://pubgeo.zwemwater.nl/geoserver/zwr_public">
        </Query>
    </GetFeature>
"""
response = requests.post(server, body)
details = [item['properties'] for item in response.json()['features']]
locations = pd.DataFrame(details, columns=["zwemwaterlocatie_id", "naam", "status"])
locations["zwemwaterlocatie_id"] = pd.to_numeric(locations["zwemwaterlocatie_id"])
locations.set_index("zwemwaterlocatie_id", inplace=True)

# Clean up names for read- and searchability
locations['naam'] = locations['naam']\
    .str.strip()\
    .str.title()\
    .str.replace("^Rcn", "RCN", regex=True)\
    .str.replace("^T ", "'t ", regex=True)\
    .str.replace("['|`]T ", "'t ", regex=True)\
    .str.replace("['|`]S", "'s", regex=True)

# Deduplicate locations
locations = locations.groupby("zwemwaterlocatie_id").tail(1)


In [5]:
response.json()['features'][0]

{'type': 'Feature',
 'id': 'zwemplekken_details.fid--3f6bf812_1895fb674c5_-7f74',
 'geometry': {'type': 'Polygon',
  'coordinates': [[[190803.2582, 550453.2576],
    [190975.4582, 550543.1376],
    [190989.7382, 550592.2776],
    [190972.9382, 550608.2376],
    [190903.7432, 550600.1526],
    [190832.0282, 550573.5876],
    [190798.0607, 550556.9451],
    [190790.9732, 550540.3026],
    [190780.5257, 550500.9801],
    [190786.8782, 550488.5376],
    [190803.2582, 550453.2576]]]},
 'geometry_name': 'geometrie',
 'properties': {'adr_huisletter': None,
  'adr_huisnummer': None,
  'adr_huisnummertoevoeging': None,
  'adr_postcode': None,
  'adr_straat': 'De Dreef',
  'adr_woonplaats': 'Heerenveen',
  'datum': None,
  'info_filename': 'FR-043-001 De Heide.jpg',
  'info_id': 300003,
  'info_length': 356023,
  'info_mime_type': 'image/jpeg',
  'key_id': 1676,
  'korte_naam': 'Heide',
  'naam': 'De Heide',
  'objecttype': 'zwemplek',
  'org_bedrijfsnaam': 'Gemeente Heerenveen',
  'org_email': 

In [3]:
# EU status
body = f"""
    <GetFeature xmlns="http://www.opengis.net/wfs" service="WFS" version="1.1.0" outputFormat="application/json" xsi:schemaLocation="http://www.opengis.net/wfs http://schemas.opengis.net/wfs/1.1.0/wfs.xsd" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance">
    <Query typeName="zwr_public:eustatussen" srsName="EPSG:28992" xmlns:zwr_public="https://pubgeo.zwemwater.nl/geoserver/zwr_public">

    </Query>
    </GetFeature>
"""
response = requests.post(server, body)
details = [item['properties'] for item in response.json()['features']]

df = pd.DataFrame(details, columns=[
                  "zwemwaterlocatie_id", "jaar", "omschrijving"]).set_index("zwemwaterlocatie_id")
latest_status = df.sort_values('jaar')\
                    .groupby('zwemwaterlocatie_id')\
                    .tail(1)\
                    .drop("jaar", axis=1)\
                    .rename({'omschrijving': 'historie'}, axis=1)


In [4]:
data = locations.merge(latest_status, on="zwemwaterlocatie_id", how="left")
data['historie'].fillna("onbekend", inplace=True)
data

Unnamed: 0_level_0,naam,status,historie
zwemwaterlocatie_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
955,De Heide,goed,goed
956,Spoekeplas,goed,uitstekend
957,Canadameer,goed,uitstekend
959,Smeliester Sân,goed,goed
960,De Leien,WAARSCHUWING,aanvaardbaar
...,...,...,...
7352390,De Waal Heerjansdam,WAARSCHUWING,onbekend
7949330,Willem-Alexanderbaan,goed,onbekend
7960110,Zwemsteiger Nieuwe Meer,goed,onbekend
7961040,Terherne Snitsermar,goed,onbekend


In [5]:
# Measurements
body = f"""
    <GetFeature xmlns="http://www.opengis.net/wfs" service="WFS" version="1.1.0" outputFormat="application/json" xsi:schemaLocation="http://www.opengis.net/wfs http://schemas.opengis.net/wfs/1.1.0/wfs.xsd" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance">
    <Query typeName="zwr_public:resultaatsen" srsName="EPSG:28992" xmlns:zwr_public="https://pubgeo.zwemwater.nl/geoserver/zwr_public">

    </Query>
    </GetFeature>
"""

time_format = "%Y-%m-%dT%H:%M:%SZ"
e_coli, inter = [], []
response = requests.post(server, body).json()['features']

# Splitting data for each measurement type
for item in response:
    item = item['properties']
    listItem = (
        item['zwemwaterlocatie_id'],
        item['numerieke_waarde'],
        datetime.strptime(item['object_begin_tijd'], time_format)
    )
    if item['type_object_code'] == "E_COLI":
        e_coli.append(listItem)
    else:
        inter.append(listItem)


In [6]:
eColiDF = pd.DataFrame(
    e_coli, columns=["zwemwaterlocatie_id", "e_coli", "e_coli_datum"]).set_index("zwemwaterlocatie_id")
eColiResults = eColiDF.sort_values(
    "e_coli_datum").groupby("zwemwaterlocatie_id").tail(1)


In [7]:
data = data.merge(eColiResults, on="zwemwaterlocatie_id", how="left")
data

Unnamed: 0_level_0,naam,status,historie,e_coli,e_coli_datum
zwemwaterlocatie_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
955,De Heide,goed,goed,230.0,2023-07-10 08:03:00
956,Spoekeplas,goed,uitstekend,30.0,2023-07-03 07:52:00
957,Canadameer,goed,uitstekend,80.0,2023-07-03 08:27:00
959,Smeliester Sân,goed,goed,80.0,2023-07-03 05:50:00
960,De Leien,WAARSCHUWING,aanvaardbaar,180.0,2023-07-03 09:45:00
...,...,...,...,...,...
7352390,De Waal Heerjansdam,WAARSCHUWING,onbekend,30.0,2023-07-04 11:51:41
7949330,Willem-Alexanderbaan,goed,onbekend,15.0,2023-07-04 12:50:10
7960110,Zwemsteiger Nieuwe Meer,goed,onbekend,15.0,2023-07-10 08:03:45
7961040,Terherne Snitsermar,goed,onbekend,,NaT


In [8]:
intEntDF = pd.DataFrame(
    inter, columns=["zwemwaterlocatie_id", "int_ent", "int_ent_datum"]).set_index("zwemwaterlocatie_id")
intEntResults = intEntDF.sort_values(
    "int_ent_datum").groupby("zwemwaterlocatie_id").tail(1)

year_old = datetime.utcnow() - relativedelta(years=1)
mask = (intEntResults['int_ent_datum'] > year_old)
intEntResults = intEntResults.loc[(intEntResults['int_ent_datum'] > year_old)]

In [9]:
# Merge measurements with location data
data = data.merge(intEntResults, on="zwemwaterlocatie_id", how="left")

In [10]:
data.dropna(subset=["e_coli_datum", "int_ent_datum"], inplace=True)
data.sort_values("naam", inplace=True)

In [11]:
data

Unnamed: 0_level_0,naam,status,historie,e_coli,e_coli_datum,int_ent,int_ent_datum
zwemwaterlocatie_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
7350900,'s-Gravenzande,goed,uitstekend,61.0,2023-06-20 06:15:00,15.0,2023-06-20 06:15:00
6841460,'t Gasselterveld,goed,uitstekend,15.0,2023-06-19 06:45:36,15.0,2023-06-19 06:45:36
1029,'t Hilgelo Naaktstrand,goed,goed,1327.0,2023-07-03 07:45:00,61.0,2023-07-03 07:45:00
1259,'t Hoefsven,goed,uitstekend,61.0,2023-07-10 10:06:04,30.0,2023-07-10 10:06:04
1396,'t Loomeer,goed,uitstekend,15.0,2023-07-10 08:48:35,15.0,2023-07-10 08:48:35
...,...,...,...,...,...,...,...
961,Zwemplaats Earnewâld,goed,uitstekend,160.0,2023-07-10 06:36:00,110.0,2023-07-10 06:36:00
1451,Zwemplas Hommelheide,goed,uitstekend,15.0,2023-05-15 07:53:00,15.0,2023-05-15 07:53:00
5911123,Zwemplas Woude,goed,goed,77.0,2023-07-04 10:05:38,110.0,2023-07-04 10:05:38
7960110,Zwemsteiger Nieuwe Meer,goed,onbekend,15.0,2023-07-10 08:03:45,140.0,2023-07-10 08:03:45


In [12]:
data.to_csv("NL-zwemwater.csv")

In [13]:
# import sqlite3
# db = sqlite3.connect("dataset.sqlite3")
# data.to_sql("measurements", db, if_exists="append")

In [19]:
coords = pd.read_csv('locations.csv').set_index("zwemwaterlocatie_id").drop("naam", axis=1)
coords = coords.drop_duplicates(keep="first")
coords

Unnamed: 0_level_0,latitude,longitude
zwemwaterlocatie_id,Unnamed: 1_level_1,Unnamed: 2_level_1
1185,52.1005,5.1522
1586,52.3925,5.7259
1626,51.7219,4.4070
6249511,52.2599,5.3905
1002,52.5044,6.3819
...,...,...
7942901,51.5199,5.1278
7949330,51.9975,4.5614
7960110,52.3329,4.8465
7961040,53.0370,5.7675


In [15]:
combinedData = pd.merge(data, coords, on="zwemwaterlocatie_id", how="left")


In [16]:
combinedData

Unnamed: 0_level_0,naam,status,historie,e_coli,e_coli_datum,int_ent,int_ent_datum,latitude,longitude
zwemwaterlocatie_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
7350900,'s-Gravenzande,goed,uitstekend,61.0,2023-06-20 06:15:00,15.0,2023-06-20 06:15:00,52.0126,4.1392
6841460,'t Gasselterveld,goed,uitstekend,15.0,2023-06-19 06:45:36,15.0,2023-06-19 06:45:36,52.9722,6.7498
1029,'t Hilgelo Naaktstrand,goed,goed,1327.0,2023-07-03 07:45:00,61.0,2023-07-03 07:45:00,51.9936,6.7208
1029,'t Hilgelo Naaktstrand,goed,goed,1327.0,2023-07-03 07:45:00,61.0,2023-07-03 07:45:00,51.9918,6.7242
1259,'t Hoefsven,goed,uitstekend,61.0,2023-07-10 10:06:04,30.0,2023-07-10 10:06:04,51.6761,5.0942
...,...,...,...,...,...,...,...,...,...
961,Zwemplaats Earnewâld,goed,uitstekend,160.0,2023-07-10 06:36:00,110.0,2023-07-10 06:36:00,53.1315,5.9559
1451,Zwemplas Hommelheide,goed,uitstekend,15.0,2023-05-15 07:53:00,15.0,2023-05-15 07:53:00,51.0678,5.8867
5911123,Zwemplas Woude,goed,goed,77.0,2023-07-04 10:05:38,110.0,2023-07-04 10:05:38,51.8810,4.6143
7960110,Zwemsteiger Nieuwe Meer,goed,onbekend,15.0,2023-07-10 08:03:45,140.0,2023-07-10 08:03:45,52.3329,4.8465


In [17]:
combinedData.to_csv("dataset.csv")