In [1]:
import requests
import pandas as pd
import re
from datetime import datetime
from dateutil.relativedelta import relativedelta

server = 'https://pubgeo.zwemwater.nl/geoserver/zwr_public/wfs'

In [2]:
# Locations
body = """
    <GetFeature xmlns="http://www.opengis.net/wfs" service="WFS" version="1.1.0" outputFormat="application/json" xsi:schemaLocation="http://www.opengis.net/wfs http://schemas.opengis.net/wfs/1.1.0/wfs.xsd" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance">
        <Query typeName="zwr_public:zwemplekken_details" srsName="EPSG:28992" xmlns:zwr_public="https://pubgeo.zwemwater.nl/geoserver/zwr_public">
        </Query>
    </GetFeature>
"""
response = requests.post(server, body)
details = [item['properties'] for item in response.json()['features']]
locations = pd.DataFrame(details, columns=["zwemwaterlocatie_id", "naam", "status"])
locations["zwemwaterlocatie_id"] = pd.to_numeric(locations["zwemwaterlocatie_id"])
locations.set_index("zwemwaterlocatie_id", inplace=True)

# Clean up names for read- and searchability
locations['naam'] = locations['naam']\
    .str.strip()\
    .str.title()\
    .str.replace("^Rcn", "RCN", regex=True)\
    .str.replace("^T ", "'t ", regex=True)\
    .str.replace("['|`]T ", "'t ", regex=True)\
    .str.replace("['|`]S", "'s", regex=True)

# Deduplicate locations
locations = locations.groupby("zwemwaterlocatie_id").tail(1)


ConnectionError: HTTPSConnectionPool(host='pubgeo.zwemwater.nl', port=443): Max retries exceeded with url: /geoserver/zwr_public/wfs (Caused by NewConnectionError('<urllib3.connection.HTTPSConnection object at 0x121b3a410>: Failed to establish a new connection: [Errno 8] nodename nor servname provided, or not known'))

In [4]:
# EU status
body = f"""
    <GetFeature xmlns="http://www.opengis.net/wfs" service="WFS" version="1.1.0" outputFormat="application/json" xsi:schemaLocation="http://www.opengis.net/wfs http://schemas.opengis.net/wfs/1.1.0/wfs.xsd" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance">
    <Query typeName="zwr_public:eustatussen" srsName="EPSG:28992" xmlns:zwr_public="https://pubgeo.zwemwater.nl/geoserver/zwr_public">

    </Query>
    </GetFeature>
"""
response = requests.post(server, body)
details = [item['properties'] for item in response.json()['features']]

df = pd.DataFrame(details, columns=[
                  "zwemwaterlocatie_id", "jaar", "omschrijving"]).set_index("zwemwaterlocatie_id")
latest_status = df.sort_values('jaar')\
                    .groupby('zwemwaterlocatie_id')\
                    .tail(1)\
                    .drop("jaar", axis=1)\
                    .rename({'omschrijving': 'historie'}, axis=1)


In [5]:
data = locations.merge(latest_status, on="zwemwaterlocatie_id", how="left")
data['historie'].fillna("onbekend", inplace=True)
data

Unnamed: 0_level_0,naam,status,historie
zwemwaterlocatie_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
7350900,'s-Gravenzande,goed,uitstekend
955,De Heide,goed,goed
1007,Camping Het Berkenven,goed,goed
1005,Recreatiepark De Tolplas,goed,aanvaardbaar
7350750,Hartje Groen,goed,uitstekend
...,...,...,...
1522,Recreatiestrand Porta Isola,goed,uitstekend
1560,Mirrorstrand Hemmeland,goed,uitstekend
1598,Strand Uitdam,goed,uitstekend
7942900,Rakelbos,goed,onbekend


In [6]:
# Measurements
body = f"""
    <GetFeature xmlns="http://www.opengis.net/wfs" service="WFS" version="1.1.0" outputFormat="application/json" xsi:schemaLocation="http://www.opengis.net/wfs http://schemas.opengis.net/wfs/1.1.0/wfs.xsd" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance">
    <Query typeName="zwr_public:resultaatsen" srsName="EPSG:28992" xmlns:zwr_public="https://pubgeo.zwemwater.nl/geoserver/zwr_public">

    </Query>
    </GetFeature>
"""

time_format = "%Y-%m-%dT%H:%M:%SZ"
e_coli, inter = [], []
response = requests.post(server, body).json()['features']

# Splitting data for each measurement type
for item in response:
    item = item['properties']
    listItem = (
        item['zwemwaterlocatie_id'],
        item['numerieke_waarde'],
        datetime.strptime(item['object_begin_tijd'], time_format)
    )
    if item['type_object_code'] == "E_COLI":
        e_coli.append(listItem)
    else:
        inter.append(listItem)


In [7]:
eColiDF = pd.DataFrame(
    e_coli, columns=["zwemwaterlocatie_id", "e_coli", "e_coli_datum"]).set_index("zwemwaterlocatie_id")
eColiResults = eColiDF.sort_values(
    "e_coli_datum").groupby("zwemwaterlocatie_id").tail(1)


In [8]:
data = data.merge(eColiResults, on="zwemwaterlocatie_id", how="left")
data

Unnamed: 0_level_0,naam,status,historie,e_coli,e_coli_datum
zwemwaterlocatie_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
7350900,'s-Gravenzande,goed,uitstekend,15.0,2022-09-13 06:28:00
955,De Heide,goed,goed,510.0,2022-09-19 09:44:00
1007,Camping Het Berkenven,goed,goed,15.0,2022-09-26 10:20:00
1005,Recreatiepark De Tolplas,goed,aanvaardbaar,15.0,2022-09-26 09:50:00
7350750,Hartje Groen,goed,uitstekend,15.0,2022-09-12 09:06:29
...,...,...,...,...,...
1522,Recreatiestrand Porta Isola,goed,uitstekend,270.0,2022-09-13 06:58:00
1560,Mirrorstrand Hemmeland,goed,uitstekend,30.0,2022-09-06 08:20:00
1598,Strand Uitdam,goed,uitstekend,45.0,2022-09-06 08:59:00
7942900,Rakelbos,goed,onbekend,,NaT


In [9]:
intEntDF = pd.DataFrame(
    inter, columns=["zwemwaterlocatie_id", "int_ent", "int_ent_datum"]).set_index("zwemwaterlocatie_id")
intEntResults = intEntDF.sort_values(
    "int_ent_datum").groupby("zwemwaterlocatie_id").tail(1)

year_old = datetime.utcnow() - relativedelta(years=1)
mask = (intEntResults['int_ent_datum'] > year_old)
intEntResults = intEntResults.loc[(intEntResults['int_ent_datum'] > year_old)]

In [10]:
# Merge measurements with location data
data = data.merge(intEntResults, on="zwemwaterlocatie_id", how="left")

In [11]:
data.dropna(subset=["e_coli_datum", "int_ent_datum"], inplace=True)
data.sort_values("naam", inplace=True)

In [12]:
data

Unnamed: 0_level_0,naam,status,historie,e_coli,e_coli_datum,int_ent,int_ent_datum
zwemwaterlocatie_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
7350900,'s-Gravenzande,goed,uitstekend,15.0,2022-09-13 06:28:00,30.0,2022-09-13 06:28:00
1427,'t Bovenwater,goed,uitstekend,453.0,2022-09-19 06:01:00,195.0,2022-09-19 06:01:00
6841460,'t Gasselterveld,goed,uitstekend,45.0,2022-09-19 06:28:11,15.0,2022-09-19 06:28:11
1029,'t Hilgelo Naaktstrand,goed,goed,30.0,2022-09-19 08:15:00,30.0,2022-09-19 08:15:00
1259,'t Hoefsven,goed,uitstekend,15.0,2022-09-19 06:48:20,15.0,2022-09-19 06:48:20
...,...,...,...,...,...,...,...
961,Zwemplaats Earnewâld,goed,uitstekend,80.0,2022-09-19 07:14:00,60.0,2022-09-19 07:14:00
1451,Zwemplas Hommelheide,goed,uitstekend,61.0,2022-09-12 11:34:00,30.0,2022-09-12 11:34:00
1450,Zwemplas Elfenmeer,goed,uitstekend,45.0,2022-09-13 06:27:00,15.0,2022-09-13 06:27:00
5911123,Zwemplas Woude,goed,goed,46.0,2022-09-27 06:45:49,15.0,2022-09-27 06:45:49


In [13]:
data.to_csv("NL-zwemwater.csv")

In [14]:
# import sqlite3
# db = sqlite3.connect("dataset.sqlite3")
# data.to_sql("measurements", db, if_exists="append")

734