# 03 - Kanton Zürich Wärmenutzungsatlas - Geothermal Probes
### Author: Daniel Herrera-Russert
#### January 28, 2025

In [1]:
# !pip install geopandas owslib requests pyproj

In [2]:
import pandas as pd
import numpy as np
import os

from owslib.wfs import WebFeatureService
import geopandas as gpd
import requests
from io import BytesIO

import plotly.express as px
import plotly.io as pio
import plotly.graph_objects as go
from plotly.subplots import make_subplots
from pyproj import Transformer # for converting coordinates

# Set the default renderer for JupyterLab
pio.renderers.default = 'iframe'

## 1. Loading the dataset

We begin with a look at the [Wärmenutzungsatlas](https://maps.zh.ch/?offlayers=bezirkslabels&scale=320000&srid=2056&topic=AwelGSWaermewwwZH&x=2692500&y=1252500) of Kanton Zürich. The data is retrieved from the GIS portal via WFS service, in order to obtain the separate layers preserving the geographic data qualities. Alternatively, it could be downloaded manually directly from the GIS download section in a variety of formats for different purposes.

Following is the extraction and exploration of the dataset, corresponding to the layer containing the earth probes (Erdwärmesonden).

In [3]:
# Define the WFS endpoint
WFS_URL = "https://maps.zh.ch/wfs/AwelGSWaermewwwZHWFS"

# Connect to the WFS service
wfs = WebFeatureService(WFS_URL, version="2.0.0")

# List available layers
print("Available layers:")
for layer in list(wfs.contents):
    print(layer)

Available layers:
ms:waermenutzung-zone-a
ms:waermenutzung-zone-b
ms:waermenutzung-zone-c
ms:waermenutzung-zone-d
ms:waermenutzung-zone-e
ms:waermenutzung-zone-f
ms:erdwaermesonden-auflagen
ms:tunnels-und-stollen
ms:erdwaermesonden
ms:sonden
ms:sammelschacht
ms:grundwasserpiezometer
ms:erdregister
ms:quellfassung-waermenutzung
ms:grundwasserfassung-waermenutzung
ms:quellfassung-ohne-waermenutzung
ms:grundwasserfassung-ohne-waermenutzung


In [4]:
# Select the layer
layer_name = "ms:erdwaermesonden"

# Request data in GML format
response = wfs.getfeature(typename=layer_name, outputFormat="GML2")  # or "text/xml; subtype=gml/3.2"

# Read the response as a GeoDataFrame
gdf = gpd.read_file(BytesIO(response.read()))

# Save to file
gdf.to_file("data/raw/zh_geothermal_probes.geojson", driver="GeoJSON")

print("Data saved as 'zh_geothermal_probes.geojson'")


'crs' was not provided.  The output dataset will not have projection information defined and may not be usable in other systems.



Data saved as 'zh_geothermal_probes.geojson'


In [5]:
# Load the GeoJSON file into a GeoPandas
geojson_path = "data/raw/zh_geothermal_probes.geojson"
zh_geothermal_probes_gdf = gpd.read_file(geojson_path)

In [6]:
# Display the first few rows of each dataset to verify
zh_geothermal_probes_gdf.head(5)

Unnamed: 0,gml_id,x,y,Waermeentnahme,Waermeeintrag,Sondentiefe,Gesamtsondenzahl,GBS-Nummer,Bohrprofil,geometry
0,erdwaermesonden.1464734,2684789,1248345,20.4,0.0,250,3.0,b 00-10370,0,POINT (2684788.902 1248344.823)
1,erdwaermesonden.1464735,2684962,1248289,68.0,0.0,250,10.0,b 00-10371,1,POINT (2684961.902 1248288.82)
2,erdwaermesonden.1464736,2684633,1248535,12.5,6.0,250,2.0,b 00-10287,0,POINT (2684632.902 1248534.826)
3,erdwaermesonden.1464737,2684626,1248526,12.5,6.0,250,2.0,b 00-10288,0,POINT (2684625.901 1248525.826)
4,erdwaermesonden.1464738,2684623,1248519,12.5,6.0,250,2.0,b 00-10289,0,POINT (2684622.901 1248518.826)


In [7]:
# Check the current CRS (if None, set to Swiss LV95)
if zh_geothermal_probes_gdf.crs is None:
    zh_geothermal_probes_gdf.set_crs(epsg=2056, inplace=True)

# Initialize the transformer from CH1903+ (LV95) to WGS84
transformer = Transformer.from_crs("EPSG:2056", "EPSG:4326", always_xy=True)

# Apply coordinate transformation manually
zh_geothermal_probes_gdf[['lon', 'lat']] = zh_geothermal_probes_gdf.apply(
    lambda row: pd.Series(transformer.transform(row.geometry.x, row.geometry.y)),
    axis=1
)

# Ensure no invalid coordinates
zh_geothermal_probes_gdf = zh_geothermal_probes_gdf.dropna(subset=['lon', 'lat'])

In [8]:
# Print the number of geothermal probes in the dataset
print(f"Total geothermal probes: {len(zh_geothermal_probes_gdf)}")

Total geothermal probes: 35667


---

In [9]:
# Display the first few rows of each dataset to verify
zh_geothermal_probes_gdf.head(5)

Unnamed: 0,gml_id,x,y,Waermeentnahme,Waermeeintrag,Sondentiefe,Gesamtsondenzahl,GBS-Nummer,Bohrprofil,geometry,lon,lat
0,erdwaermesonden.1464734,2684789,1248345,20.4,0.0,250,3.0,b 00-10370,0,POINT (2684788.902 1248344.823),8.561441,47.380481
1,erdwaermesonden.1464735,2684962,1248289,68.0,0.0,250,10.0,b 00-10371,1,POINT (2684961.902 1248288.82),8.563721,47.379955
2,erdwaermesonden.1464736,2684633,1248535,12.5,6.0,250,2.0,b 00-10287,0,POINT (2684632.902 1248534.826),8.559411,47.382209
3,erdwaermesonden.1464737,2684626,1248526,12.5,6.0,250,2.0,b 00-10288,0,POINT (2684625.901 1248525.826),8.559317,47.382129
4,erdwaermesonden.1464738,2684623,1248519,12.5,6.0,250,2.0,b 00-10289,0,POINT (2684622.901 1248518.826),8.559276,47.382067


In [10]:
save_path = "data/transformed/zh_geothermal_probes.geojson"
zh_geothermal_probes_gdf.to_file(save_path, driver="GeoJSON")
print(f"File saved under {save_path}")

File saved under data/transformed/zh_geothermal_probes.geojson


## 2. Exploratory Data Analysis  

The dataset provides geospatial information about approved geothermal probes (**Erdwärmesonden**) and heat pump installations in **Kanton Zürich**. It includes two types of probes:  
- **Mastersonden**: Geothermal bores located on the same parcel as the heat pump.  
- **Sonden**: Additional bores on separate parcels.  

The dataset is maintained by the **Amt für Abfall, Wasser, Energie und Luft (AWEL)** and is updated daily.  
Originally, the dataset used the **CH1903+ / LV95 (EPSG:2056) coordinate system** with X/Y values in meters. However, these coordinates have now been **converted to WGS84 (EPSG:4326)** to provide standard **latitude/longitude** values in decimal degrees.  

### **Mastersonden Attributes**:
- `gml_id`: Unique identifier for each probe.  
- `x`, `y`: Original Swiss coordinates (CH1903+ / LV95).  
- `lon`, `lat`: Transformed WGS84 coordinates (Longitude & Latitude).  
- `Waermeentnahme`: Heat extraction (kW).  
- `Waermeeintrag`: Heat input (kW).  
- `Sondentiefe`: Bore depth (meters).  
- `Gesamtsondenanzahl`: Total number of bores at the location.  
- `GBS-Nummer`: Unique parcel number associated with the bore.  
- `Bohrprofil`: Indicates whether a bore profile is present (binary flag).  
- `geometry`: Geospatial representation of the probe as a **POINT (lon, lat)**.  

### **Sonden Attributes (Second Dataset)**:
- `GBS_ID`: Mastersonden identifier.  
- `S_ID`: Unique identifier for the Sonde.  
- `lon`, `lat`: Transformed WGS84 coordinates (Longitude & Latitude).  
- `KATASTERNUMMER`: Parcel number.  
- `GUELTIGKEIT`: Validity status.  

### **Dataset Formats & Accessibility**  
The data is available in multiple formats: **CSV**, **SHP (Shapefile)**, **GeoPackage**, **DXF**, and **ESRI File Geodatabase**. It can be accessed via the **GIS-ZH platform**, **REST API**, or **OpendataSwiss**.

### **Key Improvements & Changes**
- **Transformed coordinate system**: CH1903+ (LV95) → WGS84 (EPSG:4326).  
- **Added `lon` & `lat` columns** for standardized geospatial analysis.  
- **Retained original `x`, `y` columns** for reference.  
- **Updated column names** to match processed dataset.  

This dataset is an invaluable tool for **spatial analysis and planning**, helping to assess **geothermal potential** and manage heat pump installations within Zürich. The data from this source is available as **CSV**, **SHP (Shapefile)**, **GeoPackage**, **DXF**, and **ESRI File Geodatabase**. It is downloadable via GIS-ZH platform, REST API, or OpendataSwiss.

In [11]:
# Basic EDA for the dataset
def basic_eda(df):
    # Number of rows and columns
    num_rows, num_columns = df.shape
    print(f"Number of rows: {num_rows}")
    print(f"Number of columns: {num_columns}\n")
    
    # Data types of each column
    print("Data Types:")
    print(df.dtypes)
    print("\n")
    
    # Check for missing values
    print("Missing Values:")
    print(df.isnull().sum())
    print("\n")
    
    # Unique values for categorical variables
    print("Unique Values for Categorical Variables:")
    categorical_columns = df.select_dtypes(include='object').columns
    for col in categorical_columns:
        print(f"{col}: {df[col].nunique()} unique values")
    print("\n")
    
    # Summary statistics for numerical columns
    print("Summary Statistics for Numerical Variables:")
    print(df.describe())
    print("\n")

# Perform basic EDA on the dataset
basic_eda(zh_geothermal_probes_gdf)

Number of rows: 35667
Number of columns: 12

Data Types:
gml_id                object
x                      int32
y                      int32
Waermeentnahme       float64
Waermeeintrag        float64
Sondentiefe            int32
Gesamtsondenzahl     float64
GBS-Nummer            object
Bohrprofil             int32
geometry            geometry
lon                  float64
lat                  float64
dtype: object


Missing Values:
gml_id              0
x                   0
y                   0
Waermeentnahme      0
Waermeeintrag       0
Sondentiefe         0
Gesamtsondenzahl    0
GBS-Nummer          0
Bohrprofil          0
geometry            0
lon                 0
lat                 0
dtype: int64


Unique Values for Categorical Variables:
gml_id: 35667 unique values
GBS-Nummer: 35667 unique values


Summary Statistics for Numerical Variables:
                  x             y  Waermeentnahme  Waermeeintrag  \
count  3.566700e+04  3.566700e+04    35667.000000   35667.000000   
m

### **EDA Summary by Variable**  

- **`gml_id`**: Contains unique identifiers for each probe, with all 35,626 values being distinct.  

- **`x` and `y` (Swiss Coordinates, CH1903+ / LV95)**: The coordinates fall within the expected Swiss coordinate system range, with values between **669,640 and 2,714,596 (x)** and **225,567 to 1,283,045 (y)**.  

- **`lon` and `lat` (WGS84 Coordinates)**: Transformed coordinates into **longitude and latitude (EPSG:4326)**. The spatial extent is valid, ranging from **8.362 to 8.960 (longitude)** and **47.173 to 47.692 (latitude)**, corresponding to Zürich.  

- **`Waermeentnahme` (Heat Extraction in kW)**:  
  - Average extraction is **18.13 kW**, with most probes extracting between **6–20 kW**.  
  - A significant outlier is observed at **3,658 kW**, indicating potential industrial-scale use or data entry errors.  

- **`Waermeeintrag` (Heat Input in kW)**:  
  - The majority of probes report **0 kW** input.  
  - A few probes have values as high as **4,282 kW**, suggesting locations where heat is injected back into the ground.  

- **`Sondentiefe` (Probe Depth in meters)**:  
  - Average depth is **191.4 meters**, with most probes between **150 and 230 meters**.  
  - An extreme outlier exists at **2,371 meters**, which may indicate deep geothermal projects.  

- **`Gesamtsondenanzahl` (Total Probes per Location)**:  
  - Most locations have **1–3 probes**.  
  - One extreme outlier has **385 probes**, which is significantly higher than typical installations.  

- **`GBS-Nummer`**: Represents unique parcel numbers for each bore. Like `gml_id`, all **35,626 values are unique**, confirming that each probe is linked to a distinct parcel.  

- **`Bohrprofil` (Drilling Profile)**: A binary categorical variable (`0` or `1`) indicating the presence of a drilling profile.  
  - The majority (**87.6%**) have no specific profile (`0`).  
  - Only **12.4%** have a documented drilling profile (`1`).  

- **`geometry`**: Represents spatial data in **POINT (lon, lat)** format. This field ensures compatibility with GIS applications.  

### **General Observations**  
- The dataset contains **no missing values**, ensuring completeness for analysis.  
- The **spatial distribution of probes is consistent with the Zürich region**, confirming accurate geolocation after coordinate transformation.  
- A few **notable outliers exist**, particularly in **heat extraction, depth, and total probe count**, which should be further analyzed for validity.

---

# 3. Numerical data analysis

### 3.1. Depth

We now direct the attention to the `SONDENTIEFE` variable to analyze the frequency of probe depths across the dataset. This is done to identify clusters, common depth ranges, and potential outliers.

- **Histogram**: The main plot visualizes the distribution of `SONDENTIEFE` in bins, showing that the majority of probes have depths concentrated below 500 meters. 
- **Box Plot**: A marginal box plot above the histogram provides a summary of the depth distribution, highlighting the central tendency, variability, and outliers. 
- **Goal**: 
  - To understand the overall structure of the data and detect patterns or irregularities.
  - To confirm suspicions of depth clustering around specific values.
  - To identify outliers or extreme depths that may require further investigation.
- **Findings**: 
  - Most probes are concentrated in a relatively narrow range of depths (approximately 100–300 meters).
  - There are a few outliers with depths exceeding 1000 meters, which may need further exploration or validation.

This analysis helps us better understand the dataset and informs further processing or visualization steps.

In [12]:
# Import libraries
import plotly.express as px

# Create the distribution plot for the updated dataset
fig = px.histogram(
    zh_geothermal_probes_gdf,
    x="Sondentiefe",  # Adjusted to match your dataset's column naming convention
    nbins=50,  # Number of bins
    title="Distribution of Probe Depth (Sondentiefe)",
    labels={"Sondentiefe": "Probe Depth (Sondentiefe)"},  # x-axis label
    marginal="box",  # Adds a box plot above the histogram
    opacity=0.7,  # Adjusts bar opacity
    color_discrete_sequence=["teal"]  # Set bar color
)

# Show the plot
fig.show()

Upon investigating the outlier with a `Sondentiefe` of **2371 meters**, we identified that this data point corresponds to the [**Zürich Triemli geothermal project**](https://geothermie-schweiz.ch/wp_live/wp-content/uploads/2021/12/ZH_Zuerich-Triemli_DE.pdf). This initiative involves a deep borehole drilled to a depth of 2371 meters, aiming to explore the potential for geothermal energy extraction in the region. The project's comprehensive analysis includes geological assessments and temperature measurements to evaluate the feasibility of sustainable energy production.

The depth of this borehole is significantly greater than typical geothermal probes, which usually range between 100 to 500 meters. This substantial difference accounts for the outlier observed in our dataset. The detailed findings and methodologies of the Triemli project are documented in the report titled "ZH_Zuerich-Triemli_DE," available on the Geothermie-Schweiz website. This report provides in-depth insights into the project's objectives, drilling processes, and preliminary results, confirming the legitimacy of the recorded depth in our dataset.

In [13]:
# Filter the dataset for the row where SONDENTIEFE is exactly 2371
specific_depth_row = zh_geothermal_probes_gdf[zh_geothermal_probes_gdf['Sondentiefe'] == 2371]

# Check if the row exists before proceeding
if not specific_depth_row.empty:
    # Display the transposed information for better readability
    print("Information for Depth = 2371 (Transposed):")
    print(specific_depth_row.T)
else:
    # Notify if no row with SONDENTIEFE = 2371 exists
    print("No row found with Sondentiefe of exactly 2371.")

Information for Depth = 2371 (Transposed):
                                            19704
gml_id                    erdwaermesonden.1490692
x                                         2679653
y                                         1246871
Waermeentnahme                              300.0
Waermeeintrag                                 0.0
Sondentiefe                                  2371
Gesamtsondenzahl                              1.0
GBS-Nummer                             b 00-10523
Bohrprofil                                      1
geometry          POINT (2679652.888 1246870.867)
lon                                      8.493173
lat                                     47.367866


---

### 3.2. Correlation Matrix

To conclude the analysis of numerical data from the dataset, we perform a correlation matrix plot to explore relationships between variables. The plot highlights the degree to which pairs of numerical variables are linearly related, with coefficients ranging from -1 (perfect negative correlation) to 1 (perfect positive correlation). The following key insights emerge from the analysis:

- **Strong Correlations**:
  - `WAERMEENTNAHME` (Heat Extraction) and `WAERMEEINTRAG` (Heat Input) exhibit a strong positive correlation (0.86). This suggests that installations with higher heat extraction often also have higher heat input, indicating well-balanced geothermal systems.
  - `GESAMTSONDENANZAHL` (Total Number of Probes) shows moderate to strong positive correlations with both `WAERMEENTNAHME` (0.83) and `WAERMEEINTRAG` (0.71), indicating that larger installations with more probes generally have greater heat transfer capacities.

- **Weak or Negligible Correlations**:
  - Coordinate variables (`X_KOORD`, `Y_KOORD`, `E`, `N`) show weak or negligible correlations with most other variables, indicating no spatial bias in the operational parameters like depth or heat transfer.
  - `SONDENTIEFE` (Probe Depth) shows a weak positive correlation with `WAERMEENTNAHME` (0.18) and `WAERMEEINTRAG` (0.04), suggesting only a minimal relationship between depth and heat transfer efficiency.

- **Negative Correlations**:
  - `ERFASSUNG` (Timestamp) is strongly negatively correlated with `X_KOORD` (-0.94) and `Y_KOORD` (-0.94). This could indicate a chronological trend in probe installations being concentrated in certain geographical areas over time.

### Implications:
The strong positive correlations between `GESAMTSONDENANZAHL`, `WAERMEENTNAHME`, and `WAERMEEINTRAG` align with expectations, as installations with more probes are likely designed for higher energy transfer. The negligible correlation between `SONDENTIEFE` and heat transfer variables suggests that depth alone is not a key determinant of energy capacity. Meanwhile, the negative correlations involving `ERFASSUNG` highlight potential temporal shifts in the dataset that may require additional exploration.

This correlation matrix offers valuable insights into the relationships between key variables, guiding further analyses and decision-making for geothermal projects.

In [14]:
# Select numerical columns from the final dataset and exclude any irrelevant columns
numerical_columns = zh_geothermal_probes_gdf.select_dtypes(include=['number'])

# Compute the correlation matrix for the numerical variables
correlation_matrix = numerical_columns.corr().round(2)  # Round values to 2 decimal places

# Create the correlation matrix heatmap
fig = px.imshow(
    correlation_matrix,
    text_auto=".2f",  # Display correlation coefficients with 2 decimal places
    title="Correlation Matrix of Numerical Variables",
    labels=dict(color="Correlation Coefficient"),
    x=correlation_matrix.columns,
    y=correlation_matrix.columns,
    color_continuous_scale="Cividis"  # Updated for better contrast and accessibility
)

# Adjust layout for improved readability
fig.update_layout(
    height=800,  # Increased height
    width=800,  # Increased width
    font=dict(size=12),  # Adjusted font size for labels and titles
    title_font=dict(size=16),  # Enhanced title prominence
    xaxis=dict(
        side="bottom",  # Position x-axis labels at the bottom
        tickangle=45,  # Rotate x-axis labels for compact display
    ),
    yaxis=dict(
        tickangle=0  # Keep y-axis labels horizontal
    ),
    coloraxis_colorbar=dict(
        title="Correlation Coefficient",  # Add title to the color bar
        tickvals=[-1, -0.5, 0, 0.5, 1],  # Explicit tick values for better scale comprehension
        len=0.8  # Adjust color bar length
    )
)

# Show the plot
fig.show()

---

## 4. Geographic Interpretation of the Data

In [15]:
# Create a copy of the main dataset to avoid modifying the original data
plot_data = zh_geothermal_probes_gdf.copy()

# Create the square root-transformed depth column
plot_data['sqrt_Sondentiefe'] = np.sqrt(plot_data['Sondentiefe'])  # Square root transformation

# Sort the dataset by depth in ascending order
plot_data_sorted = plot_data.sort_values(by="Sondentiefe", ascending=True)

# Create the scatter map with size and square root-adjusted color scale
fig = px.scatter_mapbox(
    plot_data_sorted,  # Use the sorted dataset
    lat="lat",  # Latitude column
    lon="lon",  # Longitude column
    hover_name=None,  # Disable the automatic hover name
    hover_data=None,  # Disable automatic hover data
    color="sqrt_Sondentiefe",  # Color points by square root-transformed depth
    size="Sondentiefe",  # Size points by depth
    color_continuous_scale="YlOrRd",  # Use a strong color scale
    title="Geothermal Heat Pumps in Kanton Zürich (Square Root-Adjusted)",
    mapbox_style="carto-positron",  # Use OpenStreetMap tiles
    zoom=9,  # Adjust zoom level for your data
    height=800,  # Set height of the map
    size_max=12  # Maximum size of the points for balance
)

# Update hovertemplate to show custom hover text with depth only
fig.update_traces(
    hovertemplate="Depth: %{marker.size} m<extra></extra>"  # Display depth with "m" for meters
)

# Update the color bar to display original depth values
fig.update_layout(
    coloraxis_colorbar=dict(
        title="Probe Depth (m)",  # Title for the color bar
        tickvals=np.sqrt([0, 500, 1000, 1500, 2000, 2500]),  # Square root-transformed tick positions
        ticktext=["0", "500", "1000", "1500", "2000", "2500"]  # Original depth values as labels
    )
)

# Show the map
fig.show()

In [16]:
# Create a copy of the main dataset to avoid modifying the original data
plot_data = zh_geothermal_probes_gdf.copy()

# Sort the dataset by heat extraction in ascending order (to plot higher values last)
plot_data = plot_data.sort_values(by="Waermeentnahme", ascending=True)

# Create a scaled size variable for better visibility
plot_data['scaled_size'] = np.sqrt(plot_data['Gesamtsondenzahl'] + 1) * 2  # Mild scaling factor

# Create the scatter map with adjusted size and color encoding
fig = px.scatter_mapbox(
    plot_data,  # Use the copied dataset
    lat="lat",  # Latitude column
    lon="lon",  # Longitude column
    hover_name=None,  # Disable default hover name
    hover_data=None,  # Disable default hover data
    size="scaled_size",  # Size points by scaled size variable
    color="Waermeentnahme",  # Color points by heat extraction
    color_continuous_scale="Sunsetdark",  # Use a color scale for better contrast
    opacity=1,
    size_max=12,  # Adjust maximum size of the dots
    title="Geothermal Heat Pump Locations in Kanton Zürich (Color by Heat Extraction, Size by Number of Probes)",
    mapbox_style="carto-positron",  # Use OpenStreetMap tiles
    zoom=9,  # Adjust zoom level for your data
    height=800  # Set height of the map
)

# Update hovertemplate for consistent and cleaner hover text
fig.update_traces(
    hovertemplate=(
        "Heat Extraction: %{marker.color:.1f} kW<br>"  # Display heat extraction in kW with 1 decimal place
        "Number of Probes: %{customdata[0]}<extra></extra>"  # Display number of probes
    ),
    customdata=plot_data[["Gesamtsondenzahl"]].values  # Pass the original data for number of probes
)

# Update the color bar to reflect heat extraction correctly
fig.update_layout(
    coloraxis_colorbar=dict(
        title="Heat Extraction (kW)",  # Color bar title for heat extraction
        tickformat=".0f"  # Format ticks to show whole numbers
    )
)

# Show the map
fig.show()

---