# Modeling and Insights

In [1]:
# Import necessary packages
from dbfread import DBF
import geopandas as gpd
import matplotlib.pyplot as plt
%matplotlib inline
import os
import pandas as pd
from pyhere import here
from pyproj import CRS

In [2]:
# datasets not provided as csvs. 
# screen folder for .dbf, .shp, files, load as (geo)pandas dfs, save (g)dfs in dicts with filenames as keys
# for plotting, transform to geographical crs 
dfs = {}
gdfs = {}
folder_path = here("ERGW1000/ERGW1000_v1/shp")
for file in os.listdir(folder_path):
    if file.endswith(".dbf"):
        # create short filename
        filename = file.replace("ergw1000_", "").replace(".dbf", "")
        path_to_file = os.path.join(folder_path, file)
        table = DBF(path_to_file, load=True,  encoding="utf-8")
        #convert to pandas df
        df = pd.DataFrame(iter(table))
        #save to dict
        dfs[filename] = df
    if file.endswith(".shp"):
        # create short filename
        filename = file.replace("ergw1000_", "").replace(".shp", "")
        path_to_shapefile = os.path.join(folder_path, file)
        gdf = gpd.read_file(path_to_shapefile)
        gdf = gdf.to_crs(epsg=4326)
        #save to dict
        gdfs[filename] = gdf

In [3]:
# load datasets for modeling

# mining data
mining_gdf = gdfs['bergbaugebiete__v1_poly']

# groundwater data 
water_gdf = gdfs['gwerg__v11_poly'] 


In [4]:
# Target variable: numerical oridinal encoding
water_gdf['entn_bru'] = water_gdf['entn_bru'].fillna('None') # none for no extraction at all
entn_dict = {'None': 0, 'meist < 2 (l/s)': 1, 'meist < 5 (l/s)': 2, 'meist 5 - 15 (l/s)': 3, 'meist 15 - 40 (l/s)': 4, 'meist > 40 (l/s)': 5}
y = water_gdf['entn_bru'].map(entn_dict).values

In [5]:
# convert to projected crs
projected_crs = "EPSG:32632"

# Reproject both datasets
water_gdf = water_gdf.to_crs(projected_crs)
mining_gdf = mining_gdf.to_crs(projected_crs)



In [6]:
# Feature 1: 'gestein' dummy encoded
# replace missing "gestein" with string
water_gdf['gestein'] = water_gdf['gestein'].astype('str')
# one hot encoding of gestein
gestein_encoded = pd.get_dummies(water_gdf["gestein"], prefix="gestein")
water_gdf = pd.concat([water_gdf, gestein_encoded], axis=1)

In [7]:
# Feature 2: 'Bedeutung' label encoded ordinal
# replace missing with string
water_gdf['bedeutung'] = water_gdf['bedeutung'].astype('str')
bedeut_dict = {'None': 0, 'Keine bedeutenden Grundwasservorkommen': 1, 'Weniger bedeutende Grundwasservorkommen': 2, 'Bedeutende Grundwasservorkommen': 3}
water_gdf['bedeutung_cat'] = water_gdf['bedeutung'].map(bedeut_dict)


In [8]:
# Feature 3: Distance of each groundwater geometry centroid to nearest mining polyline

# Get centroids of water source areas
water_centroid = water_gdf.geometry.centroid

# Compute distance from each centroid to the nearest mining polyline
water_gdf["distance_to_mining"] = water_centroid.apply(
    lambda point: mining_gdf.geometry.distance(point).min() / 1_0000 # to km
)

In [9]:
# Feature 4: 'entn_werk' label encoded ordinal

water_gdf['entn_werk'] = water_gdf['entn_werk'].fillna('None') # none for no extraction at all
entn_dict = {'None': 0, 'meist 0,2 - 1 (hm3/a)': 1, 'meist 1 - 5 (hm3/a)': 2, 'häufig > 5 (hm3/a)': 3}
water_gdf['entn_werk_cat'] = water_gdf['entn_werk'].map(entn_dict).values

In [10]:
# Feature 5: Proximity to rivers or lakes
bank_gdf = gdfs['uferfiltrat__v1_line']
# to projected crs
bank_gdf = bank_gdf.to_crs(projected_crs)

In [11]:
# Compute distance from each water area centroid to the nearest waterbank
water_gdf["distance_to_waterbanks"] = water_centroid.apply(
    lambda point: bank_gdf.geometry.distance(point).min() / 1_0000 # to km
)

In [12]:
water_gdf['entn_werk'] = water_gdf['entn_werk'].fillna('None')
water_gdf.ergiebigke.value_counts()

ergiebigke
weniger oder wechselnd ergiebig                                                                                             447
ergiebig                                                                                                                    283
örtliche Vorkommen können für die Versorgung wichtig sein                                                                   191
örtlich in Brunnen und Quellen große Ergiebigkeit möglich; Nutzung aus technischen und hygienischen Gründen eingeschränk    176
sehr ergiebig                                                                                                                65
Name: count, dtype: int64

In [13]:
water_gdf.erg_id.value_counts(dropna=False)

erg_id
13    447
0     311
12    283
15    191
14    176
11     65
Name: count, dtype: int64

In [14]:
# Feature 6: 'erg_id' recoding 0-5
water_gdf['ergiebigkeit_cat'] = water_gdf['erg_id'] -10
water_gdf.loc[water_gdf['ergiebigkeit_cat'] < 0, 'ergiebigkeit_cat' ] = 0

In [15]:
water_gdf

Unnamed: 0,erg_id,gestein_id,bedeutung,ergiebigke,entn_bru,entn_werk,gestein,Shape_STAr,Shape_STLe,geometry,"gestein_Kalkstein, Dolomit, Gips (Karstwasserleiter)",gestein_None,"gestein_Sand, Kies, Tuff (Porenwasserleiter)","gestein_Sandstein, Quarzit, Basalt, Kalkmergelstein (Kluftwasserleiter)",bedeutung_cat,distance_to_mining,entn_werk_cat,distance_to_waterbanks,ergiebigkeit_cat
0,13,3,Bedeutende Grundwasservorkommen,weniger oder wechselnd ergiebig,meist 5 - 15 (l/s),"meist 0,2 - 1 (hm3/a)","Sand, Kies, Tuff (Porenwasserleiter)",1.639614e+09,350189.434423,"POLYGON ((457799.062 6082107.462, 457692.664 6...",False,False,True,False,3,27.599427,1,12.776758,3
1,15,0,Keine bedeutenden Grundwasservorkommen,örtliche Vorkommen können für die Versorgung w...,meist < 2 (l/s),,,1.111391e+07,16615.602796,"POLYGON ((457799.062 6082107.462, 457675.585 6...",False,True,False,False,1,28.379206,0,16.350886,5
2,13,3,Bedeutende Grundwasservorkommen,weniger oder wechselnd ergiebig,meist 5 - 15 (l/s),"meist 0,2 - 1 (hm3/a)","Sand, Kies, Tuff (Porenwasserleiter)",9.922033e+06,23933.375167,"POLYGON ((453945.734 6076989.192, 454083.071 6...",False,False,True,False,3,27.430476,1,15.618993,3
3,11,3,Bedeutende Grundwasservorkommen,sehr ergiebig,meist > 40 (l/s),häufig > 5 (hm3/a),"Sand, Kies, Tuff (Porenwasserleiter)",1.976371e+08,72841.557107,"POLYGON ((524046.217 6076577.343, 524281.78 60...",False,False,True,False,3,28.869077,3,12.454141,1
4,12,3,Bedeutende Grundwasservorkommen,ergiebig,meist 15 - 40 (l/s),meist 1 - 5 (hm3/a),"Sand, Kies, Tuff (Porenwasserleiter)",1.688561e+07,15249.541496,"POLYGON ((497201.113 6071392.793, 496618.271 6...",False,False,True,False,3,28.714522,2,14.033133,2
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1468,13,1,Bedeutende Grundwasservorkommen,weniger oder wechselnd ergiebig,meist 5 - 15 (l/s),"meist 0,2 - 1 (hm3/a)","Kalkstein, Dolomit, Gips (Karstwasserleiter)",3.068118e+07,28962.942482,"POLYGON ((378876.539 5752784.527, 378392.838 5...",True,False,False,False,3,2.843375,1,2.883728,3
1469,11,3,Bedeutende Grundwasservorkommen,sehr ergiebig,meist > 40 (l/s),häufig > 5 (hm3/a),"Sand, Kies, Tuff (Porenwasserleiter)",2.948042e+09,663084.780452,"POLYGON ((327478.945 5748249.853, 328636.151 5...",False,False,True,False,3,0.046231,3,0.200276,1
1470,13,3,Bedeutende Grundwasservorkommen,weniger oder wechselnd ergiebig,meist 5 - 15 (l/s),"meist 0,2 - 1 (hm3/a)","Sand, Kies, Tuff (Porenwasserleiter)",1.482194e+07,25828.689980,"POLYGON ((348744.536 5747744.288, 348158.732 5...",False,False,True,False,3,2.314534,1,3.224579,3
1471,11,3,Bedeutende Grundwasservorkommen,sehr ergiebig,meist > 40 (l/s),häufig > 5 (hm3/a),"Sand, Kies, Tuff (Porenwasserleiter)",4.893134e+08,236721.978497,"POLYGON ((348397.403 5741919.625, 348924.29 57...",False,False,True,False,3,0.675877,3,3.686623,1


In [16]:
# Define feature array (X)
X = water_gdf.drop(columns = ['erg_id', 'gestein_id', 'ergiebigke', 'bedeutung', 'entn_bru', 'gestein', 'Shape_STLe', 'geometry', ])

In [17]:
X

Unnamed: 0,entn_werk,Shape_STAr,"gestein_Kalkstein, Dolomit, Gips (Karstwasserleiter)",gestein_None,"gestein_Sand, Kies, Tuff (Porenwasserleiter)","gestein_Sandstein, Quarzit, Basalt, Kalkmergelstein (Kluftwasserleiter)",bedeutung_cat,distance_to_mining,entn_werk_cat,distance_to_waterbanks,ergiebigkeit_cat
0,"meist 0,2 - 1 (hm3/a)",1.639614e+09,False,False,True,False,3,27.599427,1,12.776758,3
1,,1.111391e+07,False,True,False,False,1,28.379206,0,16.350886,5
2,"meist 0,2 - 1 (hm3/a)",9.922033e+06,False,False,True,False,3,27.430476,1,15.618993,3
3,häufig > 5 (hm3/a),1.976371e+08,False,False,True,False,3,28.869077,3,12.454141,1
4,meist 1 - 5 (hm3/a),1.688561e+07,False,False,True,False,3,28.714522,2,14.033133,2
...,...,...,...,...,...,...,...,...,...,...,...
1468,"meist 0,2 - 1 (hm3/a)",3.068118e+07,True,False,False,False,3,2.843375,1,2.883728,3
1469,häufig > 5 (hm3/a),2.948042e+09,False,False,True,False,3,0.046231,3,0.200276,1
1470,"meist 0,2 - 1 (hm3/a)",1.482194e+07,False,False,True,False,3,2.314534,1,3.224579,3
1471,häufig > 5 (hm3/a),4.893134e+08,False,False,True,False,3,0.675877,3,3.686623,1
