-
Notifications
You must be signed in to change notification settings - Fork 0
/
codetables.py
228 lines (182 loc) · 6.97 KB
/
codetables.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
import sys
import warnings
import numpy as np
from niche_vlaanderen.exception import NicheException
def package_resource(folder_paths, file_path):
"""Provide backward compatbile package resources load function
Parameters
----------
folder_paths : list
List of folders and subfolders to get resources from.
file_path : str
File name of the package resource.
"""
if sys.version_info < (3, 10):
from pkg_resources import resource_filename
return resource_filename("niche_vlaanderen",
f"{'/'.join(folder_paths)}/{file_path}")
else:
from importlib.resources import files
return files(f"niche_vlaanderen.{'.'.join(folder_paths)}").joinpath(file_path)
class CodeTableException(Exception):
"""
Exception while validating the code tables
"""
def check_lower_upper_boundaries(df, min_col, max_col, value):
"""Checks whether there are no overlaps between min_col and max_col
Parameters
==========
df: dataframe to check, must contain min_col and max_col
min_col, max_col: columns containing the mni and max value
value: the column containing the reclassified value
This function will check if there are no overlapping values when
classifying the dataframe (df).
"""
group_cols = set(df.columns.tolist()) - {min_col, max_col, value}
if len(list(group_cols)) > 0:
groups = list(group_cols)
else:
groups = list(group_cols)[0]
for sel_group, subtable in df.groupby(groups):
min_values = subtable[min_col]
max_values = subtable[max_col]
for (i, index) in enumerate(min_values.index):
if i > 0:
if min_values[index] != max_values[prev_index]: # noqa: F821
raise CodeTableException(
"Min and max values in table do not correspond"
)
prev_index = index # noqa: F841
def check_join(df1, df2, f1, f2=None, inner=True):
"""
Checks if keys for columns in two dataframes are present.
If 'inner' is specified values of both columns must be the same.
If 'inner' is false all keys of df1 have to be present in df2.
Parameters
==========
df1: first dataframe
df2: second dataframe
f1: field in first dataframe
f2: field in second dataframe
inner: boolean
Returns
=======
None on succesful usage - will raise a CodeTableException on Failure
"""
if f2 is None:
f2 = f1
u2 = np.unique(df2[f2])
u1 = np.unique(df1[f1])
if not np.array_equal(u1, u2):
if inner:
raise CodeTableException("Different keys exist in tables.")
else:
if not np.all(np.isin(u1, u2)):
raise CodeTableException("Not all codes from table 1 are in table 2")
else:
warnings.warn("Warning, different keys exist in tables")
def check_unique(df, col):
u = df[col].unique()
if u.size != df[col].size:
raise CodeTableException("Non unique fields in column {}".format(col))
def validate_tables_acidity(
ct_acidity, ct_soil_mlw_class, ct_soil_codes, lnk_acidity, ct_seepage, inner
):
# check tables
check_unique(ct_soil_codes, "soil_code")
check_unique(ct_soil_codes, "soil_name")
check_unique(ct_acidity, "acidity")
check_unique(ct_seepage, "seepage")
check_lower_upper_boundaries(ct_seepage, "seepage_min", "seepage_max", "seepage")
# check links between tables
check_join(lnk_acidity, ct_acidity, "acidity", inner=inner)
check_join(ct_soil_mlw_class, ct_soil_codes, "soil_group", inner=inner)
check_join(lnk_acidity, ct_soil_mlw_class, "soil_mlw_class", inner=inner)
check_join(lnk_acidity, ct_seepage, "seepage", inner=inner)
def validate_tables_nutrient_level(
ct_lnk_soil_nutrient_level,
ct_management,
ct_mineralisation,
ct_soil_code,
ct_nutrient_level,
inner,
):
# check tables
check_unique(ct_soil_code, "soil_code")
check_unique(ct_soil_code, "soil_name")
check_unique(ct_management, "code")
check_lower_upper_boundaries(
ct_mineralisation, "msw_min", "msw_max", "nitrogen_mineralisation"
)
check_join(ct_mineralisation, ct_soil_code, "soil_name", inner=inner)
check_join(
ct_lnk_soil_nutrient_level,
ct_management,
"management_influence",
"influence",
inner=inner,
)
check_lower_upper_boundaries(
ct_lnk_soil_nutrient_level,
"total_nitrogen_min",
"total_nitrogen_max",
"nutrient_level",
)
check_join(ct_lnk_soil_nutrient_level, ct_soil_code, "soil_name", inner=inner)
check_join(
ct_lnk_soil_nutrient_level,
ct_nutrient_level,
"nutrient_level",
"code",
inner=inner,
)
def validate_tables_vegetation(
ct_vegetation,
ct_soil_code,
ct_inundation,
ct_management,
ct_acidity,
ct_nutrient_level,
inner,
):
check_join(ct_vegetation, ct_inundation, "inundation", inner=inner)
check_join(ct_vegetation, ct_acidity, "acidity", inner=inner)
check_join(ct_vegetation, ct_nutrient_level, "nutrient_level", "code", inner=inner)
check_join(ct_vegetation, ct_management, "management", "code", inner=inner)
# extra check: per vegetation type, soil_code only one mhw, mlw combination
# is allowed. Otherwise the simple model may give unexpected results.
cols = ["veg_code", "soil_name"]
grouped = ct_vegetation[
["veg_code", "soil_name", "mhw_min", "mhw_max", "mlw_min", "mlw_max"]
].groupby(cols)
for (veg_code, soil_name), subtable in grouped:
st_unique = subtable.drop_duplicates()
if st_unique.shape[0] != 1:
print(st_unique)
raise CodeTableException("Non unique mhw/mlw combinations")
def validate_tables_flooding(
depths, duration, frequency, lnk_potential, potential, inner
):
# test disabled as we have a 0 code which is not in lnk_potential
# check_join(lnk_potential, depths, "depth","code")
check_join(lnk_potential, duration, "duration", "code", inner)
check_join(lnk_potential, frequency, "frequency", "code", inner)
# test disabled as we have a code 4 which is not in lnk_potential
# check_join(lnk_potential, potential, "potential", "code")
def check_codes_used(name, used, allowed):
""""""
if isinstance(used, str) or isinstance(used, int):
used = np.array(used)
if used.dtype.kind == "f":
used_codes = set(np.unique(used[~np.isnan(used)]))
else:
used_codes = set(np.unique(used))
allowed_codes = set(allowed)
allowed_codes.add(-99) # no data when loaded from grid
if name in ["acidity", "nutrient_level"]: # no data value when calculated
allowed_codes.add(255)
if not used_codes.issubset(allowed_codes):
msg = "Invalid %s code used\n" % name
msg += "used: %s\n" % str(used_codes)
msg += "possible: %s" % str(allowed_codes)
raise NicheException(msg)