/
config.py
142 lines (122 loc) · 5.99 KB
/
config.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
# Copyright (c) 2022 ING Wholesale Banking Advanced Analytics
#
# Permission is hereby granted, free of charge, to any person obtaining a copy of
# this software and associated documentation files (the "Software"), to deal in
# the Software without restriction, including without limitation the rights to
# use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of
# the Software, and to permit persons to whom the Software is furnished to do so,
# subject to the following conditions:
#
# The above copyright notice and this permission notice shall be included in all
# copies or substantial portions of the Software.
#
# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS
# FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR
# COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER
# IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
# CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
from fnmatch import fnmatch
from popmon.analysis.comparison.comparisons import Comparisons
profiles = {
"count": "Number of entries (non-NaN and NaN)",
"distinct": "Number of distinct entries",
"filled": "Number of non-missing entries (non-NaN)",
"nan": "Number of missing entries (NaN)",
"overflow": "Number of values larger than the maximum bin-edge of the histogram.",
"underflow": "Number of values smaller than the minimum bin-edge of the histogram.",
"min": "Minimum value",
"max": "Maximum value",
"mean": "Mean value",
"most_probable_value": "Most probable value",
"std": "Standard deviation",
"phik": "phi-k correlation between the two variables of the histogram",
"phik_pvalue": "p-value of the contingency test of the 2d histogram",
"phik_zscore": "Z-score of the contingency test of the 2d histogram",
}
comparisons = {
"ks": "Kolmogorov-Smirnov test statistic comparing each time slot to {ref}",
"ks_zscore": "Z-score of the Kolmogorov-Smirnov test, comparing each time slot with {ref}",
"ks_pvalue": "p-value of the Kolmogorov-Smirnov test, comparing each time slot with {ref}",
"pearson": "Pearson correlation between each time slot and {ref}",
"chi2": "Chi-squared test statistic, comparing each time slot with {ref}",
"chi2_norm": "Normalized chi-squared statistic, comparing each time slot with {ref}",
"chi2_pvalue": "p-value of the chi-squared statistic, comparing each time slot with {ref}",
"chi2_zscore": "Z-score of the chi-squared statistic, comparing each time slot with {ref}",
"chi2_max_residual": "The largest absolute normalized residual (|chi|) observed in all bin pairs "
+ "(one histogram in a time slot and one in {ref})",
"chi2_spike_count": "The number of normalized residuals of all bin pairs (one histogram in a time"
+ " slot and one in {ref}) with absolute value bigger than a given threshold (default: 7).",
"unknown_labels": "Are categories observed in a given time slot that are not present in {ref}?",
}
comparisons.update(Comparisons.get_descriptions())
references = {
"ref": "the reference data",
"roll": "a rolling window",
"prev1": "the preceding time slot",
"expanding": "all preceding time slots",
}
alerts = {
"n_green": "Total number of green traffic lights (observed for all statistics)",
"n_yellow": "Total number of yellow traffic lights (observed for all statistics)",
"n_red": "Total number of red traffic lights (observed for all statistics)",
"worst": "Worst traffic light (observed for all statistics)",
}
section_descriptions = {
"profiles": """Basic statistics of the data (profiles) calculated for each time period (a period
is represented by one bin). The yellow and red lines represent the corresponding
traffic light bounds (default: 4 and 7 standard deviations with respect to the reference data).""",
"comparisons": "Statistical comparisons of each time period (one bin) to the reference data.",
"traffic_lights": "Traffic light calculation for different statistics (based on the calculated normalized residual, a.k.a. pull). Statistics for which all traffic lights are green are hidden from view by default.",
"alerts": "Alerts aggregated by all traffic lights for each feature.",
"histograms": "Histograms of the last few time slots (default: 2).",
}
config = {
"section_descriptions": section_descriptions,
"limited_stats": [
"distinct*",
"filled*",
"nan*",
"mean*",
"std*",
"p05*",
"p50*",
"p95*",
"max*",
"min*",
"fraction_true*",
"phik*",
"*unknown_labels*",
"*chi2_norm*",
"*ks*",
"*zscore*",
"n_*",
"worst",
],
}
for key in Comparisons.get_comparisons().keys():
config["limited_stats"].append(f"*{key}*")
def get_stat_description(name: str):
"""Gets the description of a statistic.
:param str name: the name of the statistic.
:returns str: the description of the statistic. If not found, returns an empty string
"""
if not isinstance(name, str):
raise TypeError("Statistic's name should be a string.")
if name in profiles:
return profiles[name]
if name in alerts:
return alerts[name]
head, *tail = name.split("_")
tail = "_".join(tail)
if tail in comparisons and head in references:
return comparisons[tail].format(ref=references[head])
if fnmatch(name, "p[0-9][0-9]"):
return f"{int(name[1:])}% percentile"
return ""
# Global configuration for the joblib parallelization. Could be used to change the number of jobs, and/or change
# the backend from default (loki) to 'multiprocessing' or 'threading'.
# (see https://joblib.readthedocs.io/en/latest/generated/joblib.Parallel.html for details)
parallel_args = {"n_jobs": 1}
# Usage the `ing_matplotlib_theme`
themed = True