Skip to content

Commit

Permalink
chore: update example syntax to new reference and config
Browse files Browse the repository at this point in the history
  • Loading branch information
sbrugman committed Jul 5, 2022
1 parent da7b6d8 commit e4df434
Show file tree
Hide file tree
Showing 5 changed files with 42 additions and 89 deletions.
4 changes: 1 addition & 3 deletions examples/flight_delays.py
Expand Up @@ -10,16 +10,14 @@


# Configuration of the monitoring rules and report
settings = Settings()
settings = Settings(time_axis="DATE", reference_type="self")
settings.report.extended_report = False
settings.monitoring.pull_rules = {"*_pull": [10, 7, -7, -10]}

# generate stability report using automatic binning of all encountered features
# (importing popmon automatically adds this functionality to a dataframe)
report = popmon.df_stability_report(
df,
reference_type="self",
time_axis="DATE",
time_width="1w",
time_offset="2015-07-02",
settings=settings,
Expand Down
4 changes: 1 addition & 3 deletions examples/synthetic_data_streams/hyperplane.py
Expand Up @@ -33,6 +33,4 @@

# The training set for the model will be used as reference.
# The reduced time_width is because this is a smaller dataset compared to the rest
synthetic_data_stream_report(
df, features, report_file, time_width=500, reference="start", split=1000
)
synthetic_data_stream_report(df, features, report_file, time_width=500, split=1000)
57 changes: 12 additions & 45 deletions examples/synthetic_data_streams/synthetic_data_streams.py
Expand Up @@ -2,6 +2,7 @@
from scipy.io.arff import loadarff

import popmon
from popmon import Settings


def load_arff(name) -> pd.DataFrame:
Expand All @@ -20,59 +21,25 @@ def dataset_summary(df):


def synthetic_data_stream_report(
data, features, report_file, time_width=1000, reference="full", **kwargs
data, features, report_file, time_width=1000, **kwargs
):
data["index"] = data.index.values

if reference == "full":
ref_df = data
df = data
elif reference == "start":
# split at this point, by default time_width
split = kwargs.get("split", time_width)
ref_df = data[:split]
df = data[split:]
else:
raise ValueError("reference type should be 'full' or 'start'.")

hists_ref = popmon.make_histograms(
ref_df,
time_axis="index",
time_width=time_width,
features=features,
time_offset=0,
)
ref_bin_specs = popmon.get_bin_specs(hists_ref)
features = list(ref_bin_specs.keys())

df["batch"] = df.index // time_width

hists_list = [
popmon.make_histograms(df_chunk, features=features, bin_specs=ref_bin_specs)
for _, df_chunk in df.groupby("batch")
if not df_chunk.empty
]

hists = popmon.stitch_histograms(
hists_list=hists_list,
time_axis="index",
time_bin_idx=sorted(set(df["batch"].values.tolist())),
)

# generate stability report using automatic binning of all encountered features
# (importing popmon automatically adds this functionality to a dataframe)
pull_rules = {"*_pull": [7, 4, -4, -7]}
monitoring_rules = {
settings = Settings(features=features, time_axis="index")
settings.monitoring.pull_rules = {"*_pull": [7, 4, -4, -7]}
settings.monitoring.monitoring_rules = {
"*_pull": [7, 4, -4, -7],
"*_zscore": [7, 4, -4, -7],
"[!p]*_unknown_labels": [0.5, 0.5, 0, 0],
}
report = popmon.stability_report(
hists,
pull_rules=pull_rules,
monitoring_rules=monitoring_rules,
reference_type="external",
reference=hists_ref,

report = popmon.df_stability_report(
data,
time_width=time_width,
time_offset=0,
settings=settings,
**kwargs,
)

# or save the report to file
Expand Down
27 changes: 12 additions & 15 deletions popmon/notebooks/popmon_tutorial_advanced.ipynb
Expand Up @@ -106,12 +106,10 @@
"metadata": {},
"outputs": [],
"source": [
"settings = Settings()\n",
"settings = Settings(time_axis=\"DATE\")\n",
"settings.report.extended_report = False\n",
"\n",
"df.pm_stability_report(\n",
" time_axis=\"DATE\", time_width=\"1w\", time_offset=\"2015-07-02\", settings=settings\n",
")"
"df.pm_stability_report(time_width=\"1w\", time_offset=\"2015-07-02\", settings=settings)"
]
},
{
Expand All @@ -128,11 +126,11 @@
"metadata": {},
"outputs": [],
"source": [
"# reuse the previous settings\n",
"settings = Settings(time_axis=\"DATE\")\n",
"settings.report.extended_report = False\n",
"settings.monitoring.pull_rules = {\"*_pull\": [10, 7, -7, -10]}\n",
"\n",
"df.pm_stability_report(\n",
" time_axis=\"DATE\",\n",
" time_width=\"1w\",\n",
" time_offset=\"2015-07-02\",\n",
" settings=settings,\n",
Expand Down Expand Up @@ -186,11 +184,10 @@
"\n",
" sdf = spark.createDataFrame(df)\n",
"\n",
" settings = Settings()\n",
" settings = Settings(time_axis=\"DATE\")\n",
" settings.report.extended_report = False\n",
"\n",
" sdf.pm_stability_report(\n",
" time_axis=\"DATE\",\n",
" time_width=\"1w\",\n",
" time_offset=\"2015-07-02\",\n",
" settings=settings,\n",
Expand All @@ -215,17 +212,15 @@
"metadata": {},
"outputs": [],
"source": [
"settings = Settings()\n",
"settings = Settings(time_axis=\"DATE\", reference_type=\"external\")\n",
"settings.report.extended_report = False\n",
"\n",
"df_ref = pd.read_csv(\n",
" resources.data(\"flight_delays_reference.csv.gz\"), index_col=0, parse_dates=[\"DATE\"]\n",
")\n",
"df.pm_stability_report(\n",
" time_axis=\"DATE\",\n",
" time_width=\"1w\",\n",
" time_offset=\"2015-07-02\",\n",
" reference_type=\"external\",\n",
" reference=df_ref,\n",
" settings=settings,\n",
")"
Expand All @@ -245,11 +240,13 @@
"metadata": {},
"outputs": [],
"source": [
"settings = Settings(time_axis=\"DATE\")\n",
"settings.report.extended_report = False\n",
"settings.reference_type = \"expanding\"\n",
"\n",
"df.pm_stability_report(\n",
" time_axis=\"DATE\",\n",
" time_width=\"1w\",\n",
" time_offset=\"2015-07-02\",\n",
" reference_type=\"expanding\",\n",
" settings=settings,\n",
")"
]
Expand All @@ -269,13 +266,13 @@
"metadata": {},
"outputs": [],
"source": [
"settings = Settings(time_axis=\"DATE\", reference_type=\"rolling\")\n",
"settings.report.extended_report = False\n",
"settings.comparison.window = 5\n",
"\n",
"df.pm_stability_report(\n",
" time_axis=\"DATE\",\n",
" time_width=\"1w\",\n",
" time_offset=\"2015-07-02\",\n",
" reference_type=\"rolling\",\n",
" settings=settings,\n",
")"
]
Expand Down
39 changes: 16 additions & 23 deletions popmon/pipeline/amazing_pipeline.py
Expand Up @@ -20,19 +20,16 @@

import logging

from popmon import resources

from ..base import Pipeline
from ..io import JsonReader
from ..pipeline.report_pipelines import SelfReference
from popmon import Settings, resources
from popmon.base import Pipeline
from popmon.io import JsonReader
from popmon.pipeline.report_pipelines import SelfReference


class AmazingPipeline(Pipeline):
def __init__(self, **kwargs):
def __init__(self, histogram_path: str, **kwargs):
modules = [
JsonReader(
file_path=kwargs["histograms_path"], store_key=kwargs["hists_key"]
),
JsonReader(file_path=histogram_path, store_key=kwargs["hists_key"]),
# Or ExternalReference, RollingReference etc.
SelfReference(**kwargs),
]
Expand All @@ -45,22 +42,18 @@ def run():
level=logging.INFO, format="%(asctime)s %(levelname)s [%(module)s]: %(message)s"
)

cfg = {
"histograms_path": resources.data("synthetic_histograms.json"),
"hists_key": "hists",
"ref_hists_key": "hists",
"datetime_name": "date",
"window": 20,
"shift": 1,
"monitoring_rules": {
"*_pull": [7, 4, -4, -7],
# "*_pvalue": [1, 0.999, 0.001, 0.0001],
"*_zscore": [7, 4, -4, -7],
},
"pull_rules": {"*_pull": [7, 4, -4, -7]},
settings = Settings(time_axis="date")
settings.comparison.window = 20
settings.comparison.shift = 1
settings.monitoring.monitoring_rules = {
"*_pull": [7, 4, -4, -7],
# "*_pvalue": [1, 0.999, 0.001, 0.0001],
"*_zscore": [7, 4, -4, -7],
}
settings.monitoring.pull_rules = {"*_pull": [7, 4, -4, -7]}

pipeline = AmazingPipeline(**cfg)
histogram_path = resources.data("synthetic_histograms.json")
pipeline = AmazingPipeline(histogram_path, hists_key="hists", settings=settings)
pipeline.transform(datastore={})


Expand Down

0 comments on commit e4df434

Please sign in to comment.