docs(synthetic): update synthetic examples (#212)

- update SEA description - full and partial reference for synthetic data streams - hyperplane logistic regression example
ing-bank · Jun 14, 2022 · 84a9331 · 84a9331
1 parent f438394
commit 84a9331
Show file tree

Hide file tree

Showing 3 changed files with 42 additions and 7 deletions.
diff --git a/examples/synthetic_data_streams/README.md b/examples/synthetic_data_streams/README.md
@@ -25,7 +25,7 @@ _Characteristics of datasets used, see the survey [Learning under Concept Drift:
 For the sudden-drift datasets, the drifting point is centred at every 5th of the instances for Sine1, Sine2 and Mixed and at each 3rd for Stagger, for a transition over 50 samples. 
 For the remaining gradually shifting datasets, Circles and LED, the drifting point is centred around every 4th, and takes place over 500 instances. 
 A noise level of 10\% is added to each dataset. 
- where the drifting points occur at each 4th. 
+For the SEA dataset, the drifting points occur at each 4th of the dataset. 
 The shift in The Hyperplane dataset that was used, consists of 10.000 samples, and the drift is incremental and gradual.
 
 (adding other datasets will be simple based on the available reference configuration)

diff --git a/examples/synthetic_data_streams/hyperplane.py b/examples/synthetic_data_streams/hyperplane.py
@@ -1,6 +1,7 @@
 """
 Example configuration for the hyperplane dataset
 """
+from sklearn.linear_model import LogisticRegression
 from synthetic_data_streams import (
     dataset_summary,
     load_arff,
@@ -13,12 +14,25 @@
 # Monitor the each feature w.r.t. the label
 features = [f"index:attr{i}:output" for i in range(10)]
 
+# Also monitor predictions w.r.t. the label (see below)
+features += ["index:prediction:output"]
+
 dataset_file = f"data/{dataset_name}{v}.arff"
 report_file = f"reports/{dataset_name}_{v}.html"
 
 df = load_arff(dataset_file)
 
+# Fit a logistic regression on the first 10% of the data.
+model = LogisticRegression(C=1e5)
+model.fit(df.loc[:1000, df.columns != "output"], df.loc[:1000, "output"])
+
+# Use the model to predict over the full dataset
+df["prediction"] = model.predict_proba(df.loc[:, df.columns != "output"])[:, 1]
+
 dataset_summary(df)
 
-# Reduce the time_width for this smaller dataset
-synthetic_data_stream_report(df, features, report_file, time_width=500)
+# The training set for the model will be used as reference.
+# The reduced time_width is because this is a smaller dataset compared to the rest
+synthetic_data_stream_report(
+    df, features, report_file, time_width=500, reference="start", split=1000
+)
diff --git a/examples/synthetic_data_streams/synthetic_data_streams.py b/examples/synthetic_data_streams/synthetic_data_streams.py
@@ -19,11 +19,28 @@ def dataset_summary(df):
     print(df.head(10))
 
 
-def synthetic_data_stream_report(df, features, report_file, time_width=1000):
-    df["index"] = df.index.values
+def synthetic_data_stream_report(
+    data, features, report_file, time_width=1000, reference="full", **kwargs
+):
+    data["index"] = data.index.values
+
+    if reference == "full":
+        ref_df = data
+        df = data
+    elif reference == "start":
+        # split at this point, by default time_width
+        split = kwargs.get("split", time_width)
+        ref_df = data[:split]
+        df = data[split:]
+    else:
+        raise ValueError("reference type should be 'full' or 'start'.")
 
     hists_ref = popmon.make_histograms(
-        df, time_axis="index", time_width=time_width, features=features, time_offset=0
+        ref_df,
+        time_axis="index",
+        time_width=time_width,
+        features=features,
+        time_offset=0,
     )
     ref_bin_specs = popmon.get_bin_specs(hists_ref)
     features = list(ref_bin_specs.keys())
@@ -51,7 +68,11 @@ def synthetic_data_stream_report(df, features, report_file, time_width=1000):
         "[!p]*_unknown_labels": [0.5, 0.5, 0, 0],
     }
     report = popmon.stability_report(
-        hists, pull_rules=pull_rules, monitoring_rules=monitoring_rules
+        hists,
+        pull_rules=pull_rules,
+        monitoring_rules=monitoring_rules,
+        reference_type="external",
+        reference=hists_ref,
     )
 
     # or save the report to file