diff --git a/.github/workflows/build.yml b/.github/workflows/build.yml
index 9890a42c..d9a28914 100644
--- a/.github/workflows/build.yml
+++ b/.github/workflows/build.yml
@@ -11,7 +11,6 @@ on:
jobs:
build:
-
runs-on: ubuntu-latest
steps:
@@ -35,3 +34,34 @@ jobs:
run: |
pip install pytest
pytest
+
+ examples:
+ runs-on: ubuntu-latest
+ needs: build
+ steps:
+ - uses: actions/checkout@v2
+ - name: Set up Python 3.6
+ uses: actions/setup-python@v1
+ with:
+ python-version: 3.6
+ - name: Install dependencies
+ run: |
+ python -m pip install --upgrade pip
+ pip install -e .
+ - name:
+ run: |
+ cd examples
+ python synthetic_data.py
+ python flight_delays.py
+
+ - uses: actions/upload-artifact@v2
+ with:
+ name: synthetic-report
+ path: examples/test_data_report.html
+ if-no-files-found: error
+
+ - uses: actions/upload-artifact@v2
+ with:
+ name: flight-delays-report
+ path: examples/flight_delays_report.html
+ if-no-files-found: error
diff --git a/examples/flight_delays.py b/examples/flight_delays.py
new file mode 100644
index 00000000..657cff06
--- /dev/null
+++ b/examples/flight_delays.py
@@ -0,0 +1,22 @@
+import pandas as pd
+
+import popmon
+from popmon import resources
+
+# open synthetic data
+df = pd.read_csv(
+ resources.data("flight_delays.csv.gz"), index_col=0, parse_dates=["DATE"]
+)
+
+# generate stability report using automatic binning of all encountered features
+# (importing popmon automatically adds this functionality to a dataframe)
+report = df.pm_stability_report(
+ time_axis="DATE",
+ time_width="1w",
+ time_offset="2015-07-02",
+ extended_report=False,
+ pull_rules={"*_pull": [10, 7, -7, -10]},
+)
+
+# or save the report to file
+report.to_file("flight_delays_report.html")
diff --git a/examples/synthetic_data.py b/examples/synthetic_data.py
new file mode 100644
index 00000000..b219a40b
--- /dev/null
+++ b/examples/synthetic_data.py
@@ -0,0 +1,14 @@
+import pandas as pd
+
+import popmon
+from popmon import resources
+
+# open synthetic data
+df = pd.read_csv(resources.data("test.csv.gz"), parse_dates=["date"])
+
+# generate stability report using automatic binning of all encountered features
+# (importing popmon automatically adds this functionality to a dataframe)
+report = df.pm_stability_report(time_axis="date", features=["date:age", "date:gender"])
+
+# or save the report to file
+report.to_file("test_data_report.html")
diff --git a/popmon/notebooks/popmon_tutorial_advanced.ipynb b/popmon/notebooks/popmon_tutorial_advanced.ipynb
index 752fcda6..7e6d8aca 100644
--- a/popmon/notebooks/popmon_tutorial_advanced.ipynb
+++ b/popmon/notebooks/popmon_tutorial_advanced.ipynb
@@ -14,7 +14,8 @@
},
"outputs": [],
"source": [
- "from IPython.core.display import display, HTML\n",
+ "from IPython.core.display import HTML, display\n",
+ "\n",
"display(HTML(\"\"))\n",
"display(HTML(\"\"))"
]
@@ -27,6 +28,7 @@
"source": [
"# install popmon (if not installed yet)\n",
"import sys\n",
+ "\n",
"!{sys.executable} -m pip install popmon"
]
},
@@ -37,6 +39,7 @@
"outputs": [],
"source": [
"import pandas as pd\n",
+ "\n",
"import popmon\n",
"from popmon import resources"
]
@@ -55,7 +58,9 @@
"metadata": {},
"outputs": [],
"source": [
- "df = pd.read_csv(resources.data(\"flight_delays.csv.gz\"), index_col=0, parse_dates=[\"DATE\"])"
+ "df = pd.read_csv(\n",
+ " resources.data(\"flight_delays.csv.gz\"), index_col=0, parse_dates=[\"DATE\"]\n",
+ ")"
]
},
{
@@ -72,7 +77,7 @@
"metadata": {},
"outputs": [],
"source": [
- "df.pm_stability_report(time_axis='DATE')"
+ "df.pm_stability_report(time_axis=\"DATE\")"
]
},
{
@@ -92,7 +97,9 @@
"metadata": {},
"outputs": [],
"source": [
- "df.pm_stability_report(time_axis='DATE', time_width='1w', time_offset='2015-07-02', extended_report=False)"
+ "df.pm_stability_report(\n",
+ " time_axis=\"DATE\", time_width=\"1w\", time_offset=\"2015-07-02\", extended_report=False\n",
+ ")"
]
},
{
@@ -109,7 +116,13 @@
"metadata": {},
"outputs": [],
"source": [
- "df.pm_stability_report(time_axis='DATE', time_width='1w', time_offset='2015-07-02', extended_report=False, pull_rules={\"*_pull\": [10, 7, -7, -10]})"
+ "df.pm_stability_report(\n",
+ " time_axis=\"DATE\",\n",
+ " time_width=\"1w\",\n",
+ " time_offset=\"2015-07-02\",\n",
+ " extended_report=False,\n",
+ " pull_rules={\"*_pull\": [10, 7, -7, -10]},\n",
+ ")"
]
},
{
@@ -133,7 +146,10 @@
"source": [
"# download histogrammar jar files if not already installed, used for histogramming of spark dataframe\n",
"from pyspark.sql import SparkSession\n",
- "spark = SparkSession.builder.config('spark.jars.packages','org.diana-hep:histogrammar-sparksql_2.11:1.0.4').getOrCreate()"
+ "\n",
+ "spark = SparkSession.builder.config(\n",
+ " \"spark.jars.packages\", \"org.diana-hep:histogrammar-sparksql_2.11:1.0.4\"\n",
+ ").getOrCreate()"
]
},
{
@@ -151,7 +167,9 @@
"metadata": {},
"outputs": [],
"source": [
- "sdf.pm_stability_report(time_axis='DATE', time_width='1w', time_offset='2015-07-02', extended_report=False)"
+ "sdf.pm_stability_report(\n",
+ " time_axis=\"DATE\", time_width=\"1w\", time_offset=\"2015-07-02\", extended_report=False\n",
+ ")"
]
},
{
@@ -172,8 +190,17 @@
"metadata": {},
"outputs": [],
"source": [
- "df_ref = pd.read_csv(resources.data(\"flight_delays_reference.csv.gz\"), index_col=0, parse_dates=['DATE'])\n",
- "df.pm_stability_report(time_axis='DATE', time_width='1w', time_offset='2015-07-02', extended_report=False, reference_type='external', reference=df_ref)"
+ "df_ref = pd.read_csv(\n",
+ " resources.data(\"flight_delays_reference.csv.gz\"), index_col=0, parse_dates=[\"DATE\"]\n",
+ ")\n",
+ "df.pm_stability_report(\n",
+ " time_axis=\"DATE\",\n",
+ " time_width=\"1w\",\n",
+ " time_offset=\"2015-07-02\",\n",
+ " extended_report=False,\n",
+ " reference_type=\"external\",\n",
+ " reference=df_ref,\n",
+ ")"
]
},
{
@@ -190,7 +217,13 @@
"metadata": {},
"outputs": [],
"source": [
- "df.pm_stability_report(time_axis='DATE', time_width='1w', time_offset='2015-07-02', extended_report=False, reference_type=\"expanding\")"
+ "df.pm_stability_report(\n",
+ " time_axis=\"DATE\",\n",
+ " time_width=\"1w\",\n",
+ " time_offset=\"2015-07-02\",\n",
+ " extended_report=False,\n",
+ " reference_type=\"expanding\",\n",
+ ")"
]
},
{
@@ -208,7 +241,14 @@
"metadata": {},
"outputs": [],
"source": [
- "df.pm_stability_report(time_axis='DATE', time_width='1w', time_offset='2015-07-02', extended_report=False, reference_type=\"rolling\", window=5)"
+ "df.pm_stability_report(\n",
+ " time_axis=\"DATE\",\n",
+ " time_width=\"1w\",\n",
+ " time_offset=\"2015-07-02\",\n",
+ " extended_report=False,\n",
+ " reference_type=\"rolling\",\n",
+ " window=5,\n",
+ ")"
]
},
{
@@ -226,8 +266,10 @@
"metadata": {},
"outputs": [],
"source": [
- "report = df.pm_stability_report(time_axis='DATE', time_width='1w', time_offset='2015-07-02')\n",
- "split_hists = report.datastore['split_hists']['DEPARTURE_DELAY']\n",
+ "report = df.pm_stability_report(\n",
+ " time_axis=\"DATE\", time_width=\"1w\", time_offset=\"2015-07-02\"\n",
+ ")\n",
+ "split_hists = report.datastore[\"split_hists\"][\"DEPARTURE_DELAY\"]\n",
"split_hists"
]
},
@@ -279,9 +321,10 @@
"outputs": [],
"source": [
"import pickle\n",
- "with open('report.pkl', 'wb') as f: \n",
+ "\n",
+ "with open(\"report.pkl\", \"wb\") as f:\n",
" pickle.dump(report, f)\n",
- "report.to_file('report.html')"
+ "report.to_file(\"report.html\")"
]
},
{
@@ -298,8 +341,16 @@
"metadata": {},
"outputs": [],
"source": [
- "report.regenerate(last_n=0, skip_first_n=0, skip_last_n=0, plot_hist_n=2, skip_empty_plots=True,\n",
- " report_filepath=None, store_key='html_report', sections_key='report_sections')"
+ "report.regenerate(\n",
+ " last_n=0,\n",
+ " skip_first_n=0,\n",
+ " skip_last_n=0,\n",
+ " plot_hist_n=2,\n",
+ " skip_empty_plots=True,\n",
+ " report_filepath=None,\n",
+ " store_key=\"html_report\",\n",
+ " sections_key=\"report_sections\",\n",
+ ")\n",
]
},
{
@@ -322,15 +373,23 @@
"from popmon.base import Pipeline\n",
"from popmon.visualization import SectionGenerator, ReportGenerator\n",
"\n",
- "monitoring_rules = {\"*_pull\": [7, 4, -4, -7], \"*_zscore\": [7, 4, -4, -7], \"[!p]*_unknown_labels\": [0.5, 0.5, 0, 0]}\n",
+ "monitoring_rules = {\n",
+ " \"*_pull\": [7, 4, -4, -7],\n",
+ " \"*_zscore\": [7, 4, -4, -7],\n",
+ " \"[!p]*_unknown_labels\": [0.5, 0.5, 0, 0],\n",
+ "}\n",
"datastore = dict()\n",
- "datastore['hists'] = df.pm_make_histograms(time_axis='DATE', time_width='1w', time_offset='2015-07-02')\n",
+ "datastore[\"hists\"] = df.pm_make_histograms(\n",
+ " time_axis=\"DATE\", time_width=\"1w\", time_offset=\"2015-07-02\"\n",
+ ")\n",
"\n",
"modules = [\n",
- " HistSplitter(read_key='hists', store_key='split_hists', feature_begins_with='DATE'),\n",
- " HistProfiler(read_key='split_hists', store_key='profiles'),\n",
- " SectionGenerator(section_name='Profiles', read_key=\"profiles\", store_key=\"report_sections\"),\n",
- " ReportGenerator(read_key=\"report_sections\", store_key=\"html_report\")\n",
+ " HistSplitter(read_key=\"hists\", store_key=\"split_hists\", feature_begins_with=\"DATE\"),\n",
+ " HistProfiler(read_key=\"split_hists\", store_key=\"profiles\"),\n",
+ " SectionGenerator(\n",
+ " section_name=\"Profiles\", read_key=\"profiles\", store_key=\"report_sections\"\n",
+ " ),\n",
+ " ReportGenerator(read_key=\"report_sections\", store_key=\"html_report\"),\n",
"]\n",
"\n",
"pipeline = Pipeline(modules)\n",
@@ -356,15 +415,25 @@
"from popmon.analysis.comparison.hist_comparer import ReferenceHistComparer\n",
"\n",
"datastore = dict()\n",
- "datastore['hists'] = df.pm_make_histograms(time_axis='DATE', time_width='1w', time_offset='2015-07-02')\n",
+ "datastore[\"hists\"] = df.pm_make_histograms(\n",
+ " time_axis=\"DATE\", time_width=\"1w\", time_offset=\"2015-07-02\"\n",
+ ")\n",
"\n",
"modules = [\n",
- " HistSplitter(read_key='hists', store_key='split_hists', feature_begins_with='DATE'),\n",
- " HistProfiler(read_key='split_hists', store_key='profiles'),\n",
- " ReferenceHistComparer(reference_key='split_hists', assign_to_key='split_hists', store_key='comparisons'),\n",
- " SectionGenerator(section_name='Profiles', read_key=\"profiles\", store_key=\"report_sections\"),\n",
- " SectionGenerator(section_name=\"Comparisons\", read_key=\"comparisons\", store_key=\"report_sections\"),\n",
- " ReportGenerator(read_key=\"report_sections\", store_key=\"html_report\")\n",
+ " HistSplitter(read_key=\"hists\", store_key=\"split_hists\", feature_begins_with=\"DATE\"),\n",
+ " HistProfiler(read_key=\"split_hists\", store_key=\"profiles\"),\n",
+ " ReferenceHistComparer(\n",
+ " reference_key=\"split_hists\",\n",
+ " assign_to_key=\"split_hists\",\n",
+ " store_key=\"comparisons\",\n",
+ " ),\n",
+ " SectionGenerator(\n",
+ " section_name=\"Profiles\", read_key=\"profiles\", store_key=\"report_sections\"\n",
+ " ),\n",
+ " SectionGenerator(\n",
+ " section_name=\"Comparisons\", read_key=\"comparisons\", store_key=\"report_sections\"\n",
+ " ),\n",
+ " ReportGenerator(read_key=\"report_sections\", store_key=\"html_report\"),\n",
"]\n",
"\n",
"pipeline = Pipeline(modules)\n",
@@ -421,4 +490,4 @@
},
"nbformat": 4,
"nbformat_minor": 4
-}
\ No newline at end of file
+}
diff --git a/popmon/notebooks/popmon_tutorial_basic.ipynb b/popmon/notebooks/popmon_tutorial_basic.ipynb
index cdbc4536..c1efff56 100644
--- a/popmon/notebooks/popmon_tutorial_basic.ipynb
+++ b/popmon/notebooks/popmon_tutorial_basic.ipynb
@@ -16,6 +16,7 @@
"source": [
"# (optional) Adjust the jupyter notebook style for easier navigation of the reports\n",
"from IPython.core.display import display, HTML\n",
+ "\n",
"# Wider notebook\n",
"display(HTML(\"\"))\n",
"# Cells are higher by default\n",
@@ -37,6 +38,7 @@
"source": [
"# install popmon (if not installed yet)\n",
"import sys\n",
+ "\n",
"!{sys.executable} -m pip install popmon"
]
},
@@ -79,7 +81,11 @@
"source": [
"# first we generate histograms,\n",
"# but we could load pre-generated histograms from a pickle or json file as well.\n",
- "hists = df.pm_make_histograms(time_axis=\"date\", time_width='2w', features=['date:age', 'date:gender', 'date:isActive'])"
+ "hists = df.pm_make_histograms(\n",
+ " time_axis=\"date\",\n",
+ " time_width=\"2w\",\n",
+ " features=[\"date:age\", \"date:gender\", \"date:isActive\"],\n",
+ ")"
]
},
{
@@ -109,7 +115,7 @@
},
"outputs": [],
"source": [
- "report # or report_.to_notebook_iframe()"
+ "report # or report_.to_notebook_iframe()"
]
},
{
@@ -145,7 +151,11 @@
"metadata": {},
"outputs": [],
"source": [
- "report_ = df.pm_stability_report(time_axis=\"date\", time_width='2w', features=['date:age', 'date:isActive', 'date:eyeColor'])"
+ "report_ = df.pm_stability_report(\n",
+ " time_axis=\"date\",\n",
+ " time_width=\"2w\",\n",
+ " features=[\"date:age\", \"date:isActive\", \"date:eyeColor\"],\n",
+ ")"
]
},
{
diff --git a/popmon/notebooks/popmon_tutorial_incremental_data.ipynb b/popmon/notebooks/popmon_tutorial_incremental_data.ipynb
index 74467f2f..a2d5b97c 100644
--- a/popmon/notebooks/popmon_tutorial_incremental_data.ipynb
+++ b/popmon/notebooks/popmon_tutorial_incremental_data.ipynb
@@ -30,6 +30,7 @@
"source": [
"# install popmon (if not installed yet)\n",
"import sys\n",
+ "\n",
"!{sys.executable} -m pip install popmon"
]
},
@@ -65,12 +66,15 @@
"def to_month(x):\n",
" date = pd.to_datetime(x)\n",
" return str(12 * date.year + date.month)\n",
+ "\n",
+ "\n",
"def to_week(x):\n",
" date = pd.to_datetime(x)\n",
" return 52 * date.year + date.week\n",
"\n",
- "df['month'] = df['date'].apply(to_month)\n",
- "df['week'] = df['date'].apply(to_week)\n",
+ "\n",
+ "df[\"month\"] = df[\"date\"].apply(to_month)\n",
+ "df[\"week\"] = df[\"date\"].apply(to_week)\n",
"months = df.month.unique()\n",
"weeks = df.week.unique().tolist()"
]
@@ -88,9 +92,11 @@
"metadata": {},
"outputs": [],
"source": [
- "features = ['date:isActive', 'date:eyeColor', 'date:latitude', 'date:age']\n",
+ "features = [\"date:isActive\", \"date:eyeColor\", \"date:latitude\", \"date:age\"]\n",
"# weeks start on a Monday\n",
- "hists = df.pm_make_histograms(features=features, time_axis='date', time_width='1w', time_offset='2015-1-5')"
+ "hists = df.pm_make_histograms(\n",
+ " features=features, time_axis=\"date\", time_width=\"1w\", time_offset=\"2015-1-5\"\n",
+ ")"
]
},
{
@@ -131,7 +137,7 @@
"outputs": [],
"source": [
"# add up all the histograms sets\n",
- "hists2 = popmon.stitch_histograms(hists_list=hists_list, time_axis='date', mode='add')"
+ "hists2 = popmon.stitch_histograms(hists_list=hists_list, time_axis=\"date\", mode=\"add\")"
]
},
{
@@ -204,7 +210,7 @@
"metadata": {},
"outputs": [],
"source": [
- "# Now let's assume we already have a set of stitched histograms (hists3), \n",
+ "# Now let's assume we already have a set of stitched histograms (hists3),\n",
"# and we want to stitch to add another new batch to this:"
]
},
@@ -224,7 +230,9 @@
"\n",
"# when adding hists_delta, one can either \"add\" histograms to existing weeks, or \"replace\" existing weeks.\n",
"# the default is to add them.\n",
- "hists4 = popmon.stitch_histograms(hists_basis=hists_basis, hists_delta=hists_delta, mode=\"add\")"
+ "hists4 = popmon.stitch_histograms(\n",
+ " hists_basis=hists_basis, hists_delta=hists_delta, mode=\"add\"\n",
+ ")"
]
},
{
@@ -234,7 +242,9 @@
"outputs": [],
"source": [
"# or \"replace\" histograms found in existing weeks with those in hists_delta:\n",
- "hists4 = popmon.stitch_histograms(hists_basis=hists_basis, hists_delta=hists_delta, mode=\"replace\")"
+ "hists4 = popmon.stitch_histograms(\n",
+ " hists_basis=hists_basis, hists_delta=hists_delta, mode=\"replace\"\n",
+ ")"
]
},
{
@@ -253,7 +263,7 @@
"metadata": {},
"outputs": [],
"source": [
- "features = ['isActive', 'eyeColor', 'latitude', 'age']"
+ "features = [\"isActive\", \"eyeColor\", \"latitude\", \"age\"]"
]
},
{
@@ -289,7 +299,9 @@
"# since none of these histograms has a time-axis, in the stitching we create one (called 'batch'), and specify\n",
"# that each batch of histograms is inserted at a particular value time_bin_idx value\n",
"\n",
- "hists3 = popmon.stitch_histograms(hists_list=hists_list, time_axis='batch', time_bin_idx=weeks)"
+ "hists3 = popmon.stitch_histograms(\n",
+ " hists_list=hists_list, time_axis=\"batch\", time_bin_idx=weeks\n",
+ ")"
]
},
{
@@ -336,7 +348,7 @@
"metadata": {},
"outputs": [],
"source": [
- "# Now let's assume we already have a set of stitched histograms (hists3), \n",
+ "# Now let's assume we already have a set of stitched histograms (hists3),\n",
"# and we want to stitch to add another new batch to this:"
]
},
@@ -351,8 +363,10 @@
"# hists_delta is the new set of histograms\n",
"hists_delta = hists_list[-1]\n",
"\n",
- "# by default, the stitcher will insert the batch right after the last batch found. \n",
- "hists4 = popmon.stitch_histograms(hists_basis=hists_basis, hists_delta=hists_delta, time_axis='batch')"
+ "# by default, the stitcher will insert the batch right after the last batch found.\n",
+ "hists4 = popmon.stitch_histograms(\n",
+ " hists_basis=hists_basis, hists_delta=hists_delta, time_axis=\"batch\"\n",
+ ")"
]
},
{
@@ -362,7 +376,12 @@
"outputs": [],
"source": [
"# one can also insert the new batch at a chosen new or existing time-bin index:\n",
- "hists4 = popmon.stitch_histograms(hists_basis=hists_basis, hists_delta=hists_delta, time_axis='batch', time_bin_idx=200000)"
+ "hists4 = popmon.stitch_histograms(\n",
+ " hists_basis=hists_basis,\n",
+ " hists_delta=hists_delta,\n",
+ " time_axis=\"batch\",\n",
+ " time_bin_idx=200000,\n",
+ ")"
]
},
{
@@ -371,11 +390,16 @@
"metadata": {},
"outputs": [],
"source": [
- "# when inserting at an existing time-bin index, on can either \"add\" to that index \n",
+ "# when inserting at an existing time-bin index, on can either \"add\" to that index\n",
"# or \"replace\" the existing histograms. The default setting is to \"add\" the histograms:\n",
- "mode = \"add\" # \"replace\"\n",
- "hists4 = popmon.stitch_histograms(hists_basis=hists_basis, hists_delta=hists_delta,\n",
- " time_axis='batch', time_bin_idx=104833, mode=mode)"
+ "mode = \"add\" # \"replace\"\n",
+ "hists4 = popmon.stitch_histograms(\n",
+ " hists_basis=hists_basis,\n",
+ " hists_delta=hists_delta,\n",
+ " time_axis=\"batch\",\n",
+ " time_bin_idx=104833,\n",
+ " mode=mode,\n",
+ ")"
]
},
{