diff --git a/.github/workflows/build.yml b/.github/workflows/build.yml index 9890a42c..d9a28914 100644 --- a/.github/workflows/build.yml +++ b/.github/workflows/build.yml @@ -11,7 +11,6 @@ on: jobs: build: - runs-on: ubuntu-latest steps: @@ -35,3 +34,34 @@ jobs: run: | pip install pytest pytest + + examples: + runs-on: ubuntu-latest + needs: build + steps: + - uses: actions/checkout@v2 + - name: Set up Python 3.6 + uses: actions/setup-python@v1 + with: + python-version: 3.6 + - name: Install dependencies + run: | + python -m pip install --upgrade pip + pip install -e . + - name: + run: | + cd examples + python synthetic_data.py + python flight_delays.py + + - uses: actions/upload-artifact@v2 + with: + name: synthetic-report + path: examples/test_data_report.html + if-no-files-found: error + + - uses: actions/upload-artifact@v2 + with: + name: flight-delays-report + path: examples/flight_delays_report.html + if-no-files-found: error diff --git a/examples/flight_delays.py b/examples/flight_delays.py new file mode 100644 index 00000000..657cff06 --- /dev/null +++ b/examples/flight_delays.py @@ -0,0 +1,22 @@ +import pandas as pd + +import popmon +from popmon import resources + +# open synthetic data +df = pd.read_csv( + resources.data("flight_delays.csv.gz"), index_col=0, parse_dates=["DATE"] +) + +# generate stability report using automatic binning of all encountered features +# (importing popmon automatically adds this functionality to a dataframe) +report = df.pm_stability_report( + time_axis="DATE", + time_width="1w", + time_offset="2015-07-02", + extended_report=False, + pull_rules={"*_pull": [10, 7, -7, -10]}, +) + +# or save the report to file +report.to_file("flight_delays_report.html") diff --git a/examples/synthetic_data.py b/examples/synthetic_data.py new file mode 100644 index 00000000..b219a40b --- /dev/null +++ b/examples/synthetic_data.py @@ -0,0 +1,14 @@ +import pandas as pd + +import popmon +from popmon import resources + +# open synthetic data +df = pd.read_csv(resources.data("test.csv.gz"), parse_dates=["date"]) + +# generate stability report using automatic binning of all encountered features +# (importing popmon automatically adds this functionality to a dataframe) +report = df.pm_stability_report(time_axis="date", features=["date:age", "date:gender"]) + +# or save the report to file +report.to_file("test_data_report.html") diff --git a/popmon/notebooks/popmon_tutorial_advanced.ipynb b/popmon/notebooks/popmon_tutorial_advanced.ipynb index 752fcda6..7e6d8aca 100644 --- a/popmon/notebooks/popmon_tutorial_advanced.ipynb +++ b/popmon/notebooks/popmon_tutorial_advanced.ipynb @@ -14,7 +14,8 @@ }, "outputs": [], "source": [ - "from IPython.core.display import display, HTML\n", + "from IPython.core.display import HTML, display\n", + "\n", "display(HTML(\"\"))\n", "display(HTML(\"\"))" ] @@ -27,6 +28,7 @@ "source": [ "# install popmon (if not installed yet)\n", "import sys\n", + "\n", "!{sys.executable} -m pip install popmon" ] }, @@ -37,6 +39,7 @@ "outputs": [], "source": [ "import pandas as pd\n", + "\n", "import popmon\n", "from popmon import resources" ] @@ -55,7 +58,9 @@ "metadata": {}, "outputs": [], "source": [ - "df = pd.read_csv(resources.data(\"flight_delays.csv.gz\"), index_col=0, parse_dates=[\"DATE\"])" + "df = pd.read_csv(\n", + " resources.data(\"flight_delays.csv.gz\"), index_col=0, parse_dates=[\"DATE\"]\n", + ")" ] }, { @@ -72,7 +77,7 @@ "metadata": {}, "outputs": [], "source": [ - "df.pm_stability_report(time_axis='DATE')" + "df.pm_stability_report(time_axis=\"DATE\")" ] }, { @@ -92,7 +97,9 @@ "metadata": {}, "outputs": [], "source": [ - "df.pm_stability_report(time_axis='DATE', time_width='1w', time_offset='2015-07-02', extended_report=False)" + "df.pm_stability_report(\n", + " time_axis=\"DATE\", time_width=\"1w\", time_offset=\"2015-07-02\", extended_report=False\n", + ")" ] }, { @@ -109,7 +116,13 @@ "metadata": {}, "outputs": [], "source": [ - "df.pm_stability_report(time_axis='DATE', time_width='1w', time_offset='2015-07-02', extended_report=False, pull_rules={\"*_pull\": [10, 7, -7, -10]})" + "df.pm_stability_report(\n", + " time_axis=\"DATE\",\n", + " time_width=\"1w\",\n", + " time_offset=\"2015-07-02\",\n", + " extended_report=False,\n", + " pull_rules={\"*_pull\": [10, 7, -7, -10]},\n", + ")" ] }, { @@ -133,7 +146,10 @@ "source": [ "# download histogrammar jar files if not already installed, used for histogramming of spark dataframe\n", "from pyspark.sql import SparkSession\n", - "spark = SparkSession.builder.config('spark.jars.packages','org.diana-hep:histogrammar-sparksql_2.11:1.0.4').getOrCreate()" + "\n", + "spark = SparkSession.builder.config(\n", + " \"spark.jars.packages\", \"org.diana-hep:histogrammar-sparksql_2.11:1.0.4\"\n", + ").getOrCreate()" ] }, { @@ -151,7 +167,9 @@ "metadata": {}, "outputs": [], "source": [ - "sdf.pm_stability_report(time_axis='DATE', time_width='1w', time_offset='2015-07-02', extended_report=False)" + "sdf.pm_stability_report(\n", + " time_axis=\"DATE\", time_width=\"1w\", time_offset=\"2015-07-02\", extended_report=False\n", + ")" ] }, { @@ -172,8 +190,17 @@ "metadata": {}, "outputs": [], "source": [ - "df_ref = pd.read_csv(resources.data(\"flight_delays_reference.csv.gz\"), index_col=0, parse_dates=['DATE'])\n", - "df.pm_stability_report(time_axis='DATE', time_width='1w', time_offset='2015-07-02', extended_report=False, reference_type='external', reference=df_ref)" + "df_ref = pd.read_csv(\n", + " resources.data(\"flight_delays_reference.csv.gz\"), index_col=0, parse_dates=[\"DATE\"]\n", + ")\n", + "df.pm_stability_report(\n", + " time_axis=\"DATE\",\n", + " time_width=\"1w\",\n", + " time_offset=\"2015-07-02\",\n", + " extended_report=False,\n", + " reference_type=\"external\",\n", + " reference=df_ref,\n", + ")" ] }, { @@ -190,7 +217,13 @@ "metadata": {}, "outputs": [], "source": [ - "df.pm_stability_report(time_axis='DATE', time_width='1w', time_offset='2015-07-02', extended_report=False, reference_type=\"expanding\")" + "df.pm_stability_report(\n", + " time_axis=\"DATE\",\n", + " time_width=\"1w\",\n", + " time_offset=\"2015-07-02\",\n", + " extended_report=False,\n", + " reference_type=\"expanding\",\n", + ")" ] }, { @@ -208,7 +241,14 @@ "metadata": {}, "outputs": [], "source": [ - "df.pm_stability_report(time_axis='DATE', time_width='1w', time_offset='2015-07-02', extended_report=False, reference_type=\"rolling\", window=5)" + "df.pm_stability_report(\n", + " time_axis=\"DATE\",\n", + " time_width=\"1w\",\n", + " time_offset=\"2015-07-02\",\n", + " extended_report=False,\n", + " reference_type=\"rolling\",\n", + " window=5,\n", + ")" ] }, { @@ -226,8 +266,10 @@ "metadata": {}, "outputs": [], "source": [ - "report = df.pm_stability_report(time_axis='DATE', time_width='1w', time_offset='2015-07-02')\n", - "split_hists = report.datastore['split_hists']['DEPARTURE_DELAY']\n", + "report = df.pm_stability_report(\n", + " time_axis=\"DATE\", time_width=\"1w\", time_offset=\"2015-07-02\"\n", + ")\n", + "split_hists = report.datastore[\"split_hists\"][\"DEPARTURE_DELAY\"]\n", "split_hists" ] }, @@ -279,9 +321,10 @@ "outputs": [], "source": [ "import pickle\n", - "with open('report.pkl', 'wb') as f: \n", + "\n", + "with open(\"report.pkl\", \"wb\") as f:\n", " pickle.dump(report, f)\n", - "report.to_file('report.html')" + "report.to_file(\"report.html\")" ] }, { @@ -298,8 +341,16 @@ "metadata": {}, "outputs": [], "source": [ - "report.regenerate(last_n=0, skip_first_n=0, skip_last_n=0, plot_hist_n=2, skip_empty_plots=True,\n", - " report_filepath=None, store_key='html_report', sections_key='report_sections')" + "report.regenerate(\n", + " last_n=0,\n", + " skip_first_n=0,\n", + " skip_last_n=0,\n", + " plot_hist_n=2,\n", + " skip_empty_plots=True,\n", + " report_filepath=None,\n", + " store_key=\"html_report\",\n", + " sections_key=\"report_sections\",\n", + ")\n", ] }, { @@ -322,15 +373,23 @@ "from popmon.base import Pipeline\n", "from popmon.visualization import SectionGenerator, ReportGenerator\n", "\n", - "monitoring_rules = {\"*_pull\": [7, 4, -4, -7], \"*_zscore\": [7, 4, -4, -7], \"[!p]*_unknown_labels\": [0.5, 0.5, 0, 0]}\n", + "monitoring_rules = {\n", + " \"*_pull\": [7, 4, -4, -7],\n", + " \"*_zscore\": [7, 4, -4, -7],\n", + " \"[!p]*_unknown_labels\": [0.5, 0.5, 0, 0],\n", + "}\n", "datastore = dict()\n", - "datastore['hists'] = df.pm_make_histograms(time_axis='DATE', time_width='1w', time_offset='2015-07-02')\n", + "datastore[\"hists\"] = df.pm_make_histograms(\n", + " time_axis=\"DATE\", time_width=\"1w\", time_offset=\"2015-07-02\"\n", + ")\n", "\n", "modules = [\n", - " HistSplitter(read_key='hists', store_key='split_hists', feature_begins_with='DATE'),\n", - " HistProfiler(read_key='split_hists', store_key='profiles'),\n", - " SectionGenerator(section_name='Profiles', read_key=\"profiles\", store_key=\"report_sections\"),\n", - " ReportGenerator(read_key=\"report_sections\", store_key=\"html_report\")\n", + " HistSplitter(read_key=\"hists\", store_key=\"split_hists\", feature_begins_with=\"DATE\"),\n", + " HistProfiler(read_key=\"split_hists\", store_key=\"profiles\"),\n", + " SectionGenerator(\n", + " section_name=\"Profiles\", read_key=\"profiles\", store_key=\"report_sections\"\n", + " ),\n", + " ReportGenerator(read_key=\"report_sections\", store_key=\"html_report\"),\n", "]\n", "\n", "pipeline = Pipeline(modules)\n", @@ -356,15 +415,25 @@ "from popmon.analysis.comparison.hist_comparer import ReferenceHistComparer\n", "\n", "datastore = dict()\n", - "datastore['hists'] = df.pm_make_histograms(time_axis='DATE', time_width='1w', time_offset='2015-07-02')\n", + "datastore[\"hists\"] = df.pm_make_histograms(\n", + " time_axis=\"DATE\", time_width=\"1w\", time_offset=\"2015-07-02\"\n", + ")\n", "\n", "modules = [\n", - " HistSplitter(read_key='hists', store_key='split_hists', feature_begins_with='DATE'),\n", - " HistProfiler(read_key='split_hists', store_key='profiles'),\n", - " ReferenceHistComparer(reference_key='split_hists', assign_to_key='split_hists', store_key='comparisons'),\n", - " SectionGenerator(section_name='Profiles', read_key=\"profiles\", store_key=\"report_sections\"),\n", - " SectionGenerator(section_name=\"Comparisons\", read_key=\"comparisons\", store_key=\"report_sections\"),\n", - " ReportGenerator(read_key=\"report_sections\", store_key=\"html_report\")\n", + " HistSplitter(read_key=\"hists\", store_key=\"split_hists\", feature_begins_with=\"DATE\"),\n", + " HistProfiler(read_key=\"split_hists\", store_key=\"profiles\"),\n", + " ReferenceHistComparer(\n", + " reference_key=\"split_hists\",\n", + " assign_to_key=\"split_hists\",\n", + " store_key=\"comparisons\",\n", + " ),\n", + " SectionGenerator(\n", + " section_name=\"Profiles\", read_key=\"profiles\", store_key=\"report_sections\"\n", + " ),\n", + " SectionGenerator(\n", + " section_name=\"Comparisons\", read_key=\"comparisons\", store_key=\"report_sections\"\n", + " ),\n", + " ReportGenerator(read_key=\"report_sections\", store_key=\"html_report\"),\n", "]\n", "\n", "pipeline = Pipeline(modules)\n", @@ -421,4 +490,4 @@ }, "nbformat": 4, "nbformat_minor": 4 -} \ No newline at end of file +} diff --git a/popmon/notebooks/popmon_tutorial_basic.ipynb b/popmon/notebooks/popmon_tutorial_basic.ipynb index cdbc4536..c1efff56 100644 --- a/popmon/notebooks/popmon_tutorial_basic.ipynb +++ b/popmon/notebooks/popmon_tutorial_basic.ipynb @@ -16,6 +16,7 @@ "source": [ "# (optional) Adjust the jupyter notebook style for easier navigation of the reports\n", "from IPython.core.display import display, HTML\n", + "\n", "# Wider notebook\n", "display(HTML(\"\"))\n", "# Cells are higher by default\n", @@ -37,6 +38,7 @@ "source": [ "# install popmon (if not installed yet)\n", "import sys\n", + "\n", "!{sys.executable} -m pip install popmon" ] }, @@ -79,7 +81,11 @@ "source": [ "# first we generate histograms,\n", "# but we could load pre-generated histograms from a pickle or json file as well.\n", - "hists = df.pm_make_histograms(time_axis=\"date\", time_width='2w', features=['date:age', 'date:gender', 'date:isActive'])" + "hists = df.pm_make_histograms(\n", + " time_axis=\"date\",\n", + " time_width=\"2w\",\n", + " features=[\"date:age\", \"date:gender\", \"date:isActive\"],\n", + ")" ] }, { @@ -109,7 +115,7 @@ }, "outputs": [], "source": [ - "report # or report_.to_notebook_iframe()" + "report # or report_.to_notebook_iframe()" ] }, { @@ -145,7 +151,11 @@ "metadata": {}, "outputs": [], "source": [ - "report_ = df.pm_stability_report(time_axis=\"date\", time_width='2w', features=['date:age', 'date:isActive', 'date:eyeColor'])" + "report_ = df.pm_stability_report(\n", + " time_axis=\"date\",\n", + " time_width=\"2w\",\n", + " features=[\"date:age\", \"date:isActive\", \"date:eyeColor\"],\n", + ")" ] }, { diff --git a/popmon/notebooks/popmon_tutorial_incremental_data.ipynb b/popmon/notebooks/popmon_tutorial_incremental_data.ipynb index 74467f2f..a2d5b97c 100644 --- a/popmon/notebooks/popmon_tutorial_incremental_data.ipynb +++ b/popmon/notebooks/popmon_tutorial_incremental_data.ipynb @@ -30,6 +30,7 @@ "source": [ "# install popmon (if not installed yet)\n", "import sys\n", + "\n", "!{sys.executable} -m pip install popmon" ] }, @@ -65,12 +66,15 @@ "def to_month(x):\n", " date = pd.to_datetime(x)\n", " return str(12 * date.year + date.month)\n", + "\n", + "\n", "def to_week(x):\n", " date = pd.to_datetime(x)\n", " return 52 * date.year + date.week\n", "\n", - "df['month'] = df['date'].apply(to_month)\n", - "df['week'] = df['date'].apply(to_week)\n", + "\n", + "df[\"month\"] = df[\"date\"].apply(to_month)\n", + "df[\"week\"] = df[\"date\"].apply(to_week)\n", "months = df.month.unique()\n", "weeks = df.week.unique().tolist()" ] @@ -88,9 +92,11 @@ "metadata": {}, "outputs": [], "source": [ - "features = ['date:isActive', 'date:eyeColor', 'date:latitude', 'date:age']\n", + "features = [\"date:isActive\", \"date:eyeColor\", \"date:latitude\", \"date:age\"]\n", "# weeks start on a Monday\n", - "hists = df.pm_make_histograms(features=features, time_axis='date', time_width='1w', time_offset='2015-1-5')" + "hists = df.pm_make_histograms(\n", + " features=features, time_axis=\"date\", time_width=\"1w\", time_offset=\"2015-1-5\"\n", + ")" ] }, { @@ -131,7 +137,7 @@ "outputs": [], "source": [ "# add up all the histograms sets\n", - "hists2 = popmon.stitch_histograms(hists_list=hists_list, time_axis='date', mode='add')" + "hists2 = popmon.stitch_histograms(hists_list=hists_list, time_axis=\"date\", mode=\"add\")" ] }, { @@ -204,7 +210,7 @@ "metadata": {}, "outputs": [], "source": [ - "# Now let's assume we already have a set of stitched histograms (hists3), \n", + "# Now let's assume we already have a set of stitched histograms (hists3),\n", "# and we want to stitch to add another new batch to this:" ] }, @@ -224,7 +230,9 @@ "\n", "# when adding hists_delta, one can either \"add\" histograms to existing weeks, or \"replace\" existing weeks.\n", "# the default is to add them.\n", - "hists4 = popmon.stitch_histograms(hists_basis=hists_basis, hists_delta=hists_delta, mode=\"add\")" + "hists4 = popmon.stitch_histograms(\n", + " hists_basis=hists_basis, hists_delta=hists_delta, mode=\"add\"\n", + ")" ] }, { @@ -234,7 +242,9 @@ "outputs": [], "source": [ "# or \"replace\" histograms found in existing weeks with those in hists_delta:\n", - "hists4 = popmon.stitch_histograms(hists_basis=hists_basis, hists_delta=hists_delta, mode=\"replace\")" + "hists4 = popmon.stitch_histograms(\n", + " hists_basis=hists_basis, hists_delta=hists_delta, mode=\"replace\"\n", + ")" ] }, { @@ -253,7 +263,7 @@ "metadata": {}, "outputs": [], "source": [ - "features = ['isActive', 'eyeColor', 'latitude', 'age']" + "features = [\"isActive\", \"eyeColor\", \"latitude\", \"age\"]" ] }, { @@ -289,7 +299,9 @@ "# since none of these histograms has a time-axis, in the stitching we create one (called 'batch'), and specify\n", "# that each batch of histograms is inserted at a particular value time_bin_idx value\n", "\n", - "hists3 = popmon.stitch_histograms(hists_list=hists_list, time_axis='batch', time_bin_idx=weeks)" + "hists3 = popmon.stitch_histograms(\n", + " hists_list=hists_list, time_axis=\"batch\", time_bin_idx=weeks\n", + ")" ] }, { @@ -336,7 +348,7 @@ "metadata": {}, "outputs": [], "source": [ - "# Now let's assume we already have a set of stitched histograms (hists3), \n", + "# Now let's assume we already have a set of stitched histograms (hists3),\n", "# and we want to stitch to add another new batch to this:" ] }, @@ -351,8 +363,10 @@ "# hists_delta is the new set of histograms\n", "hists_delta = hists_list[-1]\n", "\n", - "# by default, the stitcher will insert the batch right after the last batch found. \n", - "hists4 = popmon.stitch_histograms(hists_basis=hists_basis, hists_delta=hists_delta, time_axis='batch')" + "# by default, the stitcher will insert the batch right after the last batch found.\n", + "hists4 = popmon.stitch_histograms(\n", + " hists_basis=hists_basis, hists_delta=hists_delta, time_axis=\"batch\"\n", + ")" ] }, { @@ -362,7 +376,12 @@ "outputs": [], "source": [ "# one can also insert the new batch at a chosen new or existing time-bin index:\n", - "hists4 = popmon.stitch_histograms(hists_basis=hists_basis, hists_delta=hists_delta, time_axis='batch', time_bin_idx=200000)" + "hists4 = popmon.stitch_histograms(\n", + " hists_basis=hists_basis,\n", + " hists_delta=hists_delta,\n", + " time_axis=\"batch\",\n", + " time_bin_idx=200000,\n", + ")" ] }, { @@ -371,11 +390,16 @@ "metadata": {}, "outputs": [], "source": [ - "# when inserting at an existing time-bin index, on can either \"add\" to that index \n", + "# when inserting at an existing time-bin index, on can either \"add\" to that index\n", "# or \"replace\" the existing histograms. The default setting is to \"add\" the histograms:\n", - "mode = \"add\" # \"replace\"\n", - "hists4 = popmon.stitch_histograms(hists_basis=hists_basis, hists_delta=hists_delta,\n", - " time_axis='batch', time_bin_idx=104833, mode=mode)" + "mode = \"add\" # \"replace\"\n", + "hists4 = popmon.stitch_histograms(\n", + " hists_basis=hists_basis,\n", + " hists_delta=hists_delta,\n", + " time_axis=\"batch\",\n", + " time_bin_idx=104833,\n", + " mode=mode,\n", + ")" ] }, {