v0.0.4 (#4)

* Release editions: add gitignore, readthedocs, update link in README, add exclude folders in pyproject, remove dev requirements * Update README, dependencies are currently not pinned * Change links in pyproject, approach to show tech documentation * Install project dependencies before building docs * Remove hdbscan from dependencies, import it from scikit-learn >= 1.3.0 * Add dynamic package versioning * FIX: pydantic < 2, link to examples in README * ADD: open-source community files * FIX: examples link in docs * FIX: bootstrap ratio test is for ratio metrics instead of binary; manual t-test is available for ratio metrics * ADD: tests for initialization and evaluation, Github Action for CI * FIX: statistic and p-value in tests are rounded to 5 digits; options in black CI * UPD: refactor code with black * FIX: typo in split builder parameters * UPD: version up and exclude tests from package build * UPD: target_flg -> target * UPD: make bootstrap available for binary metrics + tests * UPD: up minimum scipy version to 1.10.0 * FIX: zero p-value in bootstrap confidence interval * DEL: remove unnecessary test file * UPD: ability to save experiment plot and select its kind * UPD: change format of report output and add warning about usage of results * UPD: report returns dictionary with params, not just print report * ADD: make available to plot bootstrap distribution of differences * UPD: now injections are passed as MDEs, not multiplicators of a metric * UPD: change default treatment group name in splitter from 'target' to 'treatment' * UPD: update docs, fix typos in docstrings * CI: add GitHub Action for automated publishing to PyPI on release * UPD: updates for v0.0.4 * UPD: black formating * UPD: update report example in README
kolmogorov-lab · Oct 5, 2023 · 123540c · 123540c
1 parent b596d8b
commit 123540c
Show file tree

Hide file tree

Showing 20 changed files with 235 additions and 230 deletions.
diff --git a/.github/workflows/publish_pypi.yml b/.github/workflows/publish_pypi.yml
@@ -0,0 +1,29 @@
+name: Publish Python Package
+
+on:
+  release:
+    types: [created]
+
+jobs:
+  deploy:
+    runs-on: ubuntu-latest
+    steps:
+    - uses: actions/checkout@v1
+
+    - name: Set up Python
+      uses: actions/setup-python@v1
+      with:
+        python-version: '3.11'
+
+    - name: Install dependencies
+      run: |
+        python -m pip install --upgrade pip
+        pip install setuptools build wheel twine
+
+    - name: Build and publish
+      env:
+        TWINE_USERNAME: ${{ secrets.PYPI_USERNAME }}
+        TWINE_PASSWORD: ${{ secrets.PYPI_PASSWORD }}
+      run: |
+        python -m build
+        twine upload dist/*
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -4,6 +4,48 @@ All notable changes to this project will be documented in this file.
 The format is based on [Keep a Changelog](http://keepachangelog.com/)
 and this project adheres to [Semantic Versioning](http://semver.org/).
 
+## [0.0.4] - 2023-10-05
+
+### Added
+
+- Add GitHub Action for automated publishing to PyPI on release.
+- Make available to plot bootstrap distribution of differences.
+- Make bootstrap available for binary metrics.
+- Report now returns dictionary with params, not just print report.
+- Change format of report output and add warning about usage of results.
+- Add ability to save experiment plot and select its kind.
+
+### Changed
+
+- Update lower bound of scipy to 1.10.0.
+- target_flg -> target: unify property for setting target variable name.
+- Remove unnecessary files.
+- Change default treatment group name in splitter from 'target' to 'treatment'.
+
+### Fixed
+
+- Calculation of zero's p-value in bootstrap confidence interval. Now it takes into account directionality of hypothesis.
+- Now injections are passed as MDEs, not multiplicators of a metric.
+
+
+## [0.0.3] - 2023-09-27
+
+### Added
+
+- Add tests for test initialization and statistical tests execution.
+- Add open-source community files (CHANGELOG, CODE_OF_CONDUCT, CONTRIBUTING, SECURITY).
+- Introduce CI process for testing and linting.
+
+### Changed
+
+- Statistic value and p-value of statistical tests are now limited to 5 digits in decimal place.
+- Refactor code with black.
+
+### Fixed
+
+- Pydantic is now limited to the 1.x.x version.
+- Typos in links in docs and README.
+
 
 ## [0.0.2] - 2023-08-26
 

diff --git a/abacus/VERSION b/abacus/VERSION
@@ -1 +1 @@
-0.0.3
+0.0.4
diff --git a/abacus/auto_ab/abtest.py b/abacus/auto_ab/abtest.py
@@ -1,5 +1,5 @@
 from __future__ import annotations
-from typing import Optional, Tuple, Dict
+from typing import Optional, Tuple, Dict, Any
 import copy
 import warnings
 import numpy as np
@@ -99,6 +99,8 @@ def __check_required_metric_type(self, method: str) -> None:
             ],
             "binary": [
                 "report_binary",
+                "test_boot_confint",
+                "test_boot_fp",
                 "test_z_proportions",
                 "test_chisquare",
             ],
@@ -148,10 +150,8 @@ def __check_required_columns(self, df: DataFrameType, method: str) -> None:
                 "id_col": self.params.data_params.id_col,
                 "group_col": self.params.data_params.group_col,
             }
-            if self.params.hypothesis_params.metric_type == "continuous":
+            if self.params.hypothesis_params.metric_type in ["continuous", "binary"]:
                 cols["target"] = self.params.data_params.target
-            elif self.params.hypothesis_params.metric_type == "binary":
-                cols["target_flg"] = self.params.data_params.target_flg
             elif self.params.hypothesis_params.metric_type == "ratio":
                 cols["numerator"] = self.params.data_params.numerator
                 cols["denominator"] = self.params.data_params.denominator
@@ -191,16 +191,12 @@ def __get_group(
         """
         x = df if df is not None else self.__dataset
         group = np.array([])
-        if self.params.hypothesis_params.metric_type == "continuous":
+        if self.params.hypothesis_params.metric_type in ["continuous", "binary"]:
             group = x.loc[
                 x[self.params.data_params.group_col] == group_label,
                 self.params.data_params.target,
             ].to_numpy()
-        elif self.params.hypothesis_params.metric_type == "binary":
-            group = x.loc[
-                x[self.params.data_params.group_col] == group_label,
-                self.params.data_params.target_flg,
-            ].to_numpy()
+
         return group
 
     def __bucketize(self, x: ArrayNumType) -> np.ndarray:
@@ -384,7 +380,7 @@ def __taylor_params(self, x: DataFrameType) -> Tuple[float, float]:
 
         return mean, var
 
-    def __report_binary(self) -> str:
+    def __report_binary(self) -> Tuple[str, Dict[str, Any]]:
         self.__check_required_metric_type("report_binary")
 
         hypothesis = self.params.hypothesis_params
@@ -440,6 +436,17 @@ def __report_binary(self) -> str:
 - Errors: alpha = {alpha}, beta = {beta}.
 - Alternative: {alternative}.
 
+{transforms}
+
+Following statistical tests are used:
+- Z-test: {ztest_stat:.2f}, p-value = {ztest_pvalue:.4f}, {ztest_result}.
+{chi_square}
+
+{test_explanation}
+Please note that you should carefully use the results of different statistical 
+procedures and do not consider all of them at once.
+
+Statistics of experiment groups.
 Control group:
 - Observations: {ctrl_obs}
 - Conversion: {ctrl_conv}
@@ -448,20 +455,13 @@ def __report_binary(self) -> str:
 - Observations: {trtm_obs}
 - Conversion: {trtm_conv}
 
-{transforms}
-
-Following statistical tests are used:
-- Z-test: {ztest_stat:.2f}, p-value = {ztest_pvalue:.4f}, {ztest_result}.
-{chi_square}
-
-{test_explanation}
         """.format(
             **params
         )
 
-        return output
+        return output, params
 
-    def __report_continuous(self) -> str:
+    def __report_continuous(self) -> Tuple[str, Dict[str, Any]]:
         self.__check_required_metric_type("report_continuous")
 
         hypothesis = self.params.hypothesis_params
@@ -557,6 +557,18 @@ def __report_continuous(self) -> str:
 - Errors: alpha = {alpha}, beta = {beta}.
 - Alternative: {alternative}.
 
+{transforms}
+Number of bootstrap iterations: {n_boot_samples}.\n{bucketing_str}{metric_transform_str}{filter_outliers_str}
+Following statistical tests are used:
+- Welch's t-test: {welch_stat:.2f}, p-value = {welch_pvalue:.4f}, {welch_result}.
+- Mann Whitney's U-test: {mwu_stat:.2f}, p-value = {mwu_pvalue:.4f}, {mwu_result}.
+- Bootstrap test: {boot_result}.
+
+{test_explanation}
+Please note that you should carefully use the results of different statistical 
+procedures and do not consider all of them at once.
+
+Statistics of experiment groups.
 Control group:
 - Observations: {ctrl_obs}
 - Mean: {ctrl_mean:.4f}
@@ -579,19 +591,11 @@ def __report_continuous(self) -> str:
 - St.deviation: {trtm_std:.4f}
 - Variance: {trtm_var:.4f}
 
-{transforms}
-Number of bootstrap iterations: {n_boot_samples}.\n{bucketing_str}{metric_transform_str}{filter_outliers_str}
-Following statistical tests are used:
-- Welch's t-test: {welch_stat:.2f}, p-value = {welch_pvalue:.4f}, {welch_result}.
-- Mann Whitney's U-test: {mwu_stat:.2f}, p-value = {mwu_pvalue:.4f}, {mwu_result}.
-- Bootstrap test: {boot_result}.
-
-{test_explanation}
         """.format(
             **params
         )
 
-        return output
+        return output, params
 
     def __report_ratio(self):
         raise NotImplementedError("Reporting for ratio metric is still in progress..")
@@ -840,31 +844,48 @@ def metric_transform(self) -> ABTest:
 
         return ABTest(dataset_new, params_new)
 
-    def plot(self) -> None:
+    def plot(self, kind: str = "experiment", save_path: Optional[str] = None) -> None:
         """Plot experiment.
 
-        Plot figure type depends on the following parameters:
+        Args:
+            kind (str): Kind of plot: 'experiment', 'bootstrap'.
+            save_path (str, optional): Path where to save image.
 
-        - hypothesis_params.metric_name
-        - hypothesis_params.strategy
+        Raises:
+            ValueError: If `kind` is not in ['experiment', 'bootstrap'].
         """
-        if self.params.hypothesis_params.metric_type == "continuous":
-            Graphics.plot_continuous_experiment(self.params)
+        if kind not in ["experiment", "bootstrap"]:
+            raise ValueError(
+                "`kind` parameter supports only the following values: 'experiment', 'bootstrap'"
+            )
 
-        if self.params.hypothesis_params.metric_type == "binary":
-            Graphics.plot_binary_experiment(self.params)
+        if kind == "experiment":
+            if self.params.hypothesis_params.metric_type == "continuous":
+                Graphics.plot_continuous_experiment(self.params, save_path)
+
+            if self.params.hypothesis_params.metric_type == "binary":
+                Graphics.plot_binary_experiment(self.params, save_path)
+
+        elif kind == "bootstrap" and self.params.hypothesis_params.metric_type in [
+            "continuous",
+            "binary",
+        ]:
+            Graphics.plot_bootstrap_confint(self.params, save_path)
 
-    def report(self) -> None:
+    def report(self) -> Dict[str, Any]:
         report_output = "Report for ratio metric currently not supported."
+        report_params = {}
 
         if self.params.hypothesis_params.metric_type == "continuous":
-            report_output = self.__report_continuous()
+            report_output, report_params = self.__report_continuous()
 
         if self.params.hypothesis_params.metric_type == "binary":
-            report_output = self.__report_binary()
+            report_output, report_params = self.__report_binary()
 
         print(report_output)
 
+        return report_params
+
     def resplit_df(self) -> ABTest:
         """Resplit dataframe.
 
@@ -949,7 +970,6 @@ def test_boot_confint(self) -> StatTestResultType:
 
         boot_mean = pd_metric_diffs.mean()
         boot_std = pd_metric_diffs.std()
-        zero_pvalue = norm.sf(0, loc=boot_mean, scale=boot_std)[0]
 
         test_result: int = 0  # 0 - cannot reject H0, 1 - reject H0
         if self.params.hypothesis_params.alternative == "two-sided":
@@ -958,18 +978,27 @@ def test_boot_confint(self) -> StatTestResultType:
             ci = pd_metric_diffs.quantile([left_quant, right_quant])
             ci_left, ci_right = float(ci.iloc[0]), float(ci.iloc[1])
 
+            one_sided_pvalue = norm.cdf(0, loc=boot_mean, scale=boot_std)[0]
+            zero_pvalue = min(one_sided_pvalue, 1 - one_sided_pvalue)
+
             if ci_left > 0 or ci_right < 0:  # 0 is not in critical area
                 test_result = 1
         elif self.params.hypothesis_params.alternative == "less":
             left_quant = self.params.hypothesis_params.alpha
             ci = pd_metric_diffs.quantile([left_quant])
             ci_left = float(ci.iloc[0])
+
+            zero_pvalue = norm.cdf(0, loc=boot_mean, scale=boot_std)[0]
+
             if ci_left < 0:  # 0 is not is critical area
                 test_result = 1
         elif self.params.hypothesis_params.alternative == "greater":
             right_quant = self.params.hypothesis_params.alpha
             ci = pd_metric_diffs.quantile([right_quant])
             ci_right = float(ci.iloc[0])
+
+            zero_pvalue = 1 - norm.cdf(0, loc=boot_mean, scale=boot_std)[0]
+
             if 0 < ci_right:  # 0 is not in critical area
                 test_result = 1