Merge pull request #29 from FBruzzesi/dev_issue28

Issue #28, StandardizedErrorReason class
koaning · Dec 21, 2021 · 9202b8a · 9202b8a
2 parents f263524 + a1bded6
commit 9202b8a
Show file tree

Hide file tree

Showing 6 changed files with 88 additions and 7 deletions.
diff --git a/README.md b/README.md
@@ -93,6 +93,7 @@ The library implemented many "reasons" for doubt.
 
 - `AbsoluteDifferenceReason`: assign doubt when the absolute difference is too high
 - `RelativeDifferenceReason`: assign doubt when the relative difference is too high
+- `StandardizedErrorReason`: assign doubt when the absolute standardized residual is too high
 
 ## Feedback
 

diff --git a/docs/quickstart/index.md b/docs/quickstart/index.md
@@ -170,5 +170,6 @@ of reasons that this library supports.
 
 - `AbsoluteDifferenceReason`: assign doubt when the absolute difference is too high
 - `RelativeDifferenceReason`: assign doubt when the relative difference is too high
+- `StandardizedErrorReason`: assign doubt when the absolute standardized residual is too high
 
 If you think there's a reason missing, feel free to mention it on [GitHub](https://github.com/koaning/doubtlab/issues/new).
diff --git a/doubtlab/reason.py b/doubtlab/reason.py
@@ -484,7 +484,7 @@ class AbsoluteDifferenceReason:
     Assign doubt when the absolute difference between label and regression is too large.
 
     Arguments:
-        model: scikit-learn outlier model
+        model: scikit-learn regression model
         threshold: cutoff for doubt assignment
 
     Usage:
@@ -520,7 +520,7 @@ class RelativeDifferenceReason:
     Assign doubt when the relative difference between label and regression is too large.
 
     Arguments:
-        model: scikit-learn outlier model
+        model: scikit-learn regression model
         threshold: cutoff for doubt assignment
 
     Usage:
@@ -611,3 +611,60 @@ def from_proba(proba, y, min_doubt=0.5, sorted_index_method="normalized_margin")
     def __call__(self, X, y):
         probas = self.model.predict_proba(X)
         return self.from_proba(probas, y, self.min_doubt, self.sorted_index_method)
+
+
+class StandardizedErrorReason:
+    """
+    Assign doubt when the absolute standardized residual is too high.
+
+    Arguments:
+        model: scikit-learn regression model
+        threshold: cutoff for doubt assignment
+
+    Usage:
+
+    ```python
+    from sklearn.datasets import load_diabetes
+    from sklearn.linear_model import LinearRegression
+
+    from doubtlab.ensemble import DoubtEnsemble
+    from doubtlab.reason import StandardizedErrorReason
+
+    X, y = load_diabetes(return_X_y=True)
+    model = LinearRegression()
+    model.fit(X, y)
+
+    doubt = DoubtEnsemble(reason = StandardizedErrorReason(model, threshold=2.))
+    indices = doubt.get_indices(X, y)
+    ```
+    """
+
+    def __init__(self, model, threshold=2.0):
+        if threshold <= 0:
+            raise ValueError("threshold value should be positive")
+        self.model = model
+        self.threshold = threshold
+
+    def __call__(self, X, y):
+        preds = self.model.predict(X)
+        return self.from_predict(preds, y, self.threshold)
+
+    @staticmethod
+    def from_predict(pred, y, threshold):
+        """
+        Outputs a reason array from a prediction array, skipping the need for a model.
+
+        Usage:
+        ```python
+        import numpy as np
+        from doubtlab.reason import StandardizedErrorReason
+
+        y = np.random.randn(100)
+        preds = np.random.randn(100)
+
+        predicate = StandardizedErrorReason.from_predict(preds, y)
+        ```
+        """
+        res = y - pred
+        res_std = res / np.std(res, ddof=1)
+        return (np.abs(res_std) >= threshold).astype(np.float16)
diff --git a/setup.py b/setup.py
@@ -2,11 +2,7 @@
 from setuptools import setup, find_packages
 
 
-base_packages = [
-    "scikit-learn>=1.0.0",
-    "cleanlab>=1.0",
-    "pandas>=1.3.3",
-]
+base_packages = ["scikit-learn>=1.0.0", "cleanlab>=1.0", "pandas>=1.3.3"]
 
 docs_packages = [
     "mkdocs==1.1",

diff --git a/tests/test_general_reason.py b/tests/test_general_reason.py
@@ -17,6 +17,7 @@
     AbsoluteDifferenceReason,
     RelativeDifferenceReason,
     CleanlabReason,
+    StandardizedErrorReason,
 )
 
 clf_reasons = [
@@ -31,6 +32,7 @@
 regr_reasons = [
     AbsoluteDifferenceReason,
     RelativeDifferenceReason,
+    StandardizedErrorReason,
 ]
 
 clf_datasets = [

diff --git a/tests/test_reason/test_standardizederror.py b/tests/test_reason/test_standardizederror.py
@@ -0,0 +1,24 @@
+import numpy as np
+from doubtlab.reason import StandardizedErrorReason
+
+
+def test_from_predict():
+    """Test `from_predict` on an obvious examples"""
+    y = np.array([0.0, 0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9])
+    preds = np.array(
+        [0.0, 0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 10.0]
+    ) + np.random.choice([-0.05, 0, 0.05], 10)
+    predicate = StandardizedErrorReason.from_predict(pred=preds, y=y, threshold=3.0)
+    assert np.all(
+        predicate == np.array([0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0])
+    )
+
+
+def test_from_predict_no_reason():
+    """Test `from_predict` on an obvious examples"""
+    y = np.array([0.0, 0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9])
+    preds = y + np.random.choice([-0.05, 0, 0.05], 10)
+    predicate = StandardizedErrorReason.from_predict(pred=preds, y=y, threshold=3.0)
+    assert np.all(
+        predicate == np.array([0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0])
+    )