Skip to content

Commit

Permalink
Merge pull request #29 from FBruzzesi/dev_issue28
Browse files Browse the repository at this point in the history
Issue #28, StandardizedErrorReason class
  • Loading branch information
koaning committed Dec 21, 2021
2 parents f263524 + a1bded6 commit 9202b8a
Show file tree
Hide file tree
Showing 6 changed files with 88 additions and 7 deletions.
1 change: 1 addition & 0 deletions README.md
Expand Up @@ -93,6 +93,7 @@ The library implemented many "reasons" for doubt.

- `AbsoluteDifferenceReason`: assign doubt when the absolute difference is too high
- `RelativeDifferenceReason`: assign doubt when the relative difference is too high
- `StandardizedErrorReason`: assign doubt when the absolute standardized residual is too high

## Feedback

Expand Down
1 change: 1 addition & 0 deletions docs/quickstart/index.md
Expand Up @@ -170,5 +170,6 @@ of reasons that this library supports.

- `AbsoluteDifferenceReason`: assign doubt when the absolute difference is too high
- `RelativeDifferenceReason`: assign doubt when the relative difference is too high
- `StandardizedErrorReason`: assign doubt when the absolute standardized residual is too high

If you think there's a reason missing, feel free to mention it on [GitHub](https://github.com/koaning/doubtlab/issues/new).
61 changes: 59 additions & 2 deletions doubtlab/reason.py
Expand Up @@ -484,7 +484,7 @@ class AbsoluteDifferenceReason:
Assign doubt when the absolute difference between label and regression is too large.
Arguments:
model: scikit-learn outlier model
model: scikit-learn regression model
threshold: cutoff for doubt assignment
Usage:
Expand Down Expand Up @@ -520,7 +520,7 @@ class RelativeDifferenceReason:
Assign doubt when the relative difference between label and regression is too large.
Arguments:
model: scikit-learn outlier model
model: scikit-learn regression model
threshold: cutoff for doubt assignment
Usage:
Expand Down Expand Up @@ -611,3 +611,60 @@ def from_proba(proba, y, min_doubt=0.5, sorted_index_method="normalized_margin")
def __call__(self, X, y):
probas = self.model.predict_proba(X)
return self.from_proba(probas, y, self.min_doubt, self.sorted_index_method)


class StandardizedErrorReason:
"""
Assign doubt when the absolute standardized residual is too high.
Arguments:
model: scikit-learn regression model
threshold: cutoff for doubt assignment
Usage:
```python
from sklearn.datasets import load_diabetes
from sklearn.linear_model import LinearRegression
from doubtlab.ensemble import DoubtEnsemble
from doubtlab.reason import StandardizedErrorReason
X, y = load_diabetes(return_X_y=True)
model = LinearRegression()
model.fit(X, y)
doubt = DoubtEnsemble(reason = StandardizedErrorReason(model, threshold=2.))
indices = doubt.get_indices(X, y)
```
"""

def __init__(self, model, threshold=2.0):
if threshold <= 0:
raise ValueError("threshold value should be positive")
self.model = model
self.threshold = threshold

def __call__(self, X, y):
preds = self.model.predict(X)
return self.from_predict(preds, y, self.threshold)

@staticmethod
def from_predict(pred, y, threshold):
"""
Outputs a reason array from a prediction array, skipping the need for a model.
Usage:
```python
import numpy as np
from doubtlab.reason import StandardizedErrorReason
y = np.random.randn(100)
preds = np.random.randn(100)
predicate = StandardizedErrorReason.from_predict(preds, y)
```
"""
res = y - pred
res_std = res / np.std(res, ddof=1)
return (np.abs(res_std) >= threshold).astype(np.float16)
6 changes: 1 addition & 5 deletions setup.py
Expand Up @@ -2,11 +2,7 @@
from setuptools import setup, find_packages


base_packages = [
"scikit-learn>=1.0.0",
"cleanlab>=1.0",
"pandas>=1.3.3",
]
base_packages = ["scikit-learn>=1.0.0", "cleanlab>=1.0", "pandas>=1.3.3"]

docs_packages = [
"mkdocs==1.1",
Expand Down
2 changes: 2 additions & 0 deletions tests/test_general_reason.py
Expand Up @@ -17,6 +17,7 @@
AbsoluteDifferenceReason,
RelativeDifferenceReason,
CleanlabReason,
StandardizedErrorReason,
)

clf_reasons = [
Expand All @@ -31,6 +32,7 @@
regr_reasons = [
AbsoluteDifferenceReason,
RelativeDifferenceReason,
StandardizedErrorReason,
]

clf_datasets = [
Expand Down
24 changes: 24 additions & 0 deletions tests/test_reason/test_standardizederror.py
@@ -0,0 +1,24 @@
import numpy as np
from doubtlab.reason import StandardizedErrorReason


def test_from_predict():
"""Test `from_predict` on an obvious examples"""
y = np.array([0.0, 0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9])
preds = np.array(
[0.0, 0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 10.0]
) + np.random.choice([-0.05, 0, 0.05], 10)
predicate = StandardizedErrorReason.from_predict(pred=preds, y=y, threshold=3.0)
assert np.all(
predicate == np.array([0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0])
)


def test_from_predict_no_reason():
"""Test `from_predict` on an obvious examples"""
y = np.array([0.0, 0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9])
preds = y + np.random.choice([-0.05, 0, 0.05], 10)
predicate = StandardizedErrorReason.from_predict(pred=preds, y=y, threshold=3.0)
assert np.all(
predicate == np.array([0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0])
)

0 comments on commit 9202b8a

Please sign in to comment.