diff --git a/doubtlab/ensemble.py b/doubtlab/ensemble.py index 3268be2..2ce4243 100644 --- a/doubtlab/ensemble.py +++ b/doubtlab/ensemble.py @@ -65,16 +65,9 @@ def get_predicates(self, X, y=None): predicates = doubt.get_predicates(X, y) ``` """ - df = pd.DataFrame({"i": range(len(X))}) - for name, func in self.reasons.items(): - df[f"predicate_{name}"] = func(X, y) - predicates = [c for c in df.columns if "predicate" in c] - return ( - df[predicates] - .assign(s=lambda d: d.sum(axis=1)) - .sort_values("s", ascending=False) - .drop(columns=["s"]) - ) + df = pd.DataFrame({f"predicate_{name}": func(X, y) for name, func in self.reasons.items()}) + sorted_index = df.sum(axis=1).sort_values(ascending=False).index + return df.reindex(sorted_index) def get_indices(self, X, y=None): """ diff --git a/doubtlab/reason.py b/doubtlab/reason.py index 4ab2f5d..3396477 100644 --- a/doubtlab/reason.py +++ b/doubtlab/reason.py @@ -72,6 +72,52 @@ def __call__(self, X, y=None): return np.where(rvals < self.probability, rvals, 0) +class ShannonEntropyReason: + """ + Assign doubt when the normalized Shannon entropy is too high, see + https://math.stackexchange.com/questions/395121/how-entropy-scales-with-sample-size + for a discussion. + + Arguments: + model: scikit-learn classifier + threshold: confidence threshold for doubt assignment + + Usage: + + ```python + from sklearn.datasets import load_iris + from sklearn.linear_model import LogisticRegression + + from doubtlab.ensemble import DoubtEnsemble + from doubtlab.reason import ShannonEntropyReason + + X, y = load_iris(return_X_y=True) + model = LogisticRegression(max_iter=1_000) + model.fit(X, y) + + doubt = DoubtEnsemble(reason = ShannonEntropyReason(model=model)) + + indices = doubt.get_indices(X, y) + ``` + """ + + def __init__(self, model, threshold=0.5): + self.model = model + self.threshold = threshold + + def __call__(self, X, y): + probas = self.model.predict_proba(X) + log_probas = self.model.predict_log_proba(X) / np.log(len(self.model.classes_)) + entropies = -(probas * log_probas).sum(axis=1) + return np.where(entropies > self.threshold, entropies, 0) + + @staticmethod + def from_proba(proba, n_classes, threshold=0.5): + """Outputs a reason array from a prediction array, skipping the need for a model.""" + entropies = -(proba * np.log(proba) / np.log(n_classes)).sum(axis=1) + return np.where(entropies > threshold, entropies, 0) + + class WrongPredictionReason: """ Assign doubt when the model prediction doesn't match the label. @@ -153,7 +199,7 @@ def __call__(self, X, y): class MarginConfidenceReason: """ - Assign doubt when a the difference between the top two most confident classes is too small. + Assign doubt when the difference between the top two most confident classes is too small. Throws an error when there are only two classes.