# Import

## Modules

In [104]:
import sklearn.metrics as sm
from sklearn.datasets import make_classification
from sklearn.model_selection import GridSearchCV
from sklearn.linear_model import LogisticRegression
from collections.abc import Iterable, Callable
import pandas as pd
import numpy as np

from abc import ABC, abstractmethod

In [113]:
class CostFunction(ABC):
    """Abstract class for cost functions"""
    @abstractmethod
    def functional(self, y_true: 'np.ndarray[float]', y_pred: 'np.ndarray[float]') -> float:
        pass
    
    @staticmethod
    def _to_array(y: Iterable[float]) -> 'np.ndarray[float]':
        return np.fromiter(y, float)

    def make_scorer(self) -> Callable:
        return sm.make_scorer(self.functional, greater_is_better=False)

    def __call__(self, y_true: Iterable[float], y_pred: Iterable[float]) -> float:
        y_pred_array = self._to_array(y_pred)
        y_true_array = self._to_array(y_true)
            
        return self.functional(y_true_array, y_pred_array)
    

In [114]:
class ClassificationCostFunction(CostFunction):
    def __init__(self, metric_opt_val_map: Iterable[tuple[str, float, float]], metric_class_map: dict[str, str]={}, proba_threshold: float = 0.5):
        """Defines cost functional for optimization of multiple metrics. 
        Since this is defined as a loss function, cross validation returns the negative of the score [1].

        Args:
            metric_opt_val_map (Iterable[tuple[str, float, float]]): Iterable of tuples of the form (metric_name, optimal_value, weight).
            metric_class_map (dict[str, str], optional): Dictionary mapping metric to class or probability of the form {'metric': 'class' or 'proba'}. Defaults to {}.
            proba_threshold (float, optional): Probability threshold used to convert probabilities into classes. Defaults to 0.5.
            
        References:
            [1] https://github.com/scikit-learn/scikit-learn/issues/2439
            
        Example:
            >>> y_true = [0, 0, 0, 1, 1]
            >>> y_pred = [0.46, 0.6, 0.29, 0.25, 0.012]
            >>> threshold = 0.5
            >>> score_opt_val_map = [("f1_score", 1, 1),
            ... ("log_loss", 0, 1),
            ... ("roc_auc_score", 1, 1)]
            >>> cf = ClassificationCostFunction(score_opt_val_map)
            >>> round(cf.functional(y_true, y_pred))
            21
            >>> score_opt_val_map = [("f1_score", 1, 1)]
            >>> cf = ClassificationCostFunction(score_opt_val_map)
            >>> X, y = make_classification()
            >>> model = LogisticRegression()
            >>> model.fit(X, y)
            >>> y_proba = model.predict_proba(X)[:, 1]
            >>> class_output = cf(y, y_proba)
            >>> scorer = getattr(sm, "f1_score")
            >>> y_pred = np.where(y_proba > 0.5, 1, 0)
            >>> scorer_output = (float(scorer(y, y_pred)) - 1.0)**2
            >>> np.isclose(class_output, scorer_output)
            True
        """
        self.metric_opt_val_map = metric_opt_val_map
        self.proba_threshold = proba_threshold
        self.metric_class_map = metric_class_map or {
            "accuracy_score": "class",
            "f1_score": "class",
            "log_loss": "class",
            "precision_score": "class",
            "recall_score": "class",
            "roc_auc_score": "proba"
        }
        
    def _to_class(self, array: 'np.ndarray[float]', metric: str) -> 'np.ndarray[float]':
        return np.where(array > self.proba_threshold, 1, 0) if self.metric_class_map[metric] == "class" else array
    
    
    def functional(self, y_true: 'np.ndarray[float]', y_pred: 'np.ndarray[float]') -> float:
        
        cost = 0
        for (score_name, opt_val, weight) in self.metric_opt_val_map:
            scorer = getattr(sm, score_name)
            
            y_hat = self._to_class(y_pred, score_name)
                
            cost += weight * (scorer(y_true, y_hat) - opt_val) ** 2
            
        return cost
            

In [115]:
score_opt_val_map = [
        ("f1_score", 1, 1),
]
cf = ClassificationCostFunction(score_opt_val_map)
X, y = make_classification()
model = LogisticRegression()
model.fit(X, y)
y_proba = model.predict_proba(X)[:, 1]
class_output = cf(y, y_proba)

scorer = getattr(sm, "f1_score")
y_pred = np.where(y_proba > 0.5, 1, 0)
scorer_output = (float(scorer(y, y_pred)) - 1.0)**2

assert np.isclose(class_output, scorer_output), f"{class_output} != {scorer_output}"

In [116]:
score_opt_val_map = [
        ("accuracy_score", 1, 1),
        ("f1_score", 1, 1),
        ("log_loss", 0, 1),
        ("precision_score", 1, 1),
        ("recall_score", 1, 1),
        ("roc_auc_score", 1, 1),
]

param_grid = {"C": [0.5, 1]}

scorer = ClassificationCostFunction(score_opt_val_map, proba_threshold=0.5)
cv = GridSearchCV(LogisticRegression(), param_grid, scoring=scorer.make_scorer())

X, y = make_classification()
cv.fit(X, y)
pd.DataFrame.from_dict(cv.cv_results_)

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_C,params,split0_test_score,split1_test_score,split2_test_score,split3_test_score,split4_test_score,mean_test_score,std_test_score,rank_test_score
0,0.036147,0.04393,0.02857,0.016946,0.5,{'C': 0.5},-26.950154,-11.979547,-3.000088,-27.007001,-11.985865,-16.184531,9.403879,1
1,0.008821,0.002099,0.008231,0.001511,1.0,{'C': 1},-47.904298,-11.979547,-3.000088,-27.007001,-11.985865,-20.37536,15.777164,2
