In [7]:
import pandas as pd
import numpy as np

## 閾値最適化

### F1_scoreに対する閾値の最適化
- Accuracyなどと違って判定に用いる閾値の変化によってconfusion_matrixが異なる
- よってF1_scoreも変化するため、最も評価の高くなる閾値を探索する必要がある

In [8]:
from sklearn.metrics import f1_score
from scipy.optimize import minimize

In [9]:
rand:np.random.RandomState = np.random.RandomState(71)

In [10]:
train_y_prob = np.linspace(0,1.0,10000)
train_y_prob

array([0.00000000e+00, 1.00010001e-04, 2.00020002e-04, ...,
       9.99799980e-01, 9.99899990e-01, 1.00000000e+00])

In [15]:
train_y = pd.Series(
    rand.uniform(0.0, 1.0, train_y_prob.size) < train_y_prob
)
train_pred_prob = np.clip(train_y_prob * np.exp(rand.standard_normal(train_y_prob.shape) * 0.3), 0.0, 1.0)
train_y

0       False
1       False
2       False
3       False
4       False
        ...  
9995     True
9996     True
9997     True
9998     True
9999     True
Length: 10000, dtype: bool

In [16]:
train_pred_prob

array([0.00000000e+00, 1.23154183e-04, 2.40207041e-04, ...,
       1.00000000e+00, 1.00000000e+00, 1.00000000e+00])

In [19]:
init_threshold = 0.5
init_score = f1_score(
    y_true=train_y, y_pred=train_pred_prob >= init_threshold
)
init_threshold, init_score

(0.5, 0.7221206581352835)

In [20]:
## 最適な閾値の探索
def f1_opt(x):
    """
    docstring
    """
    return - f1_score(train_y, train_pred_prob >= x)

result = minimize(f1_opt, x0=np.array([0.5]), method="Nelder-Mead")
best_threshold = result["x"].item()

best_threshold, f1_score(train_y, train_pred_prob >= best_threshold)

(0.35624999999999984, 0.7543019336526522)