# Entry 28 notebook - Thresholds - Profit and cost

In [4]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import train_test_split, StratifiedKFold, cross_validate, cross_val_predict
from sklearn.compose import ColumnTransformer, make_column_transformer, make_column_selector
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.impute import SimpleImputer
from sklearn.linear_model import LogisticRegression

from sklearn.metrics import confusion_matrix

from sklearn.datasets import load_digits, fetch_openml

In [5]:
def plot_precision_recall_vs_threshold(precisions, recalls, thresholds):
    plt.plot(thresholds, precisions[:-1], 'b--', label='precision')
    plt.plot(thresholds, recalls[:-1], 'g-', label='recall')
    plt.xlabel('Threshold')
    plt.legend(loc='center right')
    plt.grid()
    plt.ylim([0, 1])
    
def plot_precision_vs_recall(precisions, recalls, thresholds):
    close_default = np.argmin(np.abs(thresholds - 0.5))
    plt.plot(precisions[close_default], recalls[close_default], 'o', markersize=10,
            label='threshold zero', fillstyle="none", c='k', mew=2)
    plt.plot(recalls, precisions, "b-", linewidth=2, label='precision recall curve')
    plt.xlabel("Recall", fontsize=16)
    plt.ylabel("Precision", fontsize=16)
    plt.axis([0, 1, 0, 1])
    plt.legend(loc='best')
    plt.grid(True)
    
def plt_roc_curve(fpr, tpr, thresholds, label='ROC curve'):
    close_default = np.argmin(np.abs(thresholds - 0.5))
    plt.plot(fpr[close_default], tpr[close_default], 'o', markersize=10,
            label='default threshold 0.5', fillstyle="none", c='k', mew=2)
    plt.plot(fpr, tpr, linewidth=2, label=label)
    plt.plot([0,1], [0,1], 'k--')
    plt.xlabel('False Positive Rate')
    plt.ylabel('True Positive Rate')

## Make dataset and fit model

In [6]:
titanic = fetch_openml('titanic', version=1, as_frame=True)
df_raw = titanic.data
target = titanic.target

In [7]:
features = ['pclass', 'sex', 'sibsp', 'parch']
X = df_raw.loc[:, features]
X['sex'] = X['sex'].astype('category').cat.codes
y = target.copy().astype('category').cat.codes

X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=0.8, random_state=12)
pipe = make_pipeline(SimpleImputer(strategy='median'), StandardScaler(), LogisticRegression())
pipe.fit(X_train, y_train)
pipe.score(X_test, y_test)

0.7824427480916031

### y scores

In [54]:
# y_scores = cross_val_predict(pipe, X_train, y_train, cv=10, method='decision_function')

In [13]:
train_scores = cross_val_predict(pipe, X_train, y_train, cv=3, method='predict_proba')[:,1]

In [14]:
train_scores

array([0.10770435, 0.41484318, 0.10770435, ..., 0.10290484, 0.6844146 ,
       0.10290484])

In [16]:
thresholds = np.arange(0.0, 1.01, 0.01)
thresholds

array([0.  , 0.01, 0.02, 0.03, 0.04, 0.05, 0.06, 0.07, 0.08, 0.09, 0.1 ,
       0.11, 0.12, 0.13, 0.14, 0.15, 0.16, 0.17, 0.18, 0.19, 0.2 , 0.21,
       0.22, 0.23, 0.24, 0.25, 0.26, 0.27, 0.28, 0.29, 0.3 , 0.31, 0.32,
       0.33, 0.34, 0.35, 0.36, 0.37, 0.38, 0.39, 0.4 , 0.41, 0.42, 0.43,
       0.44, 0.45, 0.46, 0.47, 0.48, 0.49, 0.5 , 0.51, 0.52, 0.53, 0.54,
       0.55, 0.56, 0.57, 0.58, 0.59, 0.6 , 0.61, 0.62, 0.63, 0.64, 0.65,
       0.66, 0.67, 0.68, 0.69, 0.7 , 0.71, 0.72, 0.73, 0.74, 0.75, 0.76,
       0.77, 0.78, 0.79, 0.8 , 0.81, 0.82, 0.83, 0.84, 0.85, 0.86, 0.87,
       0.88, 0.89, 0.9 , 0.91, 0.92, 0.93, 0.94, 0.95, 0.96, 0.97, 0.98,
       0.99, 1.  ])

In [22]:
(train_scores >= thresholds[1]).astype(int)

array([1, 1, 1, ..., 1, 1, 1])

In [27]:
tn, fp, fn, tp = confusion_matrix(y_train, (train_scores >= thresholds[1])).ravel()
print(tn, fp, fn, tp)

0 650 0 397


In [28]:
df_matrix = pd.DataFrame(columns = ['threshold', 'tp', 'fp', 'fn'])
df_matrix.concat(pd.DataFrame())

Unnamed: 0,threshold,tp,fp,fn


In [38]:
df_matrix = pd.DataFrame([[thresholds[1], tp, fp, fn]])

In [46]:
df_matrix

Unnamed: 0,0,1,2,3
0,0.01,397,650,0


In [47]:
tn, fp, fn, tp = confusion_matrix(y_train, (train_scores >= thresholds[5])).ravel()

df2 = pd.DataFrame([[thresholds[5], tp, fp, fn]])

In [48]:
df2

Unnamed: 0,0,1,2,3
0,0.05,397,643,0


In [49]:
pd.concat([df_matrix, df2])

Unnamed: 0,0,1,2,3
0,0.01,397,650,0
0,0.05,397,643,0


In [50]:
train_scores

array([0.10770435, 0.41484318, 0.10770435, ..., 0.10290484, 0.6844146 ,
       0.10290484])

In [51]:
y_train

783     1
138     0
1272    0
141     1
562     1
       ..
278     1
432     1
1283    0
1265    0
1277    1
Length: 1047, dtype: int8

In [101]:
def create_threshold_matrix(y_train, proba_scores):
    df_matrix = pd.DataFrame()
    for th in np.arange(0.0, 1.01, 0.01):
        tn, fp, fn, tp = confusion_matrix(y_train, (proba_scores >= th)).ravel()
        df_matrix = pd.concat([df_matrix, pd.DataFrame([[th, tp, fp, fn]])])
    df_matrix.columns = ['threshold', 'tp', 'fp', 'fn']
    return df_matrix

In [103]:
threshold_matrix = create_threshold_matrix(y_train, train_scores)

In [104]:
threshold_matrix

Unnamed: 0,threshold,tp,fp,fn
0,0.00,397,650,0
0,0.01,397,650,0
0,0.02,397,650,0
0,0.03,397,649,0
0,0.04,397,644,0
...,...,...,...,...
0,0.96,0,0,397
0,0.97,0,0,397
0,0.98,0,0,397
0,0.99,0,0,397


In [84]:
import random

In [89]:
random.randrange(100)

23

In [91]:
tp_cost, fp_cost, fn_cost = [random.randrange(100) for s in [1,2,3]]

In [92]:
print(tp_cost, fp_cost, fn_cost)

72 4 93


#### Profit

$profit = xTP - yFP - zFN$

In [105]:
(threshold_matrix['tp'] * tp_cost) - (threshold_matrix['fp'] * tp_cost) - (threshold_matrix['fn'] * fn_cost)

0   -18216
0   -18216
0   -18216
0   -18144
0   -17784
     ...  
0   -36921
0   -36921
0   -36921
0   -36921
0   -36921
Length: 101, dtype: int64

In [106]:
pd.concat([threshold_matrix['threshold'], ((threshold_matrix['tp'] * tp_cost) - (threshold_matrix['fp'] * tp_cost) - (threshold_matrix['fn'] * fn_cost))], axis=1)

Unnamed: 0,threshold,0
0,0.00,-18216
0,0.01,-18216
0,0.02,-18216
0,0.03,-18144
0,0.04,-17784
...,...,...
0,0.96,-36921
0,0.97,-36921
0,0.98,-36921
0,0.99,-36921


In [99]:
def profit_calc(tp_cost, fp_cost, fn_cost, count_matrix):
    profit_df = pd.concat([count_matrix['threshold'], ((count_matrix['tp'] * tp_cost) - (count_matrix['fp'] * tp_cost) - (count_matrix['fn'] * fn_cost))], axis=1)
    profit_df.columns = ['threshold', 'profit']
    return profit_df

In [100]:
profit_calc(tp_cost, fp_cost, fn_cost, threshold_matrix)

Unnamed: 0,threshold,profit
0,0.00,-18216
0,0.01,-18216
0,0.02,-18216
0,0.03,-18144
0,0.04,-17784
...,...,...
0,0.96,-36921
0,0.97,-36921
0,0.98,-36921
0,0.99,-36921


#### Probability cost function (PCF)

$PCF = \frac{P \times C(fn)}{P \times C(fp) + (1 - P) \times C(fn)}$

Where:

- *P* is the (prior) probability of the event
- *C(fn)* is the cost of a false negative (positive observation predicted as a negative)
- *C(fp)* is the cost of a false positive (negative observation predicted as a positive)

In [110]:
event_proba = y_train.sum() / y_train.size

In [111]:
def proba_cost_calc(fn_cost, fp_cost, event_proba):
    proba_cost = (event_proba * fn_cost) / (event_proba * fp_cost + (1 - event_proba) * fn_cost)
    return proba_cost

In [113]:
pcf = proba_cost_calc(fn_cost, fp_cost, event_proba)
pcf

0.5951352396918018

#### Normalized expected cost (NEC)

$NEC = PCF \times (1-TP) + (1-PCF) \times FP$

In [118]:
def norm_expect_cost(pcf, threshold_matrix):
    pcf_matrix = pcf * (1 - threshold_matrix['tp']) + (1 - pcf) * threshold_matrix['fp']
    pcf_df = pd.concat([threshold_matrix['threshold'], pcf_matrix], axis=1)
    pcf_df.columns = ['threshold', 'nec']
    return pcf_df

In [119]:
norm_expect_cost(pcf, threshold_matrix)

Unnamed: 0,threshold,nec
0,0.00,27.488539
0,0.01,27.488539
0,0.02,27.488539
0,0.03,27.083675
0,0.04,25.059351
...,...,...
0,0.96,0.595135
0,0.97,0.595135
0,0.98,0.595135
0,0.99,0.595135
