# Entry 28 notebook - Thresholds - Profit and cost

In [39]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import train_test_split, StratifiedKFold, cross_validate, cross_val_predict
from sklearn.compose import ColumnTransformer, make_column_transformer, make_column_selector
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.impute import SimpleImputer
from sklearn.linear_model import SGDClassifier

from sklearn.metrics import precision_recall_curve
from sklearn.metrics import roc_curve
from sklearn.metrics import roc_auc_score

from sklearn.datasets import load_digits, fetch_openml

In [81]:
def find_nearest(a, a0):
    "Element in nd array `a` closest to the scalar value `a0`"
    idx = np.abs(a - a0).argmin()
    return idx

def plot_precision_recall_vs_threshold(precisions, recalls, thresholds):
    plt.plot(thresholds, precisions[:-1], 'b--', label='precision')
    plt.plot(thresholds, recalls[:-1], 'g-', label='recall')
    plt.xlabel('Threshold')
    plt.legend(loc='center right')
    plt.grid()
    plt.ylim([0, 1])
    
def plot_precision_vs_recall(precisions, recalls, thresholds):
    close_zero = find_nearest(thresholds, 0.5)
    plt.plot(precisions[close_zero], recalls[close_zero], 'o', markersize=10,
            label='threshold zero', fillstyle="none", c='k', mew=2)
    plt.plot(recalls, precisions, "b-", linewidth=2, label='precision recall curve')
    plt.xlabel("Recall", fontsize=16)
    plt.ylabel("Precision", fontsize=16)
    plt.axis([0, 1, 0, 1])
    plt.legend(loc='best')
    plt.grid(True)
    
def plt_roc_curve(fpr, tpr, thresholds, label='ROC curve'):
    close_default = find_nearest(thresholds, 0.5)
    plt.plot(fpr[close_default], tpr[close_default], 'o', markersize=10,
            label='default threshold 0.5', fillstyle="none", c='k', mew=2)
    plt.plot(fpr, tpr, linewidth=2, label=label)
    plt.plot([0,1], [0,1], 'k--')
    plt.xlabel('False Positive Rate')
    plt.ylabel('True Positive Rate')

## Make dataset and fit model

In [3]:
titanic = fetch_openml('titanic', version=1, as_frame=True)
df_raw = titanic.data
target = titanic.target

In [25]:
features = ['pclass', 'sex', 'sibsp', 'parch']
X = df_raw.loc[:, features]
X['sex'] = X['sex'].astype('category').cat.codes
y = target.copy().astype('category').cat.codes

X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=0.8, random_state=12)
pipe = make_pipeline(SimpleImputer(strategy='median'), StandardScaler(), LogisticRegression())
pipe.fit(X_train, y_train)
pipe.score(X_test, y_test)

0.7824427480916031

In [26]:
cross_validate(LogisticRegression(), X_train, y_train, cv=10, scoring=['precision', 'average_precision', 'recall',
                                                       'f1', 'neg_log_loss', 'neg_brier_score'])

{'fit_time': array([0.0063138 , 0.00560308, 0.00623417, 0.00379014, 0.00518179,
        0.00485587, 0.00396514, 0.00403786, 0.0036459 , 0.00383782]),
 'score_time': array([0.00787735, 0.00784707, 0.0074482 , 0.00722694, 0.00602198,
        0.00595188, 0.00528598, 0.00574708, 0.00532413, 0.00531793]),
 'test_precision': array([0.8125    , 0.87179487, 0.8       , 0.68571429, 0.75609756,
        0.76923077, 0.8       , 0.60526316, 0.71875   , 0.72727273]),
 'test_average_precision': array([0.79230071, 0.90616753, 0.70881106, 0.79966404, 0.82336772,
        0.81621793, 0.8053233 , 0.69845842, 0.74554448, 0.71403329]),
 'test_recall': array([0.65      , 0.85      , 0.6       , 0.6       , 0.775     ,
        0.75      , 0.7       , 0.58974359, 0.58974359, 0.61538462]),
 'test_f1': array([0.72222222, 0.86075949, 0.68571429, 0.64      , 0.7654321 ,
        0.75949367, 0.74666667, 0.5974026 , 0.64788732, 0.66666667]),
 'test_neg_log_loss': array([-0.46846179, -0.34750687, -0.56200828, -0.46758

### y scores

In [54]:
# y_scores = cross_val_predict(pipe, X_train, y_train, cv=10, method='decision_function')

In [59]:
y_scores = cross_val_predict(pipe, X_train, y_train, cv=10, method='predict_proba')[:,1]