# Threshold Optimization

In [1]:
import numpy as np, pandas as pd
import joblib

from sklearn.model_selection import train_test_split, cross_val_score

from src.helper_utilities import load_data
from src.modeling_utilities import f2_scorer, optimize_threshold

In [2]:
random_state = 42
cv = 5

In [3]:
# load the original dataset as df
X, y = load_data(mode='modeling', format='dataframe', introduce_nans=0.01, random_state=random_state)

# the "orange" dataset
X_train, X_test, y_train, y_test = train_test_split(X, y, stratify=y, test_size=0.2, random_state=random_state)

# the "green" dataset
X_dev, X_val, y_dev, y_val = train_test_split(X_train, y_train, test_size=0.2, random_state=random_state)

# X_... is a pd.DataFrame
X_test[:3]

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19
690,A11,15.0,A34,A42,975.0,A61,A73,2.0,A91,A101,3.0,A122,25.0,A143,A152,2.0,A173,1.0,A191,A201
296,A14,12.0,A32,A41,4675.0,A65,A72,,A92,A101,4.0,A123,20.0,A143,A151,1.0,A173,1.0,A191,A201
672,A14,60.0,A32,A40,10366.0,A61,A75,2.0,A93,A101,4.0,A122,42.0,A143,A152,1.0,A174,1.0,A192,A201


In [4]:
best_estimators_and_ensembles = joblib.load("models/best_estimators_and_ensembles")

In [5]:
for model in best_estimators_and_ensembles:
    threshold, best_score = optimize_threshold(model, X, y, val_size=0.2, metric='f2', random_state=random_state)
    print(threshold, best_score)

0.23232323232323235 0.7658959537572254
0.393939393939394 0.7627118644067796
0.30303030303030304 0.7671232876712328
0.37373737373737376 0.7725947521865889
0.4040404040404041 0.771513353115727


In [10]:
best_model = best_estimators_and_ensembles[3]  

joblib.dump(best_model, 'models/best_model.pkl')

def predict(X, model=best_model, threshold=0.37373737373737376):
    return (model.predict_proba(X)[:, -1] >= threshold).astype(int)



class BestModel:
    model = best_model
    threshold = 0.37373737373737376

    def fit(self, X, y):
        self.model.fit(X, y)
        return self

    def predict(self, X, y=None):
        return (self.model.predict_proba(X)[:, -1] >= self.threshold).astype(int)



bestmodel = BestModel()



bestmodel.fit(X_train, y_train)

joblib.dump(bestmodel, 'models/bestmodel.pkl')

y_true = y_test.astype(int).values
y_pred = bestmodel.predict(X_test)

print(f2_scorer(y_true, y_pred).round(2))

best_model



0.69


In [14]:
model = joblib.load('models/bestmodel.pkl')

model.predict(X_train)


array([0, 1, 0, 1, 1, 0, 0, 1, 1, 0, 1, 0, 1, 1, 1, 1, 0, 0, 0, 1, 1, 1,
       1, 0, 0, 0, 0, 1, 0, 1, 0, 0, 1, 1, 0, 0, 1, 0, 0, 0, 1, 1, 0, 1,
       1, 0, 0, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 0, 1, 1, 0, 0, 0, 0, 0, 0,
       0, 1, 0, 0, 0, 0, 1, 1, 0, 0, 1, 0, 0, 1, 1, 0, 1, 0, 0, 1, 1, 1,
       1, 1, 1, 0, 0, 1, 0, 1, 0, 1, 1, 1, 1, 1, 1, 0, 1, 1, 0, 0, 1, 1,
       1, 1, 1, 0, 1, 0, 1, 0, 1, 0, 0, 0, 1, 0, 0, 0, 1, 1, 0, 1, 1, 0,
       1, 0, 1, 0, 0, 0, 0, 0, 1, 1, 0, 1, 1, 1, 1, 1, 0, 1, 0, 0, 0, 0,
       0, 0, 0, 1, 1, 1, 1, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1,
       1, 1, 0, 1, 1, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 1, 1, 1, 0, 0, 0, 0,
       0, 1, 0, 0, 0, 1, 0, 1, 0, 1, 0, 0, 0, 0, 1, 1, 1, 0, 1, 0, 0, 0,
       1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 0, 1, 1, 1, 0, 0, 1, 1, 0, 1, 1, 0,
       1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 1, 0, 0, 1, 0,
       0, 1, 0, 0, 0, 1, 1, 0, 0, 1, 1, 1, 0, 0, 0, 0, 0, 1, 0, 1, 1, 0,
       0, 1, 0, 1, 1, 0, 1, 1, 1, 0, 1, 0, 1, 0, 1,