# Modifying Classifier Threshold in scikit-learn


The discrimination threshold cannot be updated for scikit-learn classifiers.
However, it is possible to access the decision scores which are used to assign class labels to observations. 
Decision scores can be obtained by calling the classifier's `decision_function` method, which returns
a score for each observation. The decisions scores can then be used to update model predictions based on the
desired criteria. 


In [4]:
import os
import os.path
import sys
import numpy as np
import pandas as pd
from sklearn import metrics
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix
from sklearn.metrics import roc_auc_score
from sklearn import linear_model
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import confusion_matrix

pd.options.mode.chained_assignment = None
pd.set_option('display.max_columns', 1000)
pd.set_option('display.width', 500)
np.set_printoptions(
    edgeitems=5, linewidth=200, suppress=True, nanstr='NaN',
    infstr='Inf', precision=5
    )

RANDOM_STATE = 516
TRAIN_SIZE   = .70
TEST_SIZE    = 1 - TRAIN_SIZE
ID           = "id"

# fpath = "S:\\public\\Actuarial\\DSSG\\20170721_Materials\\mw.data"
# hdrs  = ["ID", "GENDER", "HEIGHT", "HAND_LENGTH", "FOREARM_LENGTH"]
# df    = pd.read_table(fpath, sep="\s+", names=hdrs)


data_path = "https://gist.githubusercontent.com/jtrive84/c1e23acb2624733ada178260cc3c683c/raw/3cf7e81263de12727ac4d6b7391bf6766f942f2b/admissions.csv"

dfall = pd.read_csv(data_path)


categorical_vars = ["rank"]
continuous_vars  = ["gre", "gpa",]
response         = "admit"


# Split data into train, validation and test cohorts. 
dftrain, dftest0, ytrain, ytest0 = \
    train_test_split(
        dfall[categorical_vars + continuous_vars], 
        dfall[response], test_size=TEST_SIZE, random_state=RANDOM_STATE
        )

# Split dftest0 into validation and test cohorts.
dfvalidate, dftest, yvalidate, ytest = \
    train_test_split(
        dftest0, ytest0, test_size=.30, random_state=RANDOM_STATE
        )


# Recombine dftrain + ytrain, dfvalidate + yvalidate and dftest + ytest.
dftrain = dftrain.join(ytrain).reset_index(drop=True)
dfvalidate = dfvalidate.join(yvalidate).reset_index(drop=True)
dftest = dftest.join(ytest).reset_index(drop=True)


In [5]:
# ------------------------------------------------------------------------------------
# Transform training data using same objects from training data. Note that we call   |
# only `transform` on test dataset (not fit_transform).                              |
# ------------------------------------------------------------------------------------
dfcategorical_train = dftrain[categorical_vars]
dfcontinuous_train = dftrain[continuous_vars]

categorical_imputer = SimpleImputer(missing_values=np.NaN, strategy="most_frequent")
continuous_imputer = SimpleImputer(missing_values=np.NaN, strategy="median")

train_categorical_arr = categorical_imputer.fit_transform(dfcategorical_train[categorical_vars])
train_continuous_arr = continuous_imputer.fit_transform(dfcontinuous_train[continuous_vars])

# One-hot encode categorical features.
dfcategorical_train = pd.DataFrame(
    train_categorical_arr, columns=categorical_vars
    )

dfcategorical_train = pd.get_dummies(
    dfcategorical_train, columns=categorical_vars, drop_first=True
    )

# Scale continuous features to eliminate magnitude bias.
std_scaler = StandardScaler()
dfcontinuous_train = pd.DataFrame(
    std_scaler.fit_transform(train_continuous_arr), columns=dfcontinuous_train.columns
    )

# Recombine dfcategorical and dfcontinuous into single DataFrame with index.
dftrain = pd.concat(
    [dftrain[["id"]], dfcategorical_train, dfcontinuous_train, dftrain[[response]]], 
    axis=1
    )


# ------------------------------------------------------------------------------------
# Transform validation data using same objects from training data. Note that we call |
# only `transform` on test dataset (not fit_transform).                              |
# ------------------------------------------------------------------------------------
dfcategorical_validate = dfvalidate[categorical_vars]
dfcontinuous_validate = dfvalidate[continuous_vars]

validate_categorical_arr = categorical_imputer.transform(dfcategorical_validate[categorical_vars])
validate_continuous_arr = continuous_imputer.transform(dfcontinuous_validate[continuous_vars])

# One-hot encode categorical features.
dfcategorical_validate = pd.DataFrame(
    validate_categorical_arr, columns=categorical_vars
    )

dfcategorical_validate = pd.get_dummies(
    dfcategorical_validate, columns=categorical_vars, drop_first=True
    )

# Scale continuous features to eliminate magnitude bias.
dfcontinuous_validate = pd.DataFrame(
    std_scaler.transform(validate_continuous_arr), columns=dfcontinuous_validate.columns
    )

# Recombine dfcategorical and dfcontinuous into single DataFrame with index.
dfvalidate = pd.concat(
    [dfvalidate[["id"]], dfcategorical_validate, dfcontinuous_validate, dfvalidate[[response]]], 
    axis=1
    )


# ------------------------------------------------------------------------------------
# Transform test data using same objects from training data. Note that we call only  |
# `transform` on test dataset (not fit_transform).                                   |
# ------------------------------------------------------------------------------------
dfcategorical_test = dftest[categorical_vars]
dfcontinuous_test = dftest[continuous_vars]
    
test_categorical_arr = categorical_imputer.transform(dfcategorical_test[categorical_vars])
test_continuous_arr = continuous_imputer.transform(dfcontinuous_test[continuous_vars])

# One-hot encode categorical features.
dfcategorical_test = pd.DataFrame(
    test_categorical_arr, columns=categorical_vars
    )

dfcategorical_test = pd.get_dummies(
    dfcategorical_test, columns=categorical_vars, drop_first=True
    )

# Scale continuous features to eliminate magnitude bias.
dfcontinuous_test = pd.DataFrame(
    std_scaler.transform(test_continuous_arr), columns=dfcontinuous_test.columns
    )

# Recombine dfcategorical and dfcontinuous into single DataFrame with index.
dftest = pd.concat(
    [dftest[["id"]], dfcategorical_test, dfcontinuous_test, dftest[[response]]], 
    axis=1
    )


In [12]:
# Fit logistic regression classifier to training data. Determine optimal
# parameters using GridSearchCV.
feature_columns = [i for i in dftrain.columns if i not in ["id", response]]
X_train = dftrain[feature_columns]
y_train = dftrain[response].values


# Determine optimal parameters using GridSearchCV.
lrc = linear_model.LogisticRegression()

param_grid = [{
    "fit_intercept":[True, False],
    "penalty"      :["elasticnet"],
    "solver"       :["saga"],
    "C"            :[1/10000, 1/1000, 1/100, 1/10, 1, 10, 100, 1000, 10000],
    "l1_ratio"     :np.arange(0, 1.1, .1)
    }]

# scoring can be one of ["accuracy", "precision", "recall", "f1_macro", "roc_auc"]
grid_search = GridSearchCV(lrc, param_grid, cv=5, scoring="recall")
grid_search.fit(X_train, y_train)
lrc_params = grid_search.best_params_
lrc_best = grid_search.best_estimator_
print(lrc_best)


LogisticRegression(C=0.01, class_weight=None, dual=False, fit_intercept=False,
                   intercept_scaling=1, l1_ratio=0.1, max_iter=100,
                   multi_class='warn', n_jobs=None, penalty='elasticnet',
                   random_state=None, solver='saga', tol=0.0001, verbose=0,
                   warm_start=False)




In [None]:
lrc_params = grid_search.best_params_
clfbest = grid_search.best_estimator_
clf_pred = clfbest.predict(Xtest)
clf_prob = clfbest.predict_proba(Xtest)

In [None]:
# split explanatory variables from response, and convert
# 1/2 response to 0/1 =>
X = df.drop(['GENDER','ID'], axis=1)
y = df['GENDER'].map(lambda x: 0 if x==2 else x).values

# use `model_selection` in the latest release of scikit-learn =>
X_train, X_test, y_train, y_test = train_test_split(
                                X, y, test_size=.33, random_state=16)

# scale explanatory variables ================================================>
sclr    = StandardScaler()
X_train = sclr.fit_transform(X_train)
X_test  = sclr.transform(X_test)

# instantiate model ==========================================================>
lr = LogisticRegression(C=1.0).fit(X_train, y_train)

# get predictions (y_hat) and probabilities (p_hat) =>
y_hat = lr.predict(X_test)
p_hat = lr.predict_proba(X_test)[:,[1]]



# evaluate logistic regression model =========================================>
lr_score  = lr.score(X_test, y_test)
cm        = confusion_matrix(actual_response, predicted_response)
cr        = classification_report(y_test, lr_y_hat, target_names=['Male', 'Female'])
auc_score = roc_auc_score(y_test, p_hat)



# ============================================================================>
# Calling classifier's `decision_function` method to adjust the
# discrimination threshold ===================================================>

y_scores = lr.decision_function(X_test)

# combine original classification with y_scores:
sp = sorted(list(zip(y_scores.tolist(), y_hat.tolist())), key=lambda x: x[0])

# set new thresholds =>
new_threshold1 = 1.95
new_predict1   = (y_scores>new_threshold1)*1

new_threshold2 = 2.35
new_predict2   = (y_scores>new_threshold2)*1

new_threshold3 = 3.0
new_predict3   = (y_scores>new_threshold3)*1


# create a dict of updated predicitions based on updated threshold =>
thresholds = {
        '1.95':new_predict1,
        '2.35':new_predict2,
        '3.00':new_predict3
        }