In [1]:
%load_ext autoreload
%autoreload 2

# Testing the benchmark

Hello Danit and Chen, in this notebook I'll show you how to use the consept of "Constraint" how to use benchmark algorithms.

In [6]:
#The right importing order is first the libraries, then the local files
import sys; sys.path.append('../')

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from tqdm import tqdm
import json
import time
import logging


from sklearn.model_selection import StratifiedKFold
from sklearn.base import ClassifierMixin
from xgboost import XGBClassifier

from ada_csl_wrc.utils import filter_only_worst_features
from ada_csl_wrc import Constraint, AbsoluteConstraint, RelativeConstraint

from ada_csl_wrc.models import ConstrainedXGB
from ada_csl_wrc.utils import benchmark_algorithm, is_satisfy_constraint

from ada_csl_wrc.evaluation import evaluate
from ada_csl_wrc.logger import get_logger

In [8]:

logger=get_logger("Notebook")
logger.setLevel(logging.DEBUG)

In [9]:
df = pd.read_csv('../data/marketing_campaign.csv', sep=";")
df = df.drop(['Z_CostContact', 'Z_Revenue', 'Income', 'Dt_Customer', 'ID'], axis = 1)
full_X = df.drop(labels = 'Response', axis=1)
y = df['Response']

#Transforming categorial features into numerical
categorial_col = full_X.select_dtypes(include='object').columns
full_X[categorial_col] = full_X[categorial_col].astype('category').apply(lambda x: x.cat.codes)

In [10]:
### Some configurations:

#Only 3.75% of the population can be positive (at most), it is equivalent to 25% of 15%
CONSTRAINT_RATIO = 0.25 * y.mean()
constraint = RelativeConstraint(global_constraint=CONSTRAINT_RATIO)

FEATURES_RATIO = 0.50
COST_MATRIX = np.array([[ 0,  1],
                        [10,  0]])

X = filter_only_worst_features(full_X, y, FEATURES_RATIO)

# Benchmark

### Function definitions 

In [11]:
def run_constrainted_experiment(model: ClassifierMixin,
                                X: np.ndarray,
                                y: np.ndarray,
                                cost_matrix: np.ndarray,
                                constraint = Constraint,
                                random_state = 42,
                                n_splits=3):
    fit_params = {}
    out = {}
    kf = StratifiedKFold(n_splits=n_splits, shuffle=True, random_state=random_state)
    for (fold, (train_index, test_index)) in enumerate(kf.split(X, y)):

        #The ordinary Kfold, but with the constraint
        X_train, X_test = X[train_index], X[test_index]
        y_train, y_test = y[train_index], y[test_index]
        model.fit(X_train, y_train, **fit_params)
        y_pred_proba = model.predict_proba(X_test)[:, 1]

        #Transforming the probabilities into 0 and 1 according to the constraint
        y_pred = benchmark_algorithm(y_pred=y_pred_proba, group_ids=None, constraint=constraint) 

        out[fold] = evaluate(y_test, y_pred, cost_matrix)
        logger.debug(f"Number of positives: {y_pred.sum()}")
        logger.debug(f"Ratio of positives: {y_pred.mean()}")
        logger.debug(f"constraint: {constraint.to_dict()}")
    return pd.DataFrame(out).T.mean(axis=0).to_dict()


 #You can use AbsoluteConstraint or RelativeConstraint

#run_constrainted_experiment(DecisionTreeClassifier(**dt_best_params), full_X.values, y.values, COST_MATRIX, constraint=constraint)

In [12]:
run_constrainted_experiment(XGBClassifier(max_depth=5, scale_pos_weight=1e1),
                            X.values, y.values, COST_MATRIX, constraint=constraint)

[38;20m2023-12-12 21:21:11,973 - Notebook - DEBUG - Number of positives: 27[0m
[38;20m2023-12-12 21:21:11,973 - Notebook - DEBUG - Ratio of positives: 0.03614457831325301[0m
[38;20m2023-12-12 21:21:11,974 - Notebook - DEBUG - constraint: {'global_constraint': 0.037276785714285714}[0m
[38;20m2023-12-12 21:21:12,151 - Notebook - DEBUG - Number of positives: 27[0m
[38;20m2023-12-12 21:21:12,152 - Notebook - DEBUG - Ratio of positives: 0.03614457831325301[0m
[38;20m2023-12-12 21:21:12,153 - Notebook - DEBUG - constraint: {'global_constraint': 0.037276785714285714}[0m
[38;20m2023-12-12 21:21:12,329 - Notebook - DEBUG - Number of positives: 27[0m
[38;20m2023-12-12 21:21:12,330 - Notebook - DEBUG - Ratio of positives: 0.036193029490616625[0m
[38;20m2023-12-12 21:21:12,331 - Notebook - DEBUG - constraint: {'global_constraint': 0.037276785714285714}[0m


{'cost': 971.6666666666666,
 'accuracy': 0.8558033145390618,
 'precision': 0.5679012345679012,
 'recall': 0.13773595023595023,
 'f1': 0.22170090014944566,
 'g_mean': 0.36765701230716524}

## Constrained XGB

### Function definitions 

In [15]:
def run_XGB_experiment(model: ClassifierMixin,
                       X: np.ndarray,
                       y: np.ndarray,
                       cost_matrix: np.ndarray,
                       constraint = Constraint,
                       random_state = 42,
                       n_splits=3):
    
    
    
    fit_params = {}
    out = {}
    kf = StratifiedKFold(n_splits=n_splits, shuffle=True, random_state=random_state)
    for (fold, (train_index, test_index)) in enumerate(kf.split(X, y)):
        model = ConstrainedXGB(XGBClassifier(), constraint=constraint)
        #The ordinary Kfold, but with the constraint
        X_train, X_test = X[train_index], X[test_index]
        y_train, y_test = y[train_index], y[test_index]
        model.fit(X_train, y_train, X_test, **fit_params)
        y_pred = model.predict(X_test)
        assert is_satisfy_constraint(y_pred, constraint)
        out[fold] = evaluate(y_test, y_pred, cost_matrix)
    return pd.DataFrame(out).T.mean(axis=0).to_dict()


 #You can use AbsoluteConstraint or RelativeConstraint

#run_constrainted_experiment(DecisionTreeClassifier(**dt_best_params), full_X.values, y.values, COST_MATRIX, constraint=constraint)

In [16]:
run_XGB_experiment(XGBClassifier(max_depth=5, scale_pos_weight=1e1),
                            X.values, y.values, COST_MATRIX, constraint=constraint)

[38;20m2023-12-12 21:22:02,563 - ada_csl_wrc.models - INFO - Finding the best estimator from history[0m
[38;20m2023-12-12 21:22:02,563 - ada_csl_wrc.models - INFO - Finding the best estimator from history[0m


{'cost': 989.6666666666666,
 'accuracy': 0.8517878484447173,
 'precision': 0.5132953466286799,
 'recall': 0.12277456027456028,
 'f1': 0.19814475300115353,
 'g_mean': 0.3465871945292354}