In [1]:
import numpy as np

In [2]:
import pandas as pd

In [3]:
from sklearn.model_selection import train_test_split

In [4]:
from sklearn.compose import ColumnTransformer

In [5]:
from sklearn.pipeline import Pipeline

In [6]:
from sklearn.preprocessing import OneHotEncoder, StandardScaler

In [7]:
from sklearn.linear_model import LogisticRegression

In [8]:
from sklearn.ensemble import RandomForestClassifier

In [9]:
from xgboost import XGBClassifier

In [27]:
from sklearn.metrics import (
    roc_auc_score, average_precision_score, confusion_matrix,
    classification_report, precision_score, recall_score, f1_score
)


In [58]:
import os, joblib

In [11]:
import warnings
warnings.filterwarnings('ignore')

In [12]:
RANDOM_STATE = 42

In [26]:
TEST_SIZE = 0.2

In [14]:
COST_FP = 1.0

In [15]:
COST_FN = 6.0

In [16]:
DATA_PATH = 'data/raw/bank-additional-full.csv'

In [17]:
df = pd.read_csv(DATA_PATH, sep = ';')

In [18]:
df.head()

Unnamed: 0,age,job,marital,education,default,housing,loan,contact,month,day_of_week,...,campaign,pdays,previous,poutcome,emp.var.rate,cons.price.idx,cons.conf.idx,euribor3m,nr.employed,y
0,56,housemaid,married,basic.4y,no,no,no,telephone,may,mon,...,1,999,0,nonexistent,1.1,93.994,-36.4,4.857,5191.0,no
1,57,services,married,high.school,unknown,no,no,telephone,may,mon,...,1,999,0,nonexistent,1.1,93.994,-36.4,4.857,5191.0,no
2,37,services,married,high.school,no,yes,no,telephone,may,mon,...,1,999,0,nonexistent,1.1,93.994,-36.4,4.857,5191.0,no
3,40,admin.,married,basic.6y,no,no,no,telephone,may,mon,...,1,999,0,nonexistent,1.1,93.994,-36.4,4.857,5191.0,no
4,56,services,married,high.school,no,no,yes,telephone,may,mon,...,1,999,0,nonexistent,1.1,93.994,-36.4,4.857,5191.0,no


In [19]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 41188 entries, 0 to 41187
Data columns (total 21 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   age             41188 non-null  int64  
 1   job             41188 non-null  object 
 2   marital         41188 non-null  object 
 3   education       41188 non-null  object 
 4   default         41188 non-null  object 
 5   housing         41188 non-null  object 
 6   loan            41188 non-null  object 
 7   contact         41188 non-null  object 
 8   month           41188 non-null  object 
 9   day_of_week     41188 non-null  object 
 10  duration        41188 non-null  int64  
 11  campaign        41188 non-null  int64  
 12  pdays           41188 non-null  int64  
 13  previous        41188 non-null  int64  
 14  poutcome        41188 non-null  object 
 15  emp.var.rate    41188 non-null  float64
 16  cons.price.idx  41188 non-null  float64
 17  cons.conf.idx   41188 non-null 

In [20]:
df.isna().sum().sort_values(ascending=False)

age               0
job               0
marital           0
education         0
default           0
housing           0
loan              0
contact           0
month             0
day_of_week       0
duration          0
campaign          0
pdays             0
previous          0
poutcome          0
emp.var.rate      0
cons.price.idx    0
cons.conf.idx     0
euribor3m         0
nr.employed       0
y                 0
dtype: int64

In [21]:
df.describe().T.head(20)

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
age,41188.0,40.02406,10.42125,17.0,32.0,38.0,47.0,98.0
duration,41188.0,258.28501,259.279249,0.0,102.0,180.0,319.0,4918.0
campaign,41188.0,2.567593,2.770014,1.0,1.0,2.0,3.0,56.0
pdays,41188.0,962.475454,186.910907,0.0,999.0,999.0,999.0,999.0
previous,41188.0,0.172963,0.494901,0.0,0.0,0.0,0.0,7.0
emp.var.rate,41188.0,0.081886,1.57096,-3.4,-1.8,1.1,1.4,1.4
cons.price.idx,41188.0,93.575664,0.57884,92.201,93.075,93.749,93.994,94.767
cons.conf.idx,41188.0,-40.5026,4.628198,-50.8,-42.7,-41.8,-36.4,-26.9
euribor3m,41188.0,3.621291,1.734447,0.634,1.344,4.857,4.961,5.045
nr.employed,41188.0,5167.035911,72.251528,4963.6,5099.1,5191.0,5228.1,5228.1


In [22]:
missing_ratio = df.isnull().mean().sort_values(ascending=False)

In [23]:
missing_ratio

age               0.0
job               0.0
marital           0.0
education         0.0
default           0.0
housing           0.0
loan              0.0
contact           0.0
month             0.0
day_of_week       0.0
duration          0.0
campaign          0.0
pdays             0.0
previous          0.0
poutcome          0.0
emp.var.rate      0.0
cons.price.idx    0.0
cons.conf.idx     0.0
euribor3m         0.0
nr.employed       0.0
y                 0.0
dtype: float64

In [24]:
df['y'] = (df['y'] == 'yes').astype(int)

In [25]:
df['y'].value_counts(), df['y'].value_counts(normalize = True)

(y
 0    36548
 1     4640
 Name: count, dtype: int64,
 y
 0    0.887346
 1    0.112654
 Name: proportion, dtype: float64)

In [28]:
def build_preprocess(X_train:pd.DataFrame):
    num_cols = X_train.select_dtypes(include = 'number').columns
    cat_cols = X_train.select_dtypes(exclude = 'number').columns
    preprocess = ColumnTransformer(
        transformers = [
            ('num', StandardScaler(),num_cols),
            ('cat', OneHotEncoder(handle_unknown = 'ignore'), cat_cols),
        ]
    )
    return preprocess, num_cols, cat_cols

In [38]:
def fit_and_eval_models(X_train,y_train,X_valid,y_valid,preprocess,models:dict):
    rows = []
    for name, model in models.items():
        pipe = Pipeline([('prep', preprocess), ('model', model)])
        pipe.fit(X_train,y_train)
        proba = pipe.predict_proba(X_valid)[:,1]
        rows.append({
            'model':name,
            'roc_auc':roc_auc_score(y_valid,proba),
            'pr_auc':average_precision_score(y_valid,proba),
            'pipe':pipe
        })
    res = pd.DataFrame(rows).sort_values('roc_auc',ascending = False).reset_index(drop = True)
    return res

In [30]:
def threshold_table(y_true,proba,thresholds = None):
    if thresholds is None:
        thresholds = np.arange(0.05, 0.96, 0.05)
    rows = []
    for t in thresholds:
        pred = (proba >=t).astype(int)
        rows.append({
            'threshold':float(np.round(t,2)),
            'precision': precision_score(y_true,pred,zero_division = 0),
            'recall': recall_score(y_true, pred, zero_division = 0),
            'f1': f1_score(y_true, pred, zero_division = 0),
            'positive_rate': pred.mean()
        })
    return pd.DataFrame(rows)

In [47]:
def expected_cost(y_true,proba,threshold, cost_fp = COST_FP, cost_fn =COST_FN):
    pred = (proba >= threshold).astype(int)
    tn,fp,fn,tp = confusion_matrix(y_true,pred).ravel()
    return fp * cost_fp + fn * cost_fn

In [48]:
def eval_at_threshold(y_true,proba,threshold):
    pred = (proba >=threshold).astype(int)
    tn,fp,fn,tp = confusion_matrix(y_true,pred).ravel()
    return {
        'threshold':float(threshold),
        'precision': precision_score(y_true,pred,zero_division = 0),
        'recall': recall_score(y_true,pred,zero_division = 0),
        'f1': f1_score(y_true,pred,zero_division = 0),
        'tn': int(tn), 'fp':int(fp), 'fn':int(fn), 'tp': int(tp),
        'positive_rate': float(pred.mean())
    }

In [49]:
models = {
    'logreg': LogisticRegression(max_iter = 2000, class_weight = 'balanced', random_state = RANDOM_STATE),
    'rf': RandomForestClassifier(
        n_estimators = 400,
        random_state = RANDOM_STATE,
        n_jobs = -1,
        class_weight = 'balanced_subsample'
    ),
    'xgb':XGBClassifier(
        n_estimators = 600,
        max_depth = 5,
        learning_rate = 0.05,
        subsample = 0.9,
        colsample_bytree = 0.9,
        eval_metric = 'logloss',
        random_state = RANDOM_STATE
    )
}

In [50]:
def run_scenario(df,drop_cols = None, scenario_name = 'scenario'):
    if drop_cols is None:
        drop_cols = []
    X = df.drop(columns = ['y'] + drop_cols)
    y = df['y'].astype(int)
    X_train,X_valid,y_train,y_valid = train_test_split(
        X,y,test_size = TEST_SIZE, stratify = y, random_state = RANDOM_STATE
    )
    preprocess, _, _ = build_preprocess(X_train)
    res = fit_and_eval_models(X_train,y_train,X_valid,y_valid,preprocess,models)
    best = res.iloc[0]
    best_pipe = best['pipe']
    best_proba = best_pipe.predict_proba(X_valid)[:,1]
    tt = threshold_table(y_valid,best_proba)
    tt_cost = tt.copy()
    tt_cost['expected_cost'] = tt_cost['threshold'].apply(
        lambda t:expected_cost(y_valid,best_proba,t,cost_fp = COST_FP,cost_fn = COST_FN)
    )
    best_t_f1 = tt.sort_values('f1', ascending = False).iloc[0]['threshold']
    best_t_cost = tt_cost.sort_values('expected_cost').iloc[0]['threshold']
    out = {
        'scenario':scenario_name,
        'dropped_cols':drop_cols,
        'res_df':res[['model', 'roc_auc', 'pr_auc']],
        'best_model':best['model'],
        'best_pipe': best_pipe,
        'y_valid':y_valid,
        'best_proba':best_proba,
        'best_t_f1':best_t_f1,
        'best_t_cost': best_t_cost,
        'eval_f1': eval_at_threshold(y_valid,best_proba,best_t_f1),
        'eval_cost':eval_at_threshold(y_valid,best_proba,best_t_cost),
    }
    return out

In [51]:
scenario_post_call = run_scenario(df, drop_cols = [], scenario_name = 'Post_call (with duration)')

In [53]:
scenario_pre_call = run_scenario(df, drop_cols = ['duration'], scenario_name = 'Pre-call targeting (no duration)')

In [54]:
display(scenario_post_call['res_df'])

Unnamed: 0,model,roc_auc,pr_auc
0,xgb,0.954407,0.690706
1,rf,0.948976,0.677853
2,logreg,0.943838,0.622248


In [55]:
display(pd.DataFrame([scenario_post_call['eval_f1'], scenario_post_call['eval_cost']]))

Unnamed: 0,threshold,precision,recall,f1,tn,fp,fn,tp,positive_rate
0,0.35,0.609862,0.759698,0.676583,6859,451,223,705,0.140325
1,0.15,0.487121,0.917026,0.636262,6414,896,77,851,0.212066


In [56]:
display(scenario_pre_call['res_df'])

Unnamed: 0,model,roc_auc,pr_auc
0,xgb,0.807845,0.483658
1,logreg,0.800942,0.459982
2,rf,0.784606,0.431273


In [57]:
display(pd.DataFrame([scenario_pre_call["eval_f1"], scenario_pre_call["eval_cost"]]))

Unnamed: 0,threshold,precision,recall,f1,tn,fp,fn,tp,positive_rate
0,0.25,0.511928,0.554957,0.532575,6819,491,413,515,0.122117
1,0.15,0.438066,0.625,0.515098,6566,744,348,580,0.160719


In [59]:
os.makedirs('models', exist_ok = True)

In [60]:
joblib.dump(scenario_post_call['best_pipe'], 'models/model_post_call_with_duration.joblib')

['models/model_post_call_with_duration.joblib']

In [61]:
joblib.dump(scenario_pre_call['best_pipe'], 'models/model_pre_call_no_duraion.joblib')

['models/model_pre_call_no_duraion.joblib']

In [62]:
def get_xgb_feature_importance(pipe:Pipeline, topn= 15):
    prep = pipe.named_steps['prep']
    model = pipe.named_steps['model']
    num_features = prep.transformers_[0][2]
    ohe = prep.transformers_[1][1]
    cat_features = prep.transformers_[1][2]
    cat_feature_names = ohe.get_feature_names_out(cat_features)
    feature_names = np.concatenate([num_features, cat_feature_names])
    importances = model.feature_importances_
    fi = pd.DataFrame({'feature':feature_names, 'importance': importances})
    return fi.sort_values('importance', ascending = False).head(topn)

In [63]:
fi_pre = get_xgb_feature_importance(scenario_pre_call['best_pipe'], topn = 15)

In [64]:
fi_pre

Unnamed: 0,feature,importance
8,nr.employed,0.417713
61,poutcome_success,0.059688
52,month_oct,0.041447
4,emp.var.rate,0.033491
2,pdays,0.024105
6,cons.conf.idx,0.020251
50,month_may,0.019039
42,contact_cellular,0.015429
7,euribor3m,0.012577
43,contact_telephone,0.012216
