## External Data

- Spending now or invest for the future
- price, inflation
- income, employment, labour supply
- comsuption level, GDP
- Composite leading indicator
- adjust interest rate
- Portugal bond return and rating
- German bond return
- Stock market return and volatity
- Gold price

In [1]:
from bank_marketing import *

In [2]:
bank_mkt = import_dataset("../data/BankMarketing.csv")

## Year

In [3]:
drop_features = ['age',
                 'job',
                 'marital',
                 'education',
                 'default',
                 'housing',
                 'loan',
                 "duration",
                 "y"]

year_encoder = FunctionTransformer(cat_encode, kw_args= {"drop": drop_features,"external":["year"]})


In [4]:
cat_clf = CatBoostClassifier(eval_metric="AUC", class_weights=[1,8], verbose=False)
benchmark(bank_mkt, year_encoder, cat_clf)

Unnamed: 0,Train,Validate,Test
TNR,0.86332,0.867431,0.861248
TPR,0.642977,0.681941,0.640086
bACC,0.753148,0.774686,0.750667
ROC,0.820415,0.835277,0.80416
REC,0.642977,0.681941,0.640086
PRE,0.373947,0.395004,0.369403
AP,0.504129,0.513574,0.457063


## Economic Indicators

In [33]:

def benchmark(data, preprocessor, clf):
    """
    Benchmark preprocessor and clf's performance on train, validation and test sets. 
    All the data transformation should be handled by preprocessor and estimation should be handled by clf.
    
    Parameters
    ----------
        data : DataFrame
        
        preprocessor : Pipeline
        
        clf : estimator
        
        name : str, default = None
        
        compare_to: DataFrame, default = None
        
    """
    X_train, y_train, X_test, y_test, X_ttrain, y_ttrain, X_validate, y_validate = split_dataset(data, preprocessor)
    X_sets = [X_ttrain, X_validate, X_test]
    y_sets = [y_ttrain, y_validate, y_test]
    
    metric_names = ["TNR", "TPR", "bACC", "ROC", "REC", "PRE", "AP"]
    set_names = ["Train", "Validate", "Test"]
    metric_df = pd.DataFrame(index=metric_names, columns=set_names)
    
    try:
        clf.fit(X_ttrain, y_ttrain, eval_set=(X_validate, y_validate), use_best_model=True, verbose=False)
    except (ValueError, TypeError):
        clf.fit(X_ttrain, y_ttrain)
        
    for name, X, y in zip(set_names, X_sets, y_sets):
        y_pred = clf.predict(X)
        try:
            y_score = clf.decision_function(X)
        except AttributeError:
            y_score = clf.predict_proba(X)[:, 1]
            
        metrics = [recall_score(y, y_pred, pos_label=0),
                   recall_score(y, y_pred),
                   balanced_accuracy_score(y, y_pred),
                   roc_auc_score(y, y_score),
                   recall_score(y, y_pred),
                   precision_score(y, y_pred),
                   average_precision_score(y, y_score)]
        metric_df[name] = metrics
        
    return metric_df

In [34]:
from sklearn.impute import SimpleImputer

In [40]:
freq_features = ["job", "marital", "education", "default", "housing", "loan"]

freq_imputer = ColumnTransformer([
    ("freq_imputer", SimpleImputer(missing_values=-1, strategy="most_frequent"), freq_features)
], remainder="passthrough")

drop_features = ["duration",
                 "y"]

econ_encoder = FunctionTransformer(cat_encode, kw_args= {"drop": drop_features,"external":["year"]})

econ_transformer = make_pipeline(econ_encoder, freq_imputer)

In [41]:
econ_transformer.fit_transform(bank_mkt)

array([[3.0, 1.0, 1.0, ..., 4.857, 5191.0, 2008],
       [7.0, 1.0, 4.0, ..., 4.857, 5191.0, 2008],
       [7.0, 1.0, 4.0, ..., 4.857, 5191.0, 2008],
       ...,
       [5.0, 1.0, 6.0, ..., 1.028, 4963.6, 2010],
       [9.0, 1.0, 5.0, ..., 1.028, 4963.6, 2010],
       [5.0, 1.0, 5.0, ..., 1.028, 4963.6, 2010]], dtype=object)

In [42]:
cat_clf = CatBoostClassifier(eval_metric="AUC", class_weights=[1,8],  verbose=False)
benchmark(bank_mkt, econ_encoder, cat_clf)

Unnamed: 0,Train,Validate,Test
TNR,0.861224,0.854259,0.855501
TPR,0.658471,0.672507,0.649784
bACC,0.759847,0.763383,0.752643
ROC,0.840873,0.81338,0.80421
REC,0.658471,0.672507,0.649784
PRE,0.375962,0.369356,0.363472
AP,0.52372,0.505116,0.449931
