### Libraries & Parameters

In [12]:
import pandas
import numpy
import os

In [13]:
dataset_path = '0_bank_additional_full.csv'
outputpath = 'splits'

### Load data (local storage)

In [14]:
df = pandas.read_csv(dataset_path, sep=';')
pandas.set_option('display.max_columns', 60)
pandas.set_option('display.max_rows', 20)

### Pre-process

In [15]:
# Replace values
df = df.replace(regex=r'\.', value='_')
df = df.replace(regex=r'\_$', value='')

# Add two new features
df["no_previous_contact"] = (df["pdays"] == 999).astype(int)
df["not_working"] = df["job"].isin(["student", "retired", "unemployed"]).astype(int)

# Drop not need columns
df = df.drop(['duration', 'emp.var.rate', 'cons.price.idx', 'cons.conf.idx', 'euribor3m', 'nr.employed', 'day_of_week', 'month'], axis=1)

# Encode categorical features
df = pandas.get_dummies(df)

# Train, test, validation split
suffled_df = df.sample(frac=1, random_state=42)
train_data_split, validation_data_split, test_data_split = numpy.split(suffled_df, [int(0.7 * len(df)), int(0.9 * len(df))])

# clean up categorical encoding of Y
train_data_df = pandas.concat([train_data_split['y_yes'], train_data_split.drop(['y_yes','y_no'], axis=1)], axis=1)
validation_data_df = pandas.concat([validation_data_split['y_yes'], validation_data_split.drop(['y_yes','y_no'], axis=1)], axis=1)
test_data_df = pandas.concat([test_data_split['y_yes'], test_data_split.drop(['y_yes','y_no'], axis=1)], axis=1)

  return bound(*args, **kwds)


### Write splits (local storage)

In [16]:
os.makedirs(outputpath, exist_ok=True)

train_data_df.to_csv(os.path.join(outputpath, 'train.csv'), index=False, header=False)
validation_data_df.to_csv(os.path.join(outputpath, 'validate.csv'), index=False, header=False)

test_data_split['y_yes'].to_csv(os.path.join(outputpath, 'test_y.csv'), index=False, header=False)
test_data_split.drop(['y_yes','y_no'], axis=1).to_csv(os.path.join(outputpath, 'test_x.csv'), index=False, header=False)

### Train

In [6]:
import xgboost

In [17]:
# Train Matrix
y_train = train_data_df.iloc[:, 0].to_numpy()

train_data_df.drop(train_data_df.columns[0], axis=1, inplace=True)
x_train = train_data_df.to_numpy()

train_dmatrix = xgboost.DMatrix(x_train, label=y_train)

# Validation Matrix
y_validation = validation_data_df.iloc[:, 0].to_numpy()

validation_data_df.drop(validation_data_df.columns[0], axis=1, inplace=True)
x_validation = validation_data_df.to_numpy()

validation_dmatrix = xgboost.DMatrix(x_validation, label=y_validation)

# parameters
num_round = 50
evaluation_results = {}  # Store accuracy result

# hyper paramerers
hyperparameters = {'objective':'binary:logistic', 'max_depth':5, 'eta':0.2, 'gamma':4, 'min_child_weight':6, 'subsample':0.7}

# Execute training
xgboost_model = xgboost.train(
    hyperparameters,
    train_dmatrix,
    num_round,
    evals=[(train_dmatrix, "train"), (validation_dmatrix, "validation")],
    early_stopping_rounds=5,
    evals_result=evaluation_results,
)

[0]	train-logloss:0.57520	validation-logloss:0.57459
[1]	train-logloss:0.49645	validation-logloss:0.49524
[2]	train-logloss:0.44194	validation-logloss:0.44038
[3]	train-logloss:0.40333	validation-logloss:0.40130
[4]	train-logloss:0.37553	validation-logloss:0.37327
[5]	train-logloss:0.35483	validation-logloss:0.35228
[6]	train-logloss:0.34002	validation-logloss:0.33718
[7]	train-logloss:0.32900	validation-logloss:0.32584
[8]	train-logloss:0.32111	validation-logloss:0.31779
[9]	train-logloss:0.31559	validation-logloss:0.31209
[10]	train-logloss:0.31144	validation-logloss:0.30769
[11]	train-logloss:0.30823	validation-logloss:0.30435
[12]	train-logloss:0.30597	validation-logloss:0.30187
[13]	train-logloss:0.30436	validation-logloss:0.30026
[14]	train-logloss:0.30306	validation-logloss:0.29915
[15]	train-logloss:0.30198	validation-logloss:0.29825
[16]	train-logloss:0.30125	validation-logloss:0.29754
[17]	train-logloss:0.30043	validation-logloss:0.29706
[18]	train-logloss:0.30000	validation-

In [8]:
#xgboost_model.save_model('xgboost-model')

In [9]:
    
#model_xgb_2 = xgboost.Booster()
#model_xgb_2.load_model("xgboost-model")

### Evaluate

In [18]:
from sklearn.metrics import mean_squared_error
from sklearn.metrics import accuracy_score, classification_report, roc_auc_score

In [19]:
y_test = test_data_df.iloc[:, 0].to_numpy()
    
test_data_df.drop(test_data_df.columns[0], axis=1, inplace=True)
x_test = test_data_df.to_numpy()

predictions = xgboost_model.predict(xgboost.DMatrix(x_test))
mse = mean_squared_error(y_test, predictions)

std = numpy.std(y_test - predictions)
acc = accuracy_score(y_test, predictions.round())

report_dict = {
        "regression_metrics": {
            "mse": {"value": mse, "standard_deviation": std},
        },
    }

print(f"evaluation report: {report_dict}")

evaluation report: {'regression_metrics': {'mse': {'value': 0.08701103, 'standard_deviation': 0.29495305}}}
