In [None]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, accuracy_score, classification_report
from sklearn.tree import DecisionTreeClassifier, plot_tree
from sklearn.ensemble import RandomForestClassifier, VotingClassifier
from sklearn.pipeline import Pipeline
import xgboost as xgb
import matplotlib.pyplot as plt
import category_encoders as ce
from datetime import datetime

In [None]:
# Performance Report
def evaluate(y_test, y_pred):
    print("Confusion Matrix: ")
    print(confusion_matrix(y_test, y_pred))

    print ("Accuracy : ")
    print(accuracy_score(y_test,y_pred)*100)

    print("Report : ")
    report = classification_report(y_test, y_pred)
    print(report)

In [None]:
data = pd.read_csv("bank.csv")

In [None]:
data.head()

<h3>Preprocessing data</h3>

In [None]:
data.rename(columns={'subcribed': 'subscribed'}, inplace=True)
data = data.replace({'yes': 1, 'no': 0})
data['education'] = data['education'].replace({'tertiary': 1, 'secondary': 2, 'primary': 3, 'unknown': 4})
data['month'] = data['month'].transform(lambda x : datetime.strptime(x, '%b').month)

In [None]:
encoding_pipeline = Pipeline([
    ('encode_others', ce.OneHotEncoder(cols=['marital', 'contact', 'poutcome'], use_cat_names=True, return_df=True)),
    ('encode_job', ce.JamesSteinEncoder(cols=['job'], return_df=True))
])

data_transformed = encoding_pipeline.fit_transform(data, data['subscribed'])

In [None]:
data.rename(columns={'subcribed': 'subscribed'}, inplace=True)
 # Encoding features
data = data.replace({"yes": 1, "no": 0})
ohe = ce.OneHotEncoder(
    cols=["job", "marital", "education", "contact", "month", "poutcome"],
    use_cat_names=True,
    return_df=True,
)
data = ohe.fit_transform(data)

In [None]:
X = data.drop(columns=['subscribed'])
y = data['subscribed']

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state = 100)

In [None]:
from sklearn import decomposition, datasets
from sklearn import tree
from sklearn.pipeline import Pipeline
from sklearn.model_selection import GridSearchCV
from sklearn.preprocessing import StandardScaler

In [43]:
xgb_classifier = xgb.XGBClassifier(
        objective="binary:logistic",
        learning_rate=0.05,
        max_depth=4,
        min_child_weight=1,
        use_label_encoder=False,
    )
xgb_classifier.fit(X_train, y_train)

XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
              colsample_bynode=1, colsample_bytree=1, gamma=0,
              learning_rate=0.05, max_delta_step=0, max_depth=4,
              min_child_weight=1, missing=None, n_estimators=100, n_jobs=1,
              nthread=None, objective='binary:logistic', random_state=0,
              reg_alpha=0, reg_lambda=1, scale_pos_weight=1, seed=None,
              silent=None, subsample=1, use_label_encoder=False, verbosity=1)

In [46]:
param_dict = {
    'max_depth': range(3,10),
    'min_child_weight': range(1, 10),
    'gamma': np.arange(0,5)
}

grid = GridSearchCV(xgb_classifier, param_grid=param_dict, verbose=1, n_jobs=-1)
grid.fit(X_train, y_train)

Fitting 3 folds for each of 315 candidates, totalling 945 fits
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  34 tasks      | elapsed:    4.7s
[Parallel(n_jobs=-1)]: Done 184 tasks      | elapsed:   34.9s
[Parallel(n_jobs=-1)]: Done 434 tasks      | elapsed:  1.3min
[Parallel(n_jobs=-1)]: Done 784 tasks      | elapsed:  2.6min
[Parallel(n_jobs=-1)]: Done 945 out of 945 | elapsed:  3.2min finished


GridSearchCV(cv='warn', error_score='raise-deprecating',
             estimator=XGBClassifier(base_score=0.5, booster='gbtree',
                                     colsample_bylevel=1, colsample_bynode=1,
                                     colsample_bytree=1, gamma=0,
                                     learning_rate=0.05, max_delta_step=0,
                                     max_depth=4, min_child_weight=1,
                                     missing=None, n_estimators=100, n_jobs=1,
                                     nthread=None, objective='binary:logistic',
                                     random_state=0, reg_alpha=0, reg_lambda=1,
                                     scale_pos_weight=1, seed=None, silent=None,
                                     subsample=1, use_label_encoder=False,
                                     verbosity=1),
             iid='warn', n_jobs=-1,
             param_grid={'gamma': array([0, 1, 2, 3, 4]),
                         'max_depth': range

In [47]:
grid.best_params_

{'gamma': 0, 'max_depth': 9, 'min_child_weight': 3}

In [41]:
grid.best_estimator_

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
                       max_depth=7, max_features='log2', max_leaf_nodes=9,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=4, min_samples_split=9,
                       min_weight_fraction_leaf=0.0, n_estimators=10,
                       n_jobs=None, oob_score=False, random_state=100,
                       verbose=0, warm_start=False)

In [40]:
grid.best_score_

0.8868520859671302

<h3>Desicion Tree Model</h3>

In [None]:
tree_classifier = DecisionTreeClassifier(criterion="gini", random_state = 100, max_depth = 4, min_samples_leaf = 5)

tree_classifier.fit(X_train, y_train)
y_pred_tree = tree_classifier.predict(X_test)
evaluate(y_test, y_pred_tree)


In [None]:
feature_imp_tree = pd.Series(tree_classifier.feature_importances_, index = X_train.columns).sort_values(ascending=False)[:10]
print(feature_imp_tree)

In [None]:
plt.figure(figsize=(20,10))

plot_tree(tree_classifier, feature_names=X_train.columns, class_names=['no', 'yes'], rounded=True)

plt.show()

<h3>Random Forest Model</h3>

In [None]:
rf_classifier = RandomForestClassifier(criterion="gini", random_state = 100, max_depth = 4, min_samples_leaf = 5)

rf_classifier.fit(X_train, np.ravel(y_train))
y_pred_rf = rf_classifier.predict(X_test)
evaluate(y_test, y_pred_rf)


In [None]:
feature_imp_rf = pd.Series(rf_classifier.feature_importances_, index = X_train.columns).sort_values(ascending=False)[:10]
print(feature_imp_rf)

<h3>XGBoost</h3>


In [None]:
xgb_classifier = xgb.XGBClassifier(objective='binary:logistic', learning_rate=0.1, max_depth=4, use_label_encoder=False)
xgb_classifier.fit(X_train, y_train)
y_pred_xgb = xgb_classifier.predict(X_test)
evaluate(y_test, y_pred_xgb)

In [None]:
feature_imp_xgb = pd.Series(xgb_classifier.feature_importances_, index = X_train.columns).sort_values(ascending=False)[:10]
print(feature_imp_xgb)

In [None]:
plt.figure(figsize=(10,5))

y_pos = np.arange(len(feature_imp_xgb))
plt.bar(y_pos, feature_imp_xgb, align='center', color='purple')
plt.xticks(y_pos, feature_imp_xgb.index, rotation=30, ha='right')
plt.xlabel('Features')
plt.title("Feature Importance", weight='bold', fontsize=18, pad=20)
plt.show()

<h3>Voting</h3>

In [None]:
vt_classifier = VotingClassifier(estimators=[("tree", tree_classifier),("rf", rf_classifier),("xgb", xgb_classifier)])
vt_classifier.fit(X_train, y_train)
y_pred_vt = vt_classifier.predict(X_test)
evaluate(y_test, y_pred_vt)