In [1]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, accuracy_score, classification_report
from sklearn.tree import DecisionTreeClassifier, plot_tree
from sklearn.ensemble import RandomForestClassifier, VotingClassifier
from sklearn.pipeline import Pipeline
import xgboost as xgb
import matplotlib.pyplot as plt
import category_encoders as ce
from datetime import datetime

In [2]:
# Performance Report
def evaluate(y_test, y_pred):
    print("Confusion Matrix: ")
    print(confusion_matrix(y_test, y_pred))

    print ("Accuracy : ")
    print(accuracy_score(y_test,y_pred)*100)

    print("Report : ")
    report = classification_report(y_test, y_pred)
    print(report)

In [3]:
data = pd.read_csv("bank.csv")

<h3>Preprocessing data</h3>

In [None]:
data.rename(columns={'subcribed': 'subscribed'}, inplace=True)
data = data.replace({'yes': 1, 'no': 0})
data['education'] = data['education'].replace({'tertiary': 1, 'secondary': 2, 'primary': 3, 'unknown': 4})
data['month'] = data['month'].transform(lambda x : datetime.strptime(x, '%b').month)

In [None]:
encoding_pipeline = Pipeline([
    ('encode_others', ce.OneHotEncoder(cols=['marital', 'contact', 'poutcome'], use_cat_names=True, return_df=True)),
    ('encode_job', ce.JamesSteinEncoder(cols=['job'], return_df=True))
])

data_transformed = encoding_pipeline.fit_transform(data, data['subscribed'])

In [None]:
data.rename(columns={'subcribed': 'subscribed'}, inplace=True)
X = data.drop(columns=['subscribed'])
y = data['subscribed']

In [None]:
from sklearn.preprocessing import LabelEncoder

ohe = ce.OneHotEncoder(handle_unknown='ignore', use_cat_names=True)
X = ohe.fit_transform(X)
    
te = LabelEncoder()
y = te.fit_transform(y)

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state = 100)

<h3>Desicion Tree Model</h3>

In [None]:
tree_classifier = DecisionTreeClassifier(criterion="gini", random_state = 100, max_depth = 4, min_samples_leaf = 5)

tree_classifier.fit(X_train, y_train)
y_pred_tree = tree_classifier.predict(X_test)
evaluate(y_test, y_pred_tree)


In [None]:
feature_imp_tree = pd.Series(tree_classifier.feature_importances_, index = X_train.columns).sort_values(ascending=False)[:10]
print(feature_imp_tree)

In [None]:
plt.figure(figsize=(20,10))

plot_tree(tree_classifier, feature_names=X_train.columns, class_names=['no', 'yes'], rounded=True)

plt.show()

<h3>Random Forest Model</h3>

In [None]:
rf_classifier = RandomForestClassifier(criterion="gini", random_state = 100, max_depth = 4, min_samples_leaf = 5)

rf_classifier.fit(X_train, np.ravel(y_train))
y_pred_rf = rf_classifier.predict(X_test)
evaluate(y_test, y_pred_rf)


In [None]:
feature_imp_rf = pd.Series(rf_classifier.feature_importances_, index = X_train.columns).sort_values(ascending=False)[:10]
print(feature_imp_rf)

<h3>XGBoost</h3>


In [None]:
xgb_classifier = xgb.XGBClassifier(objective='binary:logistic', learning_rate=0.1, max_depth=4, use_label_encoder=False)
xgb_classifier.fit(X_train, y_train)
y_pred_xgb = xgb_classifier.predict(X_test)
evaluate(y_test, y_pred_xgb)

In [None]:
feature_imp_xgb = pd.Series(xgb_classifier.feature_importances_, index = X_train.columns).sort_values(ascending=False)[:10]
print(feature_imp_xgb)

In [None]:
plt.figure(figsize=(10,5))

y_pos = np.arange(len(feature_imp_xgb))
plt.bar(y_pos, feature_imp_xgb, align='center', color='purple')
plt.xticks(y_pos, feature_imp_xgb.index, rotation=30, ha='right')
plt.xlabel('Features')
plt.title("Feature Importance", weight='bold', fontsize=18, pad=20)
plt.show()

<h3>Voting</h3>

In [None]:
vt_classifier = VotingClassifier(estimators=[("tree", tree_classifier),("rf", rf_classifier),("xgb", xgb_classifier)])
vt_classifier.fit(X_train, y_train)
y_pred_vt = vt_classifier.predict(X_test)
evaluate(y_test, y_pred_vt)