# Bank Marketing Campaigns - Predicting client subscription

## Reading and exploring the dataset

In [1]:
import numpy as np
import pandas as pd
import sklearn as sk

In [2]:
df = pd.read_csv("bank-full.csv", delimiter=';')

In [3]:
df.head()

Unnamed: 0,age,job,marital,education,default,balance,housing,loan,contact,day,month,duration,campaign,pdays,previous,poutcome,y
0,58,management,married,tertiary,no,2143,yes,no,unknown,5,may,261,1,-1,0,unknown,no
1,44,technician,single,secondary,no,29,yes,no,unknown,5,may,151,1,-1,0,unknown,no
2,33,entrepreneur,married,secondary,no,2,yes,yes,unknown,5,may,76,1,-1,0,unknown,no
3,47,blue-collar,married,unknown,no,1506,yes,no,unknown,5,may,92,1,-1,0,unknown,no
4,33,unknown,single,unknown,no,1,no,no,unknown,5,may,198,1,-1,0,unknown,no


In [4]:
df.isnull().sum()

age          0
job          0
marital      0
education    0
default      0
balance      0
housing      0
loan         0
contact      0
day          0
month        0
duration     0
campaign     0
pdays        0
previous     0
poutcome     0
y            0
dtype: int64

In [5]:
df['job'].value_counts()

blue-collar      9732
management       9458
technician       7597
admin.           5171
services         4154
retired          2264
self-employed    1579
entrepreneur     1487
unemployed       1303
housemaid        1240
student           938
unknown           288
Name: job, dtype: int64

In [6]:
df['marital'].value_counts()

married     27214
single      12790
divorced     5207
Name: marital, dtype: int64

In [7]:
df['education'].value_counts()

secondary    23202
tertiary     13301
primary       6851
unknown       1857
Name: education, dtype: int64

In [8]:
df['loan'].value_counts()

no     37967
yes     7244
Name: loan, dtype: int64

In [9]:
df.shape

(45211, 17)

In [10]:
df.dtypes

age           int64
job          object
marital      object
education    object
default      object
balance       int64
housing      object
loan         object
contact      object
day           int64
month        object
duration      int64
campaign      int64
pdays         int64
previous      int64
poutcome     object
y            object
dtype: object

## Preparing the Dataset

In [11]:
from sklearn.preprocessing import LabelEncoder

#### Convert objects to categorical

In [12]:
class MultiColumnLabelEncoder:
    def __init__(self,columns = None):
        self.columns = columns

    def fit(self,X,y=None):
        return self

    def transform(self,X):
        '''
        Transforms columns of X specified in self.columns using
        LabelEncoder(). If no columns specified, transforms all
        columns in X.
        '''
        output = X.copy()
        if self.columns is not None:
            for col in self.columns:
                output[col] = LabelEncoder().fit_transform(output[col])
        else:
            for colname,col in output.iteritems():
                output[colname] = LabelEncoder().fit_transform(col)
        return output

    def fit_transform(self,X,y=None):
        return self.fit(X,y).transform(X)

In [13]:
df1 = MultiColumnLabelEncoder(columns = ['job','marital','education','default','housing','loan',
                                         'contact','month','poutcome','y']).fit_transform(df)

In [14]:
df1.shape

(45211, 17)

In [15]:
df1.head()

Unnamed: 0,age,job,marital,education,default,balance,housing,loan,contact,day,month,duration,campaign,pdays,previous,poutcome,y
0,58,4,1,2,0,2143,1,0,2,5,8,261,1,-1,0,3,0
1,44,9,2,1,0,29,1,0,2,5,8,151,1,-1,0,3,0
2,33,2,1,1,0,2,1,1,2,5,8,76,1,-1,0,3,0
3,47,1,1,3,0,1506,1,0,2,5,8,92,1,-1,0,3,0
4,33,11,2,3,0,1,0,0,2,5,8,198,1,-1,0,3,0


In [16]:
X = df1.drop('y', axis =1)
y = df1['y']

In [17]:
X.head()

Unnamed: 0,age,job,marital,education,default,balance,housing,loan,contact,day,month,duration,campaign,pdays,previous,poutcome
0,58,4,1,2,0,2143,1,0,2,5,8,261,1,-1,0,3
1,44,9,2,1,0,29,1,0,2,5,8,151,1,-1,0,3
2,33,2,1,1,0,2,1,1,2,5,8,76,1,-1,0,3
3,47,1,1,3,0,1506,1,0,2,5,8,92,1,-1,0,3
4,33,11,2,3,0,1,0,0,2,5,8,198,1,-1,0,3


In [18]:
y.head()

0    0
1    0
2    0
3    0
4    0
Name: y, dtype: int64

#### Split the data into train and testing 70/30

In [19]:
from sklearn.model_selection import train_test_split

In [20]:
Seed = 8

In [21]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.30, random_state=Seed)

## Building Models and Evaluate Algorithms

In [49]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import BaggingClassifier
from sklearn.ensemble import AdaBoostClassifier
from sklearn.model_selection import KFold
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from sklearn.model_selection import cross_val_score, cross_val_predict

In [37]:
num_folds = 10
scoring = 'accuracy'
kfold = KFold(n_splits=num_folds, random_state=Seed)

In [40]:
model = AdaBoostClassifier()
BTmodel = BaggingClassifier()

## AdaBoost Classifier

In [41]:
model.fit(X_train, y_train)

AdaBoostClassifier(algorithm='SAMME.R', base_estimator=None,
          learning_rate=1.0, n_estimators=50, random_state=None)

In [42]:
def print_score(model, X_train, y_train, X_test, y_test, train=True):
    
    if train:
        print("Train Result:\n")
        print("accuracy score: {0:.4f}\n".format(accuracy_score(y_train, model.predict(X_train))))
        print("Classification Report: \n {}\n".format(classification_report(y_train, model.predict(X_train))))
        print("Confusion Matrix: \n {}\n".format(confusion_matrix(y_train, model.predict(X_train))))

        res = cross_val_score(model, X_train, y_train, cv =kfold, scoring=scoring)
        print("Average Accuracy: \t {0:.4f}".format(np.mean(res)))
        print("Accuracy SD: \t\t {0:.4f}".format(np.std(res)))
        
    elif train==False:
        print("Test Result:\n")        
        print("accuracy score: {0:.4f}\n".format(accuracy_score(y_test, model.predict(X_test))))
        print("Classification Report: \n {}\n".format(classification_report(y_test, model.predict(X_test))))
        print("Confusion Matrix: \n {}\n".format(confusion_matrix(y_test, model.predict(X_test))))    

In [43]:
print_score(model, X_train, y_train, X_test, y_test, train=True)

Train Result:

accuracy score: 0.8974

Classification Report: 
               precision    recall  f1-score   support

           0       0.92      0.97      0.94     27970
           1       0.60      0.35      0.44      3677

   micro avg       0.90      0.90      0.90     31647
   macro avg       0.76      0.66      0.69     31647
weighted avg       0.88      0.90      0.88     31647


Confusion Matrix: 
 [[27127   843]
 [ 2405  1272]]

Average Accuracy: 	 0.8962
Accuracy SD: 		 0.0040


In [44]:
print_score(model, X_train, y_train, X_test, y_test, train=False)

Test Result:

accuracy score: 0.8961

Classification Report: 
               precision    recall  f1-score   support

           0       0.92      0.97      0.94     11952
           1       0.61      0.36      0.45      1612

   micro avg       0.90      0.90      0.90     13564
   macro avg       0.76      0.66      0.70     13564
weighted avg       0.88      0.90      0.88     13564


Confusion Matrix: 
 [[11577   375]
 [ 1034   578]]



## Bagging Classifier 

In [45]:
BTmodel.fit(X_train, y_train)

BaggingClassifier(base_estimator=None, bootstrap=True,
         bootstrap_features=False, max_features=1.0, max_samples=1.0,
         n_estimators=10, n_jobs=None, oob_score=False, random_state=None,
         verbose=0, warm_start=False)

In [46]:
def print_score(BTmodel, X_train, y_train, X_test, y_test, train=True):
    
    if train:
        print("Train Result:\n")
        print("accuracy score: {0:.4f}\n".format(accuracy_score(y_train, BTmodel.predict(X_train))))
        print("Classification Report: \n {}\n".format(classification_report(y_train, BTmodel.predict(X_train))))
        print("Confusion Matrix: \n {}\n".format(confusion_matrix(y_train, BTmodel.predict(X_train))))

        res = cross_val_score(BTmodel, X_train, y_train, cv =kfold, scoring=scoring)
        print("Average Accuracy: \t {0:.4f}".format(np.mean(res)))
        print("Accuracy SD: \t\t {0:.4f}".format(np.std(res)))
        
    elif train==False:
        print("Test Result:\n")        
        print("accuracy score: {0:.4f}\n".format(accuracy_score(y_test, BTmodel.predict(X_test))))
        print("Classification Report: \n {}\n".format(classification_report(y_test, BTmodel.predict(X_test))))
        print("Confusion Matrix: \n {}\n".format(confusion_matrix(y_test, BTmodel.predict(X_test))))    

In [47]:
print_score(BTmodel, X_train, y_train, X_test, y_test, train=True)

Train Result:

accuracy score: 0.9924

Classification Report: 
               precision    recall  f1-score   support

           0       0.99      1.00      1.00     27970
           1       0.99      0.94      0.97      3677

   micro avg       0.99      0.99      0.99     31647
   macro avg       0.99      0.97      0.98     31647
weighted avg       0.99      0.99      0.99     31647


Confusion Matrix: 
 [[27952    18]
 [  224  3453]]

Average Accuracy: 	 0.8988
Accuracy SD: 		 0.0044


In [48]:
print_score(BTmodel, X_train, y_train, X_test, y_test, train=False)

Test Result:

accuracy score: 0.8958

Classification Report: 
               precision    recall  f1-score   support

           0       0.92      0.96      0.94     11952
           1       0.59      0.39      0.47      1612

   micro avg       0.90      0.90      0.90     13564
   macro avg       0.76      0.68      0.71     13564
weighted avg       0.88      0.90      0.89     13564


Confusion Matrix: 
 [[11518   434]
 [  980   632]]

