In [81]:
import pandas as pd
import numpy as np

# Read data

In [102]:
bank = pd.read_csv('bank-additional-full.csv',delimiter =';')
bank.head(10)

Unnamed: 0,age,job,marital,education,default,housing,loan,contact,month,day_of_week,...,campaign,pdays,previous,poutcome,emp.var.rate,cons.price.idx,cons.conf.idx,euribor3m,nr.employed,y
0,56,housemaid,married,basic.4y,no,no,no,telephone,may,mon,...,1,999,0,nonexistent,1.1,93.994,-36.4,4.857,5191.0,no
1,57,services,married,high.school,unknown,no,no,telephone,may,mon,...,1,999,0,nonexistent,1.1,93.994,-36.4,4.857,5191.0,no
2,37,services,married,high.school,no,yes,no,telephone,may,mon,...,1,999,0,nonexistent,1.1,93.994,-36.4,4.857,5191.0,no
3,40,admin.,married,basic.6y,no,no,no,telephone,may,mon,...,1,999,0,nonexistent,1.1,93.994,-36.4,4.857,5191.0,no
4,56,services,married,high.school,no,no,yes,telephone,may,mon,...,1,999,0,nonexistent,1.1,93.994,-36.4,4.857,5191.0,no
5,45,services,married,basic.9y,unknown,no,no,telephone,may,mon,...,1,999,0,nonexistent,1.1,93.994,-36.4,4.857,5191.0,no
6,59,admin.,married,professional.course,no,no,no,telephone,may,mon,...,1,999,0,nonexistent,1.1,93.994,-36.4,4.857,5191.0,no
7,41,blue-collar,married,unknown,unknown,no,no,telephone,may,mon,...,1,999,0,nonexistent,1.1,93.994,-36.4,4.857,5191.0,no
8,24,technician,single,professional.course,no,yes,no,telephone,may,mon,...,1,999,0,nonexistent,1.1,93.994,-36.4,4.857,5191.0,no
9,25,services,single,high.school,no,yes,no,telephone,may,mon,...,1,999,0,nonexistent,1.1,93.994,-36.4,4.857,5191.0,no


In [83]:
bank.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 41188 entries, 0 to 41187
Data columns (total 21 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   age             41188 non-null  int64  
 1   job             41188 non-null  object 
 2   marital         41188 non-null  object 
 3   education       41188 non-null  object 
 4   default         41188 non-null  object 
 5   housing         41188 non-null  object 
 6   loan            41188 non-null  object 
 7   contact         41188 non-null  object 
 8   month           41188 non-null  object 
 9   day_of_week     41188 non-null  object 
 10  duration        41188 non-null  int64  
 11  campaign        41188 non-null  int64  
 12  pdays           41188 non-null  int64  
 13  previous        41188 non-null  int64  
 14  poutcome        41188 non-null  object 
 15  emp.var.rate    41188 non-null  float64
 16  cons.price.idx  41188 non-null  float64
 17  cons.conf.idx   41188 non-null 

**y is the output of the model**

In [84]:
bank['y']

0         no
1         no
2         no
3         no
4         no
        ... 
41183    yes
41184     no
41185     no
41186    yes
41187     no
Name: y, Length: 41188, dtype: object

# Split Data

In [87]:
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split

y = bank['y']
y = LabelEncoder().fit_transform(y)

X = bank.drop(columns=['y'])

# Devide training data and test data
X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=0.8, test_size=0.2, random_state=0)

# Define Processing Steps

In [88]:
ordinal_cols = ['month','day_of_week','education','housing','loan','default']
onehot_cols = ['marital', 'poutcome', 'contact','job']
minmax_cols = ['age', 'cons.price.idx', 'cons.conf.idx', 'nr.employed']
duration_col = ['duration']

In [89]:
from sklearn.preprocessing import PowerTransformer, MinMaxScaler, OneHotEncoder, OrdinalEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline

# Preprocessing for oridinal encoded data
ordinal_transformer = OrdinalEncoder()

# Preprocessing for one hot encoded data
onehot_transformer = OneHotEncoder(handle_unknown='ignore')

# Preprocessing for duration column
duration_transformer = Pipeline(steps=[
    ('transform',PowerTransformer(method='yeo-johnson')),
    ('scale',MinMaxScaler())
])

minmax_transfomer = MinMaxScaler()

# Bundle preprocessing for all groups
preprocessor = ColumnTransformer(
    transformers=[
        ('ordinal',ordinal_transformer,ordinal_cols),
        ('onehot',onehot_transformer,onehot_cols),
        ('duration',duration_transformer,duration_col),
        ('min max',minmax_transfomer,minmax_cols)
])

# Define the model

In [90]:
from sklearn.ensemble import RandomForestClassifier
model = RandomForestClassifier(n_estimators=100,random_state=0)

# Create and Evaluate the Pipeline

In [92]:
from sklearn.metrics import accuracy_score

# create pipeline
bank_pipeline = Pipeline(steps= [
                        ('preprocessor', preprocessor),
                        ('model', model)
                        ])

bank_pipeline.fit(X_train,y_train)

preds = bank_pipeline.predict(X_test)

# Evaluate the model
score = accuracy_score(y_test,preds)
print(f'Accuracy: {score*100} %')

Accuracy: 91.61204175770818 %
