# Bank dataset preprocessing

See: Deep Learning with TensorFlow, Md. Rezaul Karim, Giancarlo Zaccone

## Loading, converting to numeric

In [74]:
import pandas as pd
import numpy as np
from sklearn import preprocessing

In [75]:
data = pd.read_csv("data/bank/bank-additional-full.csv", sep = ';')

In [76]:
data.head()

Unnamed: 0,age,job,marital,education,default,housing,loan,contact,month,day_of_week,...,campaign,pdays,previous,poutcome,emp.var.rate,cons.price.idx,cons.conf.idx,euribor3m,nr.employed,y
0,56,housemaid,married,basic.4y,no,no,no,telephone,may,mon,...,1,999,0,nonexistent,1.1,93.994,-36.4,4.857,5191.0,no
1,57,services,married,high.school,unknown,no,no,telephone,may,mon,...,1,999,0,nonexistent,1.1,93.994,-36.4,4.857,5191.0,no
2,37,services,married,high.school,no,yes,no,telephone,may,mon,...,1,999,0,nonexistent,1.1,93.994,-36.4,4.857,5191.0,no
3,40,admin.,married,basic.6y,no,no,no,telephone,may,mon,...,1,999,0,nonexistent,1.1,93.994,-36.4,4.857,5191.0,no
4,56,services,married,high.school,no,no,yes,telephone,may,mon,...,1,999,0,nonexistent,1.1,93.994,-36.4,4.857,5191.0,no


In [77]:
data.dtypes

age                 int64
job                object
marital            object
education          object
default            object
housing            object
loan               object
contact            object
month              object
day_of_week        object
duration            int64
campaign            int64
pdays               int64
previous            int64
poutcome           object
emp.var.rate      float64
cons.price.idx    float64
cons.conf.idx     float64
euribor3m         float64
nr.employed       float64
y                  object
dtype: object

In [78]:
data.isna().sum(axis = 0)

age               0
job               0
marital           0
education         0
default           0
housing           0
loan              0
contact           0
month             0
day_of_week       0
duration          0
campaign          0
pdays             0
previous          0
poutcome          0
emp.var.rate      0
cons.price.idx    0
cons.conf.idx     0
euribor3m         0
nr.employed       0
y                 0
dtype: int64

In [79]:
varNames = data.columns.tolist()

In [80]:
categs = [colName for colName, colType in zip(varNames, data.dtypes) if colType == object and colName != 'y']
# duration???

In [81]:
categs

['job',
 'marital',
 'education',
 'default',
 'housing',
 'loan',
 'contact',
 'month',
 'day_of_week',
 'poutcome']

In [82]:
dummyVars = [pd.get_dummies(data[varName]) for varName in categs]

In [83]:
dictMap = dict()
yMap = {'yes': 1, 'no': 0}
dictMap['y'] = yMap
data.replace(inplace = True, to_replace = dictMap)
label = data['y']

In [84]:
quantit = [i for i in varNames if i not in categs]
dfNumerical = data[quantit]

In [85]:
dfNames = dfNumerical.keys().tolist()

In [86]:
dfNames

['age',
 'duration',
 'campaign',
 'pdays',
 'previous',
 'emp.var.rate',
 'cons.price.idx',
 'cons.conf.idx',
 'euribor3m',
 'nr.employed',
 'y']

## Normalizing

In [87]:
minMaxScaler = preprocessing.MinMaxScaler()
xScaled = minMaxScaler.fit_transform(dfNumerical);

  return self.partial_fit(X, y)


In [88]:
dfTemp = pd.DataFrame(data = xScaled, columns = dfNames)

In [89]:
dfTemp.head()

Unnamed: 0,age,duration,campaign,pdays,previous,emp.var.rate,cons.price.idx,cons.conf.idx,euribor3m,nr.employed,y
0,0.481481,0.05307,0.0,1.0,0.0,0.9375,0.698753,0.60251,0.957379,0.859735,0.0
1,0.493827,0.030297,0.0,1.0,0.0,0.9375,0.698753,0.60251,0.957379,0.859735,0.0
2,0.246914,0.045954,0.0,1.0,0.0,0.9375,0.698753,0.60251,0.957379,0.859735,0.0
3,0.283951,0.030704,0.0,1.0,0.0,0.9375,0.698753,0.60251,0.957379,0.859735,0.0
4,0.481481,0.062424,0.0,1.0,0.0,0.9375,0.698753,0.60251,0.957379,0.859735,0.0


In [90]:
normalizedDf = pd.concat([dfTemp] + dummyVars, axis = 1)

In [91]:
normalizedDf.head()

Unnamed: 0,age,duration,campaign,pdays,previous,emp.var.rate,cons.price.idx,cons.conf.idx,euribor3m,nr.employed,...,oct,sep,fri,mon,thu,tue,wed,failure,nonexistent,success
0,0.481481,0.05307,0.0,1.0,0.0,0.9375,0.698753,0.60251,0.957379,0.859735,...,0,0,0,1,0,0,0,0,1,0
1,0.493827,0.030297,0.0,1.0,0.0,0.9375,0.698753,0.60251,0.957379,0.859735,...,0,0,0,1,0,0,0,0,1,0
2,0.246914,0.045954,0.0,1.0,0.0,0.9375,0.698753,0.60251,0.957379,0.859735,...,0,0,0,1,0,0,0,0,1,0
3,0.283951,0.030704,0.0,1.0,0.0,0.9375,0.698753,0.60251,0.957379,0.859735,...,0,0,0,1,0,0,0,0,1,0
4,0.481481,0.062424,0.0,1.0,0.0,0.9375,0.698753,0.60251,0.957379,0.859735,...,0,0,0,1,0,0,0,0,1,0


In [92]:
normalizedDf.to_csv('data/bank/bank_normalized.csv', index = False)