# Less attributes

We are going to drop some attribute we consider can be drop. We are going to stay with attributes related to **personal information only**.

We will drop: month, duration, campaign, pdays, previous, poutcome, emp.var.rate, cons.price.idx, euribor3m, nr.employed

In [1]:
import pandas as pd
import numpy as np
from sklearn import preprocessing

In [2]:
df_encoded = pd.read_csv('df_encoded.csv',delimiter=';')
df_encoded.dtypes

age                 int64
job                 int64
marital             int64
education           int64
default             int64
housing             int64
loan                int64
contact             int64
month               int64
day_of_week         int64
duration            int64
campaign            int64
pdays               int64
previous            int64
poutcome            int64
emp.var.rate      float64
cons.price.idx    float64
cons.conf.idx     float64
euribor3m         float64
nr.employed       float64
y                   int64
dtype: object

In [3]:
#to drop
to_drop = ['month','campaign', 'pdays', 'previous']

In [4]:
df_encoded = df_encoded.drop(to_drop, axis=1)
df_encoded.head()

Unnamed: 0,age,job,marital,education,default,housing,loan,contact,day_of_week,duration,poutcome,emp.var.rate,cons.price.idx,cons.conf.idx,euribor3m,nr.employed,y
0,56,3,1,0,0,0,0,1,1,261,1,1.1,93.994,-36.4,4.857,5191.0,0
1,57,7,1,3,1,0,0,1,1,149,1,1.1,93.994,-36.4,4.857,5191.0,0
2,37,7,1,3,0,2,0,1,1,226,1,1.1,93.994,-36.4,4.857,5191.0,0
3,40,0,1,1,0,0,0,1,1,151,1,1.1,93.994,-36.4,4.857,5191.0,0
4,56,7,1,3,0,0,2,1,1,307,1,1.1,93.994,-36.4,4.857,5191.0,0


**SAVE**

In [5]:
#df.to_csv(r'df_encoded_personalinfo.csv')

### OneHot Encoding

In [6]:
from sklearn.preprocessing import OneHotEncoder

encoder = OneHotEncoder()

#### JOB

In [7]:
job_encoded = pd.get_dummies(df_encoded['job'], prefix='job', drop_first=True)
job_encoded.head()

Unnamed: 0,job_1,job_2,job_3,job_4,job_5,job_6,job_7,job_8,job_9,job_10,job_11
0,0,0,1,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,1,0,0,0,0
2,0,0,0,0,0,0,1,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,1,0,0,0,0


#### MARITAL

In [8]:
marital_encoded = pd.get_dummies(df_encoded['marital'], prefix='marital', drop_first=True)
marital_encoded.head()

Unnamed: 0,marital_1,marital_2,marital_3
0,1,0,0
1,1,0,0
2,1,0,0
3,1,0,0
4,1,0,0


#### EDUCATION

In [9]:
education_encoded = pd.get_dummies(df_encoded['education'], prefix='education', drop_first=True)
education_encoded.head()

Unnamed: 0,education_1,education_2,education_3,education_4,education_5,education_6,education_7
0,0,0,0,0,0,0,0
1,0,0,1,0,0,0,0
2,0,0,1,0,0,0,0
3,1,0,0,0,0,0,0
4,0,0,1,0,0,0,0


#### DEFAULT

In [10]:
default_encoded = pd.get_dummies(df_encoded['default'], prefix='default', drop_first=True)
default_encoded.head()

Unnamed: 0,default_1,default_2
0,0,0
1,1,0
2,0,0
3,0,0
4,0,0


#### HOUSING

In [11]:
housing_encoded = pd.get_dummies(df_encoded['housing'], prefix='housing', drop_first=True)
housing_encoded.head()

Unnamed: 0,housing_1,housing_2
0,0,0
1,0,0
2,0,1
3,0,0
4,0,0


#### LOAN

In [12]:
loan_encoded = pd.get_dummies(df_encoded['loan'], prefix='loan', drop_first=True)
loan_encoded.head()

Unnamed: 0,loan_1,loan_2
0,0,0
1,0,0
2,0,0
3,0,0
4,0,1


#### DAY OF WEEK

In [13]:
day_of_week_encoded = pd.get_dummies(df_encoded['day_of_week'], prefix='day_of_week', drop_first=True)
day_of_week_encoded.head()

Unnamed: 0,day_of_week_1,day_of_week_2,day_of_week_3,day_of_week_4
0,1,0,0,0
1,1,0,0,0
2,1,0,0,0
3,1,0,0,0
4,1,0,0,0


#### POUTCOME

In [14]:
poutcome_encoded = pd.get_dummies(df_encoded['poutcome'], prefix='poutcome', drop_first=True)
poutcome_encoded.head()

Unnamed: 0,poutcome_1,poutcome_2
0,1,0
1,1,0
2,1,0
3,1,0
4,1,0


#### NOTES

There are som columns that doesn't need a one hot encoding, such as contact and the numeric classes

## Now let's concat all these dataframes

In [30]:
dfs = [df_encoded['age'],job_encoded,marital_encoded,education_encoded,default_encoded,
       housing_encoded,loan_encoded,poutcome_encoded,df_encoded['contact'],df_encoded['duration'],day_of_week_encoded,df_encoded['emp.var.rate'],
       df_encoded['cons.price.idx'],df_encoded['cons.conf.idx'],df_encoded['euribor3m'],df_encoded['nr.employed'],
       df_encoded['y']]

df_onehot = pd.concat(dfs,axis =1)
df_onehot.head()

Unnamed: 0,age,job_1,job_2,job_3,job_4,job_5,job_6,job_7,job_8,job_9,...,day_of_week_1,day_of_week_2,day_of_week_3,day_of_week_4,emp.var.rate,cons.price.idx,cons.conf.idx,euribor3m,nr.employed,y
0,56,0,0,1,0,0,0,0,0,0,...,1,0,0,0,1.1,93.994,-36.4,4.857,5191.0,0
1,57,0,0,0,0,0,0,1,0,0,...,1,0,0,0,1.1,93.994,-36.4,4.857,5191.0,0
2,37,0,0,0,0,0,0,1,0,0,...,1,0,0,0,1.1,93.994,-36.4,4.857,5191.0,0
3,40,0,0,0,0,0,0,0,0,0,...,1,0,0,0,1.1,93.994,-36.4,4.857,5191.0,0
4,56,0,0,0,0,0,0,1,0,0,...,1,0,0,0,1.1,93.994,-36.4,4.857,5191.0,0


In [31]:
df_onehot.dtypes

age                 int64
job_1               uint8
job_2               uint8
job_3               uint8
job_4               uint8
job_5               uint8
job_6               uint8
job_7               uint8
job_8               uint8
job_9               uint8
job_10              uint8
job_11              uint8
marital_1           uint8
marital_2           uint8
marital_3           uint8
education_1         uint8
education_2         uint8
education_3         uint8
education_4         uint8
education_5         uint8
education_6         uint8
education_7         uint8
default_1           uint8
default_2           uint8
housing_1           uint8
housing_2           uint8
loan_1              uint8
loan_2              uint8
poutcome_1          uint8
poutcome_2          uint8
contact             int64
duration            int64
day_of_week_1       uint8
day_of_week_2       uint8
day_of_week_3       uint8
day_of_week_4       uint8
emp.var.rate      float64
cons.price.idx    float64
cons.conf.id

**SAVE**

In [32]:
#Save
#df_onehot.to_csv('df_onehot_personalinfo.csv',sep=';')

## Normilizing

In [33]:
x = df_onehot.values #returns a numpy array
mms = preprocessing.MinMaxScaler()
x_scaled = mms.fit_transform(x)
df_norm = pd.DataFrame(x_scaled)
df_norm.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,32,33,34,35,36,37,38,39,40,41
0,0.481481,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,...,1.0,0.0,0.0,0.0,0.9375,0.698753,0.60251,0.957379,0.859735,0.0
1,0.493827,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,...,1.0,0.0,0.0,0.0,0.9375,0.698753,0.60251,0.957379,0.859735,0.0
2,0.246914,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,...,1.0,0.0,0.0,0.0,0.9375,0.698753,0.60251,0.957379,0.859735,0.0
3,0.283951,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,1.0,0.0,0.0,0.0,0.9375,0.698753,0.60251,0.957379,0.859735,0.0
4,0.481481,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,...,1.0,0.0,0.0,0.0,0.9375,0.698753,0.60251,0.957379,0.859735,0.0


Getting Column names

In [34]:
columns = list(df_onehot.columns)

In [35]:
df_norm.columns = columns
df_norm.head()

Unnamed: 0,age,job_1,job_2,job_3,job_4,job_5,job_6,job_7,job_8,job_9,...,day_of_week_1,day_of_week_2,day_of_week_3,day_of_week_4,emp.var.rate,cons.price.idx,cons.conf.idx,euribor3m,nr.employed,y
0,0.481481,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,...,1.0,0.0,0.0,0.0,0.9375,0.698753,0.60251,0.957379,0.859735,0.0
1,0.493827,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,...,1.0,0.0,0.0,0.0,0.9375,0.698753,0.60251,0.957379,0.859735,0.0
2,0.246914,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,...,1.0,0.0,0.0,0.0,0.9375,0.698753,0.60251,0.957379,0.859735,0.0
3,0.283951,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,1.0,0.0,0.0,0.0,0.9375,0.698753,0.60251,0.957379,0.859735,0.0
4,0.481481,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,...,1.0,0.0,0.0,0.0,0.9375,0.698753,0.60251,0.957379,0.859735,0.0


In [36]:
df_norm.dtypes

age               float64
job_1             float64
job_2             float64
job_3             float64
job_4             float64
job_5             float64
job_6             float64
job_7             float64
job_8             float64
job_9             float64
job_10            float64
job_11            float64
marital_1         float64
marital_2         float64
marital_3         float64
education_1       float64
education_2       float64
education_3       float64
education_4       float64
education_5       float64
education_6       float64
education_7       float64
default_1         float64
default_2         float64
housing_1         float64
housing_2         float64
loan_1            float64
loan_2            float64
poutcome_1        float64
poutcome_2        float64
contact           float64
duration          float64
day_of_week_1     float64
day_of_week_2     float64
day_of_week_3     float64
day_of_week_4     float64
emp.var.rate      float64
cons.price.idx    float64
cons.conf.id

In [37]:
df_norm.to_csv(r'df_onehot_norm.csv',sep=';')