<a href="https://colab.research.google.com/github/jermanalopes/FeatureEng/blob/main/FeatureEngineering.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [170]:
import pandas as pd
from sklearn.preprocessing import LabelEncoder, StandardScaler

In [171]:
#Organizing and separation dataset
dataset = pd.read_csv('credit_simple.csv', sep=';')
dataset.shape
dataset.head()
x = dataset.iloc[:,:-1]
y = dataset['CLASSE']

In [172]:
#Pre-processing data
x.isnull().sum()
SaldoAtualMedian = x['SALDO_ATUAL'].median()
SaldoAtualMedian
x['SALDO_ATUAL'].fillna(SaldoAtualMedian, inplace=True)
grup = x.groupby(['ESTADOCIVIL']).size()
x['ESTADOCIVIL'].fillna('masculino solteiro', inplace=True)

In [173]:
#Outliers detection
SaldoAtualstd = x['SALDO_ATUAL'].std()
x.loc[x['SALDO_ATUAL'] >= 2*SaldoAtualstd, 'SALDO_ATUAL']
NewSaldoAtualMedian = x['SALDO_ATUAL'].median()
x.loc[x['SALDO_ATUAL'] >= 2*SaldoAtualstd, 'SALDO_ATUAL'] = NewSaldoAtualMedian



In [174]:
#Data Binding
group_proposito = x.groupby(['PROPOSITO']).size()
group_proposito
x.loc[x['PROPOSITO'] == 'Eletrodomésticos','PROPOSITO'] = 'outros'
x.loc[x['PROPOSITO'] == 'qualificação', 'PROPOSITO'] = 'outros'
group_proposito = x.groupby(['PROPOSITO']).size()
group_proposito


PROPOSITO
carro novo             234
carro usado            103
educação                50
mobilia/equipamento    181
negócios                97
obras                   22
outros                  33
radio/tv               280
dtype: int64

In [175]:
x['DATA'] = pd.to_datetime(x['DATA'],format="%d/%m/%Y")
x['ANO'] = x['DATA'].dt.year
x['MES'] = x['DATA'].dt.month
x['DIASEMANA'] = x['DATA'].dt.day_name()
x['DIASEMANA']

0        Tuesday
1      Wednesday
2       Thursday
3      Wednesday
4      Wednesday
         ...    
995       Friday
996     Saturday
997      Tuesday
998     Thursday
999     Thursday
Name: DIASEMANA, Length: 1000, dtype: object

In [176]:
#StandardScaler
sc = StandardScaler()
sc_data = sc.fit_transform(x.iloc[:,0:3])
sc_data

array([[-0.74551643,  1.04698668,  1.6392759 ],
       [ 0.95774038, -0.76597727, -0.74024139],
       [-0.41533679,  0.14050471,  0.68746898],
       ...,
       [-0.87552244,  1.04698668,  0.1058092 ],
       [-0.50473818,  1.04698668, -0.68736323],
       [ 0.46799171,  1.04698668, -0.47585058]])

In [177]:
#LabelEncoder and One Hot Encoder
labelencoder1 = LabelEncoder()
x['ESTADOCIVIL'] = labelencoder1.fit_transform(x['ESTADOCIVIL'])
x['PROPOSITO'] = labelencoder1.fit_transform(x['PROPOSITO'])
x['DIASEMANA'] = labelencoder1.fit_transform(x['DIASEMANA'])
x['DIASEMANA'].unique()

hot_encoder_data = pd.get_dummies(x['OUTROSPLANOSPGTO'], prefix='OUTROS')
hot_encoder_data

Unnamed: 0,OUTROS_banco,OUTROS_nenhum,OUTROS_stores
0,False,True,False
1,False,True,False
2,False,True,False
3,False,True,False
4,False,True,False
...,...,...,...
995,False,True,False
996,False,True,False
997,False,True,False
998,False,True,False


In [178]:
# Concat Data
x = pd.concat([x, hot_encoder_data, pd.DataFrame(sc_data, columns=['SALDO_ATUAL_NORM', 'RESIDENCEDESDE_NORM', 'IDADE_NORM'])], axis=1)
x.head()


Unnamed: 0,SALDO_ATUAL,RESIDENCIADESDE,IDADE,OUTROSPLANOSPGTO,DATA,ESTADOCIVIL,PROPOSITO,ANO,MES,DIASEMANA,OUTROS_banco,OUTROS_nenhum,OUTROS_stores,SALDO_ATUAL_NORM,RESIDENCEDESDE_NORM,IDADE_NORM
0,1169.0,4,67,nenhum,2019-01-01,3,7,2019,1,5,False,True,False,-0.745516,1.046987,1.639276
1,5951.0,2,22,nenhum,2020-01-01,0,7,2020,1,6,False,True,False,0.95774,-0.765977,-0.740241
2,2096.0,3,49,nenhum,2020-01-02,3,2,2020,1,4,False,True,False,-0.415337,0.140505,0.687469
3,7882.0,4,45,nenhum,2019-01-02,3,3,2019,1,6,False,True,False,1.645526,1.046987,0.475956
4,4870.0,4,53,nenhum,2018-01-03,3,0,2018,1,6,False,True,False,0.572709,1.046987,0.898982


In [181]:
#Organize new data
new_x = x.drop(columns=['SALDO_ATUAL', 'RESIDENCIADESDE', 'IDADE','OUTROSPLANOSPGTO', 'DATA','OUTROS_banco'], axis=1)
new_x









Unnamed: 0,ESTADOCIVIL,PROPOSITO,ANO,MES,DIASEMANA,OUTROS_nenhum,OUTROS_stores,SALDO_ATUAL_NORM,RESIDENCEDESDE_NORM,IDADE_NORM
0,3,7,2019,1,5,True,False,-0.745516,1.046987,1.639276
1,0,7,2020,1,6,True,False,0.957740,-0.765977,-0.740241
2,3,2,2020,1,4,True,False,-0.415337,0.140505,0.687469
3,3,3,2019,1,6,True,False,1.645526,1.046987,0.475956
4,3,0,2018,1,6,True,False,0.572709,1.046987,0.898982
...,...,...,...,...,...,...,...,...,...,...
995,0,3,2018,6,0,True,False,-0.543562,1.046987,-0.264338
996,2,1,2018,6,2,True,False,0.211898,1.046987,0.211566
997,3,7,2018,7,5,True,False,-0.875522,1.046987,0.105809
998,3,7,2019,7,4,True,False,-0.504738,1.046987,-0.687363
