## Baseline.Feature Engineering
- datetime feature into int
- LabelEncode categorical into numeric
- create target column

In [1]:
import pandas as pd
import numpy as np

trn = pd.read_csv('data/train_clean.csv')
tst = pd.read_csv('data/test_clean.csv')

In [3]:
# 날짜 데이터 정수형으로 변환하기

col = 'fecha_alta'
trn[col] = ((pd.to_datetime(trn['fecha_dato']) - pd.to_datetime(trn[col])) 
            / np.timedelta64(1,'D')).astype(int)
tst[col] = ((pd.to_datetime(tst['fecha_dato']) - pd.to_datetime(tst[col]))
            / np.timedelta64(1, 'D')).astype(int)

col = 'ult_fec_cli_1t'
trn[col] = ((pd.to_datetime(trn['fecha_dato']) - pd.to_datetime(trn[col])) 
            / np.timedelta64(1,'D')).astype(int)
tst[col] = ((pd.to_datetime(tst['fecha_dato']) - pd.to_datetime(tst[col])) 
            / np.timedelta64(1,'D')).astype(int)

In [4]:
# 날짜 데이터와 고객 고유 식별번호 삭제하기
drop_cols = ['fecha_dato', 'ncodpers']
trn = trn.drop(drop_cols, axis=1)
tst = tst.drop(drop_cols, axis=1)

In [5]:
from sklearn.preprocessing import LabelEncoder

# LabelEncoder를 사용하여 범주형 데이터를 수치형으로 변환하기
for col in trn.columns:
    if trn[col].dtype == 'object':
        lb = LabelEncoder()
        lb.fit(pd.concat([trn[col],tst[col]]))
        trn[col] = lb.transform(trn[col])
        tst[col] = lb.transform(tst[col])

In [6]:
trn.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1155461 entries, 0 to 1155460
Data columns (total 70 columns):
ind_empleado                   1155461 non-null int64
pais_residencia                1155461 non-null int64
sexo                           1155461 non-null int64
age                            1155461 non-null int64
fecha_alta                     1155461 non-null int64
ind_nuevo                      1155461 non-null int64
antiguedad                     1155461 non-null int64
indrel                         1155461 non-null int64
ult_fec_cli_1t                 1155461 non-null int64
indrel_1mes                    1155461 non-null int64
tiprel_1mes                    1155461 non-null int64
indresi                        1155461 non-null int64
indext                         1155461 non-null int64
conyuemp                       1155461 non-null int64
canal_entrada                  1155461 non-null int64
indfall                        1155461 non-null int64
cod_prov               

In [7]:
tst.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 929615 entries, 0 to 929614
Data columns (total 21 columns):
ind_empleado             929615 non-null int64
pais_residencia          929615 non-null int64
sexo                     929615 non-null int64
age                      929615 non-null int64
fecha_alta               929615 non-null int64
ind_nuevo                929615 non-null int64
antiguedad               929615 non-null int64
indrel                   929615 non-null int64
ult_fec_cli_1t           929615 non-null int64
indrel_1mes              929615 non-null int64
tiprel_1mes              929615 non-null int64
indresi                  929615 non-null int64
indext                   929615 non-null int64
conyuemp                 929615 non-null int64
canal_entrada            929615 non-null int64
indfall                  929615 non-null int64
cod_prov                 929615 non-null int64
nomprov                  929615 non-null int64
ind_actividad_cliente    929615 non-null in

In [11]:
# # Test Data에 없는 변수 제거하기
trn = trn[tst.columns.tolist() + [col for col in trn.columns if 'purchased' in col]]

In [12]:
trn.shape

(1155461, 45)

In [15]:
# 신규 구매 변수를 target으로 변환하기
data = []
for _, row in trn.iterrows():
    for i in range(24):
        if row[21+i] == 1:
            temp = row[:21].values.tolist()
            temp.append(i)
            data.append(temp)
            
cols = tst.columns.tolist()
cols.append('target')
trn = pd.DataFrame(data, columns = cols)

In [16]:
# Feature Engineer _ baseline 데이터 저장하기
trn.to_csv('data/train_feng.baseline.csv', index=False)
tst.to_csv('data/test_feng.baseline.csv', index=False)

In [21]:
tst.to_csv('data/test_feng.baseline.csv', index=False)

In [17]:
trn.head()

Unnamed: 0,ind_empleado,pais_residencia,sexo,age,fecha_alta,ind_nuevo,antiguedad,indrel,ult_fec_cli_1t,indrel_1mes,...,indext,conyuemp,canal_entrada,indfall,cod_prov,nomprov,ind_actividad_cliente,renta,segmento,target
0,3,36,0,35,16,0,6,1,-153,1,...,0,2,153,0,29,31,1,87218,1,2
1,3,36,1,23,901,0,35,1,-153,1,...,1,2,150,0,13,16,0,35548,2,2
2,3,36,1,23,901,0,35,1,-153,1,...,0,2,150,0,13,16,0,122179,2,2
3,3,36,0,22,901,0,35,1,-153,1,...,0,2,149,0,50,51,0,119775,2,9
4,3,36,1,23,901,0,35,1,-153,1,...,0,2,150,0,50,51,1,0,2,2


In [18]:
trn.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1778496 entries, 0 to 1778495
Data columns (total 22 columns):
ind_empleado             int64
pais_residencia          int64
sexo                     int64
age                      int64
fecha_alta               int64
ind_nuevo                int64
antiguedad               int64
indrel                   int64
ult_fec_cli_1t           int64
indrel_1mes              int64
tiprel_1mes              int64
indresi                  int64
indext                   int64
conyuemp                 int64
canal_entrada            int64
indfall                  int64
cod_prov                 int64
nomprov                  int64
ind_actividad_cliente    int64
renta                    int64
segmento                 int64
target                   int64
dtypes: int64(22)
memory usage: 298.5 MB


In [20]:
trn.isnull().sum()

ind_empleado             0
pais_residencia          0
sexo                     0
age                      0
fecha_alta               0
ind_nuevo                0
antiguedad               0
indrel                   0
ult_fec_cli_1t           0
indrel_1mes              0
tiprel_1mes              0
indresi                  0
indext                   0
conyuemp                 0
canal_entrada            0
indfall                  0
cod_prov                 0
nomprov                  0
ind_actividad_cliente    0
renta                    0
segmento                 0
target                   0
dtype: int64

In [19]:
tst.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 929615 entries, 0 to 929614
Data columns (total 21 columns):
ind_empleado             929615 non-null int64
pais_residencia          929615 non-null int64
sexo                     929615 non-null int64
age                      929615 non-null int64
fecha_alta               929615 non-null int64
ind_nuevo                929615 non-null int64
antiguedad               929615 non-null int64
indrel                   929615 non-null int64
ult_fec_cli_1t           929615 non-null int64
indrel_1mes              929615 non-null int64
tiprel_1mes              929615 non-null int64
indresi                  929615 non-null int64
indext                   929615 non-null int64
conyuemp                 929615 non-null int64
canal_entrada            929615 non-null int64
indfall                  929615 non-null int64
cod_prov                 929615 non-null int64
nomprov                  929615 non-null int64
ind_actividad_cliente    929615 non-null in