In [1]:
import multiprocessing
import pandas as pd
import numpy as np
import sklearn
from sklearn.preprocessing import MinMaxScaler, StandardScaler
from sklearn.model_selection import train_test_split, cross_val_score, KFold, GridSearchCV
from sklearn.metrics import confusion_matrix, precision_score, f1_score, roc_auc_score

In [2]:
X_train= pd.read_csv("data/X_train.csv", encoding='cp949')
y_train= pd.read_csv("data/y_train.csv", encoding='cp949')
X_test= pd.read_csv("data/X_test.csv", encoding='cp949')

In [3]:
X_train.head()

Unnamed: 0,cust_id,총구매액,최대구매액,환불금액,주구매상품,주구매지점,내점일수,내점당구매건수,주말방문비율,구매주기
0,0,68282840,11264000,6860000.0,기타,강남점,19,3.894737,0.527027,17
1,1,2136000,2136000,300000.0,스포츠,잠실점,2,1.5,0.0,1
2,2,3197000,1639000,,남성 캐주얼,관악점,2,2.0,0.0,1
3,3,16077620,4935000,,기타,광주점,18,2.444444,0.318182,16
4,4,29050000,24000000,,보석,본 점,2,1.5,0.0,85


In [4]:
y_train.gender = y_train.gender.apply(lambda x : '여자' if x ==0 else '남자')
y_train['gender'] = y_train.loc[:,'gender'].apply(lambda x: 1 if x=='남자' else 0)
y_train.head(50)

Unnamed: 0,cust_id,gender
0,0,0
1,1,0
2,2,1
3,3,1
4,4,0
5,5,0
6,6,0
7,7,0
8,8,0
9,9,1


In [5]:
y_train.iloc[:,1] = y_train.gender.map({1:'male', 0:'female'})
y_train

Unnamed: 0,cust_id,gender
0,0,female
1,1,female
2,2,male
3,3,male
4,4,female
...,...,...
3495,3495,male
3496,3496,male
3497,3497,female
3498,3498,female


In [6]:
y_train.gender=y_train.gender.apply(lambda x : 1 if x=='male' else 0)

In [7]:
y_train

Unnamed: 0,cust_id,gender
0,0,0
1,1,0
2,2,1
3,3,1
4,4,0
...,...,...
3495,3495,1
3496,3496,1
3497,3497,0
3498,3498,0


In [8]:
y_train = y_train.iloc[:,1]
y_train

0       0
1       0
2       1
3       1
4       0
       ..
3495    1
3496    1
3497    0
3498    0
3499    0
Name: gender, Length: 3500, dtype: int64

In [9]:
X_train.loc[:,'환불금액'].fillna(0, inplace=True)
X_train.환불금액

0       6860000.0
1        300000.0
2             0.0
3             0.0
4             0.0
          ...    
3495          0.0
3496    6049600.0
3497          0.0
3498          0.0
3499    5973000.0
Name: 환불금액, Length: 3500, dtype: float64

In [10]:
dum_1=pd.get_dummies(X_train['주구매상품'])

In [11]:
dum_2=pd.get_dummies(X_train['주구매지점'])

In [12]:
X_train=pd.concat([X_train, dum_1, dum_2], axis=1)

In [13]:
X_train.drop(['cust_id','주구매상품','주구매지점'], axis=1, inplace=True)

In [14]:
X_train

Unnamed: 0,총구매액,최대구매액,환불금액,내점일수,내점당구매건수,주말방문비율,구매주기,가공식품,가구,건강식품,...,안양점,영등포점,울산점,인천점,일산점,잠실점,전주점,창원점,청량리점,포항점
0,68282840,11264000,6860000.0,19,3.894737,0.527027,17,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,2136000,2136000,300000.0,2,1.500000,0.000000,1,0,0,0,...,0,0,0,0,0,1,0,0,0,0
2,3197000,1639000,0.0,2,2.000000,0.000000,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,16077620,4935000,0.0,18,2.444444,0.318182,16,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,29050000,24000000,0.0,2,1.500000,0.000000,85,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3495,3175200,3042900,0.0,1,2.000000,1.000000,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3496,29628600,7200000,6049600.0,8,1.625000,0.461538,40,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3497,75000,75000,0.0,1,1.000000,0.000000,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0
3498,1875000,1000000,0.0,2,1.000000,0.000000,39,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [15]:
scale= MinMaxScaler()
X_train=scale.fit_transform(X_train)
X_train

array([[5.08100182e-02, 2.00895971e-02, 1.21684497e-02, ...,
        0.00000000e+00, 0.00000000e+00, 0.00000000e+00],
       [2.29657701e-02, 7.22639268e-03, 5.32147944e-04, ...,
        0.00000000e+00, 0.00000000e+00, 0.00000000e+00],
       [2.34123938e-02, 6.52601882e-03, 0.00000000e+00, ...,
        0.00000000e+00, 0.00000000e+00, 0.00000000e+00],
       ...,
       [2.20982004e-02, 4.32202542e-03, 0.00000000e+00, ...,
        1.00000000e+00, 0.00000000e+00, 0.00000000e+00],
       [2.28559032e-02, 5.62553814e-03, 0.00000000e+00, ...,
        0.00000000e+00, 0.00000000e+00, 0.00000000e+00],
       [1.32818176e-01, 5.30198514e-02, 1.05950656e-02, ...,
        0.00000000e+00, 0.00000000e+00, 0.00000000e+00]])

In [16]:
y_train

0       0
1       0
2       1
3       1
4       0
       ..
3495    1
3496    1
3497    0
3498    0
3499    0
Name: gender, Length: 3500, dtype: int64

In [17]:
X_train, X_test, y_train, y_test = train_test_split(X_train, y_train, test_size=0.2)

In [18]:
X_test, y_test

(array([[6.75572750e-02, 5.61454636e-02, 1.20265435e-02, ...,
         0.00000000e+00, 1.00000000e+00, 0.00000000e+00],
        [2.50368245e-02, 1.41596711e-02, 0.00000000e+00, ...,
         0.00000000e+00, 0.00000000e+00, 0.00000000e+00],
        [1.19325783e-01, 4.33724481e-02, 2.20543394e-02, ...,
         0.00000000e+00, 0.00000000e+00, 0.00000000e+00],
        ...,
        [2.21087872e-02, 4.30370578e-03, 0.00000000e+00, ...,
         0.00000000e+00, 0.00000000e+00, 0.00000000e+00],
        [2.30348053e-02, 7.45750196e-03, 0.00000000e+00, ...,
         0.00000000e+00, 0.00000000e+00, 0.00000000e+00],
        [1.07226637e-01, 1.99430400e-02, 4.07980091e-04, ...,
         0.00000000e+00, 0.00000000e+00, 0.00000000e+00]]),
 1104    0
 2327    1
 1557    0
 1749    0
 2572    0
        ..
 2488    0
 1051    0
 2860    1
 3393    1
 2394    0
 Name: gender, Length: 700, dtype: int64)