In [1]:
import pandas as pd
import numpy as np
import sklearn
from sklearn.preprocessing import MinMaxScaler, StandardScaler
from sklearn.model_selection import train_test_split, cross_val_score, KFold, GridSearchCV
from sklearn.metrics import confusion_matrix, precision_score, f1_score, roc_auc_score
from sklearn.linear_model import LogisticRegression

In [2]:
X_train= pd.read_csv("data/X_train.csv", encoding='cp949')
y_train= pd.read_csv("data/y_train.csv", encoding='cp949')
X_test= pd.read_csv("data/X_test.csv", encoding='cp949')

In [3]:
X_train.head()

Unnamed: 0,cust_id,총구매액,최대구매액,환불금액,주구매상품,주구매지점,내점일수,내점당구매건수,주말방문비율,구매주기
0,0,68282840,11264000,6860000.0,기타,강남점,19,3.894737,0.527027,17
1,1,2136000,2136000,300000.0,스포츠,잠실점,2,1.5,0.0,1
2,2,3197000,1639000,,남성 캐주얼,관악점,2,2.0,0.0,1
3,3,16077620,4935000,,기타,광주점,18,2.444444,0.318182,16
4,4,29050000,24000000,,보석,본 점,2,1.5,0.0,85


In [4]:
y_train.gender = y_train.gender.apply(lambda x : '여자' if x ==0 else '남자')
y_train['gender'] = y_train.loc[:,'gender'].apply(lambda x: 1 if x=='남자' else 0)
y_train.head(50)

Unnamed: 0,cust_id,gender
0,0,0
1,1,0
2,2,1
3,3,1
4,4,0
5,5,0
6,6,0
7,7,0
8,8,0
9,9,1


In [5]:
y_train.iloc[:,1] = y_train.gender.map({1:'male', 0:'female'})
y_train

Unnamed: 0,cust_id,gender
0,0,female
1,1,female
2,2,male
3,3,male
4,4,female
...,...,...
3495,3495,male
3496,3496,male
3497,3497,female
3498,3498,female


In [6]:
y_train.gender=y_train.gender.apply(lambda x : 1 if x=='male' else 0)

In [7]:
y_train

Unnamed: 0,cust_id,gender
0,0,0
1,1,0
2,2,1
3,3,1
4,4,0
...,...,...
3495,3495,1
3496,3496,1
3497,3497,0
3498,3498,0


In [8]:
y_train = y_train.iloc[:,1]
y_train

0       0
1       0
2       1
3       1
4       0
       ..
3495    1
3496    1
3497    0
3498    0
3499    0
Name: gender, Length: 3500, dtype: int64

In [9]:
X_train.loc[:,'환불금액'].fillna(0, inplace=True)
X_train.환불금액

0       6860000.0
1        300000.0
2             0.0
3             0.0
4             0.0
          ...    
3495          0.0
3496    6049600.0
3497          0.0
3498          0.0
3499    5973000.0
Name: 환불금액, Length: 3500, dtype: float64

In [10]:
dum_1=pd.get_dummies(X_train['주구매상품'])

In [11]:
dum_2=pd.get_dummies(X_train['주구매지점'])

In [12]:
X_train=pd.concat([X_train, dum_1, dum_2], axis=1)

In [13]:
X_train.drop(['cust_id','주구매상품','주구매지점'], axis=1, inplace=True)

In [14]:
X_train

Unnamed: 0,총구매액,최대구매액,환불금액,내점일수,내점당구매건수,주말방문비율,구매주기,가공식품,가구,건강식품,...,안양점,영등포점,울산점,인천점,일산점,잠실점,전주점,창원점,청량리점,포항점
0,68282840,11264000,6860000.0,19,3.894737,0.527027,17,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,2136000,2136000,300000.0,2,1.500000,0.000000,1,0,0,0,...,0,0,0,0,0,1,0,0,0,0
2,3197000,1639000,0.0,2,2.000000,0.000000,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,16077620,4935000,0.0,18,2.444444,0.318182,16,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,29050000,24000000,0.0,2,1.500000,0.000000,85,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3495,3175200,3042900,0.0,1,2.000000,1.000000,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3496,29628600,7200000,6049600.0,8,1.625000,0.461538,40,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3497,75000,75000,0.0,1,1.000000,0.000000,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0
3498,1875000,1000000,0.0,2,1.000000,0.000000,39,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [15]:
scale= MinMaxScaler()
X_train=scale.fit_transform(X_train)
X_train

array([[5.08100182e-02, 2.00895971e-02, 1.21684497e-02, ...,
        0.00000000e+00, 0.00000000e+00, 0.00000000e+00],
       [2.29657701e-02, 7.22639268e-03, 5.32147944e-04, ...,
        0.00000000e+00, 0.00000000e+00, 0.00000000e+00],
       [2.34123938e-02, 6.52601882e-03, 0.00000000e+00, ...,
        0.00000000e+00, 0.00000000e+00, 0.00000000e+00],
       ...,
       [2.20982004e-02, 4.32202542e-03, 0.00000000e+00, ...,
        1.00000000e+00, 0.00000000e+00, 0.00000000e+00],
       [2.28559032e-02, 5.62553814e-03, 0.00000000e+00, ...,
        0.00000000e+00, 0.00000000e+00, 0.00000000e+00],
       [1.32818176e-01, 5.30198514e-02, 1.05950656e-02, ...,
        0.00000000e+00, 0.00000000e+00, 0.00000000e+00]])

In [16]:
y_train

0       0
1       0
2       1
3       1
4       0
       ..
3495    1
3496    1
3497    0
3498    0
3499    0
Name: gender, Length: 3500, dtype: int64

In [17]:
X_train, X_test, y_train, y_test = train_test_split(X_train, y_train, test_size=0.2)

In [18]:
X_test, y_test

(array([[0.05819492, 0.04473092, 0.04642991, ..., 0.        , 0.        ,
         0.        ],
        [0.02583292, 0.01682476, 0.        , ..., 0.        , 0.        ,
         0.        ],
        [0.03118621, 0.01873676, 0.        , ..., 0.        , 0.        ,
         0.        ],
        ...,
        [0.02211841, 0.00438967, 0.        , ..., 0.        , 0.        ,
         0.        ],
        [0.03538433, 0.01103688, 0.        , ..., 0.        , 0.        ,
         0.        ],
        [0.07973453, 0.0446661 , 0.08285739, ..., 0.        , 0.        ,
         0.        ]]),
 1309    0
 505     0
 3246    1
 2696    0
 707     0
        ..
 3275    1
 1943    0
 246     1
 191     0
 2933    0
 Name: gender, Length: 700, dtype: int64)

In [24]:
k_f = KFold(n_splits=5, shuffle=True, random_state=123)

In [20]:
model = LogisticRegression()
model.fit(X_train, y_train)
model.score(X_train,y_train)

0.6610714285714285

In [21]:
model.score(X_test,y_test)

0.6585714285714286

In [27]:
cross_val_score(model,X_test, y_test, cv=k_f)

array([0.66428571, 0.62857143, 0.64285714, 0.65      , 0.62142857])

In [22]:
print(model.coef_)

[[-1.39494087 -1.1095445  -0.95913398 -1.86942138 -0.6420595   0.44072754
   0.05298088 -0.11728495 -0.26807729  0.23672262  0.29188302  0.17513118
  -0.13773777  1.43561834  0.5370395   1.25114233 -0.40046993  0.35906906
  -1.3848726  -0.31932865 -0.02564334 -0.73870953 -0.22764276  0.37668256
   0.05555621  0.50042849 -0.01615918 -0.15683646  0.2474194  -1.05887865
   0.35085956 -0.52243517  0.62549222 -0.69200901 -0.04423444 -0.59694606
   0.4660167   1.19298292  0.90972864  0.04272101 -0.12360056  0.14368487
  -0.887109   -0.06471542 -0.44106275 -0.02730759 -0.26240168 -0.05178479
  -0.62864801 -0.66330114 -0.04324331  0.17610096 -0.42910879  0.75256824
   0.01060791 -0.07190866  0.07107619 -0.01608755 -0.39773231 -0.59396624
  -0.24681431  0.          0.30111734  0.48839695 -0.04026485  0.36863932
   0.48114729 -0.10345784 -0.28474628  1.13074435 -0.38873513 -0.30645883
  -0.19029026]]


In [23]:
    print(model.intercept_)

[0.01589855]


In [28]:
predict=model.predict(X_test)

In [29]:
roc_auc_score(predict, y_test)

0.6129646360351059