In [1]:
import sys
import traceback
import pandas as pd
import numpy as np
import multiprocessing
import platform

import sklearn
from sklearn.preprocessing import StandardScaler, MinMaxScaler, LabelEncoder, OneHotEncoder
from sklearn.model_selection import train_test_split, KFold, cross_val_score, GridSearchCV
from sklearn.metrics import precision_score, recall_score, f1_score, accuracy_score, roc_auc_score

from sklearn.ensemble import RandomForestClassifier, BaggingClassifier, VotingClassifier, AdaBoostClassifier, GradientBoostingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.tree import DecisionTreeClassifier
from xgboost import XGBClassifier

In [2]:
pd.set_option("display.max_rows",99)
pd.set_option("display.max_columns",99)

In [3]:
##### Load DataSet
X_train_all = pd.read_csv("data/X_train.csv", encoding='cp949')
y_train_all = pd.read_csv("data/y_train.csv", encoding='cp949')
X_test_all = pd.read_csv("data/X_test.csv", encoding='cp949')

In [4]:
print(X_train_all.info())
print(X_test_all.info())
print(y_train_all.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3500 entries, 0 to 3499
Data columns (total 10 columns):
 #   Column   Non-Null Count  Dtype  
---  ------   --------------  -----  
 0   cust_id  3500 non-null   int64  
 1   총구매액     3500 non-null   int64  
 2   최대구매액    3500 non-null   int64  
 3   환불금액     1205 non-null   float64
 4   주구매상품    3500 non-null   object 
 5   주구매지점    3500 non-null   object 
 6   내점일수     3500 non-null   int64  
 7   내점당구매건수  3500 non-null   float64
 8   주말방문비율   3500 non-null   float64
 9   구매주기     3500 non-null   int64  
dtypes: float64(3), int64(5), object(2)
memory usage: 273.6+ KB
None
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2482 entries, 0 to 2481
Data columns (total 10 columns):
 #   Column   Non-Null Count  Dtype  
---  ------   --------------  -----  
 0   cust_id  2482 non-null   int64  
 1   총구매액     2482 non-null   int64  
 2   최대구매액    2482 non-null   int64  
 3   환불금액     871 non-null    float64
 4   주구매상품    2482 non-null   obj

In [5]:
print(y_train_all['gender'].value_counts())

0    2184
1    1316
Name: gender, dtype: int64


In [6]:
##### Edit DataSet
X_train_all_size= len(X_train_all)
X_all = pd.concat([X_train_all, X_test_all])
X_all = X_all.reset_index()
X_all['gender'] = y_train_all['gender']

In [7]:
print(X_all.info())
print(X_all.head())
print(X_all.tail())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5982 entries, 0 to 5981
Data columns (total 12 columns):
 #   Column   Non-Null Count  Dtype  
---  ------   --------------  -----  
 0   index    5982 non-null   int64  
 1   cust_id  5982 non-null   int64  
 2   총구매액     5982 non-null   int64  
 3   최대구매액    5982 non-null   int64  
 4   환불금액     2076 non-null   float64
 5   주구매상품    5982 non-null   object 
 6   주구매지점    5982 non-null   object 
 7   내점일수     5982 non-null   int64  
 8   내점당구매건수  5982 non-null   float64
 9   주말방문비율   5982 non-null   float64
 10  구매주기     5982 non-null   int64  
 11  gender   3500 non-null   float64
dtypes: float64(4), int64(6), object(2)
memory usage: 560.9+ KB
None
   index  cust_id      총구매액     최대구매액       환불금액   주구매상품 주구매지점  내점일수  \
0      0        0  68282840  11264000  6860000.0      기타   강남점    19   
1      1        1   2136000   2136000   300000.0     스포츠   잠실점     2   
2      2        2   3197000   1639000        NaN  남성 캐주얼   관악점     2   
3   

In [8]:
print(X_all.describe())

             index      cust_id          총구매액         최대구매액          환불금액  \
count  5982.000000  5982.000000  5.982000e+03  5.982000e+03  2.076000e+03   
mean   1538.310097  2990.500000  9.569838e+07  2.053814e+07  2.469452e+07   
std     934.514523  1726.998987  1.676480e+08  3.330805e+07  5.281222e+07   
min       0.000000     0.000000 -5.242152e+07 -3.744000e+07  5.600000e+03   
25%     747.250000  1495.250000  4.867800e+06  2.875000e+06  2.304000e+06   
50%    1495.000000  2990.500000  2.898500e+07  1.019760e+07  7.627000e+06   
75%    2242.750000  4485.750000  1.142893e+08  2.447250e+07  2.353250e+07   
max    3499.000000  5981.000000  2.861238e+09  7.066290e+08  8.715144e+08   

              내점일수      내점당구매건수       주말방문비율         구매주기       gender  
count  5982.000000  5982.000000  5982.000000  5982.000000  3500.000000  
mean     19.362922     2.828501     0.301672    20.679371     0.376000  
std      26.681326     1.848389     0.286858    24.485411     0.484449  
min       1.00

In [9]:
print(y_train_all['gender'].value_counts())
print(X_all['gender'].value_counts())
print(X_all.groupby('gender')['주구매상품'].value_counts())
print(X_all.groupby('gender')['주구매지점'].value_counts())

0    2184
1    1316
Name: gender, dtype: int64
0.0    2184
1.0    1316
Name: gender, dtype: int64
gender  주구매상품 
0.0     기타        375
        가공식품      317
        농산물       247
        화장품       196
        시티웨어      169
        디자이너      168
        수산품        92
        명품         61
        캐주얼        60
        섬유잡화       48
        일용잡화       43
        모피/피혁      39
        골프         38
        스포츠        33
        육류         31
        구두         26
        아동         26
        차/커피       26
        건강식품       22
        피혁잡화       21
        축산가공       19
        주방용품       18
        젓갈/반찬      14
        남성 캐주얼     13
        트래디셔널      12
        셔츠         11
        생활잡화        8
        주방가전        8
        란제리/내의      6
        커리어         6
        가구          5
        남성정장        4
        대형가전        4
        액세서리        4
        침구/수예       4
        주류          3
        보석          2
        식기          2
        남성 트랜디      1
        소형가전        1
       

In [10]:
print(X_all.groupby('gender')['환불금액'].value_counts())

gender  환불금액       
0.0     250000.0       3
        420000.0       3
        1050000.0      3
        3080000.0      3
        59000.0        2
                      ..
1.0     153190000.0    1
        174984000.0    1
        188570000.0    1
        210195200.0    1
        210771400.0    1
Name: 환불금액, Length: 1147, dtype: int64


In [11]:
##### 결측치
X_all['환불금액']= X_all['환불금액'].fillna(0)

In [12]:
print(X_all.groupby('gender')['환불금액'].value_counts())

gender  환불금액       
0.0     0.0            1269
        250000.0          3
        420000.0          3
        1050000.0         3
        3080000.0         3
                       ... 
1.0     153190000.0       1
        174984000.0       1
        188570000.0       1
        210195200.0       1
        210771400.0       1
Name: 환불금액, Length: 1149, dtype: int64


In [13]:
##### feature Encoding
X_all['주구매상품']=LabelEncoder().fit_transform(X_all['주구매상품'])
X_all['주구매지점']=LabelEncoder().fit_transform(X_all['주구매지점'])

In [14]:
print(X_all.head())

   index  cust_id      총구매액     최대구매액       환불금액  주구매상품  주구매지점  내점일수  \
0      0        0  68282840  11264000  6860000.0      5      0    19   
1      1        1   2136000   2136000   300000.0     21     19     2   
2      2        2   3197000   1639000        0.0      6      1     2   
3      3        3  16077620   4935000        0.0      5      2    18   
4      4        4  29050000  24000000        0.0     15      8     2   

    내점당구매건수    주말방문비율  구매주기  gender  
0  3.894737  0.527027    17     0.0  
1  1.500000  0.000000     1     0.0  
2  2.000000  0.000000     1     1.0  
3  2.444444  0.318182    16     1.0  
4  1.500000  0.000000    85     0.0  


In [15]:
##### feature Scaling
X_all = pd.DataFrame(StandardScaler().fit_transform(X_all), columns=X_all.columns)

In [16]:
print(X_all)

         index   cust_id      총구매액     최대구매액      환불금액     주구매상품     주구매지점  \
0    -1.646244 -1.731761 -0.163544 -0.278459 -0.051427 -0.725353 -1.912205   
1    -1.645174 -1.731182 -0.558135 -0.552530 -0.248711  0.511904  1.441974   
2    -1.644103 -1.730603 -0.551806 -0.567452 -0.257733 -0.648024 -1.735670   
3    -1.643033 -1.730024 -0.474968 -0.468489 -0.257733 -0.725353 -1.559134   
4    -1.641963 -1.729445 -0.397583  0.103943 -0.257733  0.047933 -0.499919   
...        ...       ...       ...       ...       ...       ...       ...   
5977  1.004552  1.729445 -0.078247  0.103223 -0.257733 -0.880010 -0.323383   
5978  1.005622  1.730024 -0.568014 -0.602251 -0.257733  0.202590 -1.559134   
5979  1.006692  1.730603  0.980144  0.156487 -0.257733 -0.648024 -0.499919   
5980  1.007762  1.731182 -0.040009 -0.072605 -0.257733  0.975876 -0.499919   
5981  1.008833  1.731761 -0.567156 -0.610388 -0.257733 -1.111996  0.735831   

          내점일수   내점당구매건수    주말방문비율      구매주기    gender  
0    -

In [18]:
##### feature Selection
X_all = X_all.drop(['cust_id', 'gender'], axis=1)
print(X_all)

         index      총구매액     최대구매액      환불금액     주구매상품     주구매지점      내점일수  \
0    -1.646244 -0.163544 -0.278459 -0.051427 -0.725353 -1.912205 -0.013603   
1    -1.645174 -0.558135 -0.552530 -0.248711  0.511904  1.441974 -0.650806   
2    -1.644103 -0.551806 -0.567452 -0.257733 -0.648024 -1.735670 -0.650806   
3    -1.643033 -0.474968 -0.468489 -0.257733 -0.725353 -1.559134 -0.051086   
4    -1.641963 -0.397583  0.103943 -0.257733  0.047933 -0.499919 -0.650806   
...        ...       ...       ...       ...       ...       ...       ...   
5977  1.004552 -0.078247  0.103223 -0.257733 -0.880010 -0.323383 -0.425911   
5978  1.005622 -0.568014 -0.602251 -0.257733  0.202590 -1.559134 -0.688289   
5979  1.006692  0.980144  0.156487 -0.257733 -0.648024 -0.499919 -0.013603   
5980  1.007762 -0.040009 -0.072605 -0.257733  0.975876 -0.499919 -0.538359   
5981  1.008833 -0.567156 -0.610388 -0.257733 -1.111996  0.735831 -0.650806   

       내점당구매건수    주말방문비율      구매주기  
0     0.576894  0.785662 -

In [20]:
##### train, val, test set
X_train_all = X_all.iloc[:X_train_all_size, :]
X_test_all = X_all.iloc[X_train_all_size:, :]
y_train_oc = y_train_all['gender']
print(X_train_all)
print(X_test_all)
print(y_train_oc)

         index      총구매액     최대구매액      환불금액     주구매상품     주구매지점      내점일수  \
0    -1.646244 -0.163544 -0.278459 -0.051427 -0.725353 -1.912205 -0.013603   
1    -1.645174 -0.558135 -0.552530 -0.248711  0.511904  1.441974 -0.650806   
2    -1.644103 -0.551806 -0.567452 -0.257733 -0.648024 -1.735670 -0.650806   
3    -1.643033 -0.474968 -0.468489 -0.257733 -0.725353 -1.559134 -0.051086   
4    -1.641963 -0.397583  0.103943 -0.257733  0.047933 -0.499919 -0.650806   
...        ...       ...       ...       ...       ...       ...       ...   
3495  2.093979 -0.551936 -0.525300 -0.257733 -0.880010 -0.499919 -0.688289   
3496  2.095049 -0.394131 -0.400482 -0.075798  0.589233 -0.323383 -0.425911   
3497  2.096119 -0.570430 -0.614412 -0.257733  1.362518  1.795046 -0.688289   
3498  2.097189 -0.559692 -0.586638 -0.257733  2.058475 -0.499919 -0.650806   
3499  2.098259  0.998623  0.423172 -0.078102 -0.725353 -0.499919  0.698565   

       내점당구매건수    주말방문비율      구매주기  
0     0.576894  0.785662 -

In [26]:
##### split
X_train, X_val, y_train, y_val = train_test_split(X_train_all, y_train_oc, test_size=0.2, random_state=123)

In [35]:
##### model
model = RandomForestClassifier(n_estimators=100, max_depth=5, min_samples_leaf=8, n_jobs=-1)

In [36]:
model.fit(X_train, y_train)
scores= cross_val_score(model, X_train_all, y_train_oc, cv=5, verbose=1, n_jobs=-1, scoring='accuracy'  )
predicts= model.predict(X_val)
print("정확도:", scores.mean())
print("rocauc:", roc_auc_score(y_val, predicts))

[Parallel(n_jobs=-1)]: Using backend LokyBackend with 32 concurrent workers.


정확도: 0.643142857142857
rocauc: 0.5912144702842378


[Parallel(n_jobs=-1)]: Done   2 out of   5 | elapsed:    0.8s remaining:    1.3s
[Parallel(n_jobs=-1)]: Done   5 out of   5 | elapsed:    0.9s finished
