In [2]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split

In [29]:
# scikit-learn commonly used classes
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import RandomizedSearchCV
from sklearn.metrics import classification_report
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import StandardScaler
from sklearn.feature_selection import SelectFromModel
from sklearn.pipeline import Pipeline

In [33]:
# scikit-learn popular classifier
from sklearn.ensemble import RandomForestClassifier

In [35]:
# # XGBoost
from xgboost import XGBClassifier

In [4]:
from keras.datasets import mnist
from keras.models import Sequential
from keras.layers.core import Dense, Activation
from keras.optimizers import SGD, Adam
from keras.utils import np_utils
import matplotlib.pyplot as plt

np.random.seed(1671)

In [5]:
NB_EPOCH = 30
BATCH_SIZE = 128
VERBOSE = 1
#NB_CLASSES = 10
OPTIMIZER = Adam()
N_HIDDEN =120
VALIDATION_SPLIT=.2 # how much TRAIN is reserved for VALIDATION

In [7]:
# read data
log = pd.read_csv('./data/ML_data_clickStreams.csv', encoding='cp949')

In [8]:
log.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2000 entries, 0 to 1999
Data columns (total 40 columns):
CUS_ID            2000 non-null int64
CT_PORTAL         2000 non-null float64
CT_SEARCH         2000 non-null float64
CT_SHOP           2000 non-null float64
CT_NEWS           2000 non-null float64
CT_MAIL           2000 non-null float64
CT_COMMUNITY      2000 non-null float64
CT_BLOG_SNS       2000 non-null float64
CT_ENTERTAIN      2000 non-null float64
CT_FINANCE        2000 non-null float64
CT_SHOP_INFO      2000 non-null float64
CT_BUSINESS       2000 non-null float64
CT_INTERNET       2000 non-null float64
CT_DOWNLOAD       2000 non-null float64
CT_PUBLIC         2000 non-null float64
CT_GAME           2000 non-null float64
CT_EDU            2000 non-null float64
CT_ETC            2000 non-null float64
COVERAGE          2000 non-null float64
DWELLTIME         2000 non-null int64
PAGEVIEWS         2000 non-null int64
HF0005            2000 non-null float64
HF0611            2

In [9]:
log.head()

Unnamed: 0,CUS_ID,CT_PORTAL,CT_SEARCH,CT_SHOP,CT_NEWS,CT_MAIL,CT_COMMUNITY,CT_BLOG_SNS,CT_ENTERTAIN,CT_FINANCE,...,DF_SAT,DF_SUN,VSITES,SITECOV,VDAYS,DAYTIME,DAYCOV,SCH_KEYWORDS,SCH_TOPKEYWORD,GENDER
0,1,0.093,0.136,0.001,0.28,0.029,0.192,0.029,0.205,0.001,...,0.081,0.129,201,4.272,205,1746,1.481,399,타이젬,남자
1,2,0.059,0.118,0.323,0.028,0.119,0.017,0.072,0.015,0.011,...,0.007,0.0,573,5.434,105,7962,0.484,1346,티몬,여자
2,3,0.43,0.107,0.179,0.019,0.164,0.012,0.017,0.007,0.001,...,0.026,0.038,270,5.965,105,3699,0.7,329,야놀자,여자
3,4,0.218,0.078,0.039,0.26,0.041,0.173,0.066,0.004,0.037,...,0.054,0.037,166,4.309,107,2575,1.144,134,아이엠간지,남자
4,5,0.332,0.223,0.033,0.2,0.003,0.039,0.039,0.003,0.046,...,0.132,0.028,328,5.75,151,3324,1.058,511,에스티아이,남자


In [10]:
log.shape

(2000, 40)

In [11]:
log.columns

Index(['CUS_ID', 'CT_PORTAL', 'CT_SEARCH', 'CT_SHOP', 'CT_NEWS', 'CT_MAIL',
       'CT_COMMUNITY', 'CT_BLOG_SNS', 'CT_ENTERTAIN', 'CT_FINANCE',
       'CT_SHOP_INFO', 'CT_BUSINESS', 'CT_INTERNET', 'CT_DOWNLOAD',
       'CT_PUBLIC', 'CT_GAME', 'CT_EDU', 'CT_ETC', 'COVERAGE', 'DWELLTIME',
       'PAGEVIEWS', 'HF0005', 'HF0611', 'HF1217', 'HF1823', 'DF_MON', 'DF_TUE',
       'DF_WED', 'DF_THU', 'DF_FRI', 'DF_SAT', 'DF_SUN', 'VSITES', 'SITECOV',
       'VDAYS', 'DAYTIME', 'DAYCOV', 'SCH_KEYWORDS', 'SCH_TOPKEYWORD',
       'GENDER'],
      dtype='object')

 - CT_xxx :: 웹사이트 카테고리 별 체류시간 비율, 즉, 총 17개 카테고리 중에 특정 카테고리에 얼마나 머물렀는가를 비율로 계산한 값
 - COVERAGE :: 서로 다른 웹 사이트에 얼마나 다양하게 접속했는지에 대한 비률('서로 다은 카테고리 수/17'로 계산)
 - DWELLTIME :: 총 체류시간
 - PAGEVIEWS :: 총 페이지뷰
 - HF_xxx :: 시간대별(0-5시, 6-11시, 12-17시, 18-23시) 체류시간 비율
 - DF_xxx :: 요일별 체류시간 비율
 - VISITES :: 접속한 서로 다른 웹사이트의 수
 - SITECOV :: 웹사이트 카케고리 별 체류시간 변동계수(카테고리별 체류시간의 '표준편차/평균' 값)
 - VDAYS :: 총 접속일수
 - DAYTIME :: 하루 평균 체류시간
 - DAYCOV :: 일별 변동계수(일일 체류시간의 '표준편차/평균' 값)
 - SCH_KEYWORDS :: 네이버에서 검색한 검색량
 - SCH_TOPKEYWORD :: 네이버에서 가장 많이 검색한 검색어
 - GENDER :: 고객성별(남자/여자). 예측하고자 하는 값


In [12]:
# Encode categorical values using label encoding
encoded_log = log.copy()

In [13]:
encoded_log['SCH_TOPKEYWORD'] = encoded_log['SCH_TOPKEYWORD'].astype('category')
encoded_log['SCH_TOPKEYWORD'] = encoded_log['SCH_TOPKEYWORD'].cat.codes

In [19]:
encoded_log.head()

Unnamed: 0,CUS_ID,CT_PORTAL,CT_SEARCH,CT_SHOP,CT_NEWS,CT_MAIL,CT_COMMUNITY,CT_BLOG_SNS,CT_ENTERTAIN,CT_FINANCE,...,DF_SAT,DF_SUN,VSITES,SITECOV,VDAYS,DAYTIME,DAYCOV,SCH_KEYWORDS,SCH_TOPKEYWORD,GENDER_여자
0,1,0.093,0.136,0.001,0.28,0.029,0.192,0.029,0.205,0.001,...,0.081,0.129,201,4.272,205,1746,1.481,399,1240,0
1,2,0.059,0.118,0.323,0.028,0.119,0.017,0.072,0.015,0.011,...,0.007,0.0,573,5.434,105,7962,0.484,1346,1274,1
2,3,0.43,0.107,0.179,0.019,0.164,0.012,0.017,0.007,0.001,...,0.026,0.038,270,5.965,105,3699,0.7,329,852,1
3,4,0.218,0.078,0.039,0.26,0.041,0.173,0.066,0.004,0.037,...,0.054,0.037,166,4.309,107,2575,1.144,134,824,0
4,5,0.332,0.223,0.033,0.2,0.003,0.039,0.039,0.003,0.046,...,0.132,0.028,328,5.75,151,3324,1.058,511,872,0


In [16]:
# Encode categorical values using one-hot encoding
encoded_log = pd.get_dummies(encoded_log, columns=['GENDER'], drop_first=True)

In [17]:
log.head()

Unnamed: 0,CUS_ID,CT_PORTAL,CT_SEARCH,CT_SHOP,CT_NEWS,CT_MAIL,CT_COMMUNITY,CT_BLOG_SNS,CT_ENTERTAIN,CT_FINANCE,...,DF_SAT,DF_SUN,VSITES,SITECOV,VDAYS,DAYTIME,DAYCOV,SCH_KEYWORDS,SCH_TOPKEYWORD,GENDER
0,1,0.093,0.136,0.001,0.28,0.029,0.192,0.029,0.205,0.001,...,0.081,0.129,201,4.272,205,1746,1.481,399,타이젬,남자
1,2,0.059,0.118,0.323,0.028,0.119,0.017,0.072,0.015,0.011,...,0.007,0.0,573,5.434,105,7962,0.484,1346,티몬,여자
2,3,0.43,0.107,0.179,0.019,0.164,0.012,0.017,0.007,0.001,...,0.026,0.038,270,5.965,105,3699,0.7,329,야놀자,여자
3,4,0.218,0.078,0.039,0.26,0.041,0.173,0.066,0.004,0.037,...,0.054,0.037,166,4.309,107,2575,1.144,134,아이엠간지,남자
4,5,0.332,0.223,0.033,0.2,0.003,0.039,0.039,0.003,0.046,...,0.132,0.028,328,5.75,151,3324,1.058,511,에스티아이,남자


In [20]:
encoded_log.head()

Unnamed: 0,CUS_ID,CT_PORTAL,CT_SEARCH,CT_SHOP,CT_NEWS,CT_MAIL,CT_COMMUNITY,CT_BLOG_SNS,CT_ENTERTAIN,CT_FINANCE,...,DF_SAT,DF_SUN,VSITES,SITECOV,VDAYS,DAYTIME,DAYCOV,SCH_KEYWORDS,SCH_TOPKEYWORD,GENDER_여자
0,1,0.093,0.136,0.001,0.28,0.029,0.192,0.029,0.205,0.001,...,0.081,0.129,201,4.272,205,1746,1.481,399,1240,0
1,2,0.059,0.118,0.323,0.028,0.119,0.017,0.072,0.015,0.011,...,0.007,0.0,573,5.434,105,7962,0.484,1346,1274,1
2,3,0.43,0.107,0.179,0.019,0.164,0.012,0.017,0.007,0.001,...,0.026,0.038,270,5.965,105,3699,0.7,329,852,1
3,4,0.218,0.078,0.039,0.26,0.041,0.173,0.066,0.004,0.037,...,0.054,0.037,166,4.309,107,2575,1.144,134,824,0
4,5,0.332,0.223,0.033,0.2,0.003,0.039,0.039,0.003,0.046,...,0.132,0.028,328,5.75,151,3324,1.058,511,872,0


In [23]:
# Split data
dfX = encoded_log.drop(['CUS_ID', 'GENDER_여자'], axis=1) # exclude 'CUS_ID' attribute & class variable
dfy = encoded_log['GENDER_여자']
X_train, X_test, y_train, y_test = train_test_split(dfX, dfy, test_size=.2, 
                                                   random_state=0)

In [30]:
# Tune models
# set hyper-parameters

rf_params = {
    'rf__max_features' : np.arange(5,10), ## faeture의 수
    'rf__n_estimators' : [100,300,500] ## tree의 수
}

xgb_params = {
    'xgb__subsample' : np.arange(.5, 1.0, .2),
    'xgb__max_depth' : np.arange(3,10,3),
    # 'xgb_colsample_bytree' : np.arange(.1, 1.05, .05)
}

In [36]:
# make pipline
rf_pipeline = Pipeline([
    ('scaler', StandardScaler()),
    ('rf', RandomForestClassifier(random_state=-0, n_jobs=-1))
])

xgb_pipeline = Pipeline([
    ('scaler', StandardScaler()),
    ('xgb', XGBClassifier(random_state=0, n_jobs=-1))
])

In [37]:
models = [
    ('rf', rf_pipeline, rf_params),
    ('xgb', xgb_pipeline, xgb_params)
]

In [39]:
# Run grid search & CV
tuned_models = {}
best_score = -1
best_model = None
for name, model, param in models :
    grid_search = GridSearchCV(model, param, cv=5).fit(X_train, y_train)
    score = grid_search.score(X_test, y_test)
    print("{} ==> {} {}".format(name, score, grid_search.best_params_))
    tuned_models[name] = grid_search 
    if score > best_score:
        best_score = score
        best_model = name

rf ==> 0.68 {'rf__max_features': 8, 'rf__n_estimators': 300}


  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:


  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:


  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:


xgb ==> 0.685 {'xgb__max_depth': 3, 'xgb__subsample': 0.5}


  if diff:
  if diff:
  if diff:
