### 시험 주관사 제공 예시문제: 기계적 처리법
##### 도메인 이해, 데이터 이해, 시각화, 모델 기획 단계 skip.
##### numerical, categorical 컬럼 처리 또한 기계적으로 진행합니다.
* 차후 분석 보고서 작성시 문제가 생길 수 있습니다.(해석력 확보 X)
* 실기시험 초단기 패스를 위한 접근법.
* 이 방식 적용시 데이터 셋이 바뀌어도
* feature / label 재정의 + classifier -> regressor 교환하여 적용 가능.

In [None]:
# feature / label 설정과 분석 방향성 설정이 마무리 되었다고 가정.
import warnings
import pandas as pd
import numpy as np
warnings.filterwarnings("ignore")

X_train = pd.read_csv('X_train.csv', encoding='euc_kr')
y_train = pd.read_csv('y_train.csv', encoding='euc_kr')
X_test = pd.read_csv('X_test.csv', encoding='euc_kr')
print(X_train.shape)
print(y_train.shape)
print(X_test.shape)

(3500, 10)
(3500, 2)
(2482, 10)


##### 탐색 step

In [None]:
desc_t = X_train.describe(include='all').T
desc_t['count']
# 환불금액 컬럼은 null 존재 확인 
# -> min != 0 이므로 환불 안 한 사람이 null이라 가정 -> fillna(0) 적용
# 주구매상품, 주구매지점 컬럼은 categorical 임을 확인 -> one-hot encoding 적용

cust_id    3500
총구매액       3500
최대구매액      3500
환불금액       1205
주구매상품      3500
주구매지점      3500
내점일수       3500
내점당구매건수    3500
주말방문비율     3500
구매주기       3500
Name: count, dtype: object

In [None]:
num_X_cols = X_train.describe().columns
len(num_X_cols)

8

In [None]:
# log-transformation => extreme-positive skew 이므로 변환 적용
not_tgt = ['cust_id', '주말방문비율']
for col in num_X_cols:
    if col in not_tgt : continue
    X_train[col] = np.log(X_train[col] + 1)
    X_test[col] = np.log(X_test[col] + 1) # log 정의역 고려하여 + 1

In [None]:
rows = len(X_train)
all_pct = []
for col in desc_t.index:
    uniq_s = len(np.unique(X_train[col]))
    col_uniq_pct = int((uniq_s / rows) * 1000) / 10
    all_pct.append({'col' : col, 'unique_counts': uniq_s, 'uniq_pct' : col_uniq_pct})
uniq_df = pd.DataFrame(all_pct).sort_values(by='uniq_pct', ascending=False)
uniq_df
# cust_id 는 PK 성질이므로 feature 반영 X

Unnamed: 0,col,unique_counts,uniq_pct
0,cust_id,3500,100.0
3,환불금액,3409,97.4
1,총구매액,3396,97.0
2,최대구매액,2576,73.6
8,주말방문비율,1142,32.6
7,내점당구매건수,1107,31.6
6,내점일수,147,4.2
9,구매주기,135,3.8
4,주구매상품,42,1.2
5,주구매지점,24,0.6


In [None]:
X_train = X_train.fillna(0)
X_test = X_test.fillna(0) # na 처리

In [None]:
# Frequency encoding
import numpy as np

interested = ['주구매상품', '주구매지점']

counts = pd.DataFrame(
    X_train[interested[0]].value_counts(normalize=True)).reset_index()
counts.columns = [interested[0], interested[0]+'_ratio']
accu = 0
all_accu = []
for val in counts[interested[0]+'_ratio']:
    accu = accu + val
    all_accu.append(accu)
counts[interested[0]+'_accu'] = all_accu
top_70_1 = list(counts.loc[counts[interested[0]+'_accu'] > 0.7][interested[0]].values)

counts = pd.DataFrame(X_train[interested[1]].value_counts(normalize=True)).reset_index()
counts.columns = [interested[1], interested[1]+'_ratio']
accu = 0
all_accu = []
for val in counts[interested[1]+'_ratio']:
    accu = accu + val
    all_accu.append(accu)
counts[interested[1]+'_accu'] = all_accu
top_70_2 = list(counts.loc[counts[interested[1]+'_accu'] > 0.7][interested[1]].values)

# 누적빈출빈도 기준 70% 미만을 other 로 간주
X_train[interested[0]] = X_train[interested[0]].apply(
    lambda x : x if x in top_70_1 else 'other')
X_train[interested[1]] = X_train[interested[1]].apply(
    lambda x : x if x in top_70_2 else 'other')
X_test[interested[0]] = X_test[interested[0]].apply(
    lambda x : x if x in top_70_1 else 'other')
X_test[interested[1]] = X_test[interested[1]].apply(
    lambda x : x if x in top_70_2 else 'other')

num_cols = list(X_train.describe().columns)
num_cols.remove('cust_id')
X_dataset = [X_train, X_test]
for n, dataset in enumerate(X_dataset):
    for col in num_cols:
        if col == 'cust_id' : continue
        freq_name_1 = interested[0] + '_' + col + '_mean'
        freq_name_2 = interested[1] + '_' + col + '_mean'
        freq_1 = dataset.groupby(interested[0]).agg({col : np.mean}).reset_index()
        freq_2 = dataset.groupby(interested[1]).agg({col : np.mean}).reset_index()
        freq_1.columns = [interested[0], freq_name_1]
        freq_2.columns = [interested[1], freq_name_2]
        dataset = pd.merge(dataset, freq_1)
        dataset = pd.merge(dataset, freq_2)
        X_dataset[n] = dataset
        
X_train, X_test = X_dataset[0], X_dataset[1]

In [None]:
# Target Encoding
label = 'gender'

all_train = pd.concat([X_train[interested], y_train], axis=1)

prod_grouped = all_train.groupby(interested[0]).agg({label : np.mean}).reset_index()
prod_grouped.columns = [interested[0], interested[0] + '_target_pct']

spot_grouped = all_train.groupby(interested[1]).agg({label : np.mean}).reset_index()
spot_grouped.columns = [interested[1], interested[1] + '_target_pct']

X_train = pd.merge(X_train, prod_grouped)
X_train = pd.merge(X_train, spot_grouped)
X_train = X_train.sort_values(by='cust_id').reset_index(drop=True)

X_test = pd.merge(X_test, prod_grouped)
X_test = pd.merge(X_test, spot_grouped)
X_test = X_test.sort_values(by='cust_id').reset_index(drop=True)

In [None]:
# one-hot encoding

one_1 = pd.get_dummies(X_train[interested[0]], drop_first=True, prefix=interested[0])
one_2 = pd.get_dummies(X_train[interested[1]], drop_first=True, prefix=interested[1])
X_train = pd.concat([X_train, one_1, one_2], axis=1)

one_1 = pd.get_dummies(X_test[interested[0]], drop_first=True, prefix=interested[0])
one_2 = pd.get_dummies(X_test[interested[1]], drop_first=True, prefix=interested[1])
X_test = pd.concat([X_test, one_1, one_2], axis=1)

train_features = list(X_train.describe().columns)
test_features = list(X_test.describe().columns)
try:
    train_features.remove('cust_id')
except:
    pass
X_train = X_train[train_features]

In [None]:
# std_scaling
from sklearn.preprocessing import RobustScaler as ss
scal = ss()
X_train_num_cols = list(X_train.describe().columns)
X_train_fit = pd.DataFrame(scal.fit_transform(X_train), columns=X_train_num_cols)
X_test_num_cols = list(X_test.describe().columns)
X_test_fit = pd.DataFrame(scal.fit_transform(X_test[X_test_num_cols]), 
                          columns=X_test_num_cols)

In [None]:
# MINI-SMOTE
from sklearn.cluster import KMeans
case_one = y_train['gender'].value_counts()[0]
case_two = y_train['gender'].value_counts()[1]
class_diff = abs(case_one - case_two)
prev = 10
for i in range(10):
    clusters = len(y_train) // (i+1)
    after = abs(clusters - class_diff) / class_diff
    if prev > after:
        prev = after
        continue
    else:
        clusters= len(y_train) // (i)
        break
model = KMeans(n_clusters=clusters)
model.fit(X_train_fit[train_features])
cc = pd.DataFrame(model.cluster_centers_, columns=train_features)
cc['gender'] = 1
all_train = pd.concat([X_train_fit[train_features], y_train[label]], axis=1)
all_train = pd.concat([all_train, cc]).reset_index(drop=True)
X_train_fit, y_train = all_train[train_features], all_train[label]

##### 변수 선택

In [None]:
# Wrapper
from sklearn.feature_selection import RFE
from sklearn.ensemble import RandomForestClassifier as rf
estimator = rf(n_jobs=-1, random_state=1)
selector = RFE(estimator= estimator)
selector.fit(X_train_fit, y_train)
rfe_df = pd.DataFrame()
select_or_not = pd.DataFrame(sorted(map(
    list, zip(selector.support_, X_train_num_cols))), 
                             columns=['select', 'feature'])
rfe_features = list(select_or_not.loc[
    select_or_not['select'] == True]['feature'].values)
len(rfe_features)

38

In [None]:
# Embed
estimator = rf(n_jobs=-1, random_state=1)
estimator.fit(X_train_fit[rfe_features], y_train)
imp = pd.DataFrame()
imp['score'] = estimator.feature_importances_
imp['feature'] = list(rfe_features)
imp = imp.sort_values(by='score', ascending=False)
embed_features = list(imp[:len(imp) // 2]['feature'])
len(embed_features)

19

In [None]:
'''
from sklearn.model_selection import train_test_split

def tts(feature_sub):
    return train_test_split(X_train_fit[feature_sub], y_train, test_size=0.3, random_state=1)

X_train_hold, X_test_hold, y_train_hold, y_test_hold = tts(rfe_features)
'''

'\nfrom sklearn.model_selection import train_test_split\n\ndef tts(feature_sub):\n    return train_test_split(X_train_fit[feature_sub], y_train, test_size=0.3, random_state=1)\n\nX_train_hold, X_test_hold, y_train_hold, y_test_hold = tts(rfe_features)\n'

In [None]:
# XAI
from sklearn.inspection import permutation_importance as pi
perm = pi(estimator, X_train_fit[rfe_features], y_train, random_state=1)
perm_df = pd.DataFrame(perm.importances_mean).T
perm_df.columns = rfe_features
perm_df = perm_df.T.reset_index()
perm_df.columns = ['feature', 'perm_imp']
perm_features = list(perm_df.sort_values(
    by='perm_imp', ascending=False)[:len(perm_df) // 2]['feature'])
print(len(perm_features))

19


In [None]:
inter_features = list(set(embed_features).intersection(set(perm_features)))
print(len(inter_features))

17


##### Model fitting step

##### Hyperparameter tuning - Random search
* Coarse search

In [None]:
''' Docstring 이용하여 hyperparameter 정의역 확인
max_depth : int
 |          Maximum tree depth for base learners.
 |      learning_rate : float
 |          Boosting learning rate (xgb's "eta")
 |      verbosity : int
 |          The degree of verbosity. Valid values are 0 (silent) - 3 (debug).
 |      objective : string or callable
 |          Specify the learning task and the corresponding learning objective or
 |          a custom objective function to be used (see note below).
 |      booster: string
 |          Specify which booster to use: gbtree, gblinear or dart.
 |      tree_method: string
 |          Specify which tree method to use.  Default to auto.  If this parameter
 |          is set to default, XGBoost will choose the most conservative option
 |          available.  It's recommended to study this option from parameters
 |          document.
 |      n_jobs : int
 |          Number of parallel threads used to run xgboost.  When used with other Scikit-Learn
 |          algorithms like grid search, you may choose which algorithm to parallelize and
 |          balance the threads.  Creating thread contention will significantly slow down both
 |          algorithms.
 |      gamma : float
 |          Minimum loss reduction required to make a further partition on a leaf
 |          node of the tree.
 |      min_child_weight : float
 |          Minimum sum of instance weight(hessian) needed in a child.
 |      max_delta_step : float
 |          Maximum delta step we allow each tree's weight estimation to be.
 |      subsample : float
 |          Subsample ratio of the training instance.
 |      colsample_bytree : float
 |          Subsample ratio of columns when constructing each tree.
 |      colsample_bylevel : float
 |          Subsample ratio of columns for each level.
 |      colsample_bynode : float
 |          Subsample ratio of columns for each split.
 |      reg_alpha : float (xgb's alpha)
 |          L1 regularization term on weights
 |      reg_lambda : float (xgb's lambda)
 |          L2 regularization term on weights
 |      scale_pos_weight : float
 |          Balancing of positive and negative weights.
 |      base_score:
 |          The initial prediction score of all instances, global bias.
 |      random_state : int
'''
from xgboost import XGBClassifier as xg
from sklearn.metrics import roc_auc_score as roc
from sklearn.model_selection import cross_val_score as cv

def hyper_cv_scorer(model):
    return np.mean(cv(model, X_train_fit[features], y_train, scoring='roc_auc'))

from random import randint
import time

features = inter_features

all_waits = 3600 # 하이퍼파라미터 튜닝에 할애할 총 시간 설정
wait_time = all_waits / 3
t0 = time.time()
results = []
while True:
    t1 = time.time() - t0
    if t1 > wait_time: break
    max_depth = randint(2, 100)
    n_estimators=randint(100, 300)
    learning_rate = randint(1, 999) / 1000
    gamma = randint(1, 999) / 1000
    min_child_weight = randint(1, 999) / 1000
    reg_alpha = randint(1, 999) / 1000
    subsample = randint(1, 999) / 1000
    colsample_bytree = randint(1, 999) / 1000
    model = xg(n_jobs=-1, eval_metric='auc', n_estimators=n_estimators, 
               reg_alpha=reg_alpha, 
               max_depth=max_depth, learning_rate=learning_rate, gamma=gamma, 
               min_child_weight=min_child_weight, use_label_encoder=False, 
               subsample=subsample, colsample_bytree=colsample_bytree)
    
    results.append({'score' : hyper_cv_scorer(model), 
                    'max_depth' : max_depth, 'n_estimators':n_estimators, 
                    'learning_rate' : learning_rate, 
                    'gamma' : gamma, 'min_child_weight' : min_child_weight, 
                    'reg_alpha' : reg_alpha, 'subsample' : subsample, 
                    'colsample_bytree' : colsample_bytree})
    
params = pd.DataFrame(results).sort_values(by='score', ascending=False)[:10]
param_desc = params.describe()
suggestion = param_desc.T[['min', 'max']].T
params

Unnamed: 0,score,max_depth,n_estimators,learning_rate,gamma,min_child_weight,reg_alpha,subsample,colsample_bytree
176,0.721365,8,181,0.043,0.171,0.914,0.984,0.272,0.068
58,0.714348,2,165,0.199,0.323,0.448,0.41,0.561,0.429
55,0.714098,2,171,0.104,0.99,0.37,0.215,0.295,0.705
196,0.713289,2,112,0.498,0.842,0.195,0.686,0.845,0.192
104,0.712615,42,219,0.065,0.635,0.021,0.806,0.272,0.078
285,0.710904,83,186,0.049,0.649,0.479,0.435,0.885,0.084
267,0.709405,11,119,0.137,0.431,0.088,0.25,0.775,0.113
158,0.709242,2,202,0.244,0.201,0.958,0.2,0.359,0.291
23,0.709221,89,141,0.076,0.015,0.804,0.666,0.775,0.041
169,0.704498,21,219,0.079,0.408,0.839,0.659,0.103,0.116


##### Finer_1 Search
* suggestion 으로부터 범위 가져와 Finer Search 수행

In [None]:
from random import randint

t0 = time.time()
results = []
while True:
    t1 = time.time() - t0
    if t1 > wait_time: break
    max_depth = randint(suggestion['max_depth']['min'], 
                        suggestion['max_depth']['max'])
    n_estimators = randint(suggestion['n_estimators']['min'], 
                        suggestion['n_estimators']['max'])
    learning_rate = randint(int(suggestion['learning_rate']['min'] * 1000), 
                            int(suggestion['learning_rate']['max'] * 1000)) / 1000
    gamma = randint(int(suggestion['gamma']['min'] * 1000), 
                    int(suggestion['gamma']['max'] * 1000)) / 1000
    min_child_weight = randint(int(suggestion['min_child_weight']['min'] * 1000), 
                                int(suggestion['min_child_weight']['max'] * 1000)) / 1000
    reg_alpha = randint(int(suggestion['reg_alpha']['min'] * 1000), 
                        int(suggestion['reg_alpha']['max'] * 1000)) / 1000
    subsample = randint(int(suggestion['subsample']['min'] * 1000), 
                        int(suggestion['subsample']['max'] * 1000)) / 1000
    colsample_bytree = randint(int(suggestion['colsample_bytree']['min'] * 1000), 
                        int(suggestion['colsample_bytree']['max'] * 1000)) / 1000
    model = xg(n_jobs=-1, eval_metric='auc', n_estimators=n_estimators, 
               reg_alpha=reg_alpha, 
               max_depth=max_depth, learning_rate=learning_rate, gamma=gamma, 
               min_child_weight=min_child_weight, use_label_encoder=False, 
               subsample=subsample, colsample_bytree=colsample_bytree)
    
    results.append({'score' : hyper_cv_scorer(model), 
                    'max_depth' : max_depth, 'n_estimators':n_estimators, 
                    'learning_rate' : learning_rate, 
                    'gamma' : gamma, 'min_child_weight' : min_child_weight, 
                    'reg_alpha' : reg_alpha, 'subsample' : subsample, 
                    'colsample_bytree' : colsample_bytree})
params = pd.DataFrame(results).sort_values(by='score', ascending=False)[:10]
param_desc = params.describe()
suggestion = param_desc.T[['min', 'max']].T
suggestion

Unnamed: 0,score,max_depth,n_estimators,learning_rate,gamma,min_child_weight,reg_alpha,subsample,colsample_bytree
min,0.707432,2.0,119.0,0.061,0.201,0.08,0.36,0.175,0.058
max,0.724578,53.0,210.0,0.197,0.948,0.934,0.97,0.855,0.391


##### Finer_2

In [None]:
t0 = time.time()
results = []
while True:
    t1 = time.time() - t0
    if t1 > wait_time: break
    max_depth = randint(suggestion['max_depth']['min'], 
                        suggestion['max_depth']['max'])
    n_estimators = randint(suggestion['n_estimators']['min'], 
                        suggestion['n_estimators']['max'])
    learning_rate = randint(int(suggestion['learning_rate']['min'] * 1000), 
                            int(suggestion['learning_rate']['max'] * 1000)) / 1000
    gamma = randint(int(suggestion['gamma']['min'] * 1000), 
                    int(suggestion['gamma']['max'] * 1000)) / 1000
    min_child_weight = randint(int(suggestion['min_child_weight']['min'] * 1000), 
                                int(suggestion['min_child_weight']['max'] * 1000)) / 1000
    reg_alpha = randint(int(suggestion['reg_alpha']['min'] * 1000), 
                        int(suggestion['reg_alpha']['max'] * 1000)) / 1000
    subsample = randint(int(suggestion['subsample']['min'] * 1000), 
                        int(suggestion['subsample']['max'] * 1000)) / 1000
    colsample_bytree = randint(int(suggestion['colsample_bytree']['min'] * 1000), 
                        int(suggestion['colsample_bytree']['max'] * 1000)) / 1000
    model = xg(n_jobs=-1, eval_metric='auc', n_estimators=n_estimators, 
               reg_alpha=reg_alpha, 
               max_depth=max_depth, learning_rate=learning_rate, gamma=gamma, 
               min_child_weight=min_child_weight, use_label_encoder=False, 
               subsample=subsample, colsample_bytree=colsample_bytree)
    
    results.append({'score' : hyper_cv_scorer(model), 
                    'max_depth' : max_depth, 'n_estimators':n_estimators, 
                    'learning_rate' : learning_rate, 
                    'gamma' : gamma, 'min_child_weight' : min_child_weight, 
                    'reg_alpha' : reg_alpha, 'subsample' : subsample, 
                    'colsample_bytree' : colsample_bytree})
params = pd.DataFrame(results).sort_values(by='score', ascending=False)[:1]
params

Unnamed: 0,score,max_depth,n_estimators,learning_rate,gamma,min_child_weight,reg_alpha,subsample,colsample_bytree
259,0.732913,3,209,0.089,0.349,0.482,0.602,0.776,0.084


##### Final model

In [None]:
final_model = xg(n_jobs=-1, eval_metric='auc', 
                 n_estimators=params['n_estimators'].values[0], 
                 reg_alpha=params['reg_alpha'].values[0], 
                 max_depth=params['max_depth'].values[0], 
                 learning_rate=params['learning_rate'].values[0], 
                 gamma=params['gamma'].values[0], 
                 min_child_weight=params['min_child_weight'].values[0],
                 subsample=params['subsample'].values[0], 
                 colsample_bytree=params['colsample_bytree'].values[0], 
                 use_label_encoder=False)

final_model.fit(X_train_fit[features], y_train)

XGBClassifier(base_score=0.5, booster=None, colsample_bylevel=1,
              colsample_bynode=1, colsample_bytree=0.084, eval_metric='auc',
              gamma=0.349, gpu_id=-1, importance_type='gain',
              interaction_constraints=None, learning_rate=0.089,
              max_delta_step=0, max_depth=3, min_child_weight=0.482,
              missing=nan, monotone_constraints=None, n_estimators=209,
              n_jobs=-1, num_parallel_tree=1, random_state=0, reg_alpha=0.602,
              reg_lambda=1, scale_pos_weight=1, subsample=0.776,
              tree_method=None, use_label_encoder=False,
              validate_parameters=False, verbosity=None)

##### make submit_file

In [None]:
interested = ['cust_id', 'gender']
for_submit = X_test.copy()
for_submit['gender'] = pd.DataFrame(final_model.predict_proba(
    X_test_fit[features]), columns=['no_use', 'gender'])['gender']
for_submit[interested].to_csv('submit.csv', index=False)
for_submit[interested].head()

Unnamed: 0,cust_id,gender
0,3500,0.613101
1,3501,0.187606
2,3502,0.31775
3,3503,0.574963
4,3504,0.627964
