In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV, RandomizedSearchCV
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler,LabelEncoder
from sklearn.ensemble import GradientBoostingRegressor, RandomForestRegressor
from sklearn.metrics import roc_auc_score
from xgboost import XGBRegressor
from lightgbm import LGBMRegressor
import datetime
import locale                                         
import pickle
from tqdm import tqdm
import warnings
import math
warnings.filterwarnings("ignore")
locale.setlocale(locale.LC_ALL, 'ko_KR.UTF-8') 

'ko_KR.UTF-8'

In [2]:
with open('../data/merged_data_concat_0926.pickle', 'rb') as f:
    performance_data = pickle.load(f)

In [3]:
performance_data['마더코드'] = performance_data['마더코드'].map(str)
performance_data['상품코드'] = performance_data['상품코드'].map(str)

## **성능 Test**

### **라벨인코딩**

In [4]:
# 라벨 인코딩
test_data = performance_data.drop(['방송일시','판매량', 'holiday', '대비', 'date', 'mean_rating', 
                                   '배당수익률(%)', '주가자산비율', '고가지수', '저가지수', '거래량(천주)', '거래대금(백만원)','상장시가총액(백만원)'],axis=1)

# test_data['상품코드'] = test_data['상품코드'].map(int)
for feat in ['상품명','상품코드','마더코드','prime_time','중분류','요일','season','남여','muil','브랜드','season_prod','소분류']:
    lbe = LabelEncoder()
    test_data[feat] = lbe.fit_transform(test_data[feat].astype(str).values)




In [5]:
prod_group = test_data["상품군"].unique()
prod_group_dct = {v:k for k, v in enumerate(prod_group)}

In [6]:
## 추후 전체 set에 대한 mape를 구하기 위해서 split을 해줌.

predict_data = test_data[test_data.취급액 == -1].reset_index(drop=True)
train_set = test_data[test_data['취급액'] != -1]

X = train_set.drop(["취급액"], axis = 1)
y = train_set["취급액"]

grp_idx = train_set['상품군'].map(prod_group_dct)
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=0, stratify = grp_idx)

In [7]:
test_len = [len(X_test[X_test["상품군"] == prod_group[i]]) for i in range(len(prod_group))]
print(test_len)

[1083, 978, 1643, 971, 326, 1291, 692, 196, 924, 575, 166]


In [8]:
# K-Fold용 데이터 만들기
def make_grp_data(X,idx):
    X_data = X[X["상품군"] == prod_group[idx]].drop(["상품군"], axis=1)
#     y_data = y[X["상품군"] == prod_group[idx]]['취급액']
    return X_data

clothes= make_grp_data(train_set,0)
inner = make_grp_data(train_set,1)
kitchen = make_grp_data(train_set,2)
food = make_grp_data(train_set,3)
beauty = make_grp_data(train_set,4)
elec = make_grp_data(train_set,5)
goods = make_grp_data(train_set,6)
health = make_grp_data(train_set,7)
etc = make_grp_data(train_set,8)
furn = make_grp_data(train_set,9)
bed = make_grp_data(train_set,10)

In [9]:
def mape(actual, pred): 
    actual, pred = np.array(actual), np.array(pred)
    result = np.mean(np.abs((actual - pred) / actual)) * 100
    return result

In [10]:
original_colnames = list(clothes.drop('취급액',axis=1).columns)
new_colnames = []
for i in range(len(clothes.drop('취급액',axis=1).columns)):
    new_colnames.append(i)
colnames_dic = {}
for i in range(len(original_colnames)):
    colnames_dic[str(new_colnames[i])] = original_colnames[i]

In [11]:
original_colnames = list(X_train.columns)
new_colnames = []
for i in range(len(X_train.columns)):
    new_colnames.append(i)
colnames_dic2 = {}
for i in range(len(original_colnames)):
    colnames_dic2[str(new_colnames[i])] = original_colnames[i]

In [12]:
# 각 상품군별 삭제할 피쳐
all_del_colmns = [33,10,37,35,16,18,9,14,15,21,24,17]
clothes_del_columns = [37,34,33,32,26,36,20,12,13,16,18]
inner_del_columns = [34,33,32,26,36,37,28,17,18,9,27,12,13,25,22,8,28,14,19,15,16]
kitchen_del_columns = [25,32,34,36,33,27,9,13,8,17,15,18,16,12,28,14,11]
food_del_columns = [25,36,26,9,33,34,32,37,27,17,16,15,28,29,13,20,14]
beauty_del_columns = [11,16,14,18,13,22,34,33,32,29,26,36,37,8,12,19,17,15]
elec_del_columns = [8,10,11,12,13,14,17,18,19,25,32,33,34,35,36]
goods_del_columns = [13,14,15,16,17,19,20,21,22,23,25,29,32,33,34,37]
health_del_columns = [8,11,13,14,15,16,17,18,20,25,26,27,32,33,34,36,37]
etc_del_columns = [9,11,13,14,15,16,17,18,19,20,22,23,25,27,32,33,34,35,36,37]
furn_del_columns = [8,10,11,12,13,14,15,18,19,23,25,27,32,33,34,35,36]
bed_del_columns = [8,13,14,15,16,17,19,20,22,23,25,26,27,28,29,31,32,33,34,35,36]

all_del_colmns = [colnames_dic2[str(i)] for i in all_del_colmns]
clothes_del_columns = [colnames_dic[str(i)] for i in clothes_del_columns]
inner_del_columns = [colnames_dic[str(i)] for i in inner_del_columns]
kitchen_del_columns = [colnames_dic[str(i)] for i in kitchen_del_columns]
food_del_columns = [colnames_dic[str(i)] for i in food_del_columns]
beauty_del_columns = [colnames_dic[str(i)] for i in beauty_del_columns]
elec_del_columns = [colnames_dic[str(i)] for i in elec_del_columns]
goods_del_columns = [colnames_dic[str(i)] for i in goods_del_columns]
health_del_columns = [colnames_dic[str(i)] for i in health_del_columns]
etc_del_columns = [colnames_dic[str(i)] for i in etc_del_columns]
furn_del_columns = [colnames_dic[str(i)] for i in furn_del_columns]
bed_del_columns = [colnames_dic[str(i)] for i in bed_del_columns]

In [13]:
train_set = train_set.drop(all_del_colmns,axis=1)

clothes = clothes.drop(clothes_del_columns,axis=1)
inner = inner.drop(inner_del_columns,axis=1)
kitchen = kitchen.drop(kitchen_del_columns,axis=1)
food = food.drop(food_del_columns,axis=1)
beauty = beauty.drop(beauty_del_columns,axis=1)
elec = elec.drop(elec_del_columns,axis=1)
goods = goods.drop(goods_del_columns,axis=1)
health = health.drop(health_del_columns,axis=1)
etc = etc.drop(etc_del_columns,axis=1)
furn = furn.drop(furn_del_columns,axis=1)
bed = bed.drop(bed_del_columns,axis=1)

In [14]:
## train test split된 걸 상품군 별로 나눔

def train_test_grp(X_train, X_test, y_train, y_test, prod_group, grp_index):
    new_X_train = X_train[X_train["상품군"] == prod_group[grp_index]].drop("상품군", axis=1)
    new_X_test = X_test[X_test["상품군"] == prod_group[grp_index]].drop("상품군", axis=1)
    new_y_train = y_train[X_train["상품군"] == prod_group[grp_index]]
    new_y_test = y_test[X_test["상품군"] == prod_group[grp_index]]
    return new_X_train, new_X_test, new_y_train, new_y_test

X_train_clothes, X_test_clothes, y_train_clothes, y_test_clothes = train_test_grp(X_train, X_test, y_train, y_test, prod_group, 0)
X_train_inner, X_test_inner, y_train_inner, y_test_inner = train_test_grp(X_train, X_test, y_train, y_test, prod_group, 1)
X_train_kitchen, X_test_kitchen, y_train_kitchen, y_test_kitchen = train_test_grp(X_train, X_test, y_train, y_test, prod_group, 2)
X_train_food, X_test_food, y_train_food, y_test_food = train_test_grp(X_train, X_test, y_train, y_test, prod_group, 3)
X_train_beauty, X_test_beauty, y_train_beauty, y_test_beauty = train_test_grp(X_train, X_test, y_train, y_test, prod_group, 4)
X_train_elec, X_test_elec, y_train_elec, y_test_elec = train_test_grp(X_train, X_test, y_train, y_test, prod_group, 5)
X_train_goods, X_test_goods, y_train_goods, y_test_goods = train_test_grp(X_train, X_test, y_train, y_test, prod_group, 6)
X_train_health, X_test_health, y_train_health, y_test_health = train_test_grp(X_train, X_test, y_train, y_test, prod_group, 7)
X_train_etc, X_test_etc, y_train_etc, y_test_etc = train_test_grp(X_train, X_test, y_train, y_test, prod_group, 8)
X_train_furn, X_test_furn, y_train_furn, y_test_furn = train_test_grp(X_train, X_test, y_train, y_test, prod_group, 9)
X_train_bed, X_test_bed, y_train_bed, y_test_bed = train_test_grp(X_train, X_test, y_train, y_test, prod_group, 10)

In [15]:


X_train_clothes = X_train_clothes.drop(clothes_del_columns,axis=1)
X_train_inner = X_train_inner.drop(inner_del_columns,axis=1)
X_train_kitchen = X_train_kitchen.drop(kitchen_del_columns,axis=1)
X_train_food = X_train_food.drop(food_del_columns,axis=1)
X_train_beauty = X_train_beauty.drop(beauty_del_columns,axis=1)
X_train_elec = X_train_elec.drop(elec_del_columns,axis=1)
X_train_goods = X_train_goods.drop(goods_del_columns,axis=1)
X_train_health = X_train_health.drop(health_del_columns,axis=1)
X_train_etc = X_train_etc.drop(etc_del_columns,axis=1)
X_train_furn = X_train_furn.drop(furn_del_columns,axis=1)
X_train_bed = X_train_bed.drop(bed_del_columns,axis=1)


X_test_clothes = X_test_clothes.drop(clothes_del_columns,axis=1)
X_test_inner = X_test_inner.drop(inner_del_columns,axis=1)
X_test_kitchen = X_test_kitchen.drop(kitchen_del_columns,axis=1)
X_test_food = X_test_food.drop(food_del_columns,axis=1)
X_test_beauty = X_test_beauty.drop(beauty_del_columns,axis=1)
X_test_elec = X_test_elec.drop(elec_del_columns,axis=1)
X_test_goods = X_test_goods.drop(goods_del_columns,axis=1)
X_test_health = X_test_health.drop(health_del_columns,axis=1)
X_test_etc = X_test_etc.drop(etc_del_columns,axis=1)
X_test_furn = X_test_furn.drop(furn_del_columns,axis=1)
X_test_bed = X_test_bed.drop(bed_del_columns,axis=1)

In [16]:
test_data['상품군'] = lbe.fit_transform(test_data['상품군'].astype(str).values)
train_set = test_data[test_data['취급액'] != -1]

X = train_set.drop(["취급액"], axis = 1)
y = train_set["취급액"]

# grp_idx = train_set['상품군'].map(prod_group_dct)
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=0, stratify = grp_idx)

X_train = X_train.drop(all_del_colmns,axis=1)
X_test = X_test.drop(all_del_colmns,axis=1)

In [17]:
# scaling
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()

X_train_scaled = pd.DataFrame(scaler.fit_transform(X_train))
X_test_scaled = pd.DataFrame(scaler.transform(X_test))

X_train_clothes_scaled = pd.DataFrame(scaler.fit_transform(X_train_clothes))
X_test_clothes_scaled = pd.DataFrame(scaler.transform(X_test_clothes))
X_train_inner_scaled = pd.DataFrame(scaler.fit_transform(X_train_inner))
X_test_inner_scaled = pd.DataFrame(scaler.transform(X_test_inner))
X_train_kitchen_scaled = pd.DataFrame(scaler.fit_transform(X_train_kitchen))
X_test_kitchen_scaled = pd.DataFrame(scaler.transform(X_test_kitchen))
X_train_food_scaled = pd.DataFrame(scaler.fit_transform(X_train_food))
X_test_food_scaled = pd.DataFrame(scaler.transform(X_test_food))
X_train_beauty_scaled = pd.DataFrame(scaler.fit_transform(X_train_beauty))
X_test_beauty_scaled = pd.DataFrame(scaler.transform(X_test_beauty))
X_train_elec_scaled = pd.DataFrame(scaler.fit_transform(X_train_elec))
X_test_elec_scaled = pd.DataFrame(scaler.transform(X_test_elec))
X_train_goods_scaled = pd.DataFrame(scaler.fit_transform(X_train_goods))
X_test_goods_scaled = pd.DataFrame(scaler.transform(X_test_goods))
X_train_health_scaled = pd.DataFrame(scaler.fit_transform(X_train_health))
X_test_health_scaled = pd.DataFrame(scaler.transform(X_test_health))
X_train_etc_scaled = pd.DataFrame(scaler.fit_transform(X_train_etc))
X_test_etc_scaled = pd.DataFrame(scaler.transform(X_test_etc))
X_train_furn_scaled = pd.DataFrame(scaler.fit_transform(X_train_furn))
X_test_furn_scaled = pd.DataFrame(scaler.transform(X_test_furn))
X_train_bed_scaled = pd.DataFrame(scaler.fit_transform(X_train_bed))
X_test_bed_scaled = pd.DataFrame(scaler.transform(X_test_bed))

#### **튜닝 전 성능**

mape: 22.1579455684485  
mape: 26.5763001623147  
mape: 27.49715254148381  
mape: 15.874558341118911  
mape: 19.217135630635212  
mape: 40.07481282197277  
mape: 31.623747738429696  
mape: 21.80460073752754  
mape: 38.29687137332689  
mape: 41.68921261024886  
mape: 25.20877367938055  

최종: 29.200866247957723

## Tunning

### **LGBM**

In [18]:
from bayes_opt import BayesianOptimization
from sklearn.model_selection import KFold #for K-fold cross validation
from sklearn.model_selection import cross_val_score #score evaluation

In [19]:
def lgb_eval(num_leaves,max_depth,learning_rate,min_split_gain,min_child_samples,
             sub_sample,n_estimators,min_data_in_leaf,min_child_weight):

    data = train_set    
    LGBM = LGBMRegressor(objective = 'gamma',
        booster='dart',
        num_leaves = int(num_leaves),
        max_depth = int(max_depth),
        min_child_samples = int(min_child_samples),
        min_data_in_leaf= int(min_data_in_leaf),
        n_jobs=6,
        random_state=0,
#         feature_fraction=feature_fraction,
#         bagging_fraction= bagging_fraction,
        learning_rate= learning_rate,
        min_split_gain = min_split_gain,
        sub_sample = sub_sample,
        n_estimators= int(n_estimators),
        min_child_weight= min_child_weight
    )

    folds = KFold(n_splits = 5, shuffle = True, random_state = 0)
        
    test_pred = np.zeros(data.shape[0])
    
    feats = [f for f in data.columns if f != '취급액']
    
    scaler = StandardScaler()
    
    
    for n_fold, (train_idx, valid_idx) in enumerate(folds.split(data[feats], data['취급액'])):
        train_x, train_y = data[feats].iloc[train_idx], data['취급액'].iloc[train_idx]
        train_x = scaler.fit_transform(train_x)
        valid_x, valid_y = data[feats].iloc[valid_idx], data['취급액'].iloc[valid_idx]
        valid_x = scaler.transform(valid_x)
        LGBM.fit(train_x, train_y, 
                eval_set = [(train_x, train_y), (valid_x, valid_y)],
                verbose=False,early_stopping_rounds = 100)

        test_pred[valid_idx] = LGBM.predict(valid_x, num_iteration = LGBM.best_iteration_)
        del train_x, train_y, valid_x, valid_y
   
  
    return -mape(data['취급액'],test_pred)

params = {'num_leaves': (12, 1024),
#         'feature_fraction': (0.1, 0.9),
#         'bagging_fraction': (0.5, 1),
        'max_depth': (7, 20),
        'learning_rate': (0.05, 0.2),
        'min_split_gain': (0.001, 0.1),
        'min_child_samples': (0, 100),
        'sub_sample': (0.7, 1),
        'n_estimators': (500, 2500),
        'min_data_in_leaf': (0, 1000),
        'min_child_weight': (0.001, 100)}

In [673]:
# all
bo = BayesianOptimization(lgb_eval, pbounds = params,random_state=0)
bo.maximize(n_iter = 200, init_points = 10, acq = 'ei', xi = 0.01, random_state = 0)

|   iter    |  target   | learni... | max_depth | min_ch... | min_ch... | min_da... | min_sp... | n_esti... | num_le... | sub_sa... |
-------------------------------------------------------------------------------------------------------------------------------------
| [0m 1       [0m | [0m-44.74   [0m | [0m 0.1323  [0m | [0m 16.3    [0m | [0m 60.28   [0m | [0m 54.49   [0m | [0m 423.7   [0m | [0m 0.06494 [0m | [0m 1.375e+0[0m | [0m 914.5   [0m | [0m 0.9891  [0m |
| [95m 2       [0m | [95m-44.63   [0m | [95m 0.1075  [0m | [95m 17.29   [0m | [95m 52.89   [0m | [95m 56.8    [0m | [95m 925.6   [0m | [95m 0.008033[0m | [95m 674.3   [0m | [95m 32.46   [0m | [95m 0.9498  [0m |
| [0m 3       [0m | [0m-45.48   [0m | [0m 0.1667  [0m | [0m 18.31   [0m | [0m 97.86   [0m | [0m 79.92   [0m | [0m 461.5   [0m | [0m 0.07827 [0m | [0m 736.5   [0m | [0m 659.6   [0m | [0m 0.743   [0m |
| [95m 4       [0m | [95m-44.36   [0m | [95m 0.191

In [674]:
def make_param(x):
    best_params = x.max['params']
    best_params['random_state'] = 0
    if x == (bo_6 or bo_10):
        best_params['objective'] = 'mape'
    else:
        best_params['objective'] = 'gamma'
    best_params['max_depth'] = int(best_params['max_depth'])
    best_params['n_estimators'] = int(best_params['n_estimators'])
    best_params['min_child_samples'] = int(best_params['min_child_samples'])
    best_params['num_leaves'] = int(best_params['num_leaves'])
    best_params['min_data_in_leaf'] = int(best_params['min_data_in_leaf'])
    return best_params

In [675]:
LGBMRegressor(**make_param(bo))

LGBMRegressor(learning_rate=0.1641933472071217, max_depth=20,
              min_child_samples=57, min_data_in_leaf=208, min_split_gain=0.001,
              n_estimators=1240, num_leaves=118, objective='gamma',
              random_state=0, sub_sample=0.7)

In [215]:
def lgb_eval(num_leaves,feature_fraction, bagging_fraction,max_depth,learning_rate,min_split_gain,min_child_samples,
             sub_sample,n_estimators,min_data_in_leaf,min_child_weight):

    data = clothes    
    LGBM = LGBMRegressor(objective = 'tweedie',
        num_leaves = int(num_leaves),
        max_depth = int(max_depth),
        min_child_samples = int(min_child_samples),
        min_data_in_leaf= int(min_data_in_leaf),
        n_jobs=6,
        random_state=0,
        feature_fraction=feature_fraction,
        bagging_fraction= bagging_fraction,
        learning_rate= learning_rate,
        min_split_gain = min_split_gain,
        sub_sample = sub_sample,
        n_estimators= int(n_estimators),
        min_child_weight= min_child_weight
    )

    folds = KFold(n_splits = 5, shuffle = True, random_state = 0)
        
    test_pred = np.zeros(data.shape[0])
    
    feats = [f for f in data.columns if f != '취급액']
    
    scaler = StandardScaler()
    
    
    for n_fold, (train_idx, valid_idx) in enumerate(folds.split(data[feats], data['취급액'])):
        train_x, train_y = data[feats].iloc[train_idx], data['취급액'].iloc[train_idx]
        train_x = scaler.fit_transform(train_x)
        valid_x, valid_y = data[feats].iloc[valid_idx], data['취급액'].iloc[valid_idx]
        valid_x = scaler.transform(valid_x)
        LGBM.fit(train_x, train_y, 
                eval_set = [(train_x, train_y), (valid_x, valid_y)],
                verbose=False,early_stopping_rounds = 100)

        test_pred[valid_idx] = LGBM.predict(valid_x, num_iteration = LGBM.best_iteration_)
        del train_x, train_y, valid_x, valid_y
   
  
    return -mape(data['취급액'],test_pred)

params = {'num_leaves': (12, 1024),
        'feature_fraction': (0.1, 0.9),
        'bagging_fraction': (0.5, 1),
        'max_depth': (7, 20),
        'learning_rate': (0.05, 0.2),
        'min_split_gain': (0.001, 0.1),
        'min_child_samples': (0, 100),
        'sub_sample': (0.7, 1),
        'n_estimators': (500, 2000),
        'min_data_in_leaf': (0, 1000),
        'min_child_weight': (0.001, 100)}

In [216]:
# clothes
bo_1 = BayesianOptimization(lgb_eval, pbounds = params,random_state=0)
bo_1.maximize(n_iter = 300, init_points = 20, acq = 'ei', xi = 0.01, random_state = 0)

|   iter    |  target   | baggin... | featur... | learni... | max_depth | min_ch... | min_ch... | min_da... | min_sp... | n_esti... | num_le... | sub_sa... |
-------------------------------------------------------------------------------------------------------------------------------------------------------------
| [0m 1       [0m | [0m-31.15   [0m | [0m 0.7744  [0m | [0m 0.6722  [0m | [0m 0.1404  [0m | [0m 14.08   [0m | [0m 42.37   [0m | [0m 64.59   [0m | [0m 437.6   [0m | [0m 0.08929 [0m | [0m 1.945e+0[0m | [0m 400.0   [0m | [0m 0.9375  [0m |
| [0m 2       [0m | [0m-44.97   [0m | [0m 0.7644  [0m | [0m 0.5544  [0m | [0m 0.1888  [0m | [0m 7.923   [0m | [0m 8.713   [0m | [0m 2.023   [0m | [0m 832.6   [0m | [0m 0.07804 [0m | [0m 1.805e+0[0m | [0m 1.002e+0[0m | [0m 0.9397  [0m |
| [0m 3       [0m | [0m-45.01   [0m | [0m 0.7307  [0m | [0m 0.7244  [0m | [0m 0.06774 [0m | [0m 15.32   [0m | [0m 14.34   [0m | [0m 94.47   [0m 

In [146]:
def lgb_eval(num_leaves,feature_fraction, bagging_fraction,max_depth,learning_rate,min_split_gain,min_child_samples,
             sub_sample,n_estimators,min_data_in_leaf,min_child_weight):

    data = inner    
    LGBM = LGBMRegressor(objective = 'tweedie',
        num_leaves = int(num_leaves),
        max_depth = int(max_depth),
        min_child_samples = int(min_child_samples),
        min_data_in_leaf= int(min_data_in_leaf),
        n_jobs=6,
        random_state=0,
        feature_fraction=feature_fraction,
        bagging_fraction= bagging_fraction,
        learning_rate= learning_rate,
        min_split_gain = min_split_gain,
        sub_sample = sub_sample,
        n_estimators= int(n_estimators),
        min_child_weight= min_child_weight
    )

    folds = KFold(n_splits = 5, shuffle = True, random_state = 0)
        
    test_pred = np.zeros(data.shape[0])
    
    feats = [f for f in data.columns if f != '취급액']
    
    scaler = StandardScaler()
    
    
    for n_fold, (train_idx, valid_idx) in enumerate(folds.split(data[feats], data['취급액'])):
        train_x, train_y = data[feats].iloc[train_idx], data['취급액'].iloc[train_idx]
        train_x = scaler.fit_transform(train_x)
        valid_x, valid_y = data[feats].iloc[valid_idx], data['취급액'].iloc[valid_idx]
        valid_x = scaler.transform(valid_x)
        LGBM.fit(train_x, train_y, 
                eval_set = [(train_x, train_y), (valid_x, valid_y)],
                verbose=False,early_stopping_rounds = 100)

        test_pred[valid_idx] = LGBM.predict(valid_x, num_iteration = LGBM.best_iteration_)
        del train_x, train_y, valid_x, valid_y
   
  
    return -mape(data['취급액'],test_pred)

params = {'num_leaves': (12, 1024),
        'feature_fraction': (0.1, 0.9),
        'bagging_fraction': (0.5, 1),
        'max_depth': (7, 20),
        'learning_rate': (0.05, 0.2),
        'min_split_gain': (0.001, 0.1),
        'min_child_samples': (0, 100),
        'sub_sample': (0.7, 1),
        'n_estimators': (500, 2000),
        'min_data_in_leaf': (0, 1000),
        'min_child_weight': (0.001, 100)}

#### **튜닝 전 성능**
 
mape: 26.5763001623147  


In [147]:
# inner
bo_2 = BayesianOptimization(lgb_eval, pbounds = params,random_state=0)
bo_2.maximize(n_iter = 300, init_points = 10, acq = 'ei', xi = 0.01, random_state = 0)

|   iter    |  target   | baggin... | featur... | learni... | max_depth | min_ch... | min_ch... | min_da... | min_sp... | n_esti... | num_le... | sub_sa... |
-------------------------------------------------------------------------------------------------------------------------------------------------------------
| [0m 1       [0m | [0m-39.94   [0m | [0m 0.7744  [0m | [0m 0.6722  [0m | [0m 0.1404  [0m | [0m 14.08   [0m | [0m 42.37   [0m | [0m 64.59   [0m | [0m 437.6   [0m | [0m 0.08929 [0m | [0m 1.945e+0[0m | [0m 400.0   [0m | [0m 0.9375  [0m |
| [0m 2       [0m | [0m-59.64   [0m | [0m 0.7644  [0m | [0m 0.5544  [0m | [0m 0.1888  [0m | [0m 7.923   [0m | [0m 8.713   [0m | [0m 2.023   [0m | [0m 832.6   [0m | [0m 0.07804 [0m | [0m 1.805e+0[0m | [0m 1.002e+0[0m | [0m 0.9397  [0m |
| [0m 3       [0m | [0m-58.01   [0m | [0m 0.7307  [0m | [0m 0.7244  [0m | [0m 0.06774 [0m | [0m 15.32   [0m | [0m 14.34   [0m | [0m 94.47   [0m 

In [148]:
def lgb_eval(num_leaves,feature_fraction, bagging_fraction,max_depth,learning_rate,min_split_gain,min_child_samples,
             sub_sample,n_estimators,min_data_in_leaf,min_child_weight):

    data = kitchen    
    LGBM = LGBMRegressor(objective = 'tweedie',
        num_leaves = int(num_leaves),
        max_depth = int(max_depth),
        min_child_samples = int(min_child_samples),
        min_data_in_leaf= int(min_data_in_leaf),
        n_jobs=6,
        random_state=0,
        feature_fraction=feature_fraction,
        bagging_fraction= bagging_fraction,
        learning_rate= learning_rate,
        min_split_gain = min_split_gain,
        sub_sample = sub_sample,
        n_estimators= int(n_estimators),
        min_child_weight= min_child_weight
    )

    folds = KFold(n_splits = 5, shuffle = True, random_state = 0)
        
    test_pred = np.zeros(data.shape[0])
    
    feats = [f for f in data.columns if f != '취급액']
    
    scaler = StandardScaler()
    
    
    for n_fold, (train_idx, valid_idx) in enumerate(folds.split(data[feats], data['취급액'])):
        train_x, train_y = data[feats].iloc[train_idx], data['취급액'].iloc[train_idx]
        train_x = scaler.fit_transform(train_x)
        valid_x, valid_y = data[feats].iloc[valid_idx], data['취급액'].iloc[valid_idx]
        valid_x = scaler.transform(valid_x)
        LGBM.fit(train_x, train_y, 
                eval_set = [(train_x, train_y), (valid_x, valid_y)],
                verbose=False,early_stopping_rounds = 100)

        test_pred[valid_idx] = LGBM.predict(valid_x, num_iteration = LGBM.best_iteration_)
        del train_x, train_y, valid_x, valid_y
   
  
    return -mape(data['취급액'],test_pred)

params = {'num_leaves': (12, 1024),
        'feature_fraction': (0.1, 0.9),
        'bagging_fraction': (0.5, 1),
        'max_depth': (7, 20),
        'learning_rate': (0.05, 0.2),
        'min_split_gain': (0.001, 0.1),
        'min_child_samples': (0, 100),
        'sub_sample': (0.7, 1),
        'n_estimators': (500, 2000),
        'min_data_in_leaf': (0, 1000),
        'min_child_weight': (0.001, 100)}

#### **튜닝 전 성능**
 
mape: 27.49715254148381  


clothes, inner, kitchen, food, beauty, elec, goods, health, etc, furn, bed

In [149]:
# kitchen
bo_3 = BayesianOptimization(lgb_eval, pbounds = params,random_state=0)
bo_3.maximize(n_iter = 300, init_points = 10, acq = 'ei', xi = 0.01, random_state = 0)

|   iter    |  target   | baggin... | featur... | learni... | max_depth | min_ch... | min_ch... | min_da... | min_sp... | n_esti... | num_le... | sub_sa... |
-------------------------------------------------------------------------------------------------------------------------------------------------------------
| [0m 1       [0m | [0m-34.78   [0m | [0m 0.7744  [0m | [0m 0.6722  [0m | [0m 0.1404  [0m | [0m 14.08   [0m | [0m 42.37   [0m | [0m 64.59   [0m | [0m 437.6   [0m | [0m 0.08929 [0m | [0m 1.945e+0[0m | [0m 400.0   [0m | [0m 0.9375  [0m |
| [0m 2       [0m | [0m-41.53   [0m | [0m 0.7644  [0m | [0m 0.5544  [0m | [0m 0.1888  [0m | [0m 7.923   [0m | [0m 8.713   [0m | [0m 2.023   [0m | [0m 832.6   [0m | [0m 0.07804 [0m | [0m 1.805e+0[0m | [0m 1.002e+0[0m | [0m 0.9397  [0m |
| [0m 3       [0m | [0m-44.98   [0m | [0m 0.7307  [0m | [0m 0.7244  [0m | [0m 0.06774 [0m | [0m 15.32   [0m | [0m 14.34   [0m | [0m 94.47   [0m 

In [150]:
def lgb_eval(num_leaves,feature_fraction, bagging_fraction,max_depth,learning_rate,min_split_gain,min_child_samples,
             sub_sample,n_estimators,min_data_in_leaf,min_child_weight):

    data = food    
    LGBM = LGBMRegressor(objective = 'tweedie',
        num_leaves = int(num_leaves),
        max_depth = int(max_depth),
        min_child_samples = int(min_child_samples),
        min_data_in_leaf= int(min_data_in_leaf),
        n_jobs=6,
        random_state=0,
        feature_fraction=feature_fraction,
        bagging_fraction= bagging_fraction,
        learning_rate= learning_rate,
        min_split_gain = min_split_gain,
        sub_sample = sub_sample,
        n_estimators= int(n_estimators),
        min_child_weight= min_child_weight
    )

    folds = KFold(n_splits = 5, shuffle = True, random_state = 0)
        
    test_pred = np.zeros(data.shape[0])
    
    feats = [f for f in data.columns if f != '취급액']
    
    scaler = StandardScaler()
    
    
    for n_fold, (train_idx, valid_idx) in enumerate(folds.split(data[feats], data['취급액'])):
        train_x, train_y = data[feats].iloc[train_idx], data['취급액'].iloc[train_idx]
        train_x = scaler.fit_transform(train_x)
        valid_x, valid_y = data[feats].iloc[valid_idx], data['취급액'].iloc[valid_idx]
        valid_x = scaler.transform(valid_x)
        LGBM.fit(train_x, train_y, 
                eval_set = [(train_x, train_y), (valid_x, valid_y)],
                verbose=False,early_stopping_rounds = 100)

        test_pred[valid_idx] = LGBM.predict(valid_x, num_iteration = LGBM.best_iteration_)
        del train_x, train_y, valid_x, valid_y
   
  
    return -mape(data['취급액'],test_pred)

params = {'num_leaves': (12, 1024),
        'feature_fraction': (0.1, 0.9),
        'bagging_fraction': (0.5, 1),
        'max_depth': (7, 20),
        'learning_rate': (0.05, 0.2),
        'min_split_gain': (0.001, 0.1),
        'min_child_samples': (0, 100),
        'sub_sample': (0.7, 1),
        'n_estimators': (500, 2000),
        'min_data_in_leaf': (0, 1000),
        'min_child_weight': (0.001, 100)}

#### **튜닝 전 성능**

mape: 15.874558341118911  


clothes, inner, kitchen, food, beauty, elec, goods, health, etc, furn, bed

In [151]:
# food
bo_4 = BayesianOptimization(lgb_eval, pbounds = params,random_state=0)
bo_4.maximize(n_iter = 300, init_points = 10, acq = 'ei', xi = 0.01, random_state = 0)

|   iter    |  target   | baggin... | featur... | learni... | max_depth | min_ch... | min_ch... | min_da... | min_sp... | n_esti... | num_le... | sub_sa... |
-------------------------------------------------------------------------------------------------------------------------------------------------------------
| [0m 1       [0m | [0m-22.47   [0m | [0m 0.7744  [0m | [0m 0.6722  [0m | [0m 0.1404  [0m | [0m 14.08   [0m | [0m 42.37   [0m | [0m 64.59   [0m | [0m 437.6   [0m | [0m 0.08929 [0m | [0m 1.945e+0[0m | [0m 400.0   [0m | [0m 0.9375  [0m |
| [0m 2       [0m | [0m-30.19   [0m | [0m 0.7644  [0m | [0m 0.5544  [0m | [0m 0.1888  [0m | [0m 7.923   [0m | [0m 8.713   [0m | [0m 2.023   [0m | [0m 832.6   [0m | [0m 0.07804 [0m | [0m 1.805e+0[0m | [0m 1.002e+0[0m | [0m 0.9397  [0m |
| [0m 3       [0m | [0m-30.54   [0m | [0m 0.7307  [0m | [0m 0.7244  [0m | [0m 0.06774 [0m | [0m 15.32   [0m | [0m 14.34   [0m | [0m 94.47   [0m 

In [152]:
def lgb_eval(num_leaves,feature_fraction, bagging_fraction,max_depth,learning_rate,min_split_gain,min_child_samples,
             sub_sample,n_estimators,min_data_in_leaf,min_child_weight):

    data = beauty    
    LGBM = LGBMRegressor(objective = 'tweedie',
        num_leaves = int(num_leaves),
        max_depth = int(max_depth),
        min_child_samples = int(min_child_samples),
        min_data_in_leaf= int(min_data_in_leaf),
        n_jobs=6,
        random_state=0,
        feature_fraction=feature_fraction,
        bagging_fraction= bagging_fraction,
        learning_rate= learning_rate,
        min_split_gain = min_split_gain,
        sub_sample = sub_sample,
        n_estimators= int(n_estimators),
        min_child_weight= min_child_weight
    )

    folds = KFold(n_splits = 5, shuffle = True, random_state = 0)
        
    test_pred = np.zeros(data.shape[0])
    
    feats = [f for f in data.columns if f != '취급액']
    
    scaler = StandardScaler()
    
    
    for n_fold, (train_idx, valid_idx) in enumerate(folds.split(data[feats], data['취급액'])):
        train_x, train_y = data[feats].iloc[train_idx], data['취급액'].iloc[train_idx]
        train_x = scaler.fit_transform(train_x)
        valid_x, valid_y = data[feats].iloc[valid_idx], data['취급액'].iloc[valid_idx]
        valid_x = scaler.transform(valid_x)
        LGBM.fit(train_x, train_y, 
                eval_set = [(train_x, train_y), (valid_x, valid_y)],
                verbose=False,early_stopping_rounds = 100)

        test_pred[valid_idx] = LGBM.predict(valid_x, num_iteration = LGBM.best_iteration_)
        del train_x, train_y, valid_x, valid_y
   
  
    return -mape(data['취급액'],test_pred)

params = {'num_leaves': (12, 1024),
        'feature_fraction': (0.1, 0.9),
        'bagging_fraction': (0.5, 1),
        'max_depth': (7, 20),
        'learning_rate': (0.05, 0.2),
        'min_split_gain': (0.001, 0.1),
        'min_child_samples': (0, 100),
        'sub_sample': (0.7, 1),
        'n_estimators': (500, 2000),
        'min_data_in_leaf': (0, 1000),
        'min_child_weight': (0.001, 100)}

#### **튜닝 전 성능**
  
mape: 19.217135630635212  


clothes, inner, kitchen, food, beauty, elec, goods, health, etc, furn, bed

In [153]:
# beauty
bo_5 = BayesianOptimization(lgb_eval, pbounds = params,random_state=0)
bo_5.maximize(n_iter = 300, init_points = 10, acq = 'ei', xi = 0.01, random_state = 0)

|   iter    |  target   | baggin... | featur... | learni... | max_depth | min_ch... | min_ch... | min_da... | min_sp... | n_esti... | num_le... | sub_sa... |
-------------------------------------------------------------------------------------------------------------------------------------------------------------
| [0m 1       [0m | [0m-76.2    [0m | [0m 0.7744  [0m | [0m 0.6722  [0m | [0m 0.1404  [0m | [0m 14.08   [0m | [0m 42.37   [0m | [0m 64.59   [0m | [0m 437.6   [0m | [0m 0.08929 [0m | [0m 1.945e+0[0m | [0m 400.0   [0m | [0m 0.9375  [0m |
| [0m 2       [0m | [0m-133.0   [0m | [0m 0.7644  [0m | [0m 0.5544  [0m | [0m 0.1888  [0m | [0m 7.923   [0m | [0m 8.713   [0m | [0m 2.023   [0m | [0m 832.6   [0m | [0m 0.07804 [0m | [0m 1.805e+0[0m | [0m 1.002e+0[0m | [0m 0.9397  [0m |
| [0m 3       [0m | [0m-129.2   [0m | [0m 0.7307  [0m | [0m 0.7244  [0m | [0m 0.06774 [0m | [0m 15.32   [0m | [0m 14.34   [0m | [0m 94.47   [0m 

In [154]:
def lgb_eval(num_leaves,feature_fraction, bagging_fraction,max_depth,learning_rate,min_split_gain,min_child_samples,
             sub_sample,n_estimators,min_data_in_leaf,min_child_weight):

    data = elec    
    LGBM = LGBMRegressor(objective = 'tweedie',
        num_leaves = int(num_leaves),
        max_depth = int(max_depth),
        min_child_samples = int(min_child_samples),
        min_data_in_leaf= int(min_data_in_leaf),
        n_jobs=6,               
        random_state=0,
        feature_fraction=feature_fraction,
        bagging_fraction= bagging_fraction,
        learning_rate= learning_rate,
        min_split_gain = min_split_gain,
        sub_sample = sub_sample,
        n_estimators= int(n_estimators),
        min_child_weight= min_child_weight
    )

    folds = KFold(n_splits = 5, shuffle = True, random_state = 0)
        
    test_pred = np.zeros(data.shape[0])
    
    feats = [f for f in data.columns if f != '취급액']
    
    scaler = StandardScaler()
    
    
    for n_fold, (train_idx, valid_idx) in enumerate(folds.split(data[feats], data['취급액'])):
        train_x, train_y = data[feats].iloc[train_idx], data['취급액'].iloc[train_idx]
        train_x = scaler.fit_transform(train_x)
        valid_x, valid_y = data[feats].iloc[valid_idx], data['취급액'].iloc[valid_idx]
        valid_x = scaler.transform(valid_x)
        LGBM.fit(train_x, train_y, 
                eval_set = [(train_x, train_y), (valid_x, valid_y)],
                verbose=False,early_stopping_rounds = 100)

        test_pred[valid_idx] = LGBM.predict(valid_x, num_iteration = LGBM.best_iteration_)
        del train_x, train_y, valid_x, valid_y
   
  
    return -mape(data['취급액'],test_pred)

params = {'num_leaves': (12, 1024),
        'feature_fraction': (0.1, 0.9),
        'bagging_fraction': (0.5, 1),
        'max_depth': (7, 20),
        'learning_rate': (0.05, 0.2),
        'min_split_gain': (0.001, 0.1),
        'min_child_samples': (0, 100),
        'sub_sample': (0.7, 1),
        'n_estimators': (500, 2000),
        'min_data_in_leaf': (0, 1000),
        'min_child_weight': (0.001, 100)}

#### **튜닝 전 성능**

mape: 40.07481282197277  


clothes, inner, kitchen, food, beauty, elec, goods, health, etc, furn, bed

In [155]:
# elec
bo_6 = BayesianOptimization(lgb_eval, pbounds = params,random_state=0)
bo_6.maximize(n_iter = 300, init_points = 10, acq = 'ei', xi = 0.01, random_state = 0)

|   iter    |  target   | baggin... | featur... | learni... | max_depth | min_ch... | min_ch... | min_da... | min_sp... | n_esti... | num_le... | sub_sa... |
-------------------------------------------------------------------------------------------------------------------------------------------------------------
| [0m 1       [0m | [0m-54.37   [0m | [0m 0.7744  [0m | [0m 0.6722  [0m | [0m 0.1404  [0m | [0m 14.08   [0m | [0m 42.37   [0m | [0m 64.59   [0m | [0m 437.6   [0m | [0m 0.08929 [0m | [0m 1.945e+0[0m | [0m 400.0   [0m | [0m 0.9375  [0m |
| [0m 2       [0m | [0m-56.83   [0m | [0m 0.7644  [0m | [0m 0.5544  [0m | [0m 0.1888  [0m | [0m 7.923   [0m | [0m 8.713   [0m | [0m 2.023   [0m | [0m 832.6   [0m | [0m 0.07804 [0m | [0m 1.805e+0[0m | [0m 1.002e+0[0m | [0m 0.9397  [0m |
| [0m 3       [0m | [0m-57.54   [0m | [0m 0.7307  [0m | [0m 0.7244  [0m | [0m 0.06774 [0m | [0m 15.32   [0m | [0m 14.34   [0m | [0m 94.47   [0m 

In [156]:
def lgb_eval(num_leaves,feature_fraction, bagging_fraction,max_depth,learning_rate,min_split_gain,min_child_samples,
             sub_sample,n_estimators,min_data_in_leaf,min_child_weight):

    data = goods    
    LGBM = LGBMRegressor(objective = 'tweedie',
        num_leaves = int(num_leaves),
        max_depth = int(max_depth),
        min_child_samples = int(min_child_samples),
        min_data_in_leaf= int(min_data_in_leaf),
        n_jobs=6,
        random_state=0,
        feature_fraction=feature_fraction,
        bagging_fraction= bagging_fraction,
        learning_rate= learning_rate,
        min_split_gain = min_split_gain,
        sub_sample = sub_sample,
        n_estimators= int(n_estimators),
        min_child_weight= min_child_weight
    )

    folds = KFold(n_splits = 5, shuffle = True, random_state = 0)
        
    test_pred = np.zeros(data.shape[0])
    
    feats = [f for f in data.columns if f != '취급액']
    
    scaler = StandardScaler()
    
    
    for n_fold, (train_idx, valid_idx) in enumerate(folds.split(data[feats], data['취급액'])):
        train_x, train_y = data[feats].iloc[train_idx], data['취급액'].iloc[train_idx]
        train_x = scaler.fit_transform(train_x)
        valid_x, valid_y = data[feats].iloc[valid_idx], data['취급액'].iloc[valid_idx]
        valid_x = scaler.transform(valid_x)
        LGBM.fit(train_x, train_y, 
                eval_set = [(train_x, train_y), (valid_x, valid_y)],
                verbose=False,early_stopping_rounds = 100)

        test_pred[valid_idx] = LGBM.predict(valid_x, num_iteration = LGBM.best_iteration_)
        del train_x, train_y, valid_x, valid_y
   
  
    return -mape(data['취급액'],test_pred)

params = {'num_leaves': (12, 1024),
        'feature_fraction': (0.1, 0.9),
        'bagging_fraction': (0.5, 1),
        'max_depth': (7, 20),
        'learning_rate': (0.05, 0.2),
        'min_split_gain': (0.001, 0.1),
        'min_child_samples': (0, 100),
        'sub_sample': (0.7, 1),
        'n_estimators': (500, 2000),
        'min_data_in_leaf': (0, 1000),
        'min_child_weight': (0.001, 100)}

#### **튜닝 전 성능**

mape: 31.623747738429696  


clothes, inner, kitchen, food, beauty, elec, goods, health, etc, furn, bed

In [157]:
# goods
bo_7 = BayesianOptimization(lgb_eval, pbounds = params,random_state=0)
bo_7.maximize(n_iter = 300, init_points = 10, acq = 'ei', xi = 0.01, random_state = 0)

|   iter    |  target   | baggin... | featur... | learni... | max_depth | min_ch... | min_ch... | min_da... | min_sp... | n_esti... | num_le... | sub_sa... |
-------------------------------------------------------------------------------------------------------------------------------------------------------------
| [0m 1       [0m | [0m-75.31   [0m | [0m 0.7744  [0m | [0m 0.6722  [0m | [0m 0.1404  [0m | [0m 14.08   [0m | [0m 42.37   [0m | [0m 64.59   [0m | [0m 437.6   [0m | [0m 0.08929 [0m | [0m 1.945e+0[0m | [0m 400.0   [0m | [0m 0.9375  [0m |
| [0m 2       [0m | [0m-153.2   [0m | [0m 0.7644  [0m | [0m 0.5544  [0m | [0m 0.1888  [0m | [0m 7.923   [0m | [0m 8.713   [0m | [0m 2.023   [0m | [0m 832.6   [0m | [0m 0.07804 [0m | [0m 1.805e+0[0m | [0m 1.002e+0[0m | [0m 0.9397  [0m |
| [0m 3       [0m | [0m-105.7   [0m | [0m 0.7307  [0m | [0m 0.7244  [0m | [0m 0.06774 [0m | [0m 15.32   [0m | [0m 14.34   [0m | [0m 94.47   [0m 

In [158]:
def lgb_eval(num_leaves,feature_fraction, bagging_fraction,max_depth,learning_rate,min_split_gain,min_child_samples,
             sub_sample,n_estimators,min_data_in_leaf,min_child_weight):

    data = health    
    LGBM = LGBMRegressor(objective = 'tweedie',
        num_leaves = int(num_leaves),
        max_depth = int(max_depth),
        min_child_samples = int(min_child_samples),
        min_data_in_leaf= int(min_data_in_leaf),
        n_jobs=6,
        random_state=0,
        feature_fraction=feature_fraction,
        bagging_fraction= bagging_fraction,
        learning_rate= learning_rate,
        min_split_gain = min_split_gain,
        sub_sample = sub_sample,
        n_estimators= int(n_estimators),
        min_child_weight= min_child_weight
    )

    folds = KFold(n_splits = 5, shuffle = True, random_state = 0)
        
    test_pred = np.zeros(data.shape[0])
    
    feats = [f for f in data.columns if f != '취급액']
    
    scaler = StandardScaler()
    
    
    for n_fold, (train_idx, valid_idx) in enumerate(folds.split(data[feats], data['취급액'])):
        train_x, train_y = data[feats].iloc[train_idx], data['취급액'].iloc[train_idx]
        train_x = scaler.fit_transform(train_x)
        valid_x, valid_y = data[feats].iloc[valid_idx], data['취급액'].iloc[valid_idx]
        valid_x = scaler.transform(valid_x)
        LGBM.fit(train_x, train_y, 
                eval_set = [(train_x, train_y), (valid_x, valid_y)],
                verbose=False,early_stopping_rounds = 100)

        test_pred[valid_idx] = LGBM.predict(valid_x, num_iteration = LGBM.best_iteration_)
        del train_x, train_y, valid_x, valid_y
   
  
    return -mape(data['취급액'],test_pred)

params = {'num_leaves': (12, 1024),
        'feature_fraction': (0.1, 0.9),
        'bagging_fraction': (0.5, 1),
        'max_depth': (7, 20),
        'learning_rate': (0.05, 0.2),
        'min_split_gain': (0.001, 0.1),
        'min_child_samples': (0, 100),
        'sub_sample': (0.7, 1),
        'n_estimators': (500, 2000),
        'min_data_in_leaf': (0, 1000),
        'min_child_weight': (0.001, 100)}

#### **튜닝 전 성능**
 
mape: 21.80460073752754  


clothes, inner, kitchen, food, beauty, elec, goods, health, etc, furn, bed

In [159]:
# health
bo_8 = BayesianOptimization(lgb_eval, pbounds = params,random_state=0)
bo_8.maximize(n_iter = 300, init_points = 10, acq = 'ei', xi = 0.01, random_state = 0)

|   iter    |  target   | baggin... | featur... | learni... | max_depth | min_ch... | min_ch... | min_da... | min_sp... | n_esti... | num_le... | sub_sa... |
-------------------------------------------------------------------------------------------------------------------------------------------------------------
| [0m 1       [0m | [0m-129.4   [0m | [0m 0.7744  [0m | [0m 0.6722  [0m | [0m 0.1404  [0m | [0m 14.08   [0m | [0m 42.37   [0m | [0m 64.59   [0m | [0m 437.6   [0m | [0m 0.08929 [0m | [0m 1.945e+0[0m | [0m 400.0   [0m | [0m 0.9375  [0m |
| [0m 2       [0m | [0m-129.4   [0m | [0m 0.7644  [0m | [0m 0.5544  [0m | [0m 0.1888  [0m | [0m 7.923   [0m | [0m 8.713   [0m | [0m 2.023   [0m | [0m 832.6   [0m | [0m 0.07804 [0m | [0m 1.805e+0[0m | [0m 1.002e+0[0m | [0m 0.9397  [0m |
| [0m 3       [0m | [0m-129.4   [0m | [0m 0.7307  [0m | [0m 0.7244  [0m | [0m 0.06774 [0m | [0m 15.32   [0m | [0m 14.34   [0m | [0m 94.47   [0m 

In [217]:
def lgb_eval(num_leaves,feature_fraction, bagging_fraction,max_depth,learning_rate,min_split_gain,min_child_samples,
             sub_sample,n_estimators,min_data_in_leaf,min_child_weight):

    data = etc    
    LGBM = LGBMRegressor(objective = 'tweedie',
        num_leaves = int(num_leaves),
        max_depth = int(max_depth),
        min_child_samples = int(min_child_samples),
        min_data_in_leaf= int(min_data_in_leaf),
        n_jobs=6,
        random_state=0,                 
        feature_fraction=feature_fraction,
        bagging_fraction= bagging_fraction,
        learning_rate= learning_rate,
        min_split_gain = min_split_gain,
        sub_sample = sub_sample,
        n_estimators= int(n_estimators),
        min_child_weight= min_child_weight
    )

    folds = KFold(n_splits = 5, shuffle = True, random_state = 0)
        
    test_pred = np.zeros(data.shape[0])
    
    feats = [f for f in data.columns if f != '취급액']
    
    scaler = StandardScaler()
    
    
    for n_fold, (train_idx, valid_idx) in enumerate(folds.split(data[feats], data['취급액'])):
        train_x, train_y = data[feats].iloc[train_idx], data['취급액'].iloc[train_idx]
        train_x = scaler.fit_transform(train_x)
        valid_x, valid_y = data[feats].iloc[valid_idx], data['취급액'].iloc[valid_idx]
        valid_x = scaler.transform(valid_x)
        LGBM.fit(train_x, train_y, 
                eval_set = [(train_x, train_y), (valid_x, valid_y)],
                verbose=False,early_stopping_rounds = 100)

        test_pred[valid_idx] = LGBM.predict(valid_x, num_iteration = LGBM.best_iteration_)
        del train_x, train_y, valid_x, valid_y
   
  
    return -mape(data['취급액'],test_pred)

params = {'num_leaves': (12, 1024),
        'feature_fraction': (0.1, 0.9),
        'bagging_fraction': (0.5, 1),
        'max_depth': (7, 20),
        'learning_rate': (0.05, 0.2),
        'min_split_gain': (0.001, 0.1),
        'min_child_samples': (0, 100),
        'sub_sample': (0.7, 1),
        'n_estimators': (500, 2000),
        'min_data_in_leaf': (0, 1000),
        'min_child_weight': (0.001, 100)}

#### **튜닝 전 성능**
 
mape: 38.29687137332689  


clothes, inner, kitchen, food, beauty, elec, goods, health, etc, furn, bed

In [218]:
# etc
bo_9 = BayesianOptimization(lgb_eval, pbounds = params,random_state=0)
bo_9.maximize(n_iter = 300, init_points = 10, acq = 'ei', xi = 0.01, random_state = 0)

|   iter    |  target   | baggin... | featur... | learni... | max_depth | min_ch... | min_ch... | min_da... | min_sp... | n_esti... | num_le... | sub_sa... |
-------------------------------------------------------------------------------------------------------------------------------------------------------------
| [0m 1       [0m | [0m-44.01   [0m | [0m 0.7744  [0m | [0m 0.6722  [0m | [0m 0.1404  [0m | [0m 14.08   [0m | [0m 42.37   [0m | [0m 64.59   [0m | [0m 437.6   [0m | [0m 0.08929 [0m | [0m 1.945e+0[0m | [0m 400.0   [0m | [0m 0.9375  [0m |
| [0m 2       [0m | [0m-58.64   [0m | [0m 0.7644  [0m | [0m 0.5544  [0m | [0m 0.1888  [0m | [0m 7.923   [0m | [0m 8.713   [0m | [0m 2.023   [0m | [0m 832.6   [0m | [0m 0.07804 [0m | [0m 1.805e+0[0m | [0m 1.002e+0[0m | [0m 0.9397  [0m |
| [0m 3       [0m | [0m-55.66   [0m | [0m 0.7307  [0m | [0m 0.7244  [0m | [0m 0.06774 [0m | [0m 15.32   [0m | [0m 14.34   [0m | [0m 94.47   [0m 

In [219]:
def lgb_eval(num_leaves,feature_fraction, bagging_fraction,max_depth,learning_rate,min_split_gain,min_child_samples,
             sub_sample,n_estimators,min_data_in_leaf,min_child_weight):

    data = furn    
    LGBM = LGBMRegressor(objective = 'tweedie',
        num_leaves = int(num_leaves),
        max_depth = int(max_depth),
        min_child_samples = int(min_child_samples),
        min_data_in_leaf= int(min_data_in_leaf),
        n_jobs=6,
        random_state=0,               
        feature_fraction=feature_fraction,
        bagging_fraction= bagging_fraction,
        learning_rate= learning_rate,
        min_split_gain = min_split_gain,
        sub_sample = sub_sample,
        n_estimators= int(n_estimators),
        min_child_weight= min_child_weight
    )

    folds = KFold(n_splits = 5, shuffle = True, random_state = 0)
        
    test_pred = np.zeros(data.shape[0])
    
    feats = [f for f in data.columns if f != '취급액']
    
    scaler = StandardScaler()
    
    
    for n_fold, (train_idx, valid_idx) in enumerate(folds.split(data[feats], data['취급액'])):
        train_x, train_y = data[feats].iloc[train_idx], data['취급액'].iloc[train_idx]
        train_x = scaler.fit_transform(train_x)
        valid_x, valid_y = data[feats].iloc[valid_idx], data['취급액'].iloc[valid_idx]
        valid_x = scaler.transform(valid_x)
        LGBM.fit(train_x, train_y, 
                eval_set = [(train_x, train_y), (valid_x, valid_y)],
                verbose=False,early_stopping_rounds = 100)

        test_pred[valid_idx] = LGBM.predict(valid_x, num_iteration = LGBM.best_iteration_)
        del train_x, train_y, valid_x, valid_y
   
  
    return -mape(data['취급액'],test_pred)

params = {'num_leaves': (12, 1024),
        'feature_fraction': (0.1, 0.9),
        'bagging_fraction': (0.5, 1),
        'max_depth': (7, 20),
        'learning_rate': (0.05, 0.2),
        'min_split_gain': (0.001, 0.1),
        'min_child_samples': (0, 100),
        'sub_sample': (0.7, 1),
        'n_estimators': (500, 2000),
        'min_data_in_leaf': (0, 1000),
        'min_child_weight': (0.001, 100)}

#### **튜닝 전 성능**
  
mape: 41.68921261024886  


clothes, inner, kitchen, food, beauty, elec, goods, health, etc, furn, bed

In [220]:
# furn
bo_10 = BayesianOptimization(lgb_eval, pbounds = params,random_state=0)
bo_10.maximize(n_iter = 300, init_points = 10, acq = 'ei', xi = 0.01, random_state = 0)

|   iter    |  target   | baggin... | featur... | learni... | max_depth | min_ch... | min_ch... | min_da... | min_sp... | n_esti... | num_le... | sub_sa... |
-------------------------------------------------------------------------------------------------------------------------------------------------------------
| [0m 1       [0m | [0m-59.32   [0m | [0m 0.7744  [0m | [0m 0.6722  [0m | [0m 0.1404  [0m | [0m 14.08   [0m | [0m 42.37   [0m | [0m 64.59   [0m | [0m 437.6   [0m | [0m 0.08929 [0m | [0m 1.945e+0[0m | [0m 400.0   [0m | [0m 0.9375  [0m |
| [0m 2       [0m | [0m-119.3   [0m | [0m 0.7644  [0m | [0m 0.5544  [0m | [0m 0.1888  [0m | [0m 7.923   [0m | [0m 8.713   [0m | [0m 2.023   [0m | [0m 832.6   [0m | [0m 0.07804 [0m | [0m 1.805e+0[0m | [0m 1.002e+0[0m | [0m 0.9397  [0m |
| [0m 3       [0m | [0m-66.61   [0m | [0m 0.7307  [0m | [0m 0.7244  [0m | [0m 0.06774 [0m | [0m 15.32   [0m | [0m 14.34   [0m | [0m 94.47   [0m 

In [221]:
def lgb_eval(num_leaves,feature_fraction, bagging_fraction,max_depth,learning_rate,min_split_gain,min_child_samples,
             sub_sample,n_estimators,min_data_in_leaf,min_child_weight):

    data = bed    
    LGBM = LGBMRegressor(objective = 'tweedie',
        num_leaves = int(num_leaves),
        max_depth = int(max_depth),
        min_child_samples = int(min_child_samples),
        min_data_in_leaf= int(min_data_in_leaf),
        n_jobs=6,
        random_state=0,              
        feature_fraction=feature_fraction,
        bagging_fraction= bagging_fraction,
        learning_rate= learning_rate,
        min_split_gain = min_split_gain,
        sub_sample = sub_sample,
        n_estimators= int(n_estimators),
        min_child_weight= min_child_weight
    )

    folds = KFold(n_splits = 5, shuffle = True, random_state = 0)
        
    test_pred = np.zeros(data.shape[0])
    
    feats = [f for f in data.columns if f != '취급액']
    
    scaler = StandardScaler()
    
    
    for n_fold, (train_idx, valid_idx) in enumerate(folds.split(data[feats], data['취급액'])):
        train_x, train_y = data[feats].iloc[train_idx], data['취급액'].iloc[train_idx]
        train_x = scaler.fit_transform(train_x)
        valid_x, valid_y = data[feats].iloc[valid_idx], data['취급액'].iloc[valid_idx]
        valid_x = scaler.transform(valid_x)
        LGBM.fit(train_x, train_y, 
                eval_set = [(train_x, train_y), (valid_x, valid_y)],
                verbose=False,early_stopping_rounds = 100)

        test_pred[valid_idx] = LGBM.predict(valid_x, num_iteration = LGBM.best_iteration_)
        del train_x, train_y, valid_x, valid_y
   
  
    return -mape(data['취급액'],test_pred)

params = {'num_leaves': (12, 1024),
        'feature_fraction': (0.1, 0.9),
        'bagging_fraction': (0.5, 1),
        'max_depth': (7, 20),
        'learning_rate': (0.05, 0.2),
        'min_split_gain': (0.001, 0.1),
        'min_child_samples': (0, 100),
        'sub_sample': (0.7, 1),
        'n_estimators': (500, 2000),
        'min_data_in_leaf': (0, 1000),
        'min_child_weight': (0.001, 100)}

#### **튜닝 전 성능**
 
mape: 25.20877367938055  


clothes, inner, kitchen, food, beauty, elec, goods, health, etc, furn, bed

In [222]:
# bed
bo_11 = BayesianOptimization(lgb_eval, pbounds = params,random_state=0)
bo_11.maximize(n_iter = 300, init_points = 10, acq = 'ei', xi = 0.01, random_state = 0)

|   iter    |  target   | baggin... | featur... | learni... | max_depth | min_ch... | min_ch... | min_da... | min_sp... | n_esti... | num_le... | sub_sa... |
-------------------------------------------------------------------------------------------------------------------------------------------------------------
| [0m 1       [0m | [0m-130.5   [0m | [0m 0.7744  [0m | [0m 0.6722  [0m | [0m 0.1404  [0m | [0m 14.08   [0m | [0m 42.37   [0m | [0m 64.59   [0m | [0m 437.6   [0m | [0m 0.08929 [0m | [0m 1.945e+0[0m | [0m 400.0   [0m | [0m 0.9375  [0m |
| [0m 2       [0m | [0m-130.5   [0m | [0m 0.7644  [0m | [0m 0.5544  [0m | [0m 0.1888  [0m | [0m 7.923   [0m | [0m 8.713   [0m | [0m 2.023   [0m | [0m 832.6   [0m | [0m 0.07804 [0m | [0m 1.805e+0[0m | [0m 1.002e+0[0m | [0m 0.9397  [0m |
| [0m 3       [0m | [0m-130.5   [0m | [0m 0.7307  [0m | [0m 0.7244  [0m | [0m 0.06774 [0m | [0m 15.32   [0m | [0m 14.34   [0m | [0m 94.47   [0m 

In [223]:
def make_param(x):
    best_params = x.max['params']
    best_params['random_state'] = 0
    if x == (bo_6 or bo_10):
        best_params['objective'] = 'mape'
    else:
        best_params['objective'] = 'tweedie'
    best_params['max_depth'] = int(best_params['max_depth'])
    best_params['n_estimators'] = int(best_params['n_estimators'])
    best_params['min_child_samples'] = int(best_params['min_child_samples'])
    best_params['num_leaves'] = int(best_params['num_leaves'])
    best_params['min_data_in_leaf'] = int(best_params['min_data_in_leaf'])
    return best_params

In [224]:
bo_list = [bo_1,bo_2,bo_3,bo_4,bo_5,bo_6,bo_7,bo_8,bo_9,bo_10,bo_11]
for i in range(len(bo_list)):
    globals()['lgbm{}'.format(i+1)] = LGBMRegressor(**make_param(bo_list[i]))

In [491]:
bo_1.max['params']

{'feature_fraction': 0.11044585954726314,
 'learning_rate': 0.09234645565909387,
 'max_depth': 7.757582362185127,
 'min_child_weight': 951.5931410110306,
 'min_split_gain': 0.06921673321404705,
 'n_estimators': 1121.2775603673385,
 'num_leaves': 283.4873502495312,
 'sub_sample': 0.7745809267896179}

### **XGB**

In [700]:
def xgb_eval(max_leaves,colsample_bytree,max_depth,eta,gamma,
             subsample,n_estimators,min_child_weight):

    data = train_set    
    xgb = XGBRegressor(objective = 'reg:tweedie',
        max_leaves  = int(max_leaves),
        max_depth = int(max_depth),
#         min_child_samples = int(min_child_samples),
#         min_child_weight = int(min_data_in_leaf),
        n_jobs=6,
        random_state=0,
        colsample_bytree=colsample_bytree,
#         bagging_fraction= bagging_fraction,
        eta= eta,
        gamma = gamma,
        subsample = subsample,
        n_estimators= int(n_estimators),
        min_child_weight= min_child_weight
    )

    folds = KFold(n_splits = 5, shuffle = True, random_state = 0)
        
    test_pred = np.zeros(data.shape[0])
    
    feats = [f for f in data.columns if f != '취급액']
    
    scaler = StandardScaler()
    
    
    for n_fold, (train_idx, valid_idx) in enumerate(folds.split(data[feats], data['취급액'])):
        train_x, train_y = data[feats].iloc[train_idx], data['취급액'].iloc[train_idx]
        train_x = scaler.fit_transform(train_x)
        valid_x, valid_y = data[feats].iloc[valid_idx], data['취급액'].iloc[valid_idx]
        valid_x = scaler.transform(valid_x)
        xgb.fit(train_x, train_y, 
                eval_set = [(train_x, train_y), (valid_x, valid_y)],
                verbose=False,early_stopping_rounds = 100)
        
        best_iteration = xgb.get_booster().best_ntree_limit
        test_pred[valid_idx] = xgb.predict(valid_x, ntree_limit = best_iteration)
        del train_x, train_y, valid_x, valid_y
   
  
    return -mape(data['취급액'],test_pred)

params = {'max_leaves': (12, 1024),
        'colsample_bytree': (0.1, 0.9),
#         'bagging_fraction': (0.5, 1),
        'max_depth': (7, 20),
        'eta': (0.05, 0.2),
        'gamma': (0.001, 0.1),
#         'min_child_samples': (0, 100),
        'subsample': (0.5, 1),
        'n_estimators': (500, 2000),
#         'min_data_in_leaf': (0, 1000),
        'min_child_weight': (0.001, 1000)}

In [701]:
# clothes
bo = BayesianOptimization(xgb_eval, pbounds = params,random_state=0)
bo.maximize(n_iter = 100, init_points = 20, acq = 'ei', xi = 0.01, random_state = 0)

|   iter    |  target   | colsam... |    eta    |   gamma   | max_depth | max_le... | min_ch... | n_esti... | subsample |
-------------------------------------------------------------------------------------------------------------------------
| [0m 1       [0m | [0m-34.43   [0m | [0m 0.5391  [0m | [0m 0.1573  [0m | [0m 0.06067 [0m | [0m 14.08   [0m | [0m 440.7   [0m | [0m 645.9   [0m | [0m 1.156e+0[0m | [0m 0.9459  [0m |
| [0m 2       [0m | [0m-34.69   [0m | [0m 0.8709  [0m | [0m 0.1075  [0m | [0m 0.07938 [0m | [0m 13.88   [0m | [0m 586.9   [0m | [0m 925.6   [0m | [0m 606.6   [0m | [0m 0.5436  [0m |
| [0m 3       [0m | [0m-35.56   [0m | [0m 0.1162  [0m | [0m 0.1749  [0m | [0m 0.07804 [0m | [0m 18.31   [0m | [0m 1.002e+0[0m | [0m 799.2   [0m | [0m 1.192e+0[0m | [0m 0.8903  [0m |
| [0m 4       [0m | [0m-34.74   [0m | [0m 0.1946  [0m | [0m 0.146   [0m | [0m 0.01519 [0m | [0m 19.28   [0m | [0m 540.1   [0m | [0m 414

In [704]:
def make_param_xgb(x):
    best_params = x.max['params']
    best_params['random_state'] = 0
    if x == (bo_6 or bo_10):
        best_params['objective'] = mape_objective_function
    else:
        best_params['objective'] = 'reg:tweedie'

    
    best_params['max_depth'] = int(best_params['max_depth'])
    best_params['n_estimators'] = int(best_params['n_estimators'])
#     best_params['min_child_samples'] = int(best_params['min_child_samples'])
    best_params['max_leaves'] = int(best_params['max_leaves'])
#     best_params['min_data_in_leaf'] = int(best_params['min_data_in_leaf'])
    return best_params
xgb_all = make_param_xgb(bo)

In [707]:
XGBRegressor(**xgb_all)

XGBRegressor(base_score=None, booster=None, colsample_bylevel=None,
             colsample_bynode=None, colsample_bytree=0.4876619535400296,
             eta=0.05669375986990678, gamma=0.007357900012337478, gpu_id=None,
             importance_type='gain', interaction_constraints=None,
             learning_rate=None, max_delta_step=None, max_depth=7,
             max_leaves=96, min_child_weight=136.74405777597332, missing=nan,
             monotone_constraints=None, n_estimators=1963, n_jobs=None,
             num_parallel_tree=None, objective='reg:tweedie', random_state=0,
             reg_alpha=None, reg_lambda=None, scale_pos_weight=None,
             subsample=0.7165695744381803, tree_method=None,
             validate_parameters=None, verbosity=None)

In [610]:
def xgb_eval(max_leaves,colsample_bytree,max_depth,eta,gamma,
             subsample,n_estimators,min_child_weight):

    data = clothes    
    xgb = XGBRegressor(objective = 'reg:tweedie',
        booster = 'dart',
        max_leaves  = int(max_leaves),
        max_depth = int(max_depth),
#         min_child_samples = int(min_child_samples),
#         min_child_weight = int(min_data_in_leaf),
        n_jobs=6,
        random_state=0,
        colsample_bytree=colsample_bytree,
#         bagging_fraction= bagging_fraction,
        eta= eta,
        gamma = gamma,
        subsample = subsample,
        n_estimators= int(n_estimators),
        min_child_weight= min_child_weight
    )

    folds = KFold(n_splits = 5, shuffle = True, random_state = 0)
        
    test_pred = np.zeros(data.shape[0])
    
    feats = [f for f in data.columns if f != '취급액']
    
    scaler = StandardScaler()
    
    
    for n_fold, (train_idx, valid_idx) in enumerate(folds.split(data[feats], data['취급액'])):
        train_x, train_y = data[feats].iloc[train_idx], data['취급액'].iloc[train_idx]
        train_x = scaler.fit_transform(train_x)
        valid_x, valid_y = data[feats].iloc[valid_idx], data['취급액'].iloc[valid_idx]
        valid_x = scaler.transform(valid_x)
        xgb.fit(train_x, train_y, 
                eval_set = [(train_x, train_y), (valid_x, valid_y)],
                verbose=False,early_stopping_rounds = 100)
        
        best_iteration = xgb.get_booster().best_ntree_limit
        test_pred[valid_idx] = xgb.predict(valid_x, ntree_limit = best_iteration)
        del train_x, train_y, valid_x, valid_y
   
  
    return -mape(data['취급액'],test_pred)

params = {'max_leaves': (12, 1024),
        'colsample_bytree': (0.1, 0.9),
#         'bagging_fraction': (0.5, 1),
        'max_depth': (7, 20),
        'eta': (0.05, 0.2),
        'gamma': (0.001, 0.1),
#         'min_child_samples': (0, 100),
        'subsample': (0.5, 1),
        'n_estimators': (500, 2000),
#         'min_data_in_leaf': (0, 1000),
        'min_child_weight': (0.001, 1000)}

In [611]:
def mape_objective_function(labels,preds):
    
    grad = (preds - labels) / (0.2 + labels * np.abs(preds - labels))
    hess = 0.1 + np.zeros(len(preds));
    return grad, hess

In [614]:
# clothes
bo_1 = BayesianOptimization(xgb_eval, pbounds = params,random_state=0)
bo_1.maximize(n_iter = 100, init_points = 20, acq = 'ei', xi = 0.01, random_state = 0)

|   iter    |  target   | colsam... |    eta    |   gamma   | max_depth | max_le... | min_ch... | n_esti... | subsample |
-------------------------------------------------------------------------------------------------------------------------
| [0m 1       [0m | [0m-28.11   [0m | [0m 0.5391  [0m | [0m 0.1573  [0m | [0m 0.06067 [0m | [0m 14.08   [0m | [0m 440.7   [0m | [0m 645.9   [0m | [0m 1.156e+0[0m | [0m 0.9459  [0m |
| [95m 2       [0m | [95m-27.55   [0m | [95m 0.8709  [0m | [95m 0.1075  [0m | [95m 0.07938 [0m | [95m 13.88   [0m | [95m 586.9   [0m | [95m 925.6   [0m | [95m 606.6   [0m | [95m 0.5436  [0m |
| [95m 3       [0m | [95m-26.0    [0m | [95m 0.1162  [0m | [95m 0.1749  [0m | [95m 0.07804 [0m | [95m 18.31   [0m | [95m 1.002e+0[0m | [95m 799.2   [0m | [95m 1.192e+0[0m | [95m 0.8903  [0m |
| [0m 4       [0m | [0m-28.95   [0m | [0m 0.1946  [0m | [0m 0.146   [0m | [0m 0.01519 [0m | [0m 19.28   [0m | [0m 540

KeyboardInterrupt: 

In [228]:
def xgb_eval(max_leaves,colsample_bytree,max_depth,eta,gamma,
             subsample,n_estimators,min_child_weight):

    data = inner    
    xgb = XGBRegressor(objective = 'reg:tweedie',
        booster = 'dart',
        max_leaves  = int(max_leaves),
        max_depth = int(max_depth),
#         min_child_samples = int(min_child_samples),
#         min_child_weight = int(min_data_in_leaf),
        n_jobs=6,
        random_state=0,
        colsample_bytree=colsample_bytree,
#         bagging_fraction= bagging_fraction,
        eta= eta,
        gamma = gamma,
        subsample = subsample,
        n_estimators= int(n_estimators),
        min_child_weight= min_child_weight
    )

    folds = KFold(n_splits = 5, shuffle = True, random_state = 0)
        
    test_pred = np.zeros(data.shape[0])
    
    feats = [f for f in data.columns if f != '취급액']
    
    scaler = StandardScaler()
    
    
    for n_fold, (train_idx, valid_idx) in enumerate(folds.split(data[feats], data['취급액'])):
        train_x, train_y = data[feats].iloc[train_idx], data['취급액'].iloc[train_idx]
        train_x = scaler.fit_transform(train_x)
        valid_x, valid_y = data[feats].iloc[valid_idx], data['취급액'].iloc[valid_idx]
        valid_x = scaler.transform(valid_x)
        xgb.fit(train_x, train_y, 
                eval_set = [(train_x, train_y), (valid_x, valid_y)],
                verbose=False,early_stopping_rounds = 100)
        
        best_iteration = xgb.get_booster().best_ntree_limit
        test_pred[valid_idx] = xgb.predict(valid_x, ntree_limit = best_iteration)
        del train_x, train_y, valid_x, valid_y
   
  
    return -mape(data['취급액'],test_pred)

params = {'max_leaves': (12, 1024),
        'colsample_bytree': (0.1, 0.9),
#         'bagging_fraction': (0.5, 1),
        'max_depth': (7, 20),
        'eta': (0.05, 0.2),
        'gamma': (0.001, 0.1),
#         'min_child_samples': (0, 100),
        'subsample': (0.5, 1),
        'n_estimators': (500, 2000),
#         'min_data_in_leaf': (0, 1000),
        'min_child_weight': (0.001, 1000)}

#### **튜닝 전 성능**
 
mape: 26.5763001623147  


In [229]:
# inner
bo_2 = BayesianOptimization(xgb_eval, pbounds = params,random_state=0)
bo_2.maximize(n_iter = 100, init_points = 10, acq = 'ei', xi = 0.01, random_state = 0)

|   iter    |  target   | featur... | learni... | max_depth | min_ch... | min_sp... | n_esti... | num_le... | sub_sa... |
-------------------------------------------------------------------------------------------------------------------------
| [0m 1       [0m | [0m-30.79   [0m | [0m 0.5391  [0m | [0m 0.1573  [0m | [0m 14.84   [0m | [0m 544.9   [0m | [0m 0.04294 [0m | [0m 1.469e+0[0m | [0m 454.8   [0m | [0m 0.9675  [0m |
| [0m 2       [0m | [0m-31.77   [0m | [0m 0.8709  [0m | [0m 0.1075  [0m | [0m 17.29   [0m | [0m 528.9   [0m | [0m 0.05724 [0m | [0m 1.888e+0[0m | [0m 83.89   [0m | [0m 0.7261  [0m |
| [0m 3       [0m | [0m-31.05   [0m | [0m 0.1162  [0m | [0m 0.1749  [0m | [0m 17.12   [0m | [0m 870.0   [0m | [0m 0.09788 [0m | [0m 1.699e+0[0m | [0m 479.0   [0m | [0m 0.9342  [0m |
| [95m 4       [0m | [95m-25.52   [0m | [95m 0.1946  [0m | [95m 0.146   [0m | [95m 8.864   [0m | [95m 944.7   [0m | [95m 0.05266 [0m | 

In [230]:
def xgb_eval(max_leaves,colsample_bytree,max_depth,eta,gamma,
             subsample,n_estimators,min_child_weight):

    data = kitchen    
    xgb = XGBRegressor(objective = 'reg:tweedie',
        max_leaves  = int(max_leaves),
        max_depth = int(max_depth),
#         min_child_samples = int(min_child_samples),
#         min_child_weight = int(min_data_in_leaf),
        n_jobs=6,
        random_state=0,
        colsample_bytree=colsample_bytree,
#         bagging_fraction= bagging_fraction,
        eta= eta,
        gamma = gamma,
        subsample = subsample,
        n_estimators= int(n_estimators),
        min_child_weight= min_child_weight
    )

    folds = KFold(n_splits = 5, shuffle = True, random_state = 0)
        
    test_pred = np.zeros(data.shape[0])
    
    feats = [f for f in data.columns if f != '취급액']
    
    scaler = StandardScaler()
    
    
    for n_fold, (train_idx, valid_idx) in enumerate(folds.split(data[feats], data['취급액'])):
        train_x, train_y = data[feats].iloc[train_idx], data['취급액'].iloc[train_idx]
        train_x = scaler.fit_transform(train_x)
        valid_x, valid_y = data[feats].iloc[valid_idx], data['취급액'].iloc[valid_idx]
        valid_x = scaler.transform(valid_x)
        xgb.fit(train_x, train_y, 
                eval_set = [(train_x, train_y), (valid_x, valid_y)],
                verbose=False,early_stopping_rounds = 100)
        
        best_iteration = xgb.get_booster().best_ntree_limit
        test_pred[valid_idx] = xgb.predict(valid_x, ntree_limit = best_iteration)
        del train_x, train_y, valid_x, valid_y
   
  
    return -mape(data['취급액'],test_pred)

params = {'max_leaves': (12, 1024),
        'colsample_bytree': (0.1, 0.9),
#         'bagging_fraction': (0.5, 1),
        'max_depth': (7, 20),
        'eta': (0.05, 0.2),
        'gamma': (0.001, 0.1),
#         'min_child_samples': (0, 100),
        'subsample': (0.5, 1),
        'n_estimators': (500, 2000),
#         'min_data_in_leaf': (0, 1000),
        'min_child_weight': (0.001, 1000)}

#### **튜닝 전 성능**
 
mape: 27.49715254148381  


clothes, inner, kitchen, food, beauty, elec, goods, health, etc, furn, bed

In [231]:
# kitchen
bo_3 = BayesianOptimization(xgb_eval, pbounds = params,random_state=0)
bo_3.maximize(n_iter = 100, init_points = 10, acq = 'ei', xi = 0.01, random_state = 0)

|   iter    |  target   | featur... | learni... | max_depth | min_ch... | min_sp... | n_esti... | num_le... | sub_sa... |
-------------------------------------------------------------------------------------------------------------------------
| [0m 1       [0m | [0m-33.26   [0m | [0m 0.5391  [0m | [0m 0.1573  [0m | [0m 14.84   [0m | [0m 544.9   [0m | [0m 0.04294 [0m | [0m 1.469e+0[0m | [0m 454.8   [0m | [0m 0.9675  [0m |
| [0m 2       [0m | [0m-34.33   [0m | [0m 0.8709  [0m | [0m 0.1075  [0m | [0m 17.29   [0m | [0m 528.9   [0m | [0m 0.05724 [0m | [0m 1.888e+0[0m | [0m 83.89   [0m | [0m 0.7261  [0m |
| [95m 3       [0m | [95m-29.65   [0m | [95m 0.1162  [0m | [95m 0.1749  [0m | [95m 17.12   [0m | [95m 870.0   [0m | [95m 0.09788 [0m | [95m 1.699e+0[0m | [95m 479.0   [0m | [95m 0.9342  [0m |
| [95m 4       [0m | [95m-29.45   [0m | [95m 0.1946  [0m | [95m 0.146   [0m | [95m 8.864   [0m | [95m 944.7   [0m | [95m 0.0526

In [232]:
def xgb_eval(max_leaves,colsample_bytree,max_depth,eta,gamma,
             subsample,n_estimators,min_child_weight):

    data = food    
    xgb = XGBRegressor(objective = 'reg:tweedie',
        max_leaves  = int(max_leaves),
        max_depth = int(max_depth),
#         min_child_samples = int(min_child_samples),
#         min_child_weight = int(min_data_in_leaf),
        n_jobs=6,
        random_state=0,
        colsample_bytree=colsample_bytree,
#         bagging_fraction= bagging_fraction,
        eta= eta,
        gamma = gamma,
        subsample = subsample,
        n_estimators= int(n_estimators),
        min_child_weight= min_child_weight
    )

    folds = KFold(n_splits = 5, shuffle = True, random_state = 0)
        
    test_pred = np.zeros(data.shape[0])
    
    feats = [f for f in data.columns if f != '취급액']
    
    scaler = StandardScaler()
    
    
    for n_fold, (train_idx, valid_idx) in enumerate(folds.split(data[feats], data['취급액'])):
        train_x, train_y = data[feats].iloc[train_idx], data['취급액'].iloc[train_idx]
        train_x = scaler.fit_transform(train_x)
        valid_x, valid_y = data[feats].iloc[valid_idx], data['취급액'].iloc[valid_idx]
        valid_x = scaler.transform(valid_x)
        xgb.fit(train_x, train_y, 
                eval_set = [(train_x, train_y), (valid_x, valid_y)],
                verbose=False,early_stopping_rounds = 100)
        
        best_iteration = xgb.get_booster().best_ntree_limit
        test_pred[valid_idx] = xgb.predict(valid_x, ntree_limit = best_iteration)
        del train_x, train_y, valid_x, valid_y
   
  
    return -mape(data['취급액'],test_pred)

params = {'max_leaves': (12, 1024),
        'colsample_bytree': (0.1, 0.9),
#         'bagging_fraction': (0.5, 1),
        'max_depth': (7, 20),
        'eta': (0.05, 0.2),
        'gamma': (0.001, 0.1),
#         'min_child_samples': (0, 100),
        'subsample': (0.5, 1),
        'n_estimators': (500, 2000),
#         'min_data_in_leaf': (0, 1000),
        'min_child_weight': (0.001, 1000)}

#### **튜닝 전 성능**

mape: 15.874558341118911  


clothes, inner, kitchen, food, beauty, elec, goods, health, etc, furn, bed

In [233]:
# food
bo_4 = BayesianOptimization(xgb_eval, pbounds = params,random_state=0)
bo_4.maximize(n_iter = 100, init_points = 10, acq = 'ei', xi = 0.01, random_state = 0)

|   iter    |  target   | featur... | learni... | max_depth | min_ch... | min_sp... | n_esti... | num_le... | sub_sa... |
-------------------------------------------------------------------------------------------------------------------------
| [0m 1       [0m | [0m-20.41   [0m | [0m 0.5391  [0m | [0m 0.1573  [0m | [0m 14.84   [0m | [0m 544.9   [0m | [0m 0.04294 [0m | [0m 1.469e+0[0m | [0m 454.8   [0m | [0m 0.9675  [0m |
| [0m 2       [0m | [0m-21.65   [0m | [0m 0.8709  [0m | [0m 0.1075  [0m | [0m 17.29   [0m | [0m 528.9   [0m | [0m 0.05724 [0m | [0m 1.888e+0[0m | [0m 83.89   [0m | [0m 0.7261  [0m |
| [95m 3       [0m | [95m-20.25   [0m | [95m 0.1162  [0m | [95m 0.1749  [0m | [95m 17.12   [0m | [95m 870.0   [0m | [95m 0.09788 [0m | [95m 1.699e+0[0m | [95m 479.0   [0m | [95m 0.9342  [0m |
| [95m 4       [0m | [95m-16.84   [0m | [95m 0.1946  [0m | [95m 0.146   [0m | [95m 8.864   [0m | [95m 944.7   [0m | [95m 0.0526

In [234]:
def xgb_eval(max_leaves,colsample_bytree,max_depth,eta,gamma,
             subsample,n_estimators,min_child_weight):
    data = beauty    
    xgb = XGBRegressor(objective = 'reg:tweedie',
        max_leaves  = int(max_leaves),
        max_depth = int(max_depth),
#         min_child_samples = int(min_child_samples),
#         min_child_weight = int(min_data_in_leaf),
        n_jobs=6,
        random_state=0,
        colsample_bytree=colsample_bytree,
#         bagging_fraction= bagging_fraction,
        eta= eta,
        gamma = gamma,
        subsample = subsample,
        n_estimators= int(n_estimators),
        min_child_weight= min_child_weight
    )
    
    folds = KFold(n_splits = 5, shuffle = True, random_state = 0)
        
    test_pred = np.zeros(data.shape[0])
    
    feats = [f for f in data.columns if f != '취급액']
    
    scaler = StandardScaler()
    
    
    for n_fold, (train_idx, valid_idx) in enumerate(folds.split(data[feats], data['취급액'])):
        train_x, train_y = data[feats].iloc[train_idx], data['취급액'].iloc[train_idx]
        train_x = scaler.fit_transform(train_x)
        valid_x, valid_y = data[feats].iloc[valid_idx], data['취급액'].iloc[valid_idx]
        valid_x = scaler.transform(valid_x)
        xgb.fit(train_x, train_y, 
                eval_set = [(train_x, train_y), (valid_x, valid_y)],
                verbose=False,early_stopping_rounds = 100)
        
        best_iteration = xgb.get_booster().best_ntree_limit
        test_pred[valid_idx] = xgb.predict(valid_x, ntree_limit = best_iteration)
        del train_x, train_y, valid_x, valid_y
   
  
    return -mape(data['취급액'],test_pred)

params = {'max_leaves': (12, 1024),
        'colsample_bytree': (0.1, 0.9),
#         'bagging_fraction': (0.5, 1),
        'max_depth': (7, 20),
        'eta': (0.05, 0.2),
        'gamma': (0.001, 0.1),
#         'min_child_samples': (0, 100),
        'subsample': (0.5, 1),
        'n_estimators': (500, 2000),
#         'min_data_in_leaf': (0, 1000),
        'min_child_weight': (0.001, 1000)}

#### **튜닝 전 성능**
  
mape: 19.217135630635212  


clothes, inner, kitchen, food, beauty, elec, goods, health, etc, furn, bed

In [235]:
# beauty
bo_5 = BayesianOptimization(xgb_eval, pbounds = params,random_state=0)
bo_5.maximize(n_iter = 100, init_points = 10, acq = 'ei', xi = 0.01, random_state = 0)

|   iter    |  target   | featur... | learni... | max_depth | min_ch... | min_sp... | n_esti... | num_le... | sub_sa... |
-------------------------------------------------------------------------------------------------------------------------
| [0m 1       [0m | [0m-21.0    [0m | [0m 0.5391  [0m | [0m 0.1573  [0m | [0m 14.84   [0m | [0m 544.9   [0m | [0m 0.04294 [0m | [0m 1.469e+0[0m | [0m 454.8   [0m | [0m 0.9675  [0m |
| [0m 2       [0m | [0m-21.83   [0m | [0m 0.8709  [0m | [0m 0.1075  [0m | [0m 17.29   [0m | [0m 528.9   [0m | [0m 0.05724 [0m | [0m 1.888e+0[0m | [0m 83.89   [0m | [0m 0.7261  [0m |
| [95m 3       [0m | [95m-19.31   [0m | [95m 0.1162  [0m | [95m 0.1749  [0m | [95m 17.12   [0m | [95m 870.0   [0m | [95m 0.09788 [0m | [95m 1.699e+0[0m | [95m 479.0   [0m | [95m 0.9342  [0m |
| [95m 4       [0m | [95m-19.12   [0m | [95m 0.1946  [0m | [95m 0.146   [0m | [95m 8.864   [0m | [95m 944.7   [0m | [95m 0.0526

In [376]:
def xgb_eval(max_leaves,colsample_bytree,max_depth,eta,gamma,
             subsample,n_estimators,min_child_weight):

    data = elec    
    xgb = XGBRegressor(objective = 'reg:tweedie',
        max_leaves  = int(max_leaves),
        max_depth = int(max_depth),
#         min_child_samples = int(min_child_samples),
#         min_child_weight = int(min_data_in_leaf),
        n_jobs=6,
        random_state=0,
        colsample_bytree=colsample_bytree,
#         bagging_fraction= bagging_fraction,
        eta= eta,
        gamma = gamma,
        subsample = subsample,
        n_estimators= int(n_estimators),
        min_child_weight= min_child_weight
    )

    folds = KFold(n_splits = 5, shuffle = True, random_state = 0)
        
    test_pred = np.zeros(data.shape[0])
    
    feats = [f for f in data.columns if f != '취급액']
    
    scaler = StandardScaler()
    
    
    for n_fold, (train_idx, valid_idx) in enumerate(folds.split(data[feats], data['취급액'])):
        train_x, train_y = data[feats].iloc[train_idx], data['취급액'].iloc[train_idx]
        train_x = scaler.fit_transform(train_x)
        valid_x, valid_y = data[feats].iloc[valid_idx], data['취급액'].iloc[valid_idx]
        valid_x = scaler.transform(valid_x)
        xgb.fit(train_x, train_y, 
                eval_set = [(train_x, train_y), (valid_x, valid_y)],
                verbose=False,early_stopping_rounds = 100)
        
        best_iteration = xgb.get_booster().best_ntree_limit
        test_pred[valid_idx] = xgb.predict(valid_x, ntree_limit = best_iteration)
        del train_x, train_y, valid_x, valid_y
   
  
    return -mape(data['취급액'],test_pred)

params = {'max_leaves': (12, 1024),
        'colsample_bytree': (0.1, 0.9),
#         'bagging_fraction': (0.5, 1),
        'max_depth': (7, 20),
        'eta': (0.05, 0.2),
        'gamma': (0.001, 0.1),
#         'min_child_samples': (0, 100),
        'subsample': (0.5, 1),
        'n_estimators': (500, 2000),
#         'min_data_in_leaf': (0, 1000),
        'min_child_weight': (0.001, 1000)}

#### **튜닝 전 성능**

mape: 40.07481282197277  


clothes, inner, kitchen, food, beauty, elec, goods, health, etc, furn, bed

In [377]:
# elec
bo_6 = BayesianOptimization(xgb_eval, pbounds = params,random_state=0)
bo_6.maximize(n_iter = 100, init_points = 10, acq = 'ei', xi = 0.01, random_state = 0)

|   iter    |  target   | colsam... |    eta    |   gamma   | max_depth | max_le... | min_ch... | n_esti... | subsample |
-------------------------------------------------------------------------------------------------------------------------
| [0m 1       [0m | [0m-49.16   [0m | [0m 0.5391  [0m | [0m 0.1573  [0m | [0m 0.06067 [0m | [0m 14.08   [0m | [0m 440.7   [0m | [0m 645.9   [0m | [0m 1.156e+0[0m | [0m 0.9459  [0m |
| [0m 2       [0m | [0m-49.94   [0m | [0m 0.8709  [0m | [0m 0.1075  [0m | [0m 0.07938 [0m | [0m 13.88   [0m | [0m 586.9   [0m | [0m 925.6   [0m | [0m 606.6   [0m | [0m 0.5436  [0m |
| [0m 3       [0m | [0m-54.84   [0m | [0m 0.1162  [0m | [0m 0.1749  [0m | [0m 0.07804 [0m | [0m 18.31   [0m | [0m 1.002e+0[0m | [0m 799.2   [0m | [0m 1.192e+0[0m | [0m 0.8903  [0m |
| [0m 4       [0m | [0m-53.65   [0m | [0m 0.1946  [0m | [0m 0.146   [0m | [0m 0.01519 [0m | [0m 19.28   [0m | [0m 540.1   [0m | [0m 414

In [238]:
def xgb_eval(max_leaves,colsample_bytree,max_depth,eta,gamma,
             subsample,n_estimators,min_child_weight):

    data = goods    
    xgb = XGBRegressor(objective = 'reg:tweedie',
        max_leaves  = int(max_leaves),
        max_depth = int(max_depth),
#         min_child_samples = int(min_child_samples),
#         min_child_weight = int(min_data_in_leaf),
        n_jobs=6,
        random_state=0,
        colsample_bytree=colsample_bytree,
#         bagging_fraction= bagging_fraction,
        eta= eta,
        gamma = gamma,
        subsample = subsample,
        n_estimators= int(n_estimators),
        min_child_weight= min_child_weight
    )

    folds = KFold(n_splits = 5, shuffle = True, random_state = 0)
        
    test_pred = np.zeros(data.shape[0])
    
    feats = [f for f in data.columns if f != '취급액']
    
    scaler = StandardScaler()
    
    
    for n_fold, (train_idx, valid_idx) in enumerate(folds.split(data[feats], data['취급액'])):
        train_x, train_y = data[feats].iloc[train_idx], data['취급액'].iloc[train_idx]
        train_x = scaler.fit_transform(train_x)
        valid_x, valid_y = data[feats].iloc[valid_idx], data['취급액'].iloc[valid_idx]
        valid_x = scaler.transform(valid_x)
        xgb.fit(train_x, train_y, 
                eval_set = [(train_x, train_y), (valid_x, valid_y)],
                verbose=False,early_stopping_rounds = 100)
        
        best_iteration = xgb.get_booster().best_ntree_limit
        test_pred[valid_idx] = xgb.predict(valid_x, ntree_limit = best_iteration)
        del train_x, train_y, valid_x, valid_y
   
  
    return -mape(data['취급액'],test_pred)

params = {'max_leaves': (12, 1024),
        'colsample_bytree': (0.1, 0.9),
#         'bagging_fraction': (0.5, 1),
        'max_depth': (7, 20),
        'eta': (0.05, 0.2),
        'gamma': (0.001, 0.1),
#         'min_child_samples': (0, 100),
        'subsample': (0.5, 1),
        'n_estimators': (500, 2000),
#         'min_data_in_leaf': (0, 1000),
        'min_child_weight': (0.001, 1000)}

#### **튜닝 전 성능**

mape: 31.623747738429696  


clothes, inner, kitchen, food, beauty, elec, goods, health, etc, furn, bed

In [239]:
# goods
bo_7 = BayesianOptimization(xgb_eval, pbounds = params,random_state=0)
bo_7.maximize(n_iter = 100, init_points = 10, acq = 'ei', xi = 0.01, random_state = 0)

|   iter    |  target   | featur... | learni... | max_depth | min_ch... | min_sp... | n_esti... | num_le... | sub_sa... |
-------------------------------------------------------------------------------------------------------------------------
| [0m 1       [0m | [0m-38.73   [0m | [0m 0.5391  [0m | [0m 0.1573  [0m | [0m 14.84   [0m | [0m 544.9   [0m | [0m 0.04294 [0m | [0m 1.469e+0[0m | [0m 454.8   [0m | [0m 0.9675  [0m |
| [95m 2       [0m | [95m-38.67   [0m | [95m 0.8709  [0m | [95m 0.1075  [0m | [95m 17.29   [0m | [95m 528.9   [0m | [95m 0.05724 [0m | [95m 1.888e+0[0m | [95m 83.89   [0m | [95m 0.7261  [0m |
| [95m 3       [0m | [95m-38.02   [0m | [95m 0.1162  [0m | [95m 0.1749  [0m | [95m 17.12   [0m | [95m 870.0   [0m | [95m 0.09788 [0m | [95m 1.699e+0[0m | [95m 479.0   [0m | [95m 0.9342  [0m |
| [95m 4       [0m | [95m-33.96   [0m | [95m 0.1946  [0m | [95m 0.146   [0m | [95m 8.864   [0m | [95m 944.7   [0m | [

In [240]:
def xgb_eval(max_leaves,colsample_bytree,max_depth,eta,gamma,
             subsample,n_estimators,min_child_weight):

    data = health    
    xgb = XGBRegressor(objective = 'reg:tweedie',
        max_leaves  = int(max_leaves),
        max_depth = int(max_depth),
#         min_child_samples = int(min_child_samples),
#         min_child_weight = int(min_data_in_leaf),
        n_jobs=6,
        random_state=0,
        colsample_bytree=colsample_bytree,
#         bagging_fraction= bagging_fraction,
        eta= eta,
        gamma = gamma,
        subsample = subsample,
        n_estimators= int(n_estimators),
        min_child_weight= min_child_weight
    )

    folds = KFold(n_splits = 5, shuffle = True, random_state = 0)
        
    test_pred = np.zeros(data.shape[0])
    
    feats = [f for f in data.columns if f != '취급액']
    
    scaler = StandardScaler()
    
    
    for n_fold, (train_idx, valid_idx) in enumerate(folds.split(data[feats], data['취급액'])):
        train_x, train_y = data[feats].iloc[train_idx], data['취급액'].iloc[train_idx]
        train_x = scaler.fit_transform(train_x)
        valid_x, valid_y = data[feats].iloc[valid_idx], data['취급액'].iloc[valid_idx]
        valid_x = scaler.transform(valid_x)
        xgb.fit(train_x, train_y, 
                eval_set = [(train_x, train_y), (valid_x, valid_y)],
                verbose=False,early_stopping_rounds = 100)
        
        best_iteration = xgb.get_booster().best_ntree_limit
        test_pred[valid_idx] = xgb.predict(valid_x, ntree_limit = best_iteration)
        del train_x, train_y, valid_x, valid_y
   
  
    return -mape(data['취급액'],test_pred)

params = {'max_leaves': (12, 1024),
        'colsample_bytree': (0.1, 0.9),
#         'bagging_fraction': (0.5, 1),
        'max_depth': (7, 20),
        'eta': (0.05, 0.2),
        'gamma': (0.001, 0.1),
#         'min_child_samples': (0, 100),
        'subsample': (0.5, 1),
        'n_estimators': (500, 2000),
#         'min_data_in_leaf': (0, 1000),
        'min_child_weight': (0.001, 1000)}

#### **튜닝 전 성능**
 
mape: 21.80460073752754  


clothes, inner, kitchen, food, beauty, elec, goods, health, etc, furn, bed

In [241]:
# health
bo_8 = BayesianOptimization(xgb_eval, pbounds = params,random_state=0)
bo_8.maximize(n_iter = 100, init_points = 10, acq = 'ei', xi = 0.01, random_state = 0)

|   iter    |  target   | featur... | learni... | max_depth | min_ch... | min_sp... | n_esti... | num_le... | sub_sa... |
-------------------------------------------------------------------------------------------------------------------------
| [0m 1       [0m | [0m-27.12   [0m | [0m 0.5391  [0m | [0m 0.1573  [0m | [0m 14.84   [0m | [0m 544.9   [0m | [0m 0.04294 [0m | [0m 1.469e+0[0m | [0m 454.8   [0m | [0m 0.9675  [0m |
| [0m 2       [0m | [0m-30.13   [0m | [0m 0.8709  [0m | [0m 0.1075  [0m | [0m 17.29   [0m | [0m 528.9   [0m | [0m 0.05724 [0m | [0m 1.888e+0[0m | [0m 83.89   [0m | [0m 0.7261  [0m |
| [95m 3       [0m | [95m-21.7    [0m | [95m 0.1162  [0m | [95m 0.1749  [0m | [95m 17.12   [0m | [95m 870.0   [0m | [95m 0.09788 [0m | [95m 1.699e+0[0m | [95m 479.0   [0m | [95m 0.9342  [0m |
| [0m 4       [0m | [0m-23.02   [0m | [0m 0.1946  [0m | [0m 0.146   [0m | [0m 8.864   [0m | [0m 944.7   [0m | [0m 0.05266 [0m 

In [242]:
def xgb_eval(max_leaves,colsample_bytree,max_depth,eta,gamma,
             subsample,n_estimators,min_child_weight):
    data = etc    
    xgb = XGBRegressor(objective = 'reg:tweedie',
        max_leaves  = int(max_leaves),
        max_depth = int(max_depth),
#         min_child_samples = int(min_child_samples),
#         min_child_weight = int(min_data_in_leaf),
        n_jobs=6,
        random_state=0,
        colsample_bytree=colsample_bytree,
#         bagging_fraction= bagging_fraction,
        eta= eta,
        gamma = gamma,
        subsample = subsample,
        n_estimators= int(n_estimators),
        min_child_weight= min_child_weight
    )

    folds = KFold(n_splits = 5, shuffle = True, random_state = 0)
        
    test_pred = np.zeros(data.shape[0])
    
    feats = [f for f in data.columns if f != '취급액']
    
    scaler = StandardScaler()
    
    
    for n_fold, (train_idx, valid_idx) in enumerate(folds.split(data[feats], data['취급액'])):
        train_x, train_y = data[feats].iloc[train_idx], data['취급액'].iloc[train_idx]
        train_x = scaler.fit_transform(train_x)
        valid_x, valid_y = data[feats].iloc[valid_idx], data['취급액'].iloc[valid_idx]
        valid_x = scaler.transform(valid_x)
        xgb.fit(train_x, train_y, 
                eval_set = [(train_x, train_y), (valid_x, valid_y)],
                verbose=False,early_stopping_rounds = 100)
        
        best_iteration = xgb.get_booster().best_ntree_limit
        test_pred[valid_idx] = xgb.predict(valid_x, ntree_limit = best_iteration)
        del train_x, train_y, valid_x, valid_y
   
  
    return -mape(data['취급액'],test_pred)

params = {'max_leaves': (12, 1024),
        'colsample_bytree': (0.1, 0.9),
#         'bagging_fraction': (0.5, 1),
        'max_depth': (7, 20),
        'eta': (0.05, 0.2),
        'gamma': (0.001, 0.1),
#         'min_child_samples': (0, 100),
        'subsample': (0.5, 1),
        'n_estimators': (500, 2000),
#         'min_data_in_leaf': (0, 1000),
        'min_child_weight': (0.001, 1000)}

#### **튜닝 전 성능**
 
mape: 38.29687137332689  


clothes, inner, kitchen, food, beauty, elec, goods, health, etc, furn, bed

In [243]:
# etc
bo_9 = BayesianOptimization(xgb_eval, pbounds = params,random_state=0)
bo_9.maximize(n_iter = 100, init_points = 10, acq = 'ei', xi = 0.01, random_state = 0)

|   iter    |  target   | featur... | learni... | max_depth | min_ch... | min_sp... | n_esti... | num_le... | sub_sa... |
-------------------------------------------------------------------------------------------------------------------------
| [0m 1       [0m | [0m-38.7    [0m | [0m 0.5391  [0m | [0m 0.1573  [0m | [0m 14.84   [0m | [0m 544.9   [0m | [0m 0.04294 [0m | [0m 1.469e+0[0m | [0m 454.8   [0m | [0m 0.9675  [0m |
| [95m 2       [0m | [95m-38.69   [0m | [95m 0.8709  [0m | [95m 0.1075  [0m | [95m 17.29   [0m | [95m 528.9   [0m | [95m 0.05724 [0m | [95m 1.888e+0[0m | [95m 83.89   [0m | [95m 0.7261  [0m |
| [95m 3       [0m | [95m-37.17   [0m | [95m 0.1162  [0m | [95m 0.1749  [0m | [95m 17.12   [0m | [95m 870.0   [0m | [95m 0.09788 [0m | [95m 1.699e+0[0m | [95m 479.0   [0m | [95m 0.9342  [0m |
| [95m 4       [0m | [95m-35.93   [0m | [95m 0.1946  [0m | [95m 0.146   [0m | [95m 8.864   [0m | [95m 944.7   [0m | [

In [244]:
def xgb_eval(max_leaves,colsample_bytree,max_depth,eta,gamma,
             subsample,n_estimators,min_child_weight):

    data = furn    
    xgb = XGBRegressor(objective = 'reg:tweedie',
        max_leaves  = int(max_leaves),
        max_depth = int(max_depth),
#         min_child_samples = int(min_child_samples),
#         min_child_weight = int(min_data_in_leaf),
        n_jobs=6,
        random_state=0,
        colsample_bytree=colsample_bytree,
#         bagging_fraction= bagging_fraction,
        eta= eta,
        gamma = gamma,
        subsample = subsample,
        n_estimators= int(n_estimators),
        min_child_weight= min_child_weight
    )

    folds = KFold(n_splits = 5, shuffle = True, random_state = 0)
        
    test_pred = np.zeros(data.shape[0])
    
    feats = [f for f in data.columns if f != '취급액']
    
    scaler = StandardScaler()
    
    
    for n_fold, (train_idx, valid_idx) in enumerate(folds.split(data[feats], data['취급액'])):
        train_x, train_y = data[feats].iloc[train_idx], data['취급액'].iloc[train_idx]
        train_x = scaler.fit_transform(train_x)
        valid_x, valid_y = data[feats].iloc[valid_idx], data['취급액'].iloc[valid_idx]
        valid_x = scaler.transform(valid_x)
        xgb.fit(train_x, train_y, 
                eval_set = [(train_x, train_y), (valid_x, valid_y)],
                verbose=False,early_stopping_rounds = 100)
        
        best_iteration = xgb.get_booster().best_ntree_limit
        test_pred[valid_idx] = xgb.predict(valid_x, ntree_limit = best_iteration)
        del train_x, train_y, valid_x, valid_y
   
  
    return -mape(data['취급액'],test_pred)

params = {'max_leaves': (12, 1024),
        'colsample_bytree': (0.1, 0.9),
#         'bagging_fraction': (0.5, 1),
        'max_depth': (7, 20),
        'eta': (0.05, 0.2),
        'gamma': (0.001, 0.1),
#         'min_child_samples': (0, 100),
        'subsample': (0.5, 1),
        'n_estimators': (500, 2000),
#         'min_data_in_leaf': (0, 1000),
        'min_child_weight': (0.001, 1000)}

#### **튜닝 전 성능**
  
mape: 41.68921261024886  


clothes, inner, kitchen, food, beauty, elec, goods, health, etc, furn, bed

In [245]:
# furn
bo_10 = BayesianOptimization(xgb_eval, pbounds = params,random_state=0)
bo_10.maximize(n_iter = 100, init_points = 10, acq = 'ei', xi = 0.01, random_state = 0)

|   iter    |  target   | featur... | learni... | max_depth | min_ch... | min_sp... | n_esti... | num_le... | sub_sa... |
-------------------------------------------------------------------------------------------------------------------------
| [0m 1       [0m | [0m-51.22   [0m | [0m 0.5391  [0m | [0m 0.1573  [0m | [0m 14.84   [0m | [0m 544.9   [0m | [0m 0.04294 [0m | [0m 1.469e+0[0m | [0m 454.8   [0m | [0m 0.9675  [0m |
| [95m 2       [0m | [95m-50.71   [0m | [95m 0.8709  [0m | [95m 0.1075  [0m | [95m 17.29   [0m | [95m 528.9   [0m | [95m 0.05724 [0m | [95m 1.888e+0[0m | [95m 83.89   [0m | [95m 0.7261  [0m |
| [0m 3       [0m | [0m-56.42   [0m | [0m 0.1162  [0m | [0m 0.1749  [0m | [0m 17.12   [0m | [0m 870.0   [0m | [0m 0.09788 [0m | [0m 1.699e+0[0m | [0m 479.0   [0m | [0m 0.9342  [0m |
| [0m 4       [0m | [0m-53.03   [0m | [0m 0.1946  [0m | [0m 0.146   [0m | [0m 8.864   [0m | [0m 944.7   [0m | [0m 0.05266 [0m 

In [246]:
def xgb_eval(max_leaves,colsample_bytree,max_depth,eta,gamma,
             subsample,n_estimators,min_child_weight):

    data = bed    
    xgb = XGBRegressor(objective = 'reg:tweedie',
        max_leaves  = int(max_leaves),
        max_depth = int(max_depth),
#         min_child_samples = int(min_child_samples),
#         min_child_weight = int(min_data_in_leaf),
        n_jobs=6,
        random_state=0,
        colsample_bytree=colsample_bytree,
#         bagging_fraction= bagging_fraction,
        eta= eta,
        gamma = gamma,
        subsample = subsample,
        n_estimators= int(n_estimators),
        min_child_weight= min_child_weight
    )

    folds = KFold(n_splits = 5, shuffle = True, random_state = 0)
        
    test_pred = np.zeros(data.shape[0])
    
    feats = [f for f in data.columns if f != '취급액']
    
    scaler = StandardScaler()
    
    
    for n_fold, (train_idx, valid_idx) in enumerate(folds.split(data[feats], data['취급액'])):
        train_x, train_y = data[feats].iloc[train_idx], data['취급액'].iloc[train_idx]
        train_x = scaler.fit_transform(train_x)
        valid_x, valid_y = data[feats].iloc[valid_idx], data['취급액'].iloc[valid_idx]
        valid_x = scaler.transform(valid_x)
        xgb.fit(train_x, train_y, 
                eval_set = [(train_x, train_y), (valid_x, valid_y)],
                verbose=False,early_stopping_rounds = 100)
        
        best_iteration = xgb.get_booster().best_ntree_limit
        test_pred[valid_idx] = xgb.predict(valid_x, ntree_limit = best_iteration)
        del train_x, train_y, valid_x, valid_y
   
  
    return -mape(data['취급액'],test_pred)

params = {'max_leaves': (12, 1024),
        'colsample_bytree': (0.1, 0.9),
#         'bagging_fraction': (0.5, 1),
        'max_depth': (7, 20),
        'eta': (0.05, 0.2),
        'gamma': (0.001, 0.1),
#         'min_child_samples': (0, 100),
        'subsample': (0.5, 1),
        'n_estimators': (500, 2000),
#         'min_data_in_leaf': (0, 1000),
        'min_child_weight': (0.001, 1000)}

#### **튜닝 전 성능**
 
mape: 25.20877367938055  


clothes, inner, kitchen, food, beauty, elec, goods, health, etc, furn, bed

In [247]:
# bed
bo_11 = BayesianOptimization(xgb_eval, pbounds = params,random_state=0)
bo_11.maximize(n_iter = 100, init_points = 10, acq = 'ei', xi = 0.01, random_state = 0)

|   iter    |  target   | featur... | learni... | max_depth | min_ch... | min_sp... | n_esti... | num_le... | sub_sa... |
-------------------------------------------------------------------------------------------------------------------------
| [0m 1       [0m | [0m-28.55   [0m | [0m 0.5391  [0m | [0m 0.1573  [0m | [0m 14.84   [0m | [0m 544.9   [0m | [0m 0.04294 [0m | [0m 1.469e+0[0m | [0m 454.8   [0m | [0m 0.9675  [0m |
| [95m 2       [0m | [95m-27.78   [0m | [95m 0.8709  [0m | [95m 0.1075  [0m | [95m 17.29   [0m | [95m 528.9   [0m | [95m 0.05724 [0m | [95m 1.888e+0[0m | [95m 83.89   [0m | [95m 0.7261  [0m |
| [95m 3       [0m | [95m-22.48   [0m | [95m 0.1162  [0m | [95m 0.1749  [0m | [95m 17.12   [0m | [95m 870.0   [0m | [95m 0.09788 [0m | [95m 1.699e+0[0m | [95m 479.0   [0m | [95m 0.9342  [0m |
| [0m 4       [0m | [0m-24.4    [0m | [0m 0.1946  [0m | [0m 0.146   [0m | [0m 8.864   [0m | [0m 944.7   [0m | [0m 0.0

In [261]:
def make_param_xgb(x):
    best_params = x.max['params']
    best_params['random_state'] = 0
    if x == (bo_6 or bo_10):
        best_params['objective'] = mape_objective_function
    else:
        best_params['objective'] = 'reg:tweedie'
    best_params['max_leaves'] = best_params.pop('num_leaves')
    best_params['colsample_bytree'] = best_params.pop('feature_fraction')
    best_params['eta'] = best_params.pop('learning_rate')
    best_params['gamma'] = best_params.pop('min_split_gain')
    best_params['subsample'] = best_params.pop('sub_sample')
    
    best_params['max_depth'] = int(best_params['max_depth'])
    best_params['n_estimators'] = int(best_params['n_estimators'])
#     best_params['min_child_samples'] = int(best_params['min_child_samples'])
    best_params['max_leaves'] = int(best_params['max_leaves'])
#     best_params['min_data_in_leaf'] = int(best_params['min_data_in_leaf'])
    return best_params

In [262]:
bo_list = [bo_1,bo_2,bo_3,bo_4,bo_5,bo_6,bo_7,bo_8,bo_9,bo_10,bo_11]
for i in range(len(bo_list)):
    globals()['xgb{}'.format(i+1)] = XGBRegressor(**make_param_xgb(bo_list[i]))

### **CATBOOST**

In [16]:
from catboost import CatBoostRegressor, Pool

In [15]:
## train test split된 걸 상품군 별로 나눔

def train_test_grp(X_train, X_test, y_train, y_test, prod_group, grp_index):
    new_X_train = X_train[X_train["상품군"] == prod_group[grp_index]].drop("상품군", axis=1)
    new_X_test = X_test[X_test["상품군"] == prod_group[grp_index]].drop("상품군", axis=1)
    new_y_train = y_train[X_train["상품군"] == prod_group[grp_index]]
    new_y_test = y_test[X_test["상품군"] == prod_group[grp_index]]
    return new_X_train, new_X_test, new_y_train, new_y_test

X_train_clothes_cat, X_test_clothes_cat, y_train_clothes_cat, y_test_clothes_cat = train_test_grp(X_train, X_test, y_train, y_test, prod_group, 0)
X_train_inner_cat, X_test_inner_cat, y_train_inner_cat, y_test_inner_cat = train_test_grp(X_train, X_test, y_train, y_test, prod_group, 1)
X_train_kitchen_cat, X_test_kitchen_cat, y_train_kitchen_cat, y_test_kitchen_cat = train_test_grp(X_train, X_test, y_train, y_test, prod_group, 2)
X_train_food_cat, X_test_food_cat, y_train_food_cat, y_test_food_cat = train_test_grp(X_train, X_test, y_train, y_test, prod_group, 3)
X_train_beauty_cat, X_test_beauty_cat, y_train_beauty_cat, y_test_beauty_cat = train_test_grp(X_train, X_test, y_train, y_test, prod_group, 4)
X_train_elec_cat, X_test_elec_cat, y_train_elec_cat, y_test_elec_cat = train_test_grp(X_train, X_test, y_train, y_test, prod_group, 5)
X_train_goods_cat, X_test_goods_cat, y_train_goods_cat, y_test_goods_cat = train_test_grp(X_train, X_test, y_train, y_test, prod_group, 6)
X_train_health_cat, X_test_health_cat, y_train_health_cat, y_test_health_cat = train_test_grp(X_train, X_test, y_train, y_test, prod_group, 7)
X_train_etc_cat, X_test_etc_cat, y_train_etc_cat, y_test_etc_cat = train_test_grp(X_train, X_test, y_train, y_test, prod_group, 8)
X_train_furn_cat, X_test_furn_cat, y_train_furn_cat, y_test_furn_cat = train_test_grp(X_train, X_test, y_train, y_test, prod_group, 9)
X_train_bed_cat, X_test_bed_cat, y_train_bed_cat, y_test_bed_cat = train_test_grp(X_train, X_test, y_train, y_test, prod_group, 10)

#### **clothes**

##### **튜닝 전**  
25.145399979198285

In [24]:
CatBoost_clothes = CatBoostRegressor(n_estimators = 3000,
                             loss_function = 'MAE',
#                              loss_function = 'Tweedie:variance_power=1.1',
                             eval_metric = 'MAPE',
#                              learning_rate = 0.1,
                             depth = 7,
                             cat_features = ['상품코드','마더코드','상품명','소분류','prime_time','중분류','요일','season','남여','muil','season_prod','season','브랜드'],                             
                             verbose=1,
                             random_seed = 0,
                             task_type='CPU')

In [None]:
pred_clothes_cat = CatBoost_clothes.fit(X_train_clothes_cat, y_train_clothes_cat).predict(X_test_clothes_cat)

In [26]:
mape(pred_clothes_cat,y_test_clothes_cat)

25.145399979198285

##### **튜닝 후**  
24.23689010234943

In [41]:
CatBoost_clothes = CatBoostRegressor(n_estimators = 3200,
                             loss_function = 'MAE',
#                              loss_function = 'Tweedie:variance_power=1.1',
                             eval_metric = 'MAPE',
#                              learning_rate = 0.1,
                             depth = 8,
                             cat_features = ['상품코드','마더코드','상품명','소분류','prime_time','중분류','요일','season','남여','muil','season_prod','season','브랜드'],                             
                             verbose=0,
                             random_seed = 0,
                             task_type='CPU')
pred_clothes_cat = CatBoost_clothes.fit(X_train_clothes_cat, y_train_clothes_cat).predict(X_test_clothes_cat)
print(mape(pred_clothes_cat,y_test_clothes_cat))

24.23689010234943


#### **inner**

##### **튜닝 전**  
43.99056799986272

In [43]:
CatBoost_inner = CatBoostRegressor(n_estimators = 3000,
                             loss_function = 'MAE',
#                              loss_function = 'Tweedie:variance_power=1.1',
                             eval_metric = 'MAPE',
#                              learning_rate = 0.1,
                             depth = 7,
                             cat_features = ['상품코드','마더코드','상품명','소분류','prime_time','중분류','요일','season','남여','muil','season_prod','season','브랜드'],                             
                             verbose=0,
                             random_seed = 0,
                             task_type='CPU')
pred_inner_cat = CatBoost_inner.fit(X_train_inner_cat, y_train_inner_cat).predict(X_test_inner_cat)
print(mape(pred_inner_cat,y_test_inner_cat))

43.99056799986272


##### **튜닝 후**  
29.53169226635896

In [52]:
CatBoost_inner = CatBoostRegressor(n_estimators = 4000,
                             loss_function = 'MAE',
#                              loss_function = 'Tweedie:variance_power=1.1',
                             eval_metric = 'MAPE',
#                              learning_rate = 0.1,
                             depth = 7,
                             cat_features = ['상품코드','마더코드','상품명','소분류','prime_time','중분류','요일','season','남여','muil','season_prod','season','브랜드'],                             
                             verbose=0,
                             random_seed = 0,
                             task_type='CPU')
pred_inner_cat = CatBoost_inner.fit(X_train_inner_cat, y_train_inner_cat).predict(X_test_inner_cat)
print(mape(pred_inner_cat,y_test_inner_cat))

29.53169226635896


#### **kitchen**

##### **튜닝 전**  
64.55274205390863

In [51]:
CatBoost_kitchen = CatBoostRegressor(n_estimators = 3000,
                             loss_function = 'MAE',
#                              loss_function = 'Tweedie:variance_power=1.1',
                             eval_metric = 'MAPE',
#                              learning_rate = 0.1,
                             depth = 7,
                             cat_features = ['상품코드','마더코드','상품명','소분류','prime_time','중분류','요일','season','남여','muil','season_prod','season','브랜드'],                             
                             verbose=0,
                             random_seed = 0,
                             task_type='CPU')
pred_kitchen_cat = CatBoost_kitchen.fit(X_train_kitchen_cat, y_train_kitchen_cat).predict(X_test_kitchen_cat)
print(mape(pred_kitchen_cat,y_test_kitchen_cat))

64.55274205390863


##### **튜닝 후**  
30.24951438451378

In [56]:
CatBoost_kitchen = CatBoostRegressor(n_estimators = 4000,
                             loss_function = 'MAE',
#                              loss_function = 'Tweedie:variance_power=1.1',
                             eval_metric = 'MAPE',
#                              learning_rate = 0.1,
                             depth = 7,
                             cat_features = ['상품코드','마더코드','상품명','소분류','prime_time','중분류','요일','season','남여','muil','season_prod','season','브랜드'],                             
                             verbose=0,
                             random_seed = 0,
                             task_type='CPU')
pred_kitchen_cat = CatBoost_kitchen.fit(X_train_kitchen_cat, y_train_kitchen_cat).predict(X_test_kitchen_cat)
print(mape(pred_kitchen_cat,y_test_kitchen_cat))

30.24951438451378


#### **food**

##### **튜닝 전**  
19.704997926203458

In [57]:
CatBoost_food = CatBoostRegressor(n_estimators = 4000,
                             loss_function = 'MAE',
#                              loss_function = 'Tweedie:variance_power=1.1',
                             eval_metric = 'MAPE',
#                              learning_rate = 0.1,
                             depth = 7,
                             cat_features = ['상품코드','마더코드','상품명','소분류','prime_time','중분류','요일','season','남여','muil','season_prod','season','브랜드'],                             
                             verbose=0,
                             random_seed = 0,
                             task_type='CPU')
pred_food_cat = CatBoost_food.fit(X_train_food_cat, y_train_food_cat).predict(X_test_food_cat)
print(mape(pred_food_cat,y_test_food_cat))

19.704997926203458


##### **튜닝 후**  
17.998711705582476

In [65]:
CatBoost_food = CatBoostRegressor(n_estimators = 6000,
                             loss_function = 'MAE',
#                              loss_function = 'Tweedie:variance_power=1.1',
                             eval_metric = 'MAPE',
#                              learning_rate = 0.1,
                             depth = 9,
                             cat_features = ['상품코드','마더코드','상품명','소분류','prime_time','중분류','요일','season','남여','muil','season_prod','season','브랜드'],                             
                             verbose=0,
                             random_seed = 0,
                             task_type='CPU')
pred_food_cat = CatBoost_food.fit(X_train_food_cat, y_train_food_cat).predict(X_test_food_cat)
print(mape(pred_food_cat,y_test_food_cat))

17.998711705582476


#### **beauty**

##### **튜닝 전**  
18.052581659186103

In [66]:
CatBoost_beauty = CatBoostRegressor(n_estimators = 4000,
                             loss_function = 'MAE',
#                              loss_function = 'Tweedie:variance_power=1.1',
                             eval_metric = 'MAPE',
#                              learning_rate = 0.1,
                             depth = 7,
                             cat_features = ['상품코드','마더코드','상품명','소분류','prime_time','중분류','요일','season','남여','muil','season_prod','season','브랜드'],                             
                             verbose=0,
                             random_seed = 0,
                             task_type='CPU')
pred_beauty_cat = CatBoost_beauty.fit(X_train_beauty_cat, y_train_beauty_cat).predict(X_test_beauty_cat)
print(mape(pred_beauty_cat,y_test_beauty_cat))

18.052581659186103


##### **튜닝 후**  
18.052581659186103

In [73]:
CatBoost_beauty = CatBoostRegressor(n_estimators = 4000,
                             loss_function = 'MAE',
#                              loss_function = 'Tweedie:variance_power=1.1',
                             eval_metric = 'MAPE',
#                              learning_rate = 0.1,
                             depth = 7,
                             cat_features = ['상품코드','마더코드','상품명','소분류','prime_time','중분류','요일','season','남여','muil','season_prod','season','브랜드'],                             
                             verbose=0,
                             random_seed = 0,
                             task_type='CPU')
pred_beauty_cat = CatBoost_beauty.fit(X_train_beauty_cat, y_train_beauty_cat).predict(X_test_beauty_cat)
print(mape(pred_beauty_cat,y_test_beauty_cat))

18.052581659186103


#### **elec**

##### **튜닝 전**  
42.00516138344872

In [74]:
CatBoost_elec = CatBoostRegressor(n_estimators = 4000,
                             loss_function = 'MAE',
#                              loss_function = 'Tweedie:variance_power=1.1',
                             eval_metric = 'MAPE',
#                              learning_rate = 0.1,
                             depth = 7,
                             cat_features = ['상품코드','마더코드','상품명','소분류','prime_time','중분류','요일','season','남여','muil','season_prod','season','브랜드'],                             
                             verbose=0,
                             random_seed = 0,
                             task_type='CPU')
pred_elec_cat = CatBoost_elec.fit(X_train_elec_cat, y_train_elec_cat).predict(X_test_elec_cat)
print(mape(pred_elec_cat,y_test_elec_cat))

42.00516138344872


##### **튜닝 후**  
40.749478469398454

In [78]:
CatBoost_elec = CatBoostRegressor(n_estimators = 6000,
                             loss_function = 'MAE',
#                              loss_function = 'Tweedie:variance_power=1.1',
                             eval_metric = 'MAPE',
#                              learning_rate = 0.1,
                             depth = 9,
                             cat_features = ['상품코드','마더코드','상품명','소분류','prime_time','중분류','요일','season','남여','muil','season_prod','season','브랜드'],                             
                             verbose=0,
                             random_seed = 0,
                             task_type='CPU')
pred_elec_cat = CatBoost_elec.fit(X_train_elec_cat, y_train_elec_cat).predict(X_test_elec_cat)
print(mape(pred_elec_cat,y_test_elec_cat))

40.749478469398454


#### **goods**

##### **튜닝 전**  
31.892540437625026

In [79]:
CatBoost_goods = CatBoostRegressor(n_estimators = 4500,
                             loss_function = 'MAE',
#                              loss_function = 'Tweedie:variance_power=1.1',
                             eval_metric = 'MAPE',
#                              learning_rate = 0.1,
                             depth = 8,
                             cat_features = ['상품코드','마더코드','상품명','소분류','prime_time','중분류','요일','season','남여','muil','season_prod','season','브랜드'],                             
                             verbose=0,
                             random_seed = 0,
                             task_type='CPU')
pred_goods_cat = CatBoost_goods.fit(X_train_goods_cat, y_train_goods_cat).predict(X_test_goods_cat)
print(mape(pred_goods_cat,y_test_goods_cat))

31.892540437625026


##### **튜닝 후**  
31.70004831869703

In [82]:
CatBoost_goods = CatBoostRegressor(n_estimators = 5500,
                             loss_function = 'MAE',
#                              loss_function = 'Tweedie:variance_power=1.1',
                             eval_metric = 'MAPE',
#                              learning_rate = 0.1,
                             depth = 8,
                             cat_features = ['상품코드','마더코드','상품명','소분류','prime_time','중분류','요일','season','남여','muil','season_prod','season','브랜드'],                             
                             verbose=0,
                             random_seed = 0,
                             task_type='CPU')
pred_goods_cat = CatBoost_goods.fit(X_train_goods_cat, y_train_goods_cat).predict(X_test_goods_cat)
print(mape(pred_goods_cat,y_test_goods_cat))

31.70004831869703


#### **health**

##### **튜닝 전**  
25.5302415747131

In [83]:
CatBoost_health = CatBoostRegressor(n_estimators = 4000,
                             loss_function = 'MAE',
#                              loss_function = 'Tweedie:variance_power=1.1',
                             eval_metric = 'MAPE',
#                              learning_rate = 0.1,
                             depth = 7,
                             cat_features = ['상품코드','마더코드','상품명','소분류','prime_time','중분류','요일','season','남여','muil','season_prod','season','브랜드'],                             
                             verbose=0,
                             random_seed = 0,
                             task_type='CPU')
pred_health_cat = CatBoost_health.fit(X_train_health_cat, y_train_health_cat).predict(X_test_health_cat)
print(mape(pred_health_cat,y_test_health_cat))

25.5302415747131


##### **튜닝 후**  
25.513938134667402

In [91]:
CatBoost_health = CatBoostRegressor(n_estimators = 5200,
                             loss_function = 'MAE',
#                              loss_function = 'Tweedie:variance_power=1.1',
                             eval_metric = 'MAPE',
#                              learning_rate = 0.1,
                             depth = 7,
                             cat_features = ['상품코드','마더코드','상품명','소분류','prime_time','중분류','요일','season','남여','muil','season_prod','season','브랜드'],                             
                             verbose=0,
                             random_seed = 0,
                             task_type='CPU')
pred_health_cat = CatBoost_health.fit(X_train_health_cat, y_train_health_cat).predict(X_test_health_cat)
print(mape(pred_health_cat,y_test_health_cat))

25.513938134667402


#### **etc**

##### **튜닝 전**  
32.83289173591722

In [88]:
CatBoost_etc = CatBoostRegressor(n_estimators = 3500,
                             loss_function = 'MAE',
#                              loss_function = 'Tweedie:variance_power=1.1',
                             eval_metric = 'MAPE',
#                              learning_rate = 0.1,
                             depth = 7,
                             cat_features = ['상품코드','마더코드','상품명','소분류','prime_time','중분류','요일','season','남여','muil','season_prod','season','브랜드'],                             
                             verbose=0,
                             random_seed = 0,
                             task_type='CPU')
pred_etc_cat = CatBoost_etc.fit(X_train_etc_cat, y_train_etc_cat).predict(X_test_etc_cat)
print(mape(pred_etc_cat,y_test_etc_cat))

32.83289173591722


##### **튜닝 후**  
32.378940364044254

In [94]:
CatBoost_etc = CatBoostRegressor(n_estimators = 5000,
                             loss_function = 'MAE',
#                              loss_function = 'Tweedie:variance_power=1.1',
                             eval_metric = 'MAPE',
#                              learning_rate = 0.1,
                             depth = 8,
                             cat_features = ['상품코드','마더코드','상품명','소분류','prime_time','중분류','요일','season','남여','muil','season_prod','season','브랜드'],                             
                             verbose=0,
                             random_seed = 0,
                             task_type='CPU')
pred_etc_cat = CatBoost_etc.fit(X_train_etc_cat, y_train_etc_cat).predict(X_test_etc_cat)
print(mape(pred_etc_cat,y_test_etc_cat))

32.378940364044254


#### **furn**

##### **튜닝 전**  
41.95334916764361

In [95]:
CatBoost_furn = CatBoostRegressor(n_estimators = 3500,
                             loss_function = 'MAE',
#                              loss_function = 'Tweedie:variance_power=1.1',
                             eval_metric = 'MAPE',
#                              learning_rate = 0.1,
                             depth = 7,
                             cat_features = ['상품코드','마더코드','상품명','소분류','prime_time','중분류','요일','season','남여','muil','season_prod','season','브랜드'],                             
                             verbose=0,
                             random_seed = 0,
                             task_type='CPU')
pred_furn_cat = CatBoost_furn.fit(X_train_furn_cat, y_train_furn_cat).predict(X_test_furn_cat)
print(mape(pred_furn_cat,y_test_furn_cat))

41.95334916764361


##### **튜닝 후**  
41.95317552001951

In [103]:
CatBoost_furn = CatBoostRegressor(n_estimators = 4000,
                             loss_function = 'MAE',
#                              loss_function = 'Tweedie:variance_power=1.1',
                             eval_metric = 'MAPE',
#                              learning_rate = 0.1,
                             depth = 7,
                             cat_features = ['상품코드','마더코드','상품명','소분류','prime_time','중분류','요일','season','남여','muil','season_prod','season','브랜드'],                             
                             verbose=0,
                             random_seed = 0,
                             task_type='CPU')
pred_furn_cat = CatBoost_furn.fit(X_train_furn_cat, y_train_furn_cat).predict(X_test_furn_cat)
print(mape(pred_furn_cat,y_test_furn_cat))

41.95317552001951


#### **bed**

##### **튜닝 전**  
25.25815984598247

In [104]:
CatBoost_bed = CatBoostRegressor(n_estimators = 3500,
                             loss_function = 'MAE',
#                              loss_function = 'Tweedie:variance_power=1.1',
                             eval_metric = 'MAPE',
#                              learning_rate = 0.1,
                             depth = 7,
                             cat_features = ['상품코드','마더코드','상품명','소분류','prime_time','중분류','요일','season','남여','muil','season_prod','season','브랜드'],                             
                             verbose=0,
                             random_seed = 0,
                             task_type='CPU')
pred_bed_cat = CatBoost_bed.fit(X_train_bed_cat, y_train_bed_cat).predict(X_test_bed_cat)
print(mape(pred_bed_cat,y_test_bed_cat))

25.25815984598247


##### **튜닝 후**  
25.180334318321613

In [111]:
CatBoost_bed = CatBoostRegressor(n_estimators = 4500,
                             loss_function = 'MAE',
#                              loss_function = 'Tweedie:variance_power=1.1',
                             eval_metric = 'MAPE',
#                              learning_rate = 0.1,
                             depth = 7,
                             cat_features = ['상품코드','마더코드','상품명','소분류','prime_time','중분류','요일','season','남여','muil','season_prod','season','브랜드'],                             
                             verbose=0,
                             random_seed = 0,
                             task_type='CPU')
pred_bed_cat = CatBoost_bed.fit(X_train_bed_cat, y_train_bed_cat).predict(X_test_bed_cat)
print(mape(pred_bed_cat,y_test_bed_cat))

25.180334318321613


모델 저장

In [135]:
CatBoost_clothes.save_model("../model/boosting_model/catboost_clothes.cbm")
CatBoost_inner.save_model("../model/boosting_model/catboost_inner.cbm")
CatBoost_kitchen.save_model("../model/boosting_model/catboost_kitchen.cbm")
CatBoost_food.save_model("../model/boosting_model/catboost_food.cbm")
CatBoost_beauty.save_model("../model/boosting_model/catboost_beauty.cbm")
CatBoost_elec.save_model("../model/boosting_model/catboost_elec.cbm")
CatBoost_goods.save_model("../model/boosting_model/catboost_goods.cbm")
CatBoost_health.save_model("../model/boosting_model/catboost_health.cbm")
CatBoost_etc.save_model("../model/boosting_model/catboost_etc.cbm")
CatBoost_furn.save_model("../model/boosting_model/catboost_furn.cbm")
CatBoost_bed.save_model("../model/boosting_model/catboost_bed.cbm")

In [136]:
CatBoost_clothes = CatBoostRegressor()
CatBoost_inner = CatBoostRegressor()
CatBoost_kitchen = CatBoostRegressor()
CatBoost_food = CatBoostRegressor()
CatBoost_beauty = CatBoostRegressor()
CatBoost_elec = CatBoostRegressor()
CatBoost_goods = CatBoostRegressor()
CatBoost_health = CatBoostRegressor()
CatBoost_etc = CatBoostRegressor()
CatBoost_furn = CatBoostRegressor()
CatBoost_bed = CatBoostRegressor()

In [137]:
CatBoost_clothes.load_model("../model/boosting_model/catboost_clothes.cbm")
CatBoost_inner.load_model("../model/boosting_model/catboost_inner.cbm")
CatBoost_kitchen.load_model("../model/boosting_model/catboost_kitchen.cbm")
CatBoost_food.load_model("../model/boosting_model/catboost_food.cbm")
CatBoost_beauty.load_model("../model/boosting_model/catboost_beauty.cbm")
CatBoost_elec.load_model("../model/boosting_model/catboost_elec.cbm")
CatBoost_goods.load_model("../model/boosting_model/catboost_goods.cbm")
CatBoost_health.load_model("../model/boosting_model/catboost_health.cbm")
CatBoost_etc.load_model("../model/boosting_model/catboost_etc.cbm")
CatBoost_furn.load_model("../model/boosting_model/catboost_furn.cbm")
CatBoost_bed.load_model("../model/boosting_model/catboost_bed.cbm")

<catboost.core.CatBoostRegressor at 0x7f12c4916eb0>

### **튜닝 성능 확인**

### LGBM

In [562]:

lgbm6 = LGBMRegressor(n_estimators = 500,  num_leaves = 2048, boosting_type = "dart", random_state = 0, max_depth = 11, verbose = 0, n_jobs = -1, objective = 'mape')

lgbm10 = LGBMRegressor(n_estimators = 500, boosting_type = 'dart', num_leaves = 127, random_state = 0, max_depth = 11, verbose = 0, n_jobs = -1, objective = 'mape', learning_rate = 0.09)


In [463]:
X_train_clothes_scaled.shape

(3248, 27)

In [563]:
# model fitting

pred1 = lgbm1.fit(X_train_clothes_scaled, y_train_clothes).predict(X_test_clothes_scaled)
pred2 = lgbm2.fit(X_train_inner_scaled, y_train_inner).predict(X_test_inner_scaled)
pred3 = lgbm3.fit(X_train_kitchen_scaled, y_train_kitchen).predict(X_test_kitchen_scaled)
pred4 = lgbm4.fit(X_train_food_scaled, y_train_food).predict(X_test_food_scaled)
pred5 = lgbm5.fit(X_train_beauty_scaled, y_train_beauty).predict(X_test_beauty_scaled)
pred6 = lgbm6.fit(X_train_elec_scaled, y_train_elec).predict(X_test_elec_scaled)
pred7 = lgbm7.fit(X_train_goods_scaled, y_train_goods).predict(X_test_goods_scaled)
pred8 = lgbm8.fit(X_train_health_scaled, y_train_health).predict(X_test_health_scaled)
pred9 = lgbm9.fit(X_train_etc_scaled, y_train_etc).predict(X_test_etc_scaled)
pred10 = lgbm10.fit(X_train_furn_scaled, y_train_furn).predict(X_test_furn_scaled)
pred11 = lgbm11.fit(X_train_bed_scaled, y_train_bed).predict(X_test_bed_scaled)

In [559]:
predictions = [pred1, pred2, pred3, pred4, pred5, pred6, pred7, pred8, pred9, pred10, pred11] # selection한 결과물
trues = [y_test_clothes, y_test_inner, y_test_kitchen, y_test_food, y_test_beauty, y_test_elec, y_test_goods, y_test_health, y_test_etc, y_test_furn, y_test_bed]

#### **튜닝 전 성능**

mape: 22.1579455684485  
mape: 26.5763001623147  
mape: 27.49715254148381  
mape: 15.874558341118911  
mape: 19.217135630635212  
mape: 40.07481282197277  
mape: 31.623747738429696  
mape: 21.80460073752754  
mape: 38.29687137332689  
mape: 41.68921261024886  
mape: 25.20877367938055  

최종: 29.200866247957723

In [466]:
### LGBM 튜닝 후
mape_list = []
for pred, true in zip(predictions, trues):
    mape_res = mape(true, pred)
    print(f"mape: {mape_res}" )
    mape_list.append(mape_res)

mape: 21.091434819839204
mape: 25.350279271126702
mape: 26.949307400008422
mape: 15.947975277414944
mape: 17.977532308273123
mape: 40.58600581168992
mape: 31.38656967075713
mape: 19.32000060644314
mape: 38.620812710486796
mape: 41.68921261024886
mape: 24.90286754227887


### 기존 format으로 변경 후 mape 계산

In [467]:
y_pred_lgbm = np.zeros_like(y_test)

In [468]:
# y_test format으로 다시 넣어주기
for i in range(len(prod_group)):
      y_pred_lgbm[X_test["상품군"] == prod_group[i]] = predictions[i]

In [469]:
# Model Save
def model_save(today, best_tot_mape):
    import joblib

    joblib.dump(lgbm1, f"../model/boosting_model/lgbm1.pickle")
    joblib.dump(lgbm2, f"../model/boosting_model/lgbm2.pickle")
    joblib.dump(lgbm3, f"../model/boosting_model/lgbm3.pickle")
    joblib.dump(lgbm4, f"../model/boosting_model/lgbm4.pickle")
    joblib.dump(lgbm5, f"../model/boosting_model/lgbm5.pickle")
    joblib.dump(lgbm6, f"../model/boosting_model/lgbm6.pickle")
    joblib.dump(lgbm7, f"../model/boosting_model/lgbm7.pickle")
    joblib.dump(lgbm8, f"../model/boosting_model/lgbm8.pickle")
    joblib.dump(lgbm9, f"../model/boosting_model/lgbm9.pickle")
    joblib.dump(lgbm10, f"../model/boosting_model/lgbm10.pickle")
    joblib.dump(lgbm11, f"../model/boosting_model/lgbm11.pickle")

  ## result 저장

In [470]:
tot_mape = mape(y_test, y_pred_lgbm)

if tot_mape < best_tot_mape:
    best_tot_mape = tot_mape
    model_save(today, best_tot_mape)


print(f"MAPE calculated over total valid set is {tot_mape:.4f}\nThe current best mape score is {best_tot_mape:2f}")


MAPE calculated over total valid set is 28.8244
The current best mape score is 28.824424


### XGB

In [334]:
xgb6 = XGBRegressor(base_score=0.5, booster='gbtree', colsample_bylevel=1,
             colsample_bynode=1, colsample_bytree=0.6224866603723187,
             eta=0.08799374038096733, gamma=0.016737988780906453, gpu_id=-1,
             importance_type='gain', interaction_constraints='',
             learning_rate=0.087993741, max_delta_step=0, max_depth=13,
             max_leaves=676, min_child_weight=244.42634757601076, 
             monotone_constraints='()', n_estimators=665, n_jobs=0,
             num_parallel_tree=1,
             objective='reg:tweedie',
             random_state=0, reg_alpha=0, reg_lambda=1, scale_pos_weight=1,
             subsample=0.7414548854045842, tree_method='exact',
             validate_parameters=1, verbosity=None)

xgb10 = XGBRegressor(base_score=0.5, booster='gbtree', colsample_bylevel=1,
             colsample_bynode=1, colsample_bytree=0.789956577441571,
             eta=0.1409391962108311, gamma=0.06058367377353676, gpu_id=-1,
             importance_type='gain', interaction_constraints='',
             learning_rate=0.140939191, max_delta_step=0, max_depth=10,
             max_leaves=279, min_child_weight=568.9477690044143,
             monotone_constraints='()', n_estimators=1399, n_jobs=0,
             num_parallel_tree=1, objective='reg:tweedie', random_state=0,
             reg_alpha=0, reg_lambda=1, scale_pos_weight=None,
             subsample=0.7017180830644052, tree_method='exact',
             validate_parameters=1, verbosity=None)

In [414]:
# model fitting

pred1 = xgb1.fit(X_train_clothes_scaled, y_train_clothes).predict(X_test_clothes_scaled)
pred2 = xgb2.fit(X_train_inner_scaled, y_train_inner).predict(X_test_inner_scaled)
pred3 = xgb3.fit(X_train_kitchen_scaled, y_train_kitchen).predict(X_test_kitchen_scaled)
pred4 = xgb4.fit(X_train_food_scaled, y_train_food).predict(X_test_food_scaled)
pred5 = xgb5.fit(X_train_beauty_scaled, y_train_beauty).predict(X_test_beauty_scaled)
pred6 = xgb6.fit(X_train_elec_scaled, y_train_elec).predict(X_test_elec_scaled)
pred7 = xgb7.fit(X_train_goods_scaled, y_train_goods).predict(X_test_goods_scaled)
pred8 = xgb8.fit(X_train_health_scaled, y_train_health).predict(X_test_health_scaled)
pred9 = xgb9.fit(X_train_etc_scaled, y_train_etc).predict(X_test_etc_scaled)
pred10 = xgb10.fit(X_train_furn_scaled, y_train_furn).predict(X_test_furn_scaled)
pred11 = xgb11.fit(X_train_bed_scaled, y_train_bed).predict(X_test_bed_scaled)

Parameters: { scale_pos_weight } might not be used.

  This may not be accurate due to some parameters are only used in language bindings but
  passed down to XGBoost core.  Or some parameters are not used but slip through this
  verification. Please open an issue if you find above cases.




In [415]:
predictions = [pred1, pred2, pred3, pred4, pred5, pred6, pred7, pred8, pred9, pred10, pred11] # selection한 결과물
trues = [y_test_clothes, y_test_inner, y_test_kitchen, y_test_food, y_test_beauty, y_test_elec, y_test_goods, y_test_health, y_test_etc, y_test_furn, y_test_bed]

#### **튜닝 전 성능**

mape: 22.1579455684485  
mape: 26.5763001623147  
mape: 27.49715254148381  
mape: 15.874558341118911  
mape: 19.217135630635212  
mape: 40.07481282197277  
mape: 31.623747738429696  
mape: 21.80460073752754  
mape: 38.29687137332689  
mape: 41.68921261024886  
mape: 25.20877367938055  

최종: 29.200866247957723

In [466]:
### LGBM 튜닝 후
mape_list = []
for pred, true in zip(predictions, trues):
    mape_res = mape(true, pred)
    print(f"mape: {mape_res}" )
    mape_list.append(mape_res)

mape: 21.091434819839204
mape: 25.350279271126702
mape: 26.949307400008422
mape: 15.947975277414944
mape: 17.977532308273123
mape: 40.58600581168992
mape: 31.38656967075713
mape: 19.32000060644314
mape: 38.620812710486796
mape: 41.68921261024886
mape: 24.90286754227887


In [416]:
### XGB 튜닝 후
mape_list = []
for pred, true in zip(predictions, trues):
    mape_res = mape(true, pred)
    print(f"mape: {mape_res}" )
    mape_list.append(mape_res)

mape: 23.178305630641383
mape: 26.38440757520433
mape: 27.09342939819902
mape: 16.6015020906853
mape: 18.98967970295486
mape: 48.445263584961076
mape: 33.80773135042142
mape: 21.126713947368547
mape: 44.98154157361393
mape: 49.87014608522391
mape: 24.172910365655706


### 기존 format으로 변경 후 mape 계산

In [417]:
y_pred = np.zeros_like(y_test)

In [418]:
# y_test format으로 다시 넣어주기
for i in range(len(prod_group)):
      y_pred[X_test["상품군"] == prod_group[i]] = predictions[i]

In [419]:
# Model Save
def model_save(today, best_tot_mape):
    import joblib

    joblib.dump(xgb1, f"../model/boosting_model/xgb1.pickle")
    joblib.dump(xgb2, f"../model/boosting_model/xgb2.pickle")
    joblib.dump(xgb3, f"../model/boosting_model/xgb3.pickle")
    joblib.dump(xgb4, f"../model/boosting_model/xgb4.pickle")
    joblib.dump(xgb5, f"../model/boosting_model/xgb5.pickle")
    joblib.dump(xgb6, f"../model/boosting_model/xgb6.pickle")
    joblib.dump(xgb7, f"../model/boosting_model/xgb7.pickle")
    joblib.dump(xgb8, f"../model/boosting_model/xgb8.pickle")
    joblib.dump(xgb9, f"../model/boosting_model/xgb9.pickle")
    joblib.dump(xgb10, f"../model/boosting_model/xgb10.pickle")
    joblib.dump(xgb11, f"../model/boosting_model/xgb11.pickle")

  ## result 저장

In [420]:
tot_mape = mape(y_test, y_pred)

if tot_mape < best_tot_mape:
    best_tot_mape = tot_mape
    model_save(today, best_tot_mape)


print(f"MAPE calculated over total valid set is {tot_mape:.4f}\nThe current best mape score is {best_tot_mape:2f}")


MAPE calculated over total valid set is 31.8893
The current best mape score is 29.219109
