In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV, RandomizedSearchCV
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler,LabelEncoder
from sklearn.ensemble import GradientBoostingRegressor, RandomForestRegressor
from sklearn.metrics import roc_auc_score
from xgboost import XGBRegressor
from catboost import CatBoostRegressor, Pool
from lightgbm import LGBMRegressor
import datetime
import locale                                         
import pickle
from tqdm import tqdm
import warnings
import math
import joblib
warnings.filterwarnings("ignore")
locale.setlocale(locale.LC_ALL, 'ko_KR.UTF-8') 

In [None]:
from datetime import datetime


today = datetime.today().strftime("%m%d")

In [None]:
with open('../data/merged_data_concat_0926.pickle', 'rb') as f:
    performance_data = pickle.load(f)

In [None]:
performance_data['마더코드'] = performance_data['마더코드'].map(str)
performance_data['상품코드'] = performance_data['상품코드'].map(str)

## **성능 Test**

### **라벨인코딩**

In [None]:
performance_data.columns

In [None]:
# 라벨 인코딩
test_data = performance_data.drop(['방송일시','판매량', 'holiday', '대비', 'date', 'mean_rating', 
                                   '배당수익률(%)', '주가자산비율', '고가지수', '저가지수', '거래량(천주)', '거래대금(백만원)','상장시가총액(백만원)'],axis=1)

test_data_cat = performance_data.drop(['방송일시','판매량', 'holiday', '대비', 'date', 'mean_rating', 
                                   '배당수익률(%)', '주가자산비율', '고가지수', '저가지수', '거래량(천주)', '거래대금(백만원)','상장시가총액(백만원)'],axis=1)

# test_data['상품코드'] = test_data['상품코드'].map(int)
for feat in ['상품명','상품코드','마더코드','prime_time','중분류','요일','season','남여','muil','브랜드','season_prod','소분류']:
    lbe = LabelEncoder()
    test_data[feat] = lbe.fit_transform(test_data[feat].astype(str).values)




## Data Preparation for Modeling

In [None]:
prod_group = test_data["상품군"].unique()
prod_group_dct = {v:k for k, v in enumerate(prod_group)}

In [None]:
## 추후 전체 set에 대한 mape를 구하기 위해서 split을 해줌.

predict_data = test_data[test_data.취급액 == -1].reset_index(drop=True)
train_set = test_data[test_data['취급액'] != -1]

X = train_set.drop(["취급액"], axis = 1)
y = train_set["취급액"]

grp_idx = train_set['상품군'].map(prod_group_dct)
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=0, stratify = grp_idx)

In [None]:
test_len = [len(X_test[X_test["상품군"] == prod_group[i]]) for i in range(len(prod_group))]
print(test_len)

In [None]:
def mape(actual, pred): 
    actual, pred = np.array(actual), np.array(pred)
    result = np.mean(np.abs((actual - pred) / actual)) * 100
    return result

In [None]:
original_colnames = list(X_train.drop('상품군',axis=1).columns)
new_colnames = []
for i in range(len(X_train.drop('상품군',axis=1).columns)):
    new_colnames.append(i)
colnames_dic = {}
for i in range(len(original_colnames)):
    colnames_dic[str(new_colnames[i])] = original_colnames[i]

In [None]:
# 각 상품군별 삭제할 피쳐

clothes_del_columns = [37,34,33,32,26,36,20,12,13,16,18]
inner_del_columns = [34,33,32,26,36,37,28,17,18,9,27,12,13,25,22,8,28,14,19,15,16]
kitchen_del_columns = [25,32,34,36,33,27,9,13,8,17,15,18,16,12,28,14,11]
food_del_columns = [25,36,26,9,33,34,32,37,27,17,16,15,28,29,13,20,14]
beauty_del_columns = [11,16,14,18,13,22,34,33,32,29,26,36,37,8,12,19,17,15]
elec_del_columns = [8,10,11,12,13,14,17,18,19,25,32,33,34,35,36]
goods_del_columns = [13,14,15,16,17,19,20,21,22,23,25,29,32,33,34,37]
health_del_columns = [8,11,13,14,15,16,17,18,20,25,26,27,32,33,34,36,37]
etc_del_columns = [9,11,13,14,15,16,17,18,19,20,22,23,25,27,32,33,34,35,36,37]
furn_del_columns = [8,10,11,12,13,14,15,18,19,23,25,27,32,33,34,35,36]
bed_del_columns = [8,13,14,15,16,17,19,20,22,23,25,26,27,28,29,31,32,33,34,35,36]


clothes_del_columns = [colnames_dic[str(i)] for i in clothes_del_columns]
inner_del_columns = [colnames_dic[str(i)] for i in inner_del_columns]
kitchen_del_columns = [colnames_dic[str(i)] for i in kitchen_del_columns]
food_del_columns = [colnames_dic[str(i)] for i in food_del_columns]
beauty_del_columns = [colnames_dic[str(i)] for i in beauty_del_columns]
elec_del_columns = [colnames_dic[str(i)] for i in elec_del_columns]
goods_del_columns = [colnames_dic[str(i)] for i in goods_del_columns]
health_del_columns = [colnames_dic[str(i)] for i in health_del_columns]
etc_del_columns = [colnames_dic[str(i)] for i in etc_del_columns]
furn_del_columns = [colnames_dic[str(i)] for i in furn_del_columns]
bed_del_columns = [colnames_dic[str(i)] for i in bed_del_columns]

In [None]:
## train test split된 걸 상품군 별로 나눔

def train_test_grp(X_train, X_test, y_train, y_test, prod_group, grp_index):
    new_X_train = X_train[X_train["상품군"] == prod_group[grp_index]].drop("상품군", axis=1)
    new_X_test = X_test[X_test["상품군"] == prod_group[grp_index]].drop("상품군", axis=1)
    new_y_train = y_train[X_train["상품군"] == prod_group[grp_index]]
    new_y_test = y_test[X_test["상품군"] == prod_group[grp_index]]
    return new_X_train, new_X_test, new_y_train, new_y_test

X_train_clothes, X_test_clothes, y_train_clothes, y_test_clothes = train_test_grp(X_train, X_test, y_train, y_test, prod_group, 0)
X_train_inner, X_test_inner, y_train_inner, y_test_inner = train_test_grp(X_train, X_test, y_train, y_test, prod_group, 1)
X_train_kitchen, X_test_kitchen, y_train_kitchen, y_test_kitchen = train_test_grp(X_train, X_test, y_train, y_test, prod_group, 2)
X_train_food, X_test_food, y_train_food, y_test_food = train_test_grp(X_train, X_test, y_train, y_test, prod_group, 3)
X_train_beauty, X_test_beauty, y_train_beauty, y_test_beauty = train_test_grp(X_train, X_test, y_train, y_test, prod_group, 4)
X_train_elec, X_test_elec, y_train_elec, y_test_elec = train_test_grp(X_train, X_test, y_train, y_test, prod_group, 5)
X_train_goods, X_test_goods, y_train_goods, y_test_goods = train_test_grp(X_train, X_test, y_train, y_test, prod_group, 6)
X_train_health, X_test_health, y_train_health, y_test_health = train_test_grp(X_train, X_test, y_train, y_test, prod_group, 7)
X_train_etc, X_test_etc, y_train_etc, y_test_etc = train_test_grp(X_train, X_test, y_train, y_test, prod_group, 8)
X_train_furn, X_test_furn, y_train_furn, y_test_furn = train_test_grp(X_train, X_test, y_train, y_test, prod_group, 9)
X_train_bed, X_test_bed, y_train_bed, y_test_bed = train_test_grp(X_train, X_test, y_train, y_test, prod_group, 10)

In [None]:
X_train_clothes = X_train_clothes.drop(clothes_del_columns,axis=1)
X_train_inner = X_train_inner.drop(inner_del_columns,axis=1)
X_train_kitchen = X_train_kitchen.drop(kitchen_del_columns,axis=1)
X_train_food = X_train_food.drop(food_del_columns,axis=1)
X_train_beauty = X_train_beauty.drop(beauty_del_columns,axis=1)
X_train_elec = X_train_elec.drop(elec_del_columns,axis=1)
X_train_goods = X_train_goods.drop(goods_del_columns,axis=1)
X_train_health = X_train_health.drop(health_del_columns,axis=1)
X_train_etc = X_train_etc.drop(etc_del_columns,axis=1)
X_train_furn = X_train_furn.drop(furn_del_columns,axis=1)
X_train_bed = X_train_bed.drop(bed_del_columns,axis=1)

X_test_clothes = X_test_clothes.drop(clothes_del_columns,axis=1)
X_test_inner = X_test_inner.drop(inner_del_columns,axis=1)
X_test_kitchen = X_test_kitchen.drop(kitchen_del_columns,axis=1)
X_test_food = X_test_food.drop(food_del_columns,axis=1)
X_test_beauty = X_test_beauty.drop(beauty_del_columns,axis=1)
X_test_elec = X_test_elec.drop(elec_del_columns,axis=1)
X_test_goods = X_test_goods.drop(goods_del_columns,axis=1)
X_test_health = X_test_health.drop(health_del_columns,axis=1)
X_test_etc = X_test_etc.drop(etc_del_columns,axis=1)
X_test_furn = X_test_furn.drop(furn_del_columns,axis=1)
X_test_bed = X_test_bed.drop(bed_del_columns,axis=1)

In [None]:
# scaling
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()

X_train_clothes_scaled = pd.DataFrame(scaler.fit_transform(X_train_clothes))
X_test_clothes_scaled = pd.DataFrame(scaler.transform(X_test_clothes))
X_train_inner_scaled = pd.DataFrame(scaler.fit_transform(X_train_inner))
X_test_inner_scaled = pd.DataFrame(scaler.transform(X_test_inner))
X_train_kitchen_scaled = pd.DataFrame(scaler.fit_transform(X_train_kitchen))
X_test_kitchen_scaled = pd.DataFrame(scaler.transform(X_test_kitchen))
X_train_food_scaled = pd.DataFrame(scaler.fit_transform(X_train_food))
X_test_food_scaled = pd.DataFrame(scaler.transform(X_test_food))
X_train_beauty_scaled = pd.DataFrame(scaler.fit_transform(X_train_beauty))
X_test_beauty_scaled = pd.DataFrame(scaler.transform(X_test_beauty))
X_train_elec_scaled = pd.DataFrame(scaler.fit_transform(X_train_elec))
X_test_elec_scaled = pd.DataFrame(scaler.transform(X_test_elec))
X_train_goods_scaled = pd.DataFrame(scaler.fit_transform(X_train_goods))
X_test_goods_scaled = pd.DataFrame(scaler.transform(X_test_goods))
X_train_health_scaled = pd.DataFrame(scaler.fit_transform(X_train_health))
X_test_health_scaled = pd.DataFrame(scaler.transform(X_test_health))
X_train_etc_scaled = pd.DataFrame(scaler.fit_transform(X_train_etc))
X_test_etc_scaled = pd.DataFrame(scaler.transform(X_test_etc))
X_train_furn_scaled = pd.DataFrame(scaler.fit_transform(X_train_furn))
X_test_furn_scaled = pd.DataFrame(scaler.transform(X_test_furn))
X_train_bed_scaled = pd.DataFrame(scaler.fit_transform(X_train_bed))
X_test_bed_scaled = pd.DataFrame(scaler.transform(X_test_bed))

### Catboost용 데이터 처리

In [None]:
prod_group_cat = test_data_cat["상품군"].unique()
prod_group_dct_cat = {v:k for k, v in enumerate(prod_group_cat)}

In [None]:
## 추후 전체 set에 대한 mape를 구하기 위해서 split을 해줌.

predict_data_cat = test_data_cat[test_data_cat.취급액 == -1].reset_index(drop=True)
train_set_cat = test_data_cat[test_data_cat['취급액'] != -1]

X_cat = train_set_cat.drop(["취급액"], axis = 1)
y_cat = train_set_cat["취급액"]

grp_idx_cat = train_set_cat['상품군'].map(prod_group_dct_cat)
X_train_cat, X_test_cat, y_train_cat, y_test_cat = train_test_split(X_cat, y_cat, random_state=0, stratify = grp_idx_cat)

In [None]:
## train test split된 걸 상품군 별로 나눔

def train_test_grp(X_train, X_test, y_train, y_test, prod_group, grp_index):
    new_X_train = X_train[X_train["상품군"] == prod_group[grp_index]].drop("상품군", axis=1)
    new_X_test = X_test[X_test["상품군"] == prod_group[grp_index]].drop("상품군", axis=1)
    new_y_train = y_train[X_train["상품군"] == prod_group[grp_index]]
    new_y_test = y_test[X_test["상품군"] == prod_group[grp_index]]
    return new_X_train, new_X_test, new_y_train, new_y_test

X_train_clothes_cat, X_test_clothes_cat, y_train_clothes_cat, y_test_clothes_cat = train_test_grp(X_train_cat, X_test_cat, y_train_cat, y_test_cat, prod_group_cat, 0)
X_train_inner_cat, X_test_inner_cat, y_train_inner_cat, y_test_inner_cat = train_test_grp(X_train_cat, X_test_cat, y_train_cat, y_test_cat, prod_group_cat, 1)
X_train_kitchen_cat, X_test_kitchen_cat, y_train_kitchen_cat, y_test_kitchen_cat = train_test_grp(X_train_cat, X_test_cat, y_train_cat, y_test_cat, prod_group_cat, 2)
X_train_food_cat, X_test_food_cat, y_train_food_cat, y_test_food_cat = train_test_grp(X_train_cat, X_test_cat, y_train_cat, y_test_cat, prod_group_cat, 3)
X_train_beauty_cat, X_test_beauty_cat, y_train_beauty_cat, y_test_beauty_cat = train_test_grp(X_train_cat, X_test_cat, y_train_cat, y_test_cat, prod_group_cat, 4)
X_train_elec_cat, X_test_elec_cat, y_train_elec_cat, y_test_elec_cat = train_test_grp(X_train_cat, X_test_cat, y_train_cat, y_test_cat, prod_group_cat, 5)
X_train_goods_cat, X_test_goods_cat, y_train_goods_cat, y_test_goods_cat = train_test_grp(X_train_cat, X_test_cat, y_train_cat, y_test_cat, prod_group_cat, 6)
X_train_health_cat, X_test_health_cat, y_train_health_cat, y_test_health_cat = train_test_grp(X_train_cat, X_test_cat, y_train_cat, y_test_cat, prod_group_cat, 7)
X_train_etc_cat, X_test_etc_cat, y_train_etc_cat, y_test_etc_cat = train_test_grp(X_train_cat, X_test_cat, y_train_cat, y_test_cat, prod_group_cat, 8)
X_train_furn_cat, X_test_furn_cat, y_train_furn_cat, y_test_furn_cat = train_test_grp(X_train_cat, X_test_cat, y_train_cat, y_test_cat, prod_group_cat, 9)
X_train_bed_cat, X_test_bed_cat, y_train_bed_cat, y_test_bed_cat = train_test_grp(X_train_cat, X_test_cat, y_train_cat, y_test_cat, prod_group_cat, 10)

### **모델지정**

In [None]:
lgbm1 = LGBMRegressor(bagging_fraction=0.7307137280834164,
              feature_fraction=0.17885571364318312,
              learning_rate=0.05399675356928432, max_depth=12,
              min_child_samples=73, min_child_weight=97.77473030676221,
              min_data_in_leaf=3, min_split_gain=0.07355236386628859,
              n_estimators=1207, num_leaves=55, objective='tweedie',
              sub_sample=0.8782282595207351, random_state=0)
lgbm2 = LGBMRegressor(bagging_fraction=0.9278033560669966,
              feature_fraction=0.2723596633675743,
              learning_rate=0.08501364792701771, max_depth=8,
              min_child_samples=87, min_child_weight=76.73725399140089,
              min_data_in_leaf=1, min_split_gain=0.07961193791689251,
              n_estimators=1210, num_leaves=24, objective='tweedie',
              sub_sample=0.9183149006037761, random_state=0)
lgbm3 = LGBMRegressor(bagging_fraction=0.5699760518309701,
              feature_fraction=0.11616629778968468,
              learning_rate=0.17549876792458874, max_depth=7,
              min_child_samples=84, min_child_weight=5.811554492331914,
              min_data_in_leaf=3, min_split_gain=0.04789200310107769,
              n_estimators=1439, num_leaves=637, objective='tweedie',
              sub_sample=0.7038030128410426, random_state=0)
lgbm4 = LGBMRegressor(bagging_fraction=0.5079841780274699,
              feature_fraction=0.15001505244731972,
              learning_rate=0.05640935206317574, max_depth=7,
              min_child_samples=88, min_child_weight=3.821270143808374,
              min_data_in_leaf=0, min_split_gain=0.06344199624185946,
              n_estimators=1747, num_leaves=225, objective='tweedie',
              sub_sample=0.7022885680852412, random_state=0)
lgbm5 = LGBMRegressor(bagging_fraction=0.6364000144639433,
              feature_fraction=0.2577693146770039,
              learning_rate=0.05830893786024687, max_depth=19,
              min_child_samples=9, min_child_weight=33.15266461422622,
              min_data_in_leaf=0, min_split_gain=0.022674218802850026,
              n_estimators=1656, num_leaves=15, objective='tweedie',
              sub_sample=0.8467825255492876, random_state=0)
lgbm6 = LGBMRegressor(n_estimators = 500,  num_leaves = 2048, 
                      boosting_type = "dart", random_state = 0, max_depth = 11, 
                      verbose = 0, n_jobs = -1, objective = 'mape')
lgbm7 = LGBMRegressor(bagging_fraction=0.9999999999997822, feature_fraction=0.1,
              learning_rate=0.05, max_depth=7, min_child_samples=100,
              min_data_in_leaf=0, min_split_gain=0.0010000000003967224,
              n_estimators=1744, num_leaves=653, objective='tweedie',
              sub_sample=0.9222227228407224, random_state=0)
lgbm8 = LGBMRegressor(bagging_fraction=0.5975849393619003, feature_fraction=0.1,
              learning_rate=0.05, max_depth=7, min_child_samples=100,
              min_child_weight=100.0, min_data_in_leaf=0,
              min_split_gain=0.0010000000000559605, n_estimators=1564,
              num_leaves=12, objective='tweedie', sub_sample=0.7, random_state=0)
lgbm9 = LGBMRegressor(bagging_fraction=0.9984805691308754,
              feature_fraction=0.30088006750171986,
              learning_rate=0.14620335385613942, max_depth=12,
              min_child_samples=2, min_child_weight=85.43148764226444,
              min_data_in_leaf=67, min_split_gain=0.06823262603393153,
              n_estimators=1183, num_leaves=993, objective='tweedie',
              sub_sample=0.8276883785558009, random_state=0)
lgbm10 = LGBMRegressor(n_estimators = 500, boosting_type = 'dart', 
                       num_leaves = 127, random_state = 0, max_depth = 11, 
                       verbose = 0, n_jobs = -1, objective = 'mape', 
                       learning_rate = 0.09)
lgbm11 = LGBMRegressor(bagging_fraction=0.6046157978828623, feature_fraction=0.1,
              learning_rate=0.08138473936485872, max_depth=7,
              min_child_samples=0, min_data_in_leaf=11,
              min_split_gain=0.0687946498411097, n_estimators=971,
              num_leaves=471, objective='tweedie', sub_sample=0.7, random_state=0)

In [None]:
xgb1 = XGBRegressor(base_score=0.5, booster='gbtree', colsample_bylevel=1,
             colsample_bynode=1, colsample_bytree=0.11044585954726314,
             eta=0.09234645565909387, gamma=0.06921673321404705, gpu_id=-1,
             importance_type='gain', interaction_constraints='',
             learning_rate=0.0923464522, max_delta_step=0, max_depth=7,
             max_leaves=283, min_child_weight=951.5931410110306,
             monotone_constraints='()', n_estimators=1121, n_jobs=0,
             num_parallel_tree=1, objective='reg:tweedie', random_state=0,
             reg_alpha=0, reg_lambda=1, scale_pos_weight=None,
             subsample=0.7745809267896179, tree_method='exact',
             validate_parameters=1, verbosity=None)
xgb2 = XGBRegressor(base_score=0.5, booster='gbtree', colsample_bylevel=1,
             colsample_bynode=1, colsample_bytree=0.4876619535400296,
             eta=0.05669375986990678, gamma=0.009309295139350023, gpu_id=-1,
             importance_type='gain', interaction_constraints='',
             learning_rate=0.0566937588, max_delta_step=0, max_depth=7,
             max_leaves=999, min_child_weight=49.71317626175442, 
             monotone_constraints='()', n_estimators=705, n_jobs=0,
             num_parallel_tree=1, objective='reg:tweedie', random_state=0,
             reg_alpha=0, reg_lambda=1, scale_pos_weight=None,
             subsample=0.8299417446629082, tree_method='exact',
             validate_parameters=1, verbosity=None)
xgb3 = XGBRegressor(base_score=0.5, booster='gbtree', colsample_bylevel=1,
             colsample_bynode=1, colsample_bytree=0.16210634090565457,
             eta=0.08736750768138013, gamma=0.05817626686767469, gpu_id=-1,
             importance_type='gain', interaction_constraints='',
             learning_rate=0.0873675048, max_delta_step=0, max_depth=7,
             max_leaves=471, min_child_weight=861.8835231037181, 
             monotone_constraints='()', n_estimators=1676, n_jobs=0,
             num_parallel_tree=1, objective='reg:tweedie', random_state=0,
             reg_alpha=0, reg_lambda=1, scale_pos_weight=None,
             subsample=0.848851356748161, tree_method='exact',
             validate_parameters=1, verbosity=None)
xgb4 = XGBRegressor(base_score=0.5, booster='gbtree', colsample_bylevel=1,
             colsample_bynode=1, colsample_bytree=0.11044585954726314,
             eta=0.09234645565909387, gamma=0.06921673321404705, gpu_id=-1,
             importance_type='gain', interaction_constraints='',
             learning_rate=0.0923464522, max_delta_step=0, max_depth=7,
             max_leaves=283, min_child_weight=951.5931410110306, 
             monotone_constraints='()', n_estimators=1121, n_jobs=0,
             num_parallel_tree=1, objective='reg:tweedie', random_state=0,
             reg_alpha=0, reg_lambda=1, scale_pos_weight=None,
             subsample=0.7745809267896179, tree_method='exact',
             validate_parameters=1, verbosity=None)
xgb5 = XGBRegressor(base_score=0.5, booster='gbtree', colsample_bylevel=1,
             colsample_bynode=1, colsample_bytree=0.3196831742620959,
             eta=0.05209505656752034, gamma=0.04514823863223244, gpu_id=-1,
             importance_type='gain', interaction_constraints='',
             learning_rate=0.0520950556, max_delta_step=0, max_depth=7,
             max_leaves=484, min_child_weight=105.71598217494693, 
             monotone_constraints='()', n_estimators=684, n_jobs=0,
             num_parallel_tree=1, objective='reg:tweedie', random_state=0,
             reg_alpha=0, reg_lambda=1, scale_pos_weight=None,
             subsample=0.8664474611577506, tree_method='exact',
             validate_parameters=1, verbosity=None)
xgb6 = XGBRegressor(base_score=0.5, booster='gbtree', colsample_bylevel=1,
             colsample_bynode=1, colsample_bytree=0.6224866603723187,
             eta=0.08799374038096733, gamma=0.016737988780906453, gpu_id=-1,
             importance_type='gain', interaction_constraints='',
             learning_rate=0.087993741, max_delta_step=0, max_depth=13,
             max_leaves=676, min_child_weight=244.42634757601076, 
             monotone_constraints='()', n_estimators=665, n_jobs=0,
             num_parallel_tree=1,
             objective='reg:tweedie',
             random_state=0, reg_alpha=0, reg_lambda=1, scale_pos_weight=1,
             subsample=0.7414548854045842, tree_method='exact',
             validate_parameters=1, verbosity=None)
xgb7 = XGBRegressor(base_score=0.5, booster='gbtree', colsample_bylevel=1,
             colsample_bynode=1, colsample_bytree=0.21538834418148778,
             eta=0.15120509904373458, gamma=0.06208137181317149, gpu_id=-1,
             importance_type='gain', interaction_constraints='',
             learning_rate=0.151205093, max_delta_step=0, max_depth=7,
             max_leaves=957, min_child_weight=648.5462494115064, 
             monotone_constraints='()', n_estimators=1569, n_jobs=0,
             num_parallel_tree=1, objective='reg:tweedie', random_state=0,
             reg_alpha=0, reg_lambda=1, scale_pos_weight=None,
             subsample=0.9116377368732753, tree_method='exact',
             validate_parameters=1, verbosity=None)
xgb8 = XGBRegressor(base_score=0.5, booster='gbtree', colsample_bylevel=1,
             colsample_bynode=1, colsample_bytree=0.11313066967872505,
             eta=0.08340025045635116, gamma=0.05858427776109298, gpu_id=-1,
             importance_type='gain', interaction_constraints='',
             learning_rate=0.0834002495, max_delta_step=0, max_depth=13,
             max_leaves=325, min_child_weight=709.9495700733073, 
             monotone_constraints='()', n_estimators=1113, n_jobs=0,
             num_parallel_tree=1, objective='reg:tweedie', random_state=0,
             reg_alpha=0, reg_lambda=1, scale_pos_weight=None,
             subsample=0.8429703306961998, tree_method='exact',
             validate_parameters=1, verbosity=None)
xgb9 = XGBRegressor(base_score=0.5, booster='gbtree', colsample_bylevel=1,
             colsample_bynode=1, colsample_bytree=0.11122047820184111,
             eta=0.07486537100643052, gamma=0.023776481238799444, gpu_id=-1,
             importance_type='gain', interaction_constraints='',
             learning_rate=0.074865371, max_delta_step=0, max_depth=9,
             max_leaves=275, min_child_weight=963.1135002728994,
             monotone_constraints='()', n_estimators=1126, n_jobs=0,
             num_parallel_tree=1, objective='reg:tweedie', random_state=0,
             reg_alpha=0, reg_lambda=1, scale_pos_weight=None,
             subsample=0.9866813645370839, tree_method='exact',
             validate_parameters=1, verbosity=None)
xgb10 = XGBRegressor(base_score=0.5, booster='gbtree', colsample_bylevel=1,
             colsample_bynode=1, colsample_bytree=0.789956577441571,
             eta=0.1409391962108311, gamma=0.06058367377353676, gpu_id=-1,
             importance_type='gain', interaction_constraints='',
             learning_rate=0.140939191, max_delta_step=0, max_depth=10,
             max_leaves=279, min_child_weight=568.9477690044143,
             monotone_constraints='()', n_estimators=1399, n_jobs=0,
             num_parallel_tree=1, objective='reg:tweedie', random_state=0,
             reg_alpha=0, reg_lambda=1, scale_pos_weight=None,
             subsample=0.7017180830644052, tree_method='exact',
             validate_parameters=1, verbosity=None)
xgb11 = XGBRegressor(base_score=0.5, booster='gbtree', colsample_bylevel=1,
             colsample_bynode=1, colsample_bytree=0.11617471795226059,
             eta=0.17489297683219074, gamma=0.09788321588104364, gpu_id=-1,
             importance_type='gain', interaction_constraints='',
             learning_rate=0.174892977, max_delta_step=0, max_depth=17,
             max_leaves=479, min_child_weight=870.0122782346709,
             monotone_constraints='()', n_estimators=1698, n_jobs=0,
             num_parallel_tree=1, objective='reg:tweedie', random_state=0,
             reg_alpha=0, reg_lambda=1, scale_pos_weight=None,
             subsample=0.9341587528859366, tree_method='exact',
             validate_parameters=1, verbosity=None)

In [None]:
CatBoost_clothes = CatBoostRegressor(n_estimators = 3200,
                             loss_function = 'MAE',
                             eval_metric = 'MAPE',
                             depth = 8,
                             cat_features = ['상품코드','마더코드','상품명','소분류','prime_time','중분류','요일','season','남여','muil','season_prod','season','브랜드'],                             
                             verbose=0,
                             random_seed = 0,
                             task_type='CPU')
CatBoost_inner = CatBoostRegressor(n_estimators = 4000,
                             loss_function = 'MAE',
                             eval_metric = 'MAPE',
                             depth = 7,
                             cat_features = ['상품코드','마더코드','상품명','소분류','prime_time','중분류','요일','season','남여','muil','season_prod','season','브랜드'],                             
                             verbose=0,
                             random_seed = 0,
                             task_type='CPU')
CatBoost_kitchen = CatBoostRegressor(n_estimators = 4000,
                             loss_function = 'MAE',
                             eval_metric = 'MAPE',
                             depth = 7,
                             cat_features = ['상품코드','마더코드','상품명','소분류','prime_time','중분류','요일','season','남여','muil','season_prod','season','브랜드'],                             
                             verbose=0,
                             random_seed = 0,
                             task_type='CPU')
CatBoost_food = CatBoostRegressor(n_estimators = 6000,
                             loss_function = 'MAE',
                             eval_metric = 'MAPE',
                             depth = 9,
                             cat_features = ['상품코드','마더코드','상품명','소분류','prime_time','중분류','요일','season','남여','muil','season_prod','season','브랜드'],                             
                             verbose=0,
                             random_seed = 0,
                             task_type='CPU')
CatBoost_beauty = CatBoostRegressor(n_estimators = 4000,
                             loss_function = 'MAE',
                             eval_metric = 'MAPE',
                             depth = 7,
                             cat_features = ['상품코드','마더코드','상품명','소분류','prime_time','중분류','요일','season','남여','muil','season_prod','season','브랜드'],                             
                             verbose=0,
                             random_seed = 0,
                             task_type='CPU')
CatBoost_elec = CatBoostRegressor(n_estimators = 6000,
                             loss_function = 'MAE',
                             eval_metric = 'MAPE',
                             depth = 9,
                             cat_features = ['상품코드','마더코드','상품명','소분류','prime_time','중분류','요일','season','남여','muil','season_prod','season','브랜드'],                             
                             verbose=0,
                             random_seed = 0,
                             task_type='CPU')
CatBoost_goods = CatBoostRegressor(n_estimators = 5500,
                             loss_function = 'MAE',
                             eval_metric = 'MAPE',
                             depth = 8,
                             cat_features = ['상품코드','마더코드','상품명','소분류','prime_time','중분류','요일','season','남여','muil','season_prod','season','브랜드'],                             
                             verbose=0,
                             random_seed = 0,
                             task_type='CPU')
CatBoost_health = CatBoostRegressor(n_estimators = 5200,
                             loss_function = 'MAE',
                             eval_metric = 'MAPE',
                             depth = 7,
                             cat_features = ['상품코드','마더코드','상품명','소분류','prime_time','중분류','요일','season','남여','muil','season_prod','season','브랜드'],                             
                             verbose=0,
                             random_seed = 0,
                             task_type='CPU')
CatBoost_etc = CatBoostRegressor(n_estimators = 5000,
                             loss_function = 'MAE',
                             eval_metric = 'MAPE',
                             depth = 8,
                             cat_features = ['상품코드','마더코드','상품명','소분류','prime_time','중분류','요일','season','남여','muil','season_prod','season','브랜드'],                             
                             verbose=0,
                             random_seed = 0,
                             task_type='CPU')
CatBoost_furn = CatBoostRegressor(n_estimators = 4000,
                             loss_function = 'MAE',
                             eval_metric = 'MAPE',
                             depth = 7,
                             cat_features = ['상품코드','마더코드','상품명','소분류','prime_time','중분류','요일','season','남여','muil','season_prod','season','브랜드'],                             
                             verbose=0,
                             random_seed = 0,
                             task_type='CPU')
CatBoost_bed = CatBoostRegressor(n_estimators = 4500,
                             loss_function = 'MAE',
                             eval_metric = 'MAPE',
                             depth = 7,
                             cat_features = ['상품코드','마더코드','상품명','소분류','prime_time','중분류','요일','season','남여','muil','season_prod','season','브랜드'],                             
                             verbose=0,
                             random_seed = 0,
                             task_type='CPU')
CatBoost = CatBoostRegressor(n_estimators = 3000,
                             loss_function = 'MAE',
                             eval_metric = 'MAPE',
                             depth = 7,
                             cat_features = ['상품군','상품코드','마더코드','상품명','소분류','prime_time','중분류','요일','season','남여','muil','season_prod','season','브랜드'],                             
                             verbose=0,
                             random_seed = 0,
                             task_type='CPU')

### fitting

In [None]:
# model fitting

pred1_lgb = lgbm1.fit(X_train_clothes_scaled, y_train_clothes).predict(X_test_clothes_scaled)
pred2_lgb = lgbm2.fit(X_train_inner_scaled, y_train_inner).predict(X_test_inner_scaled)
pred3_lgb = lgbm3.fit(X_train_kitchen_scaled, y_train_kitchen).predict(X_test_kitchen_scaled)
pred4_lgb = lgbm4.fit(X_train_food_scaled, y_train_food).predict(X_test_food_scaled)
pred5_lgb = lgbm5.fit(X_train_beauty_scaled, y_train_beauty).predict(X_test_beauty_scaled)
pred6_lgb = lgbm6.fit(X_train_elec_scaled, y_train_elec).predict(X_test_elec_scaled)
pred7_lgb = lgbm7.fit(X_train_goods_scaled, y_train_goods).predict(X_test_goods_scaled)
pred8_lgb = lgbm8.fit(X_train_health_scaled, y_train_health).predict(X_test_health_scaled)
pred9_lgb = lgbm9.fit(X_train_etc_scaled, y_train_etc).predict(X_test_etc_scaled)
pred10_lgb = lgbm10.fit(X_train_furn_scaled, y_train_furn).predict(X_test_furn_scaled)
pred11_lgb = lgbm11.fit(X_train_bed_scaled, y_train_bed).predict(X_test_bed_scaled)

In [None]:
# model fitting

pred1_xgb = xgb1.fit(X_train_clothes_scaled, y_train_clothes).predict(X_test_clothes_scaled)
pred2_xgb = xgb2.fit(X_train_inner_scaled, y_train_inner).predict(X_test_inner_scaled)
pred3_xgb = xgb3.fit(X_train_kitchen_scaled, y_train_kitchen).predict(X_test_kitchen_scaled)
pred4_xgb = xgb4.fit(X_train_food_scaled, y_train_food).predict(X_test_food_scaled)
pred5_xgb = xgb5.fit(X_train_beauty_scaled, y_train_beauty).predict(X_test_beauty_scaled)
pred6_xgb = xgb6.fit(X_train_elec_scaled, y_train_elec).predict(X_test_elec_scaled)
pred7_xgb = xgb7.fit(X_train_goods_scaled, y_train_goods).predict(X_test_goods_scaled)
pred8_xgb = xgb8.fit(X_train_health_scaled, y_train_health).predict(X_test_health_scaled)
pred9_xgb = xgb9.fit(X_train_etc_scaled, y_train_etc).predict(X_test_etc_scaled)
pred10_xgb = xgb10.fit(X_train_furn_scaled, y_train_furn).predict(X_test_furn_scaled)
pred11_xgb = xgb11.fit(X_train_bed_scaled, y_train_bed).predict(X_test_bed_scaled)

In [None]:
pred1_cat = CatBoost_clothes.fit(X_train_clothes_cat, y_train_clothes_cat).predict(X_test_clothes_cat)
pred2_cat = CatBoost_inner.fit(X_train_inner_cat, y_train_inner_cat).predict(X_test_inner_cat)
pred3_cat = CatBoost_kitchen.fit(X_train_kitchen_cat, y_train_kitchen_cat).predict(X_test_kitchen_cat)
pred4_cat = CatBoost_food.fit(X_train_food_cat, y_train_food_cat).predict(X_test_food_cat)
pred5_cat = CatBoost_beauty.fit(X_train_beauty_cat, y_train_beauty_cat).predict(X_test_beauty_cat)
pred6_cat = CatBoost_elec.fit(X_train_elec_cat, y_train_elec_cat).predict(X_test_elec_cat)
pred7_cat = CatBoost_goods.fit(X_train_goods_cat, y_train_goods_cat).predict(X_test_goods_cat)
pred8_cat = CatBoost_health.fit(X_train_health_cat, y_train_health_cat).predict(X_test_health_cat)
pred9_cat = CatBoost_etc.fit(X_train_etc_cat, y_train_etc_cat).predict(X_test_etc_cat)
pred10_cat = CatBoost_furn.fit(X_train_furn_cat, y_train_furn_cat).predict(X_test_furn_cat)
pred11_cat = CatBoost_bed.fit(X_train_bed_cat, y_train_bed_cat).predict(X_test_bed_cat)


In [None]:
predictions_lgb = [pred1_lgb, pred2_lgb, pred3_lgb, pred4_lgb, pred5_lgb, pred6_lgb, pred7_lgb, pred8_lgb, pred9_lgb, pred10_lgb, pred11_lgb] # selection한 결과물
predictions_xgb = [pred1_xgb, pred2_xgb, pred3_xgb, pred4_xgb, pred5_xgb, pred6_xgb, pred7_xgb, pred8_xgb, pred9_xgb, pred10_xgb, pred11_xgb] 
predictions_cat = [pred1_cat, pred2_cat, pred3_cat, pred4_cat, pred5_cat, pred6_cat, pred7_cat, pred8_cat, pred9_cat, pred10_cat, pred11_cat]
trues = [y_test_clothes, y_test_inner, y_test_kitchen, y_test_food, y_test_beauty, y_test_elec, y_test_goods, y_test_health, y_test_etc, y_test_furn, y_test_bed]

### 기존 format으로 변경 후 mape 계산

In [None]:
y_pred_lgbm = np.zeros_like(y_test)

In [None]:
prod_group

In [None]:
# y_test format으로 다시 넣어주기
for i in range(len(prod_group)):
      y_pred_lgbm[X_test["상품군"] == prod_group[i]] = predictions_lgb[i]

In [None]:
y_pred_xgb = np.zeros_like(y_test)

In [None]:
# y_test format으로 다시 넣어주기
for i in range(len(prod_group)):
      y_pred_xgb[X_test["상품군"] == prod_group[i]] = predictions_xgb[i]

In [None]:
y_pred_cat = np.zeros_like(y_test)

In [None]:
# y_test format으로 다시 넣어주기
for i in range(len(prod_group_cat)):
      y_pred_cat[X_test_cat["상품군"] == prod_group_cat[i]] = predictions_cat[i]

In [None]:
### LGBM
mape_list = []
for pred, true in zip(predictions_lgb, trues):
    mape_res = mape(true, pred)
    print(f"mape: {mape_res}" )
    mape_list.append(mape_res)

In [None]:
### XGB 
mape_list = []
for pred, true in zip(predictions_xgb, trues):
    mape_res = mape(true, pred)
    print(f"mape: {mape_res}" )
    mape_list.append(mape_res)

In [None]:
### CAT
mape_list = []
for pred, true in zip(predictions_cat, trues):
    mape_res = mape(true, pred)
    print(f"mape: {mape_res}" )
    mape_list.append(mape_res)

In [None]:
# lgbm
mape(y_test,y_pred_lgbm)

In [None]:
# xgb
mape(y_test,y_pred_xgb)

In [None]:
# cat
mape(y_test,y_pred_cat)

# **SAVE && LOAD MODEL**

In [None]:
# Model Save
def model_save():
    joblib.dump(lgbm1, f"../model/boosting_model/lgbm1.pickle")
    joblib.dump(lgbm2, f"../model/boosting_model/lgbm2.pickle")
    joblib.dump(lgbm3, f"../model/boosting_model/lgbm3.pickle")
    joblib.dump(lgbm4, f"../model/boosting_model/lgbm4.pickle")
    joblib.dump(lgbm5, f"../model/boosting_model/lgbm5.pickle")
    joblib.dump(lgbm6, f"../model/boosting_model/lgbm6.pickle")
    joblib.dump(lgbm7, f"../model/boosting_model/lgbm7.pickle")
    joblib.dump(lgbm8, f"../model/boosting_model/lgbm8.pickle")
    joblib.dump(lgbm9, f"../model/boosting_model/lgbm9.pickle")
    joblib.dump(lgbm10, f"../model/boosting_model/lgbm10.pickle")
    joblib.dump(lgbm11, f"../model/boosting_model/lgbm11.pickle")
    
    joblib.dump(xgb1, f"../model/boosting_model/xgb1.pickle")
    joblib.dump(xgb2, f"../model/boosting_model/xgb2.pickle")
    joblib.dump(xgb3, f"../model/boosting_model/xgb3.pickle")
    joblib.dump(xgb4, f"../model/boosting_model/xgb4.pickle")
    joblib.dump(xgb5, f"../model/boosting_model/xgb5.pickle")
    joblib.dump(xgb6, f"../model/boosting_model/xgb6.pickle")
    joblib.dump(xgb7, f"../model/boosting_model/xgb7.pickle")
    joblib.dump(xgb8, f"../model/boosting_model/xgb8.pickle")
    joblib.dump(xgb9, f"../model/boosting_model/xgb9.pickle")
    joblib.dump(xgb10, f"../model/boosting_model/xgb10.pickle")
    joblib.dump(xgb11, f"../model/boosting_model/xgb11.pickle")

  ## result 저장

In [None]:
# Model Save
def model_load():
    lgbm1 = joblib.load("../model/boosting_model/lgbm1.pickle")
    lgbm2 = joblib.load("../model/boosting_model/lgbm2.pickle")
    lgbm3 = joblib.load("../model/boosting_model/lgbm3.pickle")
    lgbm4 = joblib.load("../model/boosting_model/lgbm4.pickle")
    lgbm5 = joblib.load("../model/boosting_model/lgbm5.pickle")
    lgbm6 = joblib.load("../model/boosting_model/lgbm6.pickle")
    lgbm7 = joblib.load("../model/boosting_model/lgbm7.pickle")
    lgbm8 = joblib.load("../model/boosting_model/lgbm8.pickle")
    lgbm9 = joblib.load("../model/boosting_model/lgbm9.pickle")
    lgbm10 = joblib.load("../model/boosting_model/lgbm10.pickle")
    lgbm11 = joblib.load("../model/boosting_model/lgbm11.pickle")
    
    xgb1 = joblib.load("../model/boosting_model/xgb1.pickle")
    xgb2 = joblib.load("../model/boosting_model/xgb2.pickle")
    xgb3 = joblib.load("../model/boosting_model/xgb3.pickle")
    xgb4 = joblib.load("../model/boosting_model/xgb4.pickle")
    xgb5 = joblib.load("../model/boosting_model/xgb5.pickle")
    xgb6 = joblib.load("../model/boosting_model/xgb6.pickle")
    xgb7 = joblib.load("../model/boosting_model/xgb7.pickle")
    xgb8 = joblib.load("../model/boosting_model/xgb8.pickle")
    xgb9 = joblib.load("../model/boosting_model/xgb9.pickle")
    xgb10 = joblib.load("../model/boosting_model/xgb10.pickle")
    xgb11 = joblib.load("../model/boosting_model/xgb11.pickle")
    
    return lgbm1, lgbm2, lgbm3, lgbm4, lgbm5, lgbm6, lgbm7, lgbm8, lgbm9, lgbm10, lgbm11, xgb1, xgb2, xgb3, xgb4, xgb5, xgb6, xgb7, xgb8, xgb9, xgb10, xgb11

  ## result 저장

In [None]:
CatBoost_clothes.save_model("../model/boosting_model/catboost_clothes.cbm")
CatBoost_inner.save_model("../model/boosting_model/catboost_inner.cbm")
CatBoost_kitchen.save_model("../model/boosting_model/catboost_kitchen.cbm")
CatBoost_food.save_model("../model/boosting_model/catboost_food.cbm")
CatBoost_beauty.save_model("../model/boosting_model/catboost_beauty.cbm")
CatBoost_elec.save_model("../model/boosting_model/catboost_elec.cbm")
CatBoost_goods.save_model("../model/boosting_model/catboost_goods.cbm")
CatBoost_health.save_model("../model/boosting_model/catboost_health.cbm")
CatBoost_etc.save_model("../model/boosting_model/catboost_etc.cbm")
CatBoost_furn.save_model("../model/boosting_model/catboost_furn.cbm")
CatBoost_bed.save_model("../model/boosting_model/catboost_bed.cbm")

In [None]:
model_save()

In [None]:
CatBoost_clothes = CatBoostRegressor()
CatBoost_inner = CatBoostRegressor()
CatBoost_kitchen = CatBoostRegressor()
CatBoost_food = CatBoostRegressor()
CatBoost_beauty = CatBoostRegressor()
CatBoost_elec = CatBoostRegressor()
CatBoost_goods = CatBoostRegressor()
CatBoost_health = CatBoostRegressor()
CatBoost_etc = CatBoostRegressor()
CatBoost_furn = CatBoostRegressor()
CatBoost_bed = CatBoostRegressor()

In [None]:
CatBoost_clothes.load_model("../model/boosting_model/catboost_clothes.cbm")
CatBoost_inner.load_model("../model/boosting_model/catboost_inner.cbm")
CatBoost_kitchen.load_model("../model/boosting_model/catboost_kitchen.cbm")
CatBoost_food.load_model("../model/boosting_model/catboost_food.cbm")
CatBoost_beauty.load_model("../model/boosting_model/catboost_beauty.cbm")
CatBoost_elec.load_model("../model/boosting_model/catboost_elec.cbm")
CatBoost_goods.load_model("../model/boosting_model/catboost_goods.cbm")
CatBoost_health.load_model("../model/boosting_model/catboost_health.cbm")
CatBoost_etc.load_model("../model/boosting_model/catboost_etc.cbm")
CatBoost_furn.load_model("../model/boosting_model/catboost_furn.cbm")
CatBoost_bed.load_model("../model/boosting_model/catboost_bed.cbm")

In [None]:
model_load()

### **앙상블을 위한 데이터 다시 불러오기**

In [None]:
# 라벨 인코딩
test_data = performance_data.drop(['방송일시','판매량', 'holiday', '대비', 'date', 'mean_rating', 
                                   '배당수익률(%)', '주가자산비율', '고가지수', '저가지수', '거래량(천주)', '거래대금(백만원)','상장시가총액(백만원)'],axis=1)

# test_data['상품코드'] = test_data['상품코드'].map(int)
for feat in ['상품명','상품코드','상품군','마더코드','prime_time','중분류','요일','season','남여','muil','브랜드','season_prod','소분류']:
    lbe = LabelEncoder()
    test_data[feat] = lbe.fit_transform(test_data[feat].astype(str).values)



In [None]:
original_colnames = list(test_data.drop('취급액',axis=1).columns)
new_colnames = []
for i in range(len(test_data.drop('취급액',axis=1).columns)):
    new_colnames.append(i)
colnames_dic2 = {}
for i in range(len(original_colnames)):
    colnames_dic2[str(new_colnames[i])] = original_colnames[i]
    

In [None]:
all_del_colmns = [33,10,37,35,16,18,9,14,15,21,24,17]
all_del_colmns = [colnames_dic2[str(i)] for i in all_del_colmns]

In [None]:
prod_group = test_data["상품군"].unique()
prod_group_dct = {v:k for k, v in enumerate(prod_group)}

In [None]:
## 추후 전체 set에 대한 mape를 구하기 위해서 split을 해줌.

predict_data = test_data[test_data.취급액 == -1].reset_index(drop=True)
train_set = test_data[test_data['취급액'] != -1]

X = train_set.drop(["취급액"], axis = 1)
y = train_set["취급액"]

grp_idx = train_set['상품군'].map(prod_group_dct)
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=0, stratify = grp_idx)

X_train = X_train.drop(all_del_colmns,axis=1)
X_test = X_test.drop(all_del_colmns,axis=1)

scaler = StandardScaler()

X_train_scaled = pd.DataFrame(scaler.fit_transform(X_train))
X_test_scaled = pd.DataFrame(scaler.transform(X_test))

In [None]:
lgbm_all = LGBMRegressor(learning_rate=0.1641933472071217, max_depth=15,
              min_child_samples=57, min_data_in_leaf=208, min_split_gain=0.001,
              n_estimators=1300, num_leaves=118, objective='gamma',
              random_state=0, sub_sample=0.7)

pred_all_lgbm = lgbm_all.fit(X_train_scaled, y_train).predict(X_test_scaled)

In [None]:
mape(y_test,pred_all_lgbm)

In [None]:
xgb_all = XGBRegressor(base_score=None, booster=None, colsample_bylevel=None,
             colsample_bynode=None, colsample_bytree=0.4876619535400296,
             eta=0.05669375986990678, gamma=0.007357900012337478, gpu_id=None,
             importance_type='gain', interaction_constraints=None,
             learning_rate=None, max_delta_step=None, max_depth=7,
             max_leaves=96, min_child_weight=136.74405777597332, 
             monotone_constraints=None, n_estimators=1963, n_jobs=None,
             num_parallel_tree=None, objective='reg:tweedie', random_state=0,
             reg_alpha=None, reg_lambda=None, scale_pos_weight=None,
             subsample=0.7165695744381803, tree_method=None,
             validate_parameters=None, verbosity=None)
pred_all_xgb = xgb_all.fit(X_train_scaled, y_train).predict(X_test_scaled)

In [None]:
mape(y_test,pred_all_xgb)

In [None]:
pred_cat = CatBoost.fit(X_train_cat, y_train_cat).predict(X_test_cat)

In [None]:
mape(y_test,pred_cat)

In [None]:
### LGBM
mape_list = []
for pred, true in zip(predictions_lgb, trues):
    mape_res = mape(true, pred)
    print(f"mape: {mape_res}" )
    mape_list.append(mape_res)

In [None]:
### XGB 
mape_list = []
for pred, true in zip(predictions_xgb, trues):
    mape_res = mape(true, pred)
    print(f"mape: {mape_res}" )
    mape_list.append(mape_res)

In [None]:
### CAT
mape_list = []
for pred, true in zip(predictions_cat, trues):
    mape_res = mape(true, pred)
    print(f"mape: {mape_res}" )
    mape_list.append(mape_res)

In [None]:
mape(y_test_clothes, (pred1_lgb*0.7+pred1_xgb*0.15+pred1_cat*0.15))

## **Ensemble 1**

#### Model 1

In [None]:
pred1 = pred1_lgb*0.70 + pred1_xgb*0.30
pred2 = pred2_lgb*0.70 + pred2_xgb*0.30
pred3 = (pred3_lgb*pred3_xgb)**(1/2)
pred4 = pred4_lgb*0.65 + pred4_xgb*0.35
pred5 = pred5_lgb*0.75 + pred5_xgb*0.25
pred6 = pred6_lgb*0.85 + pred6_xgb*0.15
pred7 = (pred7_lgb*pred7_xgb)**(1/2)
pred8 = pred8_lgb*0.90 + pred8_xgb*0.10
pred9 = (pred9_lgb*pred9_xgb)**(1/2)
pred10 = pred10_lgb*0.95 + pred10_xgb*0.05
pred11 = (pred11_lgb*pred11_xgb)**(1/2)

pred1 : 21.15152335098328  
pred2 : 25.516611998657673  
pred3 : 26.872051596337464  
pred4 : 15.58536517238385  
pred5 : 18.13144847247924  
pred6 : 40.28634995228615  
pred7 : 30.777249491483833  
pred8 : 19.49473606475452  
pred9 : 36.547539947626134  
pred10 : 41.59788324834989  
pred11 : 23.410398012582057  

In [None]:
predictions = [pred1, pred2, pred3, pred4, pred5, pred6, pred7, pred8, pred9, pred10, pred11] # selection한 결과물
y_pred = np.zeros_like(y_test)

In [None]:
# y_test format으로 다시 넣어주기
for i in range(len(prod_group)):
      y_pred[X_test["상품군"] == prod_group[i]] = predictions[i]

In [None]:
# 상품군별 앙상블 하기 전
mape(y_test, (y_pred_lgbm*0.80 + y_pred_xgb*0.20))

In [None]:
# 상품군별 앙상블 한 후
mape(y_test, y_pred)

#### Model 2

In [None]:
mape(y_test,(pred_all_lgbm*0.1 +pred_all_xgb*0.8+ pred_cat*0.1))

In [None]:
mape(y_test, (y_pred*0.75 + (pred_all_lgbm*0.1 +pred_all_xgb*0.8+ pred_cat*0.1)*0.25))

## **Ensemble2**

In [None]:
with open('../data/2020-09-27_3105.pickle', 'rb') as f:
    yujin = pickle.load(f)

In [None]:
mape(y_test,yujin)

In [None]:
mape(y_test, (y_pred*0.6 + (pred_all_lgbm*0.1 +pred_all_xgb*0.8+ pred_cat*0.1)*0.1 + yujin*0.3))

# **테스트 데이터 예측!!**

In [None]:
# 라벨 인코딩
test_data = performance_data.drop(['방송일시','판매량', 'holiday', '대비', 'date', 'mean_rating', 
                                   '배당수익률(%)', '주가자산비율', '고가지수', '저가지수', '거래량(천주)', '거래대금(백만원)','상장시가총액(백만원)'],axis=1)

test_data_cat = performance_data.drop(['방송일시','판매량', 'holiday', '대비', 'date', 'mean_rating', 
                                   '배당수익률(%)', '주가자산비율', '고가지수', '저가지수', '거래량(천주)', '거래대금(백만원)','상장시가총액(백만원)'],axis=1)

# test_data['상품코드'] = test_data['상품코드'].map(int)
for feat in ['상품명','상품코드','마더코드','prime_time','중분류','요일','season','남여','muil','브랜드','season_prod','소분류']:
    lbe = LabelEncoder()
    test_data[feat] = lbe.fit_transform(test_data[feat].astype(str).values)




In [None]:
prod_group = test_data["상품군"].unique()
prod_group_dct = {v:k for k, v in enumerate(prod_group)}

In [None]:
predict_data = test_data[test_data.취급액 == -1].reset_index(drop=True)
train_set = test_data[test_data['취급액'] != -1]

X_train = train_set.drop(["취급액"], axis = 1)
y_train = train_set["취급액"]
X_test = predict_data.drop(["취급액"], axis = 1)
y_test = predict_data["취급액"]

grp_idx = train_set['상품군'].map(prod_group_dct)

### **Model 1**

In [None]:
def train_test_grp(X_train, X_test, y_train, y_test, prod_group, grp_index):
    new_X_train = X_train[X_train["상품군"] == prod_group[grp_index]].drop("상품군", axis=1)
    new_X_test = X_test[X_test["상품군"] == prod_group[grp_index]].drop("상품군", axis=1)
    new_y_train = y_train[X_train["상품군"] == prod_group[grp_index]]
    new_y_test = y_test[X_test["상품군"] == prod_group[grp_index]]
    return new_X_train, new_X_test, new_y_train, new_y_test

In [None]:
X_train_clothes, X_test_clothes, y_train_clothes, y_test_clothes = train_test_grp(X_train, X_test, y_train, y_test, prod_group, 0)
X_train_inner, X_test_inner, y_train_inner, y_test_inner = train_test_grp(X_train, X_test, y_train, y_test, prod_group, 1)
X_train_kitchen, X_test_kitchen, y_train_kitchen, y_test_kitchen = train_test_grp(X_train, X_test, y_train, y_test, prod_group, 2)
X_train_food, X_test_food, y_train_food, y_test_food = train_test_grp(X_train, X_test, y_train, y_test, prod_group, 3)
X_train_beauty, X_test_beauty, y_train_beauty, y_test_beauty = train_test_grp(X_train, X_test, y_train, y_test, prod_group, 4)
X_train_elec, X_test_elec, y_train_elec, y_test_elec = train_test_grp(X_train, X_test, y_train, y_test, prod_group, 5)
X_train_goods, X_test_goods, y_train_goods, y_test_goods = train_test_grp(X_train, X_test, y_train, y_test, prod_group, 6)
X_train_health, X_test_health, y_train_health, y_test_health = train_test_grp(X_train, X_test, y_train, y_test, prod_group, 7)
X_train_etc, X_test_etc, y_train_etc, y_test_etc = train_test_grp(X_train, X_test, y_train, y_test, prod_group, 8)
X_train_furn, X_test_furn, y_train_furn, y_test_furn = train_test_grp(X_train, X_test, y_train, y_test, prod_group, 9)
X_train_bed, X_test_bed, y_train_bed, y_test_bed = train_test_grp(X_train, X_test, y_train, y_test, prod_group, 10)

In [None]:
idx_clothes = X_test_clothes.index
idx_inner = X_test_inner.index
idx_kitchen = X_test_kitchen.index
idx_food = X_test_food.index
idx_beauty = X_test_beauty.index
idx_elec = X_test_elec.index
idx_goods = X_test_goods.index
idx_health = X_test_health.index
idx_etc = X_test_etc.index
idx_furn = X_test_furn.index
idx_bed = X_test_bed.index

In [None]:
X_train_clothes = X_train_clothes.drop(clothes_del_columns,axis=1)
X_train_inner = X_train_inner.drop(inner_del_columns,axis=1)
X_train_kitchen = X_train_kitchen.drop(kitchen_del_columns,axis=1)
X_train_food = X_train_food.drop(food_del_columns,axis=1)
X_train_beauty = X_train_beauty.drop(beauty_del_columns,axis=1)
X_train_elec = X_train_elec.drop(elec_del_columns,axis=1)
X_train_goods = X_train_goods.drop(goods_del_columns,axis=1)
X_train_health = X_train_health.drop(health_del_columns,axis=1)
X_train_etc = X_train_etc.drop(etc_del_columns,axis=1)
X_train_furn = X_train_furn.drop(furn_del_columns,axis=1)
X_train_bed = X_train_bed.drop(bed_del_columns,axis=1)


X_test_clothes = X_test_clothes.drop(clothes_del_columns,axis=1)
X_test_inner = X_test_inner.drop(inner_del_columns,axis=1)
X_test_kitchen = X_test_kitchen.drop(kitchen_del_columns,axis=1)
X_test_food = X_test_food.drop(food_del_columns,axis=1)
X_test_beauty = X_test_beauty.drop(beauty_del_columns,axis=1)
X_test_elec = X_test_elec.drop(elec_del_columns,axis=1)
X_test_goods = X_test_goods.drop(goods_del_columns,axis=1)
X_test_health = X_test_health.drop(health_del_columns,axis=1)
X_test_etc = X_test_etc.drop(etc_del_columns,axis=1)
X_test_furn = X_test_furn.drop(furn_del_columns,axis=1)
X_test_bed = X_test_bed.drop(bed_del_columns,axis=1)

In [None]:
# scaling
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()

X_train_clothes_scaled = pd.DataFrame(scaler.fit_transform(X_train_clothes))
X_test_clothes_scaled = pd.DataFrame(scaler.transform(X_test_clothes))
X_train_inner_scaled = pd.DataFrame(scaler.fit_transform(X_train_inner))
X_test_inner_scaled = pd.DataFrame(scaler.transform(X_test_inner))
X_train_kitchen_scaled = pd.DataFrame(scaler.fit_transform(X_train_kitchen))
X_test_kitchen_scaled = pd.DataFrame(scaler.transform(X_test_kitchen))
X_train_food_scaled = pd.DataFrame(scaler.fit_transform(X_train_food))
X_test_food_scaled = pd.DataFrame(scaler.transform(X_test_food))
X_train_beauty_scaled = pd.DataFrame(scaler.fit_transform(X_train_beauty))
X_test_beauty_scaled = pd.DataFrame(scaler.transform(X_test_beauty))
X_train_elec_scaled = pd.DataFrame(scaler.fit_transform(X_train_elec))
X_test_elec_scaled = pd.DataFrame(scaler.transform(X_test_elec))
X_train_goods_scaled = pd.DataFrame(scaler.fit_transform(X_train_goods))
X_test_goods_scaled = pd.DataFrame(scaler.transform(X_test_goods))
X_train_health_scaled = pd.DataFrame(scaler.fit_transform(X_train_health))
X_test_health_scaled = pd.DataFrame(scaler.transform(X_test_health))
X_train_etc_scaled = pd.DataFrame(scaler.fit_transform(X_train_etc))
X_test_etc_scaled = pd.DataFrame(scaler.transform(X_test_etc))
X_train_furn_scaled = pd.DataFrame(scaler.fit_transform(X_train_furn))
X_test_furn_scaled = pd.DataFrame(scaler.transform(X_test_furn))
X_train_bed_scaled = pd.DataFrame(scaler.fit_transform(X_train_bed))
X_test_bed_scaled = pd.DataFrame(scaler.transform(X_test_bed))

In [None]:
# model fitting

pred1_lgb_test = lgbm1.fit(X_train_clothes_scaled, y_train_clothes).predict(X_test_clothes_scaled)
pred2_lgb_test = lgbm2.fit(X_train_inner_scaled, y_train_inner).predict(X_test_inner_scaled)
pred3_lgb_test = lgbm3.fit(X_train_kitchen_scaled, y_train_kitchen).predict(X_test_kitchen_scaled)
pred4_lgb_test = lgbm4.fit(X_train_food_scaled, y_train_food).predict(X_test_food_scaled)
pred5_lgb_test = lgbm5.fit(X_train_beauty_scaled, y_train_beauty).predict(X_test_beauty_scaled)
pred6_lgb_test = lgbm6.fit(X_train_elec_scaled, y_train_elec).predict(X_test_elec_scaled)
pred7_lgb_test = lgbm7.fit(X_train_goods_scaled, y_train_goods).predict(X_test_goods_scaled)
pred8_lgb_test = lgbm8.fit(X_train_health_scaled, y_train_health).predict(X_test_health_scaled)
pred9_lgb_test = lgbm9.fit(X_train_etc_scaled, y_train_etc).predict(X_test_etc_scaled)
pred10_lgb_test = lgbm10.fit(X_train_furn_scaled, y_train_furn).predict(X_test_furn_scaled)
pred11_lgb_test = lgbm11.fit(X_train_bed_scaled, y_train_bed).predict(X_test_bed_scaled)

In [None]:
# model fitting

pred1_xgb_test = xgb1.fit(X_train_clothes_scaled, y_train_clothes).predict(X_test_clothes_scaled)
pred2_xgb_test = xgb2.fit(X_train_inner_scaled, y_train_inner).predict(X_test_inner_scaled)
pred3_xgb_test = xgb3.fit(X_train_kitchen_scaled, y_train_kitchen).predict(X_test_kitchen_scaled)
pred4_xgb_test = xgb4.fit(X_train_food_scaled, y_train_food).predict(X_test_food_scaled)
pred5_xgb_test = xgb5.fit(X_train_beauty_scaled, y_train_beauty).predict(X_test_beauty_scaled)
pred6_xgb_test = xgb6.fit(X_train_elec_scaled, y_train_elec).predict(X_test_elec_scaled)
pred7_xgb_test = xgb7.fit(X_train_goods_scaled, y_train_goods).predict(X_test_goods_scaled)
pred8_xgb_test = xgb8.fit(X_train_health_scaled, y_train_health).predict(X_test_health_scaled)
pred9_xgb_test = xgb9.fit(X_train_etc_scaled, y_train_etc).predict(X_test_etc_scaled)
pred10_xgb_test = xgb10.fit(X_train_furn_scaled, y_train_furn).predict(X_test_furn_scaled)
pred11_xgb_test = xgb11.fit(X_train_bed_scaled, y_train_bed).predict(X_test_bed_scaled)

In [None]:
pred1_cat_test = CatBoost_clothes.fit(X_train_clothes_cat, y_train_clothes_cat).predict(X_test_clothes_cat)
pred2_cat_test = CatBoost_inner.fit(X_train_inner_cat, y_train_inner_cat).predict(X_test_inner_cat)
pred3_cat_test = CatBoost_kitchen.fit(X_train_kitchen_cat, y_train_kitchen_cat).predict(X_test_kitchen_cat)
pred4_cat_test = CatBoost_food.fit(X_train_food_cat, y_train_food_cat).predict(X_test_food_cat)
pred5_cat_test = CatBoost_beauty.fit(X_train_beauty_cat, y_train_beauty_cat).predict(X_test_beauty_cat)
pred6_cat_test = CatBoost_elec.fit(X_train_elec_cat, y_train_elec_cat).predict(X_test_elec_cat)
pred7_cat_test = CatBoost_goods.fit(X_train_goods_cat, y_train_goods_cat).predict(X_test_goods_cat)
pred8_cat_test = CatBoost_health.fit(X_train_health_cat, y_train_health_cat).predict(X_test_health_cat)
pred9_cat_test = CatBoost_etc.fit(X_train_etc_cat, y_train_etc_cat).predict(X_test_etc_cat)
pred10_cat_test = CatBoost_furn.fit(X_train_furn_cat, y_train_furn_cat).predict(X_test_furn_cat)
pred11_cat_test = CatBoost_bed.fit(X_train_bed_cat, y_train_bed_cat).predict(X_test_bed_cat)


In [None]:
predictions_lgb = [pred1_lgb_test, pred2_lgb_test, pred3_lgb_test, pred4_lgb_test, pred5_lgb_test, pred6_lgb_test, pred7_lgb_test, pred8_lgb_test, pred9_lgb_test, pred10_lgb_test, pred11_lgb_test] # selection한 결과물
predictions_xgb = [pred1_xgb_test, pred2_xgb_test, pred3_xgb_test, pred4_xgb_test, pred5_xgb_test, pred6_xgb_test, pred7_xgb_test, pred8_xgb_test, pred9_xgb_test, pred10_xgb_test, pred11_xgb_test] 
predictions_cat = [pred1_cat_test, pred2_cat_test, pred3_cat_test, pred4_cat_test, pred5_cat_test, pred6_cat_test, pred7_cat_test, pred8_cat_test, pred9_cat_test, pred10_cat_test, pred11_cat_test]


In [None]:
pred1 = pred1_lgb_test*0.70 + pred1_xgb_test*0.30 #+ pred1_cat_test*0.0
pred2 = pred2_lgb_test*0.70 + pred2_xgb_test*0.30 #+ pred2_cat_test*0.0
pred3 = (pred3_lgb_test*pred3_xgb_test)**(1/2) #+ pred3_cat_test*0.0
pred4 = pred4_lgb_test*0.65 + pred4_xgb_test*0.35# + pred4_cat_test*0.0
pred5 = pred5_lgb_test*0.75 + pred5_xgb_test*0.25 #+ pred5_cat_test*0.0
pred6 = pred6_lgb_test*0.85 + pred6_xgb_test*0.15 #+ pred6_cat_test*0.0
pred7 = (pred7_lgb_test*pred7_xgb_test)**(1/2) #+ pred7_cat_test*0.0
pred8 = pred8_lgb_test*0.90 + pred8_xgb_test*0.10 #+ pred8_cat_test*0.0
pred9 = (pred9_lgb_test*pred9_xgb_test)**(1/2) #+ pred9_cat_test*0.0
pred10 = pred10_lgb_test*0.95 + pred10_xgb_test*0.05 #+ pred10_cat_test*0.0
pred11 = (pred11_lgb_test*pred11_xgb_test)**(1/2) #+ pred11_cat_test*0.0

In [None]:
predict_data_ = predict_data

In [None]:
predictions = [pred1, pred2, pred3, pred4, pred5, pred6, pred7, pred8, pred9, pred10, pred11] # selection한 결과물
y_pred_model1 = np.zeros_like(y_test)

In [None]:
# y_test format으로 다시 넣어주기
for i in range(len(prod_group)):
      y_pred_model1[predict_data["상품군"] == prod_group[i]] = predictions[i]

In [None]:
y_pred_model1

### **Model 2**

In [None]:
# 라벨 인코딩
test_data = performance_data.drop(['방송일시','판매량', 'holiday', '대비', 'date', 'mean_rating', 
                                   '배당수익률(%)', '주가자산비율', '고가지수', '저가지수', '거래량(천주)', '거래대금(백만원)','상장시가총액(백만원)'],axis=1)

# test_data['상품코드'] = test_data['상품코드'].map(int)
for feat in ['상품명','상품코드','상품군','마더코드','prime_time','중분류','요일','season','남여','muil','브랜드','season_prod','소분류']:
    lbe = LabelEncoder()
    test_data[feat] = lbe.fit_transform(test_data[feat].astype(str).values)



In [None]:
original_colnames = list(test_data.drop('취급액',axis=1).columns)
new_colnames = []
for i in range(len(test_data.drop('취급액',axis=1).columns)):
    new_colnames.append(i)
colnames_dic2 = {}
for i in range(len(original_colnames)):
    colnames_dic2[str(new_colnames[i])] = original_colnames[i]
    

In [None]:
all_del_colmns = [33,10,37,35,16,18,9,14,15,21,24,17]
all_del_colmns = [colnames_dic2[str(i)] for i in all_del_colmns]

In [None]:
prod_group = test_data["상품군"].unique()
prod_group_dct = {v:k for k, v in enumerate(prod_group)}

In [None]:
predict_data = test_data[test_data.취급액 == -1].reset_index(drop=True)
train_set = test_data[test_data['취급액'] != -1]

X = train_set.drop(["취급액"], axis = 1)
y_train = train_set["취급액"]
y_test = predict_data["취급액"]
grp_idx = train_set['상품군'].map(prod_group_dct)

predict_data = predict_data.drop('취급액',axis=1)

X_train = X.drop(all_del_colmns,axis=1)
X_test = predict_data.drop(all_del_colmns,axis=1)

scaler = StandardScaler()

X_train_scaled = pd.DataFrame(scaler.fit_transform(X_train))
X_test_scaled = pd.DataFrame(scaler.transform(X_test))

In [None]:
lgbm_all = LGBMRegressor(learning_rate=0.1641933472071217, max_depth=15,
              min_child_samples=57, min_data_in_leaf=208, min_split_gain=0.001,
              n_estimators=1300, num_leaves=118, objective='gamma',
              random_state=0, sub_sample=0.7)

pred_all_lgbm = lgbm_all.fit(X_train_scaled, y_train).predict(X_test_scaled)

In [None]:
xgb_all = XGBRegressor(base_score=None, booster=None, colsample_bylevel=None,
             colsample_bynode=None, colsample_bytree=0.4876619535400296,
             eta=0.05669375986990678, gamma=0.007357900012337478, gpu_id=None,
             importance_type='gain', interaction_constraints=None,
             learning_rate=None, max_delta_step=None, max_depth=7,
             max_leaves=96, min_child_weight=136.74405777597332, 
             monotone_constraints=None, n_estimators=1963, n_jobs=None,
             num_parallel_tree=None, objective='reg:tweedie', random_state=0,
             reg_alpha=None, reg_lambda=None, scale_pos_weight=None,
             subsample=0.7165695744381803, tree_method=None,
             validate_parameters=None, verbosity=None)
pred_all_xgb = xgb_all.fit(X_train_scaled, y_train).predict(X_test_scaled)

In [None]:
CatBoost = CatBoostRegressor(n_estimators = 3000,
                             loss_function = 'MAE',
                             eval_metric = 'MAPE',
                             depth = 7,
                             cat_features = ['상품군','상품코드','마더코드','상품명','소분류','prime_time','중분류','요일','season','남여','muil','season_prod','season','브랜드'],                             
                             verbose=0,
                             random_seed = 0,
                             task_type='CPU')
pred_all_cat = CatBoost.fit(train_set_cat.drop('취급액',axis=1), train_set_cat['취급액']).predict(predict_data_cat.drop('취급액',axis=1))

In [None]:
y_pred_model2 = (pred_all_lgbm*0.1 +pred_all_xgb*0.8+ pred_all_cat*0.1)

### **Model 3**

In [None]:
with open('../data/2020-09-27_3105.pickle', 'rb') as f:
    y_pred_model3 = pickle.load(f)

### **Ensemble**

In [None]:
# 최종 submission Y
final_pred = (y_pred_model1*0.6 + y_pred_model2*0.1 + y_pred_model3*0.3)

## **제출**

In [None]:
test_data = pd.read_excel('../2020빅콘테스트 문제데이터(데이터분석분야-챔피언리그)_update_200818/02_평가데이터/2020 빅콘테스트 데이터분석분야-챔피언리그_2020년 6월 판매실적예측데이터(평가데이터).xlsx',header=1)

In [None]:
test_data = test_data.drop('취급액',axis=1)

In [None]:
submission_data = performance_data[performance_data.취급액==-1].reset_index(drop=True)
submission_data['취급액'] = final_pred
submission_data = submission_data[['방송일시','마더코드','상품코드','상품명','취급액']]
submission_data['마더코드'] = submission_data['마더코드'].map(int)
submission_data['상품코드'] = submission_data['상품코드'].map(int)
submission_data.info()

In [None]:
final_pred = pd.merge(test_data,submission_data,on=['방송일시','마더코드','상품코드','상품명'],how='left')
final_pred

In [None]:
final_pred.to_excel('../submission/평가데이터답안.xlsx',index=False)

# **편성표 작업시에만!**

In [None]:
a = performance_data.drop(['판매량', 'holiday', '대비', 'date', 'mean_rating',
                                   '배당수익률(%)', '주가자산비율', '고가지수', '저가지수', '거래량(천주)', '거래대금(백만원)','상장시가총액(백만원)'],axis=1)
a = a[a.취급액 == -1].reset_index(drop=True)

In [None]:
a['취급액'] = final_pred['취급액']

In [None]:
a.to_csv('../data/hungarian.csv')