In [1739]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV, RandomizedSearchCV
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler,LabelEncoder
from sklearn.ensemble import GradientBoostingRegressor, RandomForestRegressor
from sklearn.metrics import roc_auc_score
from xgboost import XGBRegressor
from lightgbm import LGBMRegressor
import datetime
import locale                                         
import pickle
from tqdm import tqdm
import warnings
import math

warnings.filterwarnings("ignore")
locale.setlocale(locale.LC_ALL, 'ko_KR.UTF-8') 

'ko_KR.UTF-8'

In [2039]:
with open('../data/merged_data_concat_0926.pickle', 'rb') as f:
    performance_data = pickle.load(f)

In [2040]:
from datetime import datetime


today = datetime.today().strftime("%m%d")

In [2041]:
train = performance_data[performance_data.취급액 != -1]
test = performance_data[performance_data.취급액 == -1]

In [2042]:
print(len(set(test.마더코드.unique())))
print(len(set(train.마더코드.unique())))
print(len(set(test.마더코드.unique()) - set(train.마더코드.unique())))

201
687
116


In [2043]:
print(len(set(test.상품코드.unique())))
print(len(set(train.상품코드.unique())))
print(len(set(test.상품코드.unique()) - set(train.상품코드.unique())))

389
2031
365


In [2044]:
print(len(set(test.상품명.unique())))
print(len(set(train.상품명.unique())))
print(len(set(test.상품명.unique()) - set(train.상품명.unique())))

349
1682
306


In [2045]:
print(len(set(test.중분류.unique())))
print(len(set(train.중분류.unique())))
print(len(set(test.중분류.unique()) - set(train.중분류.unique())))

57
89
0


In [2046]:
print(len(set(test.소분류.unique())))
print(len(set(train.소분류.unique())))
print(len(set(test.소분류.unique()) - set(train.소분류.unique())))

129
325
0


In [2047]:
# 라벨 인코딩
test_data = performance_data.drop(['방송일시','판매량', 'holiday', '대비', 'date', 'mean_rating', 
                                   '배당수익률(%)', '주가자산비율', '고가지수', '저가지수', '거래량(천주)', '거래대금(백만원)','상장시가총액(백만원)'],axis=1)

# test_data['상품코드'] = test_data['상품코드'].map(int)
for feat in ['상품명','상품코드','마더코드','prime_time','중분류','요일','season','남여','muil','브랜드','season_prod','소분류']:
    lbe = LabelEncoder()
    test_data[feat] = lbe.fit_transform(test_data[feat].astype(str).values)


## Data Preparation for Modeling

In [2048]:
prod_group = test_data["상품군"].unique()
prod_group_dct = {v:k for k, v in enumerate(prod_group)}

In [2049]:
## 추후 전체 set에 대한 mape를 구하기 위해서 split을 해줌.

train_set = test_data[test_data['취급액'] != -1]

X = train_set.drop(["취급액"], axis = 1)
y = train_set["취급액"]

grp_idx = train_set['상품군'].map(prod_group_dct)
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=0, stratify = grp_idx)

In [2050]:
test_len = [len(X_test[X_test["상품군"] == prod_group[i]]) for i in range(len(prod_group))]
print(test_len)

[1083, 978, 1643, 971, 326, 1291, 692, 196, 924, 575, 166]


In [2051]:
## train test split된 걸 상품군 별로 나눔

def train_test_grp(X_train, X_test, y_train, y_test, prod_group, grp_index):
    new_X_train = X_train[X_train["상품군"] == prod_group[grp_index]].drop("상품군", axis=1)
    new_X_test = X_test[X_test["상품군"] == prod_group[grp_index]].drop("상품군", axis=1)
    new_y_train = y_train[X_train["상품군"] == prod_group[grp_index]]
    new_y_test = y_test[X_test["상품군"] == prod_group[grp_index]]
    return new_X_train, new_X_test, new_y_train, new_y_test

In [2053]:
# K-Fold용 데이터 만들기
def make_grp_data(X,idx):
    X_data = X[X["상품군"] == prod_group[idx]].drop(["상품군"], axis=1)
#     y_data = y[X["상품군"] == prod_group[idx]]['취급액']
    return X_data

clothes= make_grp_data(train_set,0)
inner = make_grp_data(train_set,1)
kitchen = make_grp_data(train_set,2)
food = make_grp_data(train_set,3)
beauty = make_grp_data(train_set,4)
elec = make_grp_data(train_set,5)
goods = make_grp_data(train_set,6)
health = make_grp_data(train_set,7)
etc = make_grp_data(train_set,8)
furn = make_grp_data(train_set,9)
bed = make_grp_data(train_set,10)

In [2054]:
X_train_clothes, X_test_clothes, y_train_clothes, y_test_clothes = train_test_grp(X_train, X_test, y_train, y_test, prod_group, 0)
X_train_inner, X_test_inner, y_train_inner, y_test_inner = train_test_grp(X_train, X_test, y_train, y_test, prod_group, 1)
X_train_kitchen, X_test_kitchen, y_train_kitchen, y_test_kitchen = train_test_grp(X_train, X_test, y_train, y_test, prod_group, 2)
X_train_food, X_test_food, y_train_food, y_test_food = train_test_grp(X_train, X_test, y_train, y_test, prod_group, 3)
X_train_beauty, X_test_beauty, y_train_beauty, y_test_beauty = train_test_grp(X_train, X_test, y_train, y_test, prod_group, 4)
X_train_elec, X_test_elec, y_train_elec, y_test_elec = train_test_grp(X_train, X_test, y_train, y_test, prod_group, 5)
X_train_goods, X_test_goods, y_train_goods, y_test_goods = train_test_grp(X_train, X_test, y_train, y_test, prod_group, 6)
X_train_health, X_test_health, y_train_health, y_test_health = train_test_grp(X_train, X_test, y_train, y_test, prod_group, 7)
X_train_etc, X_test_etc, y_train_etc, y_test_etc = train_test_grp(X_train, X_test, y_train, y_test, prod_group, 8)
X_train_furn, X_test_furn, y_train_furn, y_test_furn = train_test_grp(X_train, X_test, y_train, y_test, prod_group, 9)
X_train_bed, X_test_bed, y_train_bed, y_test_bed = train_test_grp(X_train, X_test, y_train, y_test, prod_group, 10)

In [2066]:
test_data['상품군'] = lbe.fit_transform(test_data['상품군'].astype(str).values)
train_set = test_data[test_data['취급액'] != -1]

X = train_set.drop(["취급액"], axis = 1)
y = train_set["취급액"]

# grp_idx = train_set['상품군'].map(prod_group_dct)
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=0, stratify = grp_idx)

In [2073]:
# scaling
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()
X_train_scaled = pd.DataFrame(scaler.fit_transform(X_train))
X_test_scaled = pd.DataFrame(scaler.transform(X_test))

X_train_clothes_scaled = pd.DataFrame(scaler.fit_transform(X_train_clothes))
X_test_clothes_scaled = pd.DataFrame(scaler.transform(X_test_clothes))
X_train_inner_scaled = pd.DataFrame(scaler.fit_transform(X_train_inner))
X_test_inner_scaled = pd.DataFrame(scaler.transform(X_test_inner))
X_train_kitchen_scaled = pd.DataFrame(scaler.fit_transform(X_train_kitchen))
X_test_kitchen_scaled = pd.DataFrame(scaler.transform(X_test_kitchen))
X_train_food_scaled = pd.DataFrame(scaler.fit_transform(X_train_food))
X_test_food_scaled = pd.DataFrame(scaler.transform(X_test_food))
X_train_beauty_scaled = pd.DataFrame(scaler.fit_transform(X_train_beauty))
X_test_beauty_scaled = pd.DataFrame(scaler.transform(X_test_beauty))
X_train_elec_scaled = pd.DataFrame(scaler.fit_transform(X_train_elec))
X_test_elec_scaled = pd.DataFrame(scaler.transform(X_test_elec))
X_train_goods_scaled = pd.DataFrame(scaler.fit_transform(X_train_goods))
X_test_goods_scaled = pd.DataFrame(scaler.transform(X_test_goods))
X_train_health_scaled = pd.DataFrame(scaler.fit_transform(X_train_health))
X_test_health_scaled = pd.DataFrame(scaler.transform(X_test_health))
X_train_etc_scaled = pd.DataFrame(scaler.fit_transform(X_train_etc))
X_test_etc_scaled = pd.DataFrame(scaler.transform(X_test_etc))
X_train_furn_scaled = pd.DataFrame(scaler.fit_transform(X_train_furn))
X_test_furn_scaled = pd.DataFrame(scaler.transform(X_test_furn))
X_train_bed_scaled = pd.DataFrame(scaler.fit_transform(X_train_bed))
X_test_bed_scaled = pd.DataFrame(scaler.transform(X_test_bed))

In [2057]:
original_colnames = list(X_train_clothes.columns)
new_colnames = list(X_train_clothes_scaled.columns)
colnames_dic = {}
for i in range(len(original_colnames)):
    colnames_dic[str(new_colnames[i])] = original_colnames[i]

## Modeling


In [2058]:
def mape(y_true, y_pred):
    return 100*np.mean(np.abs(y_pred - y_true) / y_true)

best_tot_mape = 100

In [1760]:
lgbm1 = LGBMRegressor(n_estimators = 1500,random_state = 0, max_depth = 11, num_leaves = 23,verbose = 0, n_jobs = -1, objective = 'gamma')
lgbm2 = LGBMRegressor(n_estimators = 2500,random_state = 0, max_depth = 11, learning_rate = 0.09, num_leaves = 23, verbose = 0, n_jobs = -1, objective = 'tweedie')
lgbm3 = LGBMRegressor(n_estimators = 2500,random_state = 0, max_depth = 11, learning_rate = 0.09, num_leaves = 23, verbose = 0, n_jobs = -1, objective = 'tweedie')
lgbm4 = LGBMRegressor(n_estimators = 1500,random_state = 0, max_depth = 11, num_leaves = 20, verbose = 0, n_jobs = -1, objective = 'tweedie')
lgbm5 = LGBMRegressor(n_estimators = 1500,random_state = 0, max_depth = 11, verbose = 0, n_jobs = -1, objective = 'tweedie')
lgbm6 = LGBMRegressor(n_estimators = 500,  num_leaves = 2048, boosting_type = "dart", random_state = 0, max_depth = 11, verbose = 0, n_jobs = -1, objective = 'mape')
lgbm7 = LGBMRegressor(n_estimators = 1200,random_state = 0, max_depth = 11, verbose = 0, n_jobs = -1, objective = 'tweedie')
lgbm8 = LGBMRegressor(n_estimators = 1500,random_state = 0, max_depth = 11, verbose = 0, n_jobs = -1, objective = 'tweedie')
lgbm9 = LGBMRegressor(n_estimators = 1200, random_state = 0, boosting_type = 'dart', max_depth = 11, verbose = 0, n_jobs = -1, objective = 'tweedie')
lgbm10 = LGBMRegressor(n_estimators = 500, boosting_type = 'dart', num_leaves = 127, random_state = 0, max_depth = 11, verbose = 0, n_jobs = -1, objective = 'mape', learning_rate = 0.09)
lgbm11 = LGBMRegressor(n_estimators = 500,random_state = 0, max_depth = 11, num_leaves = 20, verbose = 0, n_jobs = -1, objective = 'tweedie')

- loss에 variation을 줘보자  
1) regression: mse loss  
2) regression_l1: mae loss (group별로는 줄어드는데 오히려 전체에 대해서 mape를 계산하면 전체 mape는 높아짐)    
3) fair:  
4) huber:  
5) poisson regression: 분포 모양이 어느 정도 비슷함. 근데 중요한 건 얜 y가 discrete임을 가정함. 뭐 얘로 해도 ㄱㅊ하게 예측하는 게 좀 있음.  
6) quantile regression: 얘도 거지  
7) mape: 거지같이 나옴(가전, 가구는 얘가 제일 나음)    
8) gamma: ㄱㅊㄱㅊ 모양이 얘랑 비슷한 게 많음  
9) tweedie: 얘는 0에 뭉치는 경향이 있다는 게 가장 뚜렷한 특징. BEST   

In [1761]:
# model fitting

pred1 = lgbm1.fit(X_train_clothes_scaled, y_train_clothes).predict(X_test_clothes_scaled)
pred2 = lgbm2.fit(X_train_inner_scaled, y_train_inner).predict(X_test_inner_scaled)
pred3 = lgbm3.fit(X_train_kitchen_scaled, y_train_kitchen).predict(X_test_kitchen_scaled)
pred4 = lgbm4.fit(X_train_food_scaled, y_train_food).predict(X_test_food_scaled)
pred5 = lgbm5.fit(X_train_beauty_scaled, y_train_beauty).predict(X_test_beauty_scaled)
pred6 = lgbm6.fit(X_train_elec_scaled, y_train_elec).predict(X_test_elec_scaled)
pred7 = lgbm7.fit(X_train_goods_scaled, y_train_goods).predict(X_test_goods_scaled)
pred8 = lgbm8.fit(X_train_health_scaled, y_train_health).predict(X_test_health_scaled)
pred9 = lgbm9.fit(X_train_etc_scaled, y_train_etc).predict(X_test_etc_scaled)
pred10 = lgbm10.fit(X_train_furn_scaled, y_train_furn).predict(X_test_furn_scaled)
pred11 = lgbm11.fit(X_train_bed_scaled, y_train_bed).predict(X_test_bed_scaled)

### Feature selection

In [983]:
from sklearn.feature_selection import SelectFromModel
import eli5
from eli5.sklearn import PermutationImportance
from sklearn.model_selection import KFold

In [1059]:
def color_negative_red(val):
    """
    Takes a scalar and returns a string with
    the css property `'color: red'` for negative
    strings, black otherwise.
    """
    if val < 0 :
        color = 'red' 
    elif val == 0 :
        color = 'blue'
    else :
        color ='black'
    return 'color: %s' % color

In [2059]:
original_colnames = list(X_train.columns)
new_colnames = list(X_train_scaled.columns)
colnames_dic3 = {}
for i in range(len(original_colnames)):
    colnames_dic3[str(new_colnames[i])] = original_colnames[i]

In [2060]:
colnames_dic2 = {v: k for k, v in colnames_dic.items()}
colnames_dic4 = {v: k for k, v in colnames_dic3.items()}

#### **전체**

In [2075]:
SEED = 0
CV = KFold(n_splits=5)
FEATURES = train_set.drop(["취급액"], axis=1).columns.tolist()
TARGET_COL = "취급액"

In [2076]:
train_df = train_set.reset_index(drop=True).copy()
for fold, (train_idx, valid_idx) in enumerate(CV.split(train_df, train_df[TARGET_COL])):
    clf = LGBMRegressor(n_estimators = 1500,random_state = 0, max_depth = 11, verbose = 0, n_jobs = -1, objective = 'tweedie')
    
    scaler = StandardScaler()
    X_train_scaled2 = pd.DataFrame(scaler.fit_transform(train_df.loc[train_idx, FEATURES]))
    X_val_scaled = pd.DataFrame(scaler.fit_transform(train_df.loc[valid_idx, FEATURES]))
    
    clf.fit(X_train_scaled2, 
            train_df.loc[train_idx, TARGET_COL], 
            verbose = 0,
            early_stopping_rounds=1000,
            eval_set=[(X_val_scaled, 
                       train_df.loc[valid_idx, TARGET_COL])])
    permutation_importance = PermutationImportance(clf, random_state=SEED)
    permutation_importance.fit(X_val_scaled, 
                               train_df.loc[valid_idx, TARGET_COL])
    if fold == 0:
        df = eli5.explain_weights_df(permutation_importance, feature_names = FEATURES,top=50)
    else:
        df = pd.merge(df,eli5.explain_weights_df(permutation_importance, feature_names = FEATURES,top=50),on='feature')
df.columns = ['feature','weight_1','std_1','weight_2','std_2','weight_3','std_3','weight_4','std_4','weight_5','std_5']
df['feature_num'] = df.feature.apply(lambda x: colnames_dic4[x])
df.index = df[['feature','feature_num']]
s = df.drop(["feature",'feature_num'], axis=1).style.applymap(color_negative_red)
s


Unnamed: 0,weight_1,std_1,weight_2,std_2,weight_3,std_3,weight_4,std_4,weight_5,std_5
"('취급액_mean_quantile_10_소분류', '31')",0.416434,0.005842,0.41306,0.012359,0.429603,0.011723,0.297371,0.007333,0.335895,0.006736
"('group', '25')",0.341797,0.001632,0.34145,0.008312,0.307713,0.006938,0.357235,0.003333,0.209047,0.006218
"('muil', '27')",0.161769,0.009521,0.144184,0.006748,0.154131,0.003814,0.118248,0.002445,0.079478,0.002684
"('hour', '7')",0.115133,0.004116,0.107096,0.003103,0.101669,0.003652,0.124497,0.005172,0.069485,0.005761
"('holiday_yn', '11')",0.04428,0.004107,0.040832,0.002283,0.041298,0.001368,0.063159,0.003679,0.030132,0.001868
"('취급액_mean_log_quantile_10_소분류', '32')",0.033125,0.002357,0.008063,0.003782,0.063967,0.002134,0.140608,0.00431,0.0477,0.002877
"('노출(분)', '0')",0.017218,0.001487,0.022939,0.000789,0.071093,0.004054,0.10418,0.004219,0.008472,0.002067
"('판매단가', '5')",0.008914,0.001555,0.010759,0.001462,0.035411,0.001919,0.045157,0.001963,0.002128,0.001285
"('minute', '8')",0.008829,0.000898,0.028156,0.001592,0.009737,0.000977,0.01153,0.000701,0.003319,0.001926
"('남여', '26')",0.006069,0.000673,0.009083,0.000957,0.00101,0.000583,0.007647,0.001334,0.002066,0.000568


In [2091]:
mape(y_test,pred_all)

32.628086320811974

In [2103]:
del_columns = [33,10,37,35,16,18,9,14,15,21,24,17]
lgbm = LGBMRegressor(n_estimators = 2500,random_state = 0, max_depth = 15, verbose = 0, n_jobs = -1, objective = 'gamma')
pred_all = lgbm.fit(X_train_scaled.drop(del_columns,axis=1), y_train).predict(X_test_scaled.drop(del_columns,axis=1))
mape(y_test,pred_all)

32.19832486338997

In [2104]:
pred_all

array([47189656.0120872 , 33063393.51890229, 14110151.05453782, ...,
       38774106.66910242,  8767237.6304738 ,  4013822.78929766])

#### **의류**

In [1063]:
SEED = 0
CV = KFold(n_splits=5)
FEATURES = clothes.drop(["취급액"], axis=1).columns.tolist()
TARGET_COL = "취급액"

In [1064]:
train_df = clothes.reset_index(drop=True).copy()
for fold, (train_idx, valid_idx) in enumerate(CV.split(train_df, train_df[TARGET_COL])):
    clf = LGBMRegressor(n_estimators = 1500,random_state = 0, max_depth = 11, num_leaves = 23,verbose = 0, n_jobs = -1, objective = 'gamma')
    
    scaler = StandardScaler()
    X_train_scaled = pd.DataFrame(scaler.fit_transform(train_df.loc[train_idx, FEATURES]))
    X_val_scaled = pd.DataFrame(scaler.fit_transform(train_df.loc[valid_idx, FEATURES]))
    
    clf.fit(X_train_scaled, 
            train_df.loc[train_idx, TARGET_COL], 
            verbose = 0,
            early_stopping_rounds=1000,
            eval_set=[(X_val_scaled, 
                       train_df.loc[valid_idx, TARGET_COL])])
    permutation_importance = PermutationImportance(clf, random_state=SEED)
    permutation_importance.fit(X_val_scaled, 
                               train_df.loc[valid_idx, TARGET_COL])
    if fold == 0:
        df = eli5.explain_weights_df(permutation_importance, feature_names = FEATURES,top=50)
    else:
        df = pd.merge(df,eli5.explain_weights_df(permutation_importance, feature_names = FEATURES,top=50),on='feature')
df.columns = ['feature','weight_1','std_1','weight_2','std_2','weight_3','std_3','weight_4','std_4','weight_5','std_5']
df['feature_num'] = df.feature.apply(lambda x: colnames_dic2[x])
df.index = df[['feature','feature_num']]
s = df.drop(["feature",'feature_num'], axis=1).style.applymap(color_negative_red)
s


Unnamed: 0,weight_1,std_1,weight_2,std_2,weight_3,std_3,weight_4,std_4,weight_5,std_5
"('group', '24')",0.276371,0.01457,0.375388,0.016899,0.347991,0.034991,0.196703,0.0227,0.166445,0.011237
"('취급액_mean_quantile_10_소분류', '30')",0.04184,0.008625,0.015506,0.004844,0.020587,0.001899,0.133431,0.020396,0.050429,0.007421
"('노출(분)', '0')",0.031411,0.003112,0.012482,0.002849,-0.019604,0.003179,0.106613,0.008485,0.031234,0.007957
"('판매단가', '4')",0.019318,0.00222,0.0,0.0,0.00905,0.002539,0.023991,0.003564,0.035989,0.002015
"('hour', '6')",0.011214,0.001557,0.037781,0.013382,0.014325,0.000799,0.105947,0.013966,0.047086,0.007918
"('남여', '25')",0.001534,0.000155,0.002772,0.001163,0.012521,0.001729,0.045653,0.003522,0.004315,0.001848
"('마더코드', '1')",0.0011,0.001155,0.0,0.0,0.002179,0.000861,0.031363,0.004028,0.020591,0.00309
"('취급액_mean_log_quantile_10_소분류', '31')",0.000665,0.00054,0.0,0.0,0.000646,0.000205,0.025193,0.006844,0.033855,0.006503
"('season_prod', '27')",0.00064,0.000258,0.003769,0.002284,0.0,0.0,-0.001963,0.002264,0.001474,0.000406
"('취급액_mean_quantile_10', '28')",9.7e-05,0.000193,-0.002204,0.001336,0.0,0.0,0.005818,0.001144,0.002118,0.001624


In [1062]:
mape(y_test_clothes,pred1)

22.908054492898227

In [1699]:
del_columns = [37,34,33,32,26,36,20,12,13,16,18]
lgbm1 = LGBMRegressor(n_estimators = 1500,random_state = 0, max_depth = 11, num_leaves = 23,verbose = 0, n_jobs = -1, objective = 'gamma')
pred1_new = lgbm1.fit(X_train_clothes_scaled.drop(del_columns,axis=1), y_train_clothes).predict(X_test_clothes_scaled.drop(del_columns,axis=1))
mape(y_test_clothes,pred1_new)

22.1579455684485

37,34,33,32,26,36,20,12,13,16,18 번 피쳐 제거

#### **속옷**

In [1085]:
SEED = 0
CV = KFold(n_splits=5)
FEATURES = inner.drop(["취급액"], axis=1).columns.tolist()
TARGET_COL = "취급액"

In [1273]:
train_df = inner.reset_index(drop=True).copy()
for fold, (train_idx, valid_idx) in enumerate(CV.split(train_df, train_df[TARGET_COL])):
    clf = LGBMRegressor(n_estimators = 2500,random_state = 0, max_depth = 11, learning_rate = 0.09, num_leaves = 23, verbose = 0, n_jobs = -1, objective = 'tweedie')
    
    scaler = StandardScaler()
    X_train_scaled = pd.DataFrame(scaler.fit_transform(train_df.loc[train_idx, FEATURES]))
    X_val_scaled = pd.DataFrame(scaler.fit_transform(train_df.loc[valid_idx, FEATURES]))
    
    clf.fit(X_train_scaled, 
            train_df.loc[train_idx, TARGET_COL], 
            verbose = 0,
            early_stopping_rounds=1000,
            eval_set=[(X_val_scaled, 
                       train_df.loc[valid_idx, TARGET_COL])])
    permutation_importance = PermutationImportance(clf, random_state=SEED)
    permutation_importance.fit(X_val_scaled, 
                               train_df.loc[valid_idx, TARGET_COL])
    if fold == 0:
        df = eli5.explain_weights_df(permutation_importance, feature_names = FEATURES,top=50)
    else:
        df = pd.merge(df,eli5.explain_weights_df(permutation_importance, feature_names = FEATURES,top=50),on='feature')
df.columns = ['feature','weight_1','std_1','weight_2','std_2','weight_3','std_3','weight_4','std_4','weight_5','std_5']
df['feature_num'] = df.feature.apply(lambda x: colnames_dic2[x])
df.index = df[['feature','feature_num']]
s = df.drop(["feature",'feature_num'], axis=1).style.applymap(color_negative_red)
s


Unnamed: 0,weight_1,std_1,weight_2,std_2,weight_3,std_3,weight_4,std_4,weight_5,std_5
"('group', '24')",0.326718,0.028337,0.165628,0.014097,0.193707,0.018518,0.293643,0.015157,0.318996,0.027433
"('취급액_mean_quantile_10_소분류', '30')",0.260931,0.049648,0.120511,0.012081,0.145542,0.021183,0.496887,0.015147,0.281597,0.008096
"('hour', '6')",0.166114,0.022394,0.066939,0.003287,0.087368,0.021095,0.143471,0.018143,0.075411,0.014231
"('노출(분)', '0')",0.144636,0.017265,0.080582,0.003959,0.077612,0.010034,0.093122,0.01171,0.019391,0.005136
"('holiday_yn', '10')",0.043676,0.003749,0.005823,0.001899,0.025372,0.002522,0.038399,0.006764,0.039279,0.00529
"('상품코드', '2')",0.043478,0.006789,0.006576,0.004786,0.005771,0.007854,0.061973,0.008715,-0.002188,0.011321
"('상품명', '3')",0.021817,0.002847,-0.007174,0.0026,0.014941,0.005575,0.010937,0.004047,-0.013534,0.006112
"('뉴스시청률', '19')",0.008231,0.001073,-5e-05,0.0001,-0.000462,0.003845,-7.1e-05,0.000988,-0.000264,0.003223
"('현재지수', '15')",0.004421,0.002587,0.001558,0.000454,-0.004002,0.00145,0.004337,0.001025,-0.001217,0.002367
"('prime_time', '20')",0.00382,0.000464,0.000392,0.000291,0.00262,0.001304,0.003478,0.000784,-0.003839,0.002552


In [1087]:
mape(y_test_inner,pred2)

27.23922408357301

In [1700]:
del_columns = [34,33,32,26,36,37,28,17,18,9,27,12,13,25,22,8,28,14,19,15,16]
lgbm2 = LGBMRegressor(n_estimators = 2500,random_state = 0, max_depth = 11, learning_rate = 0.09, num_leaves = 23, verbose = 0, n_jobs = -1, objective = 'tweedie')
pred2_new = lgbm2.fit(X_train_inner_scaled.drop(del_columns,axis=1), y_train_inner).predict(X_test_inner_scaled.drop(del_columns,axis=1))
mape(y_test_inner,pred2_new)

26.5763001623147

34,33,32,26,36,37,28,17,18,9,27,12,13,25,22,8,28,14,19,15,16 번 피쳐 제거

#### **주방**

clothes, inner, kitchen, food, beauty, elec, goods, health, etc, furn, bed

In [1111]:
SEED = 0
CV = KFold(n_splits=5)
FEATURES = kitchen.drop(["취급액"], axis=1).columns.tolist()
TARGET_COL = "취급액"

In [1287]:
train_df = kitchen.reset_index(drop=True).copy()
for fold, (train_idx, valid_idx) in enumerate(CV.split(train_df, train_df[TARGET_COL])):
    clf = LGBMRegressor(n_estimators = 2500,random_state = 0, max_depth = 11, learning_rate = 0.09, num_leaves = 23, verbose = 0, n_jobs = -1, objective = 'tweedie')
    
    scaler = StandardScaler()
    X_train_scaled = pd.DataFrame(scaler.fit_transform(train_df.loc[train_idx, FEATURES]))
    X_val_scaled = pd.DataFrame(scaler.fit_transform(train_df.loc[valid_idx, FEATURES]))
    
    clf.fit(X_train_scaled, 
            train_df.loc[train_idx, TARGET_COL], 
            verbose = 0,
            early_stopping_rounds=1000,
            eval_set=[(X_val_scaled, 
                       train_df.loc[valid_idx, TARGET_COL])])
    permutation_importance = PermutationImportance(clf, random_state=SEED)
    permutation_importance.fit(X_val_scaled, 
                               train_df.loc[valid_idx, TARGET_COL])
    if fold == 0:
        df = eli5.explain_weights_df(permutation_importance, feature_names = FEATURES,top=50)
    else:
        df = pd.merge(df,eli5.explain_weights_df(permutation_importance, feature_names = FEATURES,top=50),on='feature')
df.columns = ['feature','weight_1','std_1','weight_2','std_2','weight_3','std_3','weight_4','std_4','weight_5','std_5']
df['feature_num'] = df.feature.apply(lambda x: colnames_dic2[x])
df.index = df[['feature','feature_num']]
s = df.drop(["feature",'feature_num'], axis=1).style.applymap(color_negative_red)
s


Unnamed: 0,weight_1,std_1,weight_2,std_2,weight_3,std_3,weight_4,std_4,weight_5,std_5
"('group', '24')",0.374827,0.019967,0.405457,0.038139,0.395459,0.017773,0.444484,0.011905,0.202331,0.017277
"('muil', '26')",0.258481,0.019674,0.240186,0.010608,0.248851,0.012569,0.214347,0.019014,0.169255,0.011702
"('취급액_mean_quantile_10_소분류', '30')",0.079402,0.007044,0.107991,0.005844,0.254936,0.020906,0.24972,0.022139,0.173084,0.007813
"('prod_size', '37')",0.074782,0.00556,0.057801,0.007509,0.043674,0.006386,0.099401,0.006392,0.068011,0.00153
"('취급액_mean_log_quantile_10_소분류', '31')",0.065943,0.00253,0.104681,0.004318,0.06825,0.007435,0.037788,0.006288,0.054014,0.005666
"('노출(분)', '0')",0.037813,0.004058,0.006171,0.001656,0.002259,0.000651,0.010697,0.001212,7.2e-05,0.000223
"('판매단가', '4')",0.036238,0.001412,-0.00268,0.003554,0.034028,0.003548,0.025211,0.003519,-0.000802,0.000593
"('hour', '6')",0.035013,0.006167,0.018224,0.005116,0.029162,0.010045,0.074876,0.004708,0.012661,0.006611
"('holiday_yn', '10')",0.028734,0.005071,0.029176,0.005639,0.043784,0.003145,0.071891,0.013035,0.020678,0.003743
"('브랜드', '23')",0.016182,0.001443,0.001129,0.001268,0.037935,0.004095,0.001671,0.001771,0.004682,0.000743


In [1113]:
mape(y_test_kitchen,pred3)

28.349131759163082

In [1701]:
del_columns = [25,32,34,36,33,27,9,13,8,17,15,18,16,12,28,14,11]
lgbm3 = LGBMRegressor(n_estimators = 2500,random_state = 0, max_depth = 11, learning_rate = 0.09, num_leaves = 23, verbose = 0, n_jobs = -1, objective = 'tweedie')
pred3_new = lgbm3.fit(X_train_kitchen_scaled.drop(del_columns,axis=1), y_train_kitchen).predict(X_test_kitchen_scaled.drop(del_columns,axis=1))
mape(y_test_kitchen,pred3_new)

27.49715254148381

25,32,34,36,33,27,9,13,8,17,15,18,16,12,28,14,11 번 피쳐 제거

#### **농수축**

clothes, inner, kitchen, food, beauty, elec, goods, health, etc, furn, bed

In [1158]:
SEED = 0
CV = KFold(n_splits=5)
FEATURES = food.drop(["취급액"], axis=1).columns.tolist()
TARGET_COL = "취급액"

In [1295]:
train_df = food.reset_index(drop=True).copy()
for fold, (train_idx, valid_idx) in enumerate(CV.split(train_df, train_df[TARGET_COL])):
    clf = LGBMRegressor(n_estimators = 1500,random_state = 0, max_depth = 11, num_leaves = 20, verbose = 0, n_jobs = -1, objective = 'tweedie')
    
    scaler = StandardScaler()
    X_train_scaled = pd.DataFrame(scaler.fit_transform(train_df.loc[train_idx, FEATURES]))
    X_val_scaled = pd.DataFrame(scaler.fit_transform(train_df.loc[valid_idx, FEATURES]))
    
    clf.fit(X_train_scaled, 
            train_df.loc[train_idx, TARGET_COL], 
            verbose = 0,
            early_stopping_rounds=1000,
            eval_set=[(X_val_scaled, 
                       train_df.loc[valid_idx, TARGET_COL])])
    permutation_importance = PermutationImportance(clf, random_state=SEED)
    permutation_importance.fit(X_val_scaled, 
                               train_df.loc[valid_idx, TARGET_COL])
    if fold == 0:
        df = eli5.explain_weights_df(permutation_importance, feature_names = FEATURES,top=50)
    else:
        df = pd.merge(df,eli5.explain_weights_df(permutation_importance, feature_names = FEATURES,top=50),on='feature')
df.columns = ['feature','weight_1','std_1','weight_2','std_2','weight_3','std_3','weight_4','std_4','weight_5','std_5']
df['feature_num'] = df.feature.apply(lambda x: colnames_dic2[x])
df.index = df[['feature','feature_num']]
s = df.drop(["feature",'feature_num'], axis=1).style.applymap(color_negative_red)
s


Unnamed: 0,weight_1,std_1,weight_2,std_2,weight_3,std_3,weight_4,std_4,weight_5,std_5
"('group', '24')",0.607353,0.031886,0.393667,0.01543,0.535524,0.028954,0.355617,0.038591,0.243063,0.027262
"('holiday_yn', '10')",0.11972,0.013488,0.082915,0.009305,0.054017,0.003168,0.038279,0.012578,0.049832,0.007629
"('hour', '6')",0.067822,0.004056,0.015539,0.004402,0.051682,0.009339,0.070259,0.006517,0.033259,0.011002
"('취급액_mean_quantile_10_소분류', '30')",0.045573,0.011925,0.159437,0.011609,0.094587,0.021474,0.210226,0.008203,0.117099,0.006885
"('마더코드', '1')",0.037705,0.005656,0.006359,0.010018,0.021113,0.003275,0.007585,0.002362,0.002161,0.00304
"('취급액_mean_log_quantile_10_소분류', '31')",0.036095,0.007208,0.024146,0.009536,0.014164,0.008099,0.008659,0.005497,0.00812,0.001121
"('소분류', '21')",0.022842,0.004735,0.010223,0.006663,0.007278,0.006362,0.005561,0.002287,0.010084,0.004297
"('요일', '5')",0.009231,0.001392,0.005293,0.001178,-0.000436,0.000475,-0.002235,0.000661,-0.000155,0.000192
"('중분류', '22')",0.008736,0.001651,-0.009375,0.001551,0.002496,0.000466,0.000967,0.000967,0.006159,0.001305
"('기온', '11')",0.005024,0.001793,-0.017683,0.002891,0.002192,0.000184,-0.002254,0.003556,0.001068,0.000583


In [1189]:
mape(y_test_food,pred4)

16.778379175821048

In [1702]:
del_columns = [25,36,26,9,33,34,32,37,27,17,16,15,28,29,13,20,14]
lgbm4 = LGBMRegressor(n_estimators = 1500,random_state = 0, max_depth = 11, num_leaves = 20, verbose = 0, n_jobs = -1, objective = 'tweedie')
pred4_new = lgbm4.fit(X_train_food_scaled.drop(del_columns,axis=1), y_train_food).predict(X_test_food_scaled.drop(del_columns,axis=1))
mape(y_test_food,pred4_new)

15.874558341118911

25,36,26,9,33,34,32,37,27,17,16,15,28,29,13,20,14 번 피쳐 제거

#### **이미용**

clothes, inner, kitchen, food, beauty, elec, goods, health, etc, furn, bed

In [1240]:
SEED = 0
CV = KFold(n_splits=5)
FEATURES = beauty.drop(["취급액"], axis=1).columns.tolist()
TARGET_COL = "취급액"

In [1306]:
train_df = beauty.reset_index(drop=True).copy()
for fold, (train_idx, valid_idx) in enumerate(CV.split(train_df, train_df[TARGET_COL])):
    clf = LGBMRegressor(n_estimators = 1500,random_state = 0, max_depth = 11, verbose = 0, n_jobs = -1, objective = 'tweedie')
    
    scaler = StandardScaler()
    X_train_scaled = pd.DataFrame(scaler.fit_transform(train_df.loc[train_idx, FEATURES]))
    X_val_scaled = pd.DataFrame(scaler.fit_transform(train_df.loc[valid_idx, FEATURES]))
    
    clf.fit(X_train_scaled, 
            train_df.loc[train_idx, TARGET_COL], 
            verbose = 0,
            early_stopping_rounds=1000,
            eval_set=[(X_val_scaled, 
                       train_df.loc[valid_idx, TARGET_COL])])
    permutation_importance = PermutationImportance(clf, random_state=SEED)
    permutation_importance.fit(X_val_scaled, 
                               train_df.loc[valid_idx, TARGET_COL])
    if fold == 0:
        df = eli5.explain_weights_df(permutation_importance, feature_names = FEATURES,top=50)
    else:
        df = pd.merge(df,eli5.explain_weights_df(permutation_importance, feature_names = FEATURES,top=50),on='feature')
df.columns = ['feature','weight_1','std_1','weight_2','std_2','weight_3','std_3','weight_4','std_4','weight_5','std_5']
df['feature_num'] = df.feature.apply(lambda x: colnames_dic2[x])
df.index = df[['feature','feature_num']]
s = df.drop(["feature",'feature_num'], axis=1).style.applymap(color_negative_red)
s


Unnamed: 0,weight_1,std_1,weight_2,std_2,weight_3,std_3,weight_4,std_4,weight_5,std_5
"('group', '24')",0.447065,0.018799,0.483773,0.062756,0.392567,0.104074,0.307616,0.03444,0.153575,0.013738
"('취급액_mean_quantile_10_소분류', '30')",0.307361,0.037664,0.064492,0.018424,0.29534,0.052516,0.127172,0.035605,0.008143,0.00308
"('hour', '6')",0.206791,0.052763,-0.01448,0.044064,0.156885,0.022992,0.225676,0.025153,0.052534,0.007518
"('holiday_yn', '10')",0.070098,0.008834,0.087672,0.01959,0.084002,0.009209,0.030105,0.003219,0.05167,0.012635
"('minute', '7')",0.026171,0.005183,0.008106,0.007817,-0.010527,0.002779,0.01658,0.006089,-0.001662,0.008358
"('노출(분)', '0')",0.018265,0.005889,0.082415,0.018874,0.015676,0.004273,0.002102,0.000846,0.000119,0.000103
"('상품코드', '2')",0.017669,0.003157,-0.000118,0.004947,-0.00269,0.002369,-0.002232,0.002896,0.006559,0.003
"('weekend', '35')",0.017212,0.009253,0.002056,0.002724,0.000642,0.000212,0.016636,0.001485,0.016926,0.00381
"('판매단가', '4')",0.014199,0.003438,0.008252,0.006233,0.003971,0.009319,0.002352,0.001045,-0.0006,0.000601
"('현재지수', '15')",0.012846,0.006307,0.010113,0.009258,0.002309,0.003966,-0.001416,0.000358,-0.001258,0.000166


In [1242]:
mape(y_test_beauty,pred5)

19.276008278963715

In [1703]:
del_columns = [11,16,14,18,13,22,34,33,32,29,26,36,37,8,12,19,17,15]
lgbm5 = LGBMRegressor(n_estimators = 500,random_state = 0, max_depth = 4, verbose = 0, n_jobs = -1, objective = 'gamma')
pred5_new = lgbm5.fit(X_train_beauty_scaled.drop(del_columns,axis=1), y_train_beauty).predict(X_test_beauty_scaled.drop(del_columns,axis=1))
mape(y_test_beauty,pred5_new)

19.217135630635212

11,16,14,18,13,22,34,33,32,29,26,36,37,8,12,19,17,15 번 피쳐 제거

#### **가전**

clothes, inner, kitchen, food, beauty, elec, goods, health, etc, furn, bed

In [1259]:
SEED = 0
CV = KFold(n_splits=5)
FEATURES = elec.drop(["취급액"], axis=1).columns.tolist()
TARGET_COL = "취급액"

In [1437]:
train_df = elec.reset_index(drop=True).copy()
for fold, (train_idx, valid_idx) in enumerate(CV.split(train_df, train_df[TARGET_COL])):
    clf = LGBMRegressor(n_estimators = 1000,  num_leaves = 1024, boosting_type = "dart", random_state = 0, max_depth = 11, verbose = 0, n_jobs = -1, objective = 'mape')
    
    scaler = StandardScaler()
    X_train_scaled = pd.DataFrame(scaler.fit_transform(train_df.loc[train_idx, FEATURES]))
    X_val_scaled = pd.DataFrame(scaler.fit_transform(train_df.loc[valid_idx, FEATURES]))
    
    clf.fit(X_train_scaled, 
            train_df.loc[train_idx, TARGET_COL], 
            verbose = 0,
            early_stopping_rounds=1000,
            eval_set=[(X_val_scaled, 
                       train_df.loc[valid_idx, TARGET_COL])])
    permutation_importance = PermutationImportance(clf, random_state=SEED)
    permutation_importance.fit(X_val_scaled, 
                               train_df.loc[valid_idx, TARGET_COL])
    if fold == 0:
        df = eli5.explain_weights_df(permutation_importance, feature_names = FEATURES,top=50)
    else:
        df = pd.merge(df,eli5.explain_weights_df(permutation_importance, feature_names = FEATURES,top=50),on='feature')
df.columns = ['feature','weight_1','std_1','weight_2','std_2','weight_3','std_3','weight_4','std_4','weight_5','std_5']
df['feature_num'] = df.feature.apply(lambda x: colnames_dic2[x])
df.index = df[['feature','feature_num']]
s = df.drop(["feature",'feature_num'], axis=1).style.applymap(color_negative_red)
s


Unnamed: 0,weight_1,std_1,weight_2,std_2,weight_3,std_3,weight_4,std_4,weight_5,std_5
"('group', '24')",0.321372,0.014756,0.247007,0.012184,0.306901,0.014652,0.311484,0.026208,0.188447,0.011823
"('취급액_mean_quantile_10_소분류', '30')",0.074314,0.010573,0.181977,0.007647,0.077798,0.005848,0.15641,0.011648,0.095301,0.009472
"('취급액_mean_quantile_10', '28')",0.051787,0.003569,0.005796,0.004452,0.042063,0.005647,0.063958,0.002142,0.020816,0.002418
"('취급액_mean_log_quantile_10_소분류', '31')",0.038332,0.005909,0.001379,0.002052,0.107314,0.0156,0.130669,0.012157,0.08426,0.005649
"('판매단가', '4')",0.031242,0.005581,-0.014063,0.004426,-0.006427,0.00645,0.028066,0.002235,0.032879,0.004449
"('hour', '6')",0.014361,0.004391,0.007819,0.00198,0.009606,0.002227,-0.00466,0.005731,0.014075,0.006843
"('취급액_mean_log_quantile_10', '29')",0.006876,0.001238,-0.004169,0.001851,0.002753,0.000701,0.010818,0.00166,0.000958,0.000216
"('상품명', '3')",0.006272,0.001976,0.001751,0.003019,0.027622,0.002257,0.011833,0.002135,-0.003569,0.001256
"('prod_size', '37')",0.006221,0.002406,-0.004236,0.001716,0.000541,0.001364,0.004835,0.000631,0.006593,0.001127
"('상품코드', '2')",0.004431,0.002613,0.002459,0.002898,0.000565,0.005144,0.002471,0.005984,-0.006208,0.001967


In [1391]:
mape(y_test_elec,pred6)

40.292005336436155

In [1704]:
del_columns = [8,10,11,12,13,14,17,18,19,25,32,33,34,35,36]
lgbm6 = LGBMRegressor(n_estimators = 1000,  num_leaves = 1024, boosting_type = "dart", random_state = 0, max_depth = 11, verbose = 0, n_jobs = -1, objective = 'mape')
pred6_new = lgbm6.fit(X_train_elec_scaled.drop(del_columns,axis=1), y_train_elec).predict(X_test_elec_scaled.drop(del_columns,axis=1))
mape(y_test_elec,pred6_new)

40.07481282197277

8,10,11,12,13,14,17,18,19,25,32,33,34,35,36 번 피쳐 제거

#### **생활용품**

goods, health, etc, furn, bed

In [1453]:
SEED = 0
CV = KFold(n_splits=5)
FEATURES = goods.drop(["취급액"], axis=1).columns.tolist()
TARGET_COL = "취급액"

In [1455]:
train_df = goods.reset_index(drop=True).copy()
for fold, (train_idx, valid_idx) in enumerate(CV.split(train_df, train_df[TARGET_COL])):
    clf = LGBMRegressor(n_estimators = 1200,random_state = 0, max_depth = 11, verbose = 0, n_jobs = -1, objective = 'tweedie')
    
    scaler = StandardScaler()
    X_train_scaled = pd.DataFrame(scaler.fit_transform(train_df.loc[train_idx, FEATURES]))
    X_val_scaled = pd.DataFrame(scaler.fit_transform(train_df.loc[valid_idx, FEATURES]))
    
    clf.fit(X_train_scaled, 
            train_df.loc[train_idx, TARGET_COL], 
            verbose = 0,
            early_stopping_rounds=1000,
            eval_set=[(X_val_scaled, 
                       train_df.loc[valid_idx, TARGET_COL])])
    permutation_importance = PermutationImportance(clf, random_state=SEED)
    permutation_importance.fit(X_val_scaled, 
                               train_df.loc[valid_idx, TARGET_COL])
    if fold == 0:
        df = eli5.explain_weights_df(permutation_importance, feature_names = FEATURES,top=50)
    else:
        df = pd.merge(df,eli5.explain_weights_df(permutation_importance, feature_names = FEATURES,top=50),on='feature')
df.columns = ['feature','weight_1','std_1','weight_2','std_2','weight_3','std_3','weight_4','std_4','weight_5','std_5']
df['feature_num'] = df.feature.apply(lambda x: colnames_dic2[x])
df.index = df[['feature','feature_num']]
s = df.drop(["feature",'feature_num'], axis=1).style.applymap(color_negative_red)
s


Unnamed: 0,weight_1,std_1,weight_2,std_2,weight_3,std_3,weight_4,std_4,weight_5,std_5
"('group', '24')",0.294106,0.022206,0.295461,0.067807,0.345591,0.026226,0.340238,0.02746,0.274075,0.016754
"('취급액_mean_quantile_10_소분류', '30')",0.261636,0.021781,0.228432,0.031918,0.24609,0.025213,0.354097,0.026729,0.171395,0.00423
"('muil', '26')",0.106524,0.011477,-0.022068,0.00808,0.202953,0.02118,0.124033,0.015452,0.123877,0.014964
"('취급액_mean_log_quantile_10_소분류', '31')",0.084858,0.006904,-0.003413,0.005894,0.034108,0.015047,0.100272,0.004219,0.168509,0.02354
"('판매단가', '4')",0.074476,0.01779,0.128385,0.012379,0.005194,0.019495,0.074805,0.005596,0.103823,0.006071
"('hour', '6')",0.023316,0.006977,0.035992,0.009433,0.029301,0.010987,0.060567,0.011439,0.04876,0.00993
"('holiday_yn', '10')",0.007707,0.000794,0.016648,0.006741,0.002687,0.002408,0.011072,0.003976,0.00652,0.004524
"('기온', '11')",0.007109,0.001338,0.001223,0.00129,0.005298,0.003806,-0.002508,0.002076,0.001501,0.001722
"('브랜드', '23')",0.005867,0.000644,-0.010221,0.002718,0.007366,0.005151,0.010279,0.001929,0.005666,0.002037
"('시가지수', '18')",0.002286,0.002289,0.010053,0.000391,0.000702,0.002594,0.002867,0.00237,0.001191,0.002383


In [1454]:
mape(y_test_goods,pred7)

32.253925686479846

In [1705]:
del_columns = [13,14,15,16,17,19,20,21,22,23,25,29,32,33,34,37]
lgbm7 = LGBMRegressor(n_estimators = 1200,random_state = 0, max_depth = 11, verbose = 0, n_jobs = -1, objective = 'tweedie')
pred7_new = lgbm7.fit(X_train_goods_scaled.drop(del_columns,axis=1), y_train_goods).predict(X_test_goods_scaled.drop(del_columns,axis=1))
mape(y_test_goods,pred7_new)

31.623747738429696

13,14,15,16,17,19,20,21,22,23,25,29,32,33,34,37번 피쳐 제거

#### **건강기능**

health, etc, furn, bed

In [1477]:
SEED = 0
CV = KFold(n_splits=5)
FEATURES = health.drop(["취급액"], axis=1).columns.tolist()
TARGET_COL = "취급액"

In [1479]:
train_df = health.reset_index(drop=True).copy()
for fold, (train_idx, valid_idx) in enumerate(CV.split(train_df, train_df[TARGET_COL])):
    clf = LGBMRegressor(n_estimators = 1500,random_state = 0, max_depth = 11, verbose = 0, n_jobs = -1, objective = 'tweedie')
    
    scaler = StandardScaler()
    X_train_scaled = pd.DataFrame(scaler.fit_transform(train_df.loc[train_idx, FEATURES]))
    X_val_scaled = pd.DataFrame(scaler.fit_transform(train_df.loc[valid_idx, FEATURES]))
    
    clf.fit(X_train_scaled, 
            train_df.loc[train_idx, TARGET_COL], 
            verbose = 0,
            early_stopping_rounds=1000,
            eval_set=[(X_val_scaled, 
                       train_df.loc[valid_idx, TARGET_COL])])
    permutation_importance = PermutationImportance(clf, random_state=SEED)
    permutation_importance.fit(X_val_scaled, 
                               train_df.loc[valid_idx, TARGET_COL])
    if fold == 0:
        df = eli5.explain_weights_df(permutation_importance, feature_names = FEATURES,top=50)
    else:
        df = pd.merge(df,eli5.explain_weights_df(permutation_importance, feature_names = FEATURES,top=50),on='feature')
df.columns = ['feature','weight_1','std_1','weight_2','std_2','weight_3','std_3','weight_4','std_4','weight_5','std_5']
df['feature_num'] = df.feature.apply(lambda x: colnames_dic2[x])
df.index = df[['feature','feature_num']]
s = df.drop(["feature",'feature_num'], axis=1).style.applymap(color_negative_red)
s


Unnamed: 0,weight_1,std_1,weight_2,std_2,weight_3,std_3,weight_4,std_4,weight_5,std_5
"('group', '24')",0.38375,0.030914,0.14571,0.029573,0.367037,0.063903,0.331786,0.015437,0.065173,0.019676
"('hour', '6')",0.194404,0.021735,0.124099,0.021465,0.063022,0.031746,0.041609,0.016846,0.024945,0.017684
"('뉴스시청률', '19')",0.094257,0.01505,0.004623,0.00844,-0.003288,0.026918,0.049446,0.010093,0.009957,0.002129
"('판매단가', '4')",0.089724,0.040978,0.214975,0.048275,0.117424,0.050523,0.282402,0.045935,0.032682,0.0154
"('취급액_mean_quantile_10_소분류', '30')",0.054221,0.016855,0.046875,0.019835,0.167321,0.047651,0.05785,0.024477,0.0,0.0
"('취급액_mean_quantile_10', '28')",0.026835,0.044579,0.00153,0.000354,0.001877,0.001207,0.00052,0.000337,0.0,0.0
"('취급액_mean_log_quantile_10_소분류', '31')",0.023525,0.009765,0.0,0.0,0.002418,0.00119,0.004395,0.00397,0.0,0.0
"('상품명', '3')",0.01841,0.00604,0.001451,0.006338,0.086241,0.031219,-0.014562,0.008822,-0.0011,0.000616
"('등락률(%)', '16')",0.016512,0.011712,0.007238,0.001639,0.00214,0.019303,-0.000263,0.001942,-0.001226,0.00197
"('시가지수', '18')",0.013528,0.003924,-0.008985,0.001489,-0.006195,0.005147,-0.003568,0.006958,0.0,0.0


In [1478]:
mape(y_test_health,pred8)

22.809002341045336

In [1706]:
del_columns = [8,11,13,14,15,16,17,18,20,25,26,27,32,33,34,36,37]
lgbm8 = LGBMRegressor(n_estimators = 1500,random_state = 0, max_depth = 11, verbose = 0, n_jobs = -1, objective = 'tweedie')
pred8_new = lgbm8.fit(X_train_health_scaled.drop(del_columns,axis=1), y_train_health).predict(X_test_health_scaled.drop(del_columns,axis=1))
mape(y_test_health,pred8_new)

21.80460073752754

8,11,13,14,15,16,17,18,20,25,26,27,32,33,34,36,37 번 피쳐 제거

#### **잡화**

etc, furn, bed

In [1503]:
SEED = 0
CV = KFold(n_splits=5)
FEATURES = etc.drop(["취급액"], axis=1).columns.tolist()
TARGET_COL = "취급액"

In [1505]:
train_df = etc.reset_index(drop=True).copy()
for fold, (train_idx, valid_idx) in enumerate(CV.split(train_df, train_df[TARGET_COL])):
    clf = LGBMRegressor(n_estimators = 1200, random_state = 0, boosting_type = 'dart', max_depth = 11, verbose = 0, n_jobs = -1, objective = 'tweedie')
    
    scaler = StandardScaler()
    X_train_scaled = pd.DataFrame(scaler.fit_transform(train_df.loc[train_idx, FEATURES]))
    X_val_scaled = pd.DataFrame(scaler.fit_transform(train_df.loc[valid_idx, FEATURES]))
    
    clf.fit(X_train_scaled, 
            train_df.loc[train_idx, TARGET_COL], 
            verbose = 0,
            early_stopping_rounds=1000,
            eval_set=[(X_val_scaled, 
                       train_df.loc[valid_idx, TARGET_COL])])
    permutation_importance = PermutationImportance(clf, random_state=SEED)
    permutation_importance.fit(X_val_scaled, 
                               train_df.loc[valid_idx, TARGET_COL])
    if fold == 0:
        df = eli5.explain_weights_df(permutation_importance, feature_names = FEATURES,top=50)
    else:
        df = pd.merge(df,eli5.explain_weights_df(permutation_importance, feature_names = FEATURES,top=50),on='feature')
df.columns = ['feature','weight_1','std_1','weight_2','std_2','weight_3','std_3','weight_4','std_4','weight_5','std_5']
df['feature_num'] = df.feature.apply(lambda x: colnames_dic2[x])
df.index = df[['feature','feature_num']]
s = df.drop(["feature",'feature_num'], axis=1).style.applymap(color_negative_red)
s


Unnamed: 0,weight_1,std_1,weight_2,std_2,weight_3,std_3,weight_4,std_4,weight_5,std_5
"('취급액_mean_quantile_10_소분류', '30')",0.215179,0.022663,0.295772,0.015282,0.234546,0.021165,0.256806,0.013744,0.090342,0.012997
"('group', '24')",0.202019,0.013631,0.397685,0.038786,0.242371,0.013572,0.27325,0.025354,0.239855,0.013149
"('hour', '6')",0.090065,0.003078,0.061671,0.030357,0.030543,0.006843,0.03904,0.009213,0.000476,0.011241
"('판매단가', '4')",0.046301,0.005869,0.110946,0.023374,0.005392,0.003843,0.062199,0.008078,0.101153,0.013948
"('holiday_yn', '10')",0.036329,0.005769,0.048681,0.006628,0.000904,0.006209,0.022797,0.007681,0.039453,0.005662
"('취급액_mean_log_quantile_10_소분류', '31')",0.032983,0.002213,0.022772,0.001353,0.018645,0.005479,0.004703,0.000843,0.006745,0.002537
"('마더코드', '1')",0.028077,0.002377,0.03669,0.006656,0.019063,0.004441,0.040557,0.005249,-0.001742,0.010426
"('muil', '26')",0.018483,0.004898,0.004583,0.000577,0.000924,0.000832,0.0,0.0,0.0,0.0
"('노출(분)', '0')",0.016144,0.004316,0.004008,0.001874,-0.000603,0.008011,0.031906,0.007857,0.031324,0.003758
"('minute', '7')",0.015617,0.002142,0.012255,0.005339,0.012231,0.007153,0.009293,0.003256,0.052311,0.009569


In [1504]:
mape(y_test_etc,pred9)

38.806570945642285

In [1707]:
del_columns = [9,11,13,14,15,16,17,18,19,20,22,23,25,27,32,33,34,35,36,37]
lgbm9 = LGBMRegressor(n_estimators = 1200, random_state = 0, boosting_type = 'dart', max_depth = 11, verbose = 0, n_jobs = -1, objective = 'tweedie')
pred9_new = lgbm9.fit(X_train_etc_scaled.drop(del_columns,axis=1), y_train_etc).predict(X_test_etc_scaled.drop(del_columns,axis=1))
mape(y_test_etc,pred9_new)

38.29687137332689

9,11,13,14,15,16,17,18,19,20,22,23,25,27,32,33,34,35,36,37 번 피쳐 제거

#### **가구**

furn, bed

In [1543]:
SEED = 0
CV = KFold(n_splits=5)
FEATURES = furn.drop(["취급액"], axis=1).columns.tolist()
TARGET_COL = "취급액"

In [1545]:
train_df = furn.reset_index(drop=True).copy()
for fold, (train_idx, valid_idx) in enumerate(CV.split(train_df, train_df[TARGET_COL])):
    clf = LGBMRegressor(n_estimators = 500, boosting_type = 'dart', num_leaves = 127, random_state = 0, max_depth = 11, verbose = 0, n_jobs = -1, objective = 'mape', learning_rate = 0.09)
    
    scaler = StandardScaler()
    X_train_scaled = pd.DataFrame(scaler.fit_transform(train_df.loc[train_idx, FEATURES]))
    X_val_scaled = pd.DataFrame(scaler.fit_transform(train_df.loc[valid_idx, FEATURES]))
    
    clf.fit(X_train_scaled, 
            train_df.loc[train_idx, TARGET_COL], 
            verbose = 0,
            early_stopping_rounds=1000,
            eval_set=[(X_val_scaled, 
                       train_df.loc[valid_idx, TARGET_COL])])
    permutation_importance = PermutationImportance(clf, random_state=SEED)
    permutation_importance.fit(X_val_scaled, 
                               train_df.loc[valid_idx, TARGET_COL])
    if fold == 0:
        df = eli5.explain_weights_df(permutation_importance, feature_names = FEATURES,top=50)
    else:
        df = pd.merge(df,eli5.explain_weights_df(permutation_importance, feature_names = FEATURES,top=50),on='feature')
df.columns = ['feature','weight_1','std_1','weight_2','std_2','weight_3','std_3','weight_4','std_4','weight_5','std_5']
df['feature_num'] = df.feature.apply(lambda x: colnames_dic2[x])
df.index = df[['feature','feature_num']]
s = df.drop(["feature",'feature_num'], axis=1).style.applymap(color_negative_red)
s


Unnamed: 0,weight_1,std_1,weight_2,std_2,weight_3,std_3,weight_4,std_4,weight_5,std_5
"('group', '24')",0.494081,0.020419,0.620577,0.028261,0.625803,0.049703,0.528729,0.043524,0.457526,0.026396
"('상품명', '3')",0.194064,0.016833,0.054425,0.009916,0.158188,0.023968,0.210693,0.036426,0.099755,0.010591
"('취급액_mean_quantile_10_소분류', '30')",0.090682,0.022957,0.129208,0.022775,0.179533,0.014258,0.179931,0.028146,0.026707,0.003093
"('마더코드', '1')",0.056151,0.006478,0.033127,0.00235,0.00896,0.001485,0.019509,0.007172,0.011675,0.004611
"('브랜드', '23')",0.015982,0.004435,0.072041,0.007616,0.020613,0.002675,0.005485,0.003726,-0.000719,0.000971
"('판매단가', '4')",0.015377,0.003051,0.011752,0.003715,0.035839,0.007172,0.007964,0.003354,0.018466,0.004494
"('등락률(%)', '16')",0.006498,0.002393,0.007072,0.002633,0.007276,0.00231,-0.000429,0.003321,0.002295,0.001737
"('주가이익비율', '17')",0.005208,0.002212,0.002298,0.002861,0.002427,0.000841,0.001175,0.001343,-0.002107,0.002353
"('취급액_mean_log_quantile_10_소분류', '31')",0.004633,0.001842,0.00959,0.001479,0.01652,0.003383,0.020439,0.002683,0.033602,0.005434
"('소분류', '21')",0.004494,0.001838,0.002951,0.002468,-0.000505,0.001059,-0.001409,0.001915,0.012476,0.002646


In [1544]:
mape(y_test_furn,pred10)

42.18782227547758

In [1708]:
del_columns = [8,10,11,12,13,14,15,18,19,23,25,27,32,33,34,35,36]
lgbm10 = LGBMRegressor(n_estimators = 500, boosting_type = 'dart', num_leaves = 127, random_state = 0, max_depth = 11, verbose = 0, n_jobs = -1, objective = 'mape', learning_rate = 0.09)
pred10_new = lgbm10.fit(X_train_furn_scaled.drop(del_columns,axis=1), y_train_furn).predict(X_test_furn_scaled.drop(del_columns,axis=1))
mape(y_test_furn,pred10_new)

41.68921261024886

8,10,11,12,13,14,15,18,19,23,25,27,32,33,34,35,36 번 피쳐 제거

#### **침구**

bed

In [1599]:
SEED = 0
CV = KFold(n_splits=5)
FEATURES = bed.drop(["취급액"], axis=1).columns.tolist()
TARGET_COL = "취급액"

In [1601]:
train_df = bed.reset_index(drop=True).copy()
for fold, (train_idx, valid_idx) in enumerate(CV.split(train_df, train_df[TARGET_COL])):
    clf = LGBMRegressor(n_estimators = 500,random_state = 0, max_depth = 11, num_leaves = 20, verbose = 0, n_jobs = -1, objective = 'tweedie')
    
    scaler = StandardScaler()
    X_train_scaled = pd.DataFrame(scaler.fit_transform(train_df.loc[train_idx, FEATURES]))
    X_val_scaled = pd.DataFrame(scaler.fit_transform(train_df.loc[valid_idx, FEATURES]))
    
    clf.fit(X_train_scaled, 
            train_df.loc[train_idx, TARGET_COL], 
            verbose = 0,
            early_stopping_rounds=1000,
            eval_set=[(X_val_scaled, 
                       train_df.loc[valid_idx, TARGET_COL])])
    permutation_importance = PermutationImportance(clf, random_state=SEED)
    permutation_importance.fit(X_val_scaled, 
                               train_df.loc[valid_idx, TARGET_COL])
    if fold == 0:
        df = eli5.explain_weights_df(permutation_importance, feature_names = FEATURES,top=50)
    else:
        df = pd.merge(df,eli5.explain_weights_df(permutation_importance, feature_names = FEATURES,top=50),on='feature')
df.columns = ['feature','weight_1','std_1','weight_2','std_2','weight_3','std_3','weight_4','std_4','weight_5','std_5']
df['feature_num'] = df.feature.apply(lambda x: colnames_dic2[x])
df.index = df[['feature','feature_num']]
s = df.drop(["feature",'feature_num'], axis=1).style.applymap(color_negative_red)
s


Unnamed: 0,weight_1,std_1,weight_2,std_2,weight_3,std_3,weight_4,std_4,weight_5,std_5
"('hour', '6')",0.302945,0.036697,0.113479,0.070707,0.009362,0.00527,0.072152,0.054589,0.214505,0.08023
"('group', '24')",0.242329,0.048033,0.616862,0.093318,0.264715,0.045709,0.671407,0.100087,0.460288,0.065229
"('취급액_mean_log_quantile_10_소분류', '31')",0.103325,0.024092,0.103105,0.018625,0.016624,0.006155,0.116739,0.053489,0.146894,0.037279
"('누적강수량', '12')",0.010774,0.006063,0.016481,0.02326,0.0,0.0,-0.01302,0.003059,-0.011047,0.0021
"('기온', '11')",0.00715,0.002166,-0.003753,0.004316,0.005104,0.002756,-0.112444,0.061276,0.019664,0.016183
"('소분류', '21')",0.00447,0.000407,-0.000279,0.000108,-0.001886,0.00063,0.118918,0.018795,0.037965,0.028658
"('풍속', '13')",0.00439,0.007941,-0.000335,0.00536,-0.000596,0.001588,0.03389,0.012229,0.028423,0.016268
"('판매단가', '4')",0.00266,0.000894,0.00048,0.001008,0.0,0.0,-0.066034,0.008298,0.069889,0.027442
"('취급액_mean_quantile_10_소분류', '30')",0.002642,0.001468,-0.002658,0.002484,0.038422,0.020007,-0.027312,0.010967,0.004877,0.004049
"('뉴스시청률', '19')",0.002382,0.00022,0.015567,0.006383,0.002098,0.002942,-0.027557,0.005368,0.001475,0.007115


In [1600]:
mape(y_test_bed,pred11)

26.618213143407623

In [1709]:
del_columns = [8,13,14,15,16,17,19,20,22,23,25,26,27,28,29,31,32,33,34,35,36]
lgbm11 = LGBMRegressor(n_estimators = 500,random_state = 0, max_depth = 11, num_leaves = 20, verbose = 0, n_jobs = -1, objective = 'tweedie')
pred11_new = lgbm11.fit(X_train_bed_scaled.drop(del_columns,axis=1), y_train_bed).predict(X_test_bed_scaled.drop(del_columns,axis=1))
mape(y_test_bed,pred11_new)

25.20877367938055

8,13,14,15,16,17,19,20,22,23,25,26,27,28,29,31,32,33,34,35,36 번 피쳐 제거

In [1721]:
# 각 상품군별 삭제할 피쳐
colnames_dic
clothes_del_columns = [37,34,33,32,26,36,20,12,13,16,18]
inner_del_columns = [34,33,32,26,36,37,28,17,18,9,27,12,13,25,22,8,28,14,19,15,16]
kitchen_del_columns = [25,32,34,36,33,27,9,13,8,17,15,18,16,12,28,14,11]
food_del_columns = [25,36,26,9,33,34,32,37,27,17,16,15,28,29,13,20,14]
beauty_del_columns = [11,16,14,18,13,22,34,33,32,29,26,36,37,8,12,19,17,15]
elec_del_columns = [8,10,11,12,13,14,17,18,19,25,32,33,34,35,36]
goods_del_columns = [13,14,15,16,17,19,20,21,22,23,25,29,32,33,34,37]
health_del_columns = [8,11,13,14,15,16,17,18,20,25,26,27,32,33,34,36,37]
etc_del_columns = [9,11,13,14,15,16,17,18,19,20,22,23,25,27,32,33,34,35,36,37]
furn_del_columns = [8,10,11,12,13,14,15,18,19,23,25,27,32,33,34,35,36]
bed_del_columns = [8,13,14,15,16,17,19,20,22,23,25,26,27,28,29,31,32,33,34,35,36]

## Model Evaluation


In [1859]:
lgbm3 = LGBMRegressor(n_estimators = 1200,random_state = 0, max_depth = 11, learning_rate = 0.07, num_leaves = 23, verbose = 0, n_jobs = -1, objective = 'tweedie')
lgbm6 = LGBMRegressor(n_estimators = 500,  num_leaves = 1024, boosting_type = "dart", random_state = 0, max_depth = 12, verbose = 0, n_jobs = -1, objective = 'mape')
lgbm7 = LGBMRegressor(n_estimators = 600,random_state = 0, max_depth = 9, verbose = 0,  n_jobs = -1, objective = 'tweedie')

In [1946]:



lgbm9 = LGBMRegressor(n_estimators = 1200, random_state = 0, boosting_type = 'dart', max_depth = 17, learning_rate = 0.15, verbose = 0, n_jobs = -1, objective = 'tweedie')


pred9 = lgbm9.fit(X_train_etc_scaled, y_train_etc).predict(X_test_etc_scaled)

In [1947]:
predictions = [pred1, pred2, pred3, pred4, pred5, pred6, pred7, pred8, pred9, pred10, pred11]
# predictions = [pred1_new, pred2_new, pred3_new, pred4_new, pred5_new, pred6_new, pred7_new, pred8_new, pred9_new, pred10_new, pred11_new] # selection한 결과물
trues = [y_test_clothes, y_test_inner, y_test_kitchen, y_test_food, y_test_beauty, y_test_elec, y_test_goods, y_test_health, y_test_etc, y_test_furn, y_test_bed]

In [633]:
# 피쳐추가이전
mape_list = []
for pred, true in zip(predictions, trues):
    mape_res = mape(true, pred)
    print(f"mape: {mape_res}" )
    mape_list.append(mape_res)

mape: 23.64668171057669
mape: 27.896492101713054
mape: 28.734353776071586
mape: 16.992263416968026
mape: 19.712953901596315
mape: 40.247451198285084
mape: 31.97547595666234
mape: 23.42317770172483
mape: 38.46505299119441
mape: 43.76484970096859
mape: 26.825635634992658


In [1713]:
# final_selected
mape_list = []
for pred, true in zip(predictions, trues):
    mape_res = mape(true, pred)
    print(f"mape: {mape_res}" )
    mape_list.append(mape_res)

mape: 22.1579455684485
mape: 26.5763001623147
mape: 27.49715254148381
mape: 15.874558341118911
mape: 19.217135630635212
mape: 40.07481282197277
mape: 31.623747738429696
mape: 21.80460073752754
mape: 38.29687137332689
mape: 41.68921261024886
mape: 25.20877367938055


### 기존 format으로 변경 후 mape 계산

In [1949]:
y_pred = np.zeros_like(y_test)

In [1950]:
# y_test format으로 다시 넣어주기
for i in range(len(prod_group)):
      y_pred[X_test["상품군"] == prod_group[i]] = predictions[i]

In [616]:
print(best_tot_mape)
# 수정전

30.17134701161952


In [1734]:
print(best_tot_mape)
# 최종

29.200866247957723
