In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV, RandomizedSearchCV
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler,LabelEncoder
from sklearn.ensemble import GradientBoostingRegressor, RandomForestRegressor
from sklearn.metrics import roc_auc_score
from xgboost import XGBRegressor
from lightgbm import LGBMRegressor
import datetime
import locale                                         
import pickle
from tqdm import tqdm
import warnings
import math

warnings.filterwarnings("ignore")
locale.setlocale(locale.LC_ALL, 'ko_KR.UTF-8') 

In [None]:
with open('../data/merged_data_concat_0926.pickle', 'rb') as f:
    performance_data = pickle.load(f)

In [None]:
from datetime import datetime


today = datetime.today().strftime("%m%d")

In [None]:
train = performance_data[performance_data.취급액 != -1]
test = performance_data[performance_data.취급액 == -1]

In [None]:
print(len(set(test.마더코드.unique())))
print(len(set(train.마더코드.unique())))
print(len(set(test.마더코드.unique()) - set(train.마더코드.unique())))

In [None]:
print(len(set(test.상품코드.unique())))
print(len(set(train.상품코드.unique())))
print(len(set(test.상품코드.unique()) - set(train.상품코드.unique())))

In [None]:
print(len(set(test.상품명.unique())))
print(len(set(train.상품명.unique())))
print(len(set(test.상품명.unique()) - set(train.상품명.unique())))

In [None]:
print(len(set(test.중분류.unique())))
print(len(set(train.중분류.unique())))
print(len(set(test.중분류.unique()) - set(train.중분류.unique())))

In [None]:
print(len(set(test.소분류.unique())))
print(len(set(train.소분류.unique())))
print(len(set(test.소분류.unique()) - set(train.소분류.unique())))

In [None]:
# 라벨 인코딩
test_data = performance_data.drop(['방송일시','판매량', 'holiday', '대비', 'date', 'mean_rating', 
                                   '배당수익률(%)', '주가자산비율', '고가지수', '저가지수', '거래량(천주)', '거래대금(백만원)','상장시가총액(백만원)'],axis=1)

# test_data['상품코드'] = test_data['상품코드'].map(int)
for feat in ['상품명','상품코드','마더코드','prime_time','중분류','요일','season','남여','muil','브랜드','season_prod','소분류']:
    lbe = LabelEncoder()
    test_data[feat] = lbe.fit_transform(test_data[feat].astype(str).values)


## Data Preparation for Modeling

In [None]:
prod_group = test_data["상품군"].unique()
prod_group_dct = {v:k for k, v in enumerate(prod_group)}

In [None]:
## 추후 전체 set에 대한 mape를 구하기 위해서 split을 해줌.

train_set = test_data[test_data['취급액'] != -1]

X = train_set.drop(["취급액"], axis = 1)
y = train_set["취급액"]

grp_idx = train_set['상품군'].map(prod_group_dct)
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=0, stratify = grp_idx)

In [None]:
test_len = [len(X_test[X_test["상품군"] == prod_group[i]]) for i in range(len(prod_group))]
print(test_len)

In [None]:
## train test split된 걸 상품군 별로 나눔

def train_test_grp(X_train, X_test, y_train, y_test, prod_group, grp_index):
    new_X_train = X_train[X_train["상품군"] == prod_group[grp_index]].drop("상품군", axis=1)
    new_X_test = X_test[X_test["상품군"] == prod_group[grp_index]].drop("상품군", axis=1)
    new_y_train = y_train[X_train["상품군"] == prod_group[grp_index]]
    new_y_test = y_test[X_test["상품군"] == prod_group[grp_index]]
    return new_X_train, new_X_test, new_y_train, new_y_test

In [None]:
# K-Fold용 데이터 만들기
def make_grp_data(X,idx):
    X_data = X[X["상품군"] == prod_group[idx]].drop(["상품군"], axis=1)
#     y_data = y[X["상품군"] == prod_group[idx]]['취급액']
    return X_data

clothes= make_grp_data(train_set,0)
inner = make_grp_data(train_set,1)
kitchen = make_grp_data(train_set,2)
food = make_grp_data(train_set,3)
beauty = make_grp_data(train_set,4)
elec = make_grp_data(train_set,5)
goods = make_grp_data(train_set,6)
health = make_grp_data(train_set,7)
etc = make_grp_data(train_set,8)
furn = make_grp_data(train_set,9)
bed = make_grp_data(train_set,10)

In [None]:
X_train_clothes, X_test_clothes, y_train_clothes, y_test_clothes = train_test_grp(X_train, X_test, y_train, y_test, prod_group, 0)
X_train_inner, X_test_inner, y_train_inner, y_test_inner = train_test_grp(X_train, X_test, y_train, y_test, prod_group, 1)
X_train_kitchen, X_test_kitchen, y_train_kitchen, y_test_kitchen = train_test_grp(X_train, X_test, y_train, y_test, prod_group, 2)
X_train_food, X_test_food, y_train_food, y_test_food = train_test_grp(X_train, X_test, y_train, y_test, prod_group, 3)
X_train_beauty, X_test_beauty, y_train_beauty, y_test_beauty = train_test_grp(X_train, X_test, y_train, y_test, prod_group, 4)
X_train_elec, X_test_elec, y_train_elec, y_test_elec = train_test_grp(X_train, X_test, y_train, y_test, prod_group, 5)
X_train_goods, X_test_goods, y_train_goods, y_test_goods = train_test_grp(X_train, X_test, y_train, y_test, prod_group, 6)
X_train_health, X_test_health, y_train_health, y_test_health = train_test_grp(X_train, X_test, y_train, y_test, prod_group, 7)
X_train_etc, X_test_etc, y_train_etc, y_test_etc = train_test_grp(X_train, X_test, y_train, y_test, prod_group, 8)
X_train_furn, X_test_furn, y_train_furn, y_test_furn = train_test_grp(X_train, X_test, y_train, y_test, prod_group, 9)
X_train_bed, X_test_bed, y_train_bed, y_test_bed = train_test_grp(X_train, X_test, y_train, y_test, prod_group, 10)

In [None]:
test_data['상품군'] = lbe.fit_transform(test_data['상품군'].astype(str).values)
train_set = test_data[test_data['취급액'] != -1]

X = train_set.drop(["취급액"], axis = 1)
y = train_set["취급액"]

# grp_idx = train_set['상품군'].map(prod_group_dct)
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=0, stratify = grp_idx)

In [None]:
# scaling
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()
X_train_scaled = pd.DataFrame(scaler.fit_transform(X_train))
X_test_scaled = pd.DataFrame(scaler.transform(X_test))

X_train_clothes_scaled = pd.DataFrame(scaler.fit_transform(X_train_clothes))
X_test_clothes_scaled = pd.DataFrame(scaler.transform(X_test_clothes))
X_train_inner_scaled = pd.DataFrame(scaler.fit_transform(X_train_inner))
X_test_inner_scaled = pd.DataFrame(scaler.transform(X_test_inner))
X_train_kitchen_scaled = pd.DataFrame(scaler.fit_transform(X_train_kitchen))
X_test_kitchen_scaled = pd.DataFrame(scaler.transform(X_test_kitchen))
X_train_food_scaled = pd.DataFrame(scaler.fit_transform(X_train_food))
X_test_food_scaled = pd.DataFrame(scaler.transform(X_test_food))
X_train_beauty_scaled = pd.DataFrame(scaler.fit_transform(X_train_beauty))
X_test_beauty_scaled = pd.DataFrame(scaler.transform(X_test_beauty))
X_train_elec_scaled = pd.DataFrame(scaler.fit_transform(X_train_elec))
X_test_elec_scaled = pd.DataFrame(scaler.transform(X_test_elec))
X_train_goods_scaled = pd.DataFrame(scaler.fit_transform(X_train_goods))
X_test_goods_scaled = pd.DataFrame(scaler.transform(X_test_goods))
X_train_health_scaled = pd.DataFrame(scaler.fit_transform(X_train_health))
X_test_health_scaled = pd.DataFrame(scaler.transform(X_test_health))
X_train_etc_scaled = pd.DataFrame(scaler.fit_transform(X_train_etc))
X_test_etc_scaled = pd.DataFrame(scaler.transform(X_test_etc))
X_train_furn_scaled = pd.DataFrame(scaler.fit_transform(X_train_furn))
X_test_furn_scaled = pd.DataFrame(scaler.transform(X_test_furn))
X_train_bed_scaled = pd.DataFrame(scaler.fit_transform(X_train_bed))
X_test_bed_scaled = pd.DataFrame(scaler.transform(X_test_bed))

In [None]:
original_colnames = list(X_train_clothes.columns)
new_colnames = list(X_train_clothes_scaled.columns)
colnames_dic = {}
for i in range(len(original_colnames)):
    colnames_dic[str(new_colnames[i])] = original_colnames[i]

## Modeling


In [None]:
def mape(y_true, y_pred):
    return 100*np.mean(np.abs(y_pred - y_true) / y_true)

best_tot_mape = 100

In [None]:
lgbm1 = LGBMRegressor(n_estimators = 1500,random_state = 0, max_depth = 11, num_leaves = 23,verbose = 0, n_jobs = -1, objective = 'gamma')
lgbm2 = LGBMRegressor(n_estimators = 2500,random_state = 0, max_depth = 11, learning_rate = 0.09, num_leaves = 23, verbose = 0, n_jobs = -1, objective = 'tweedie')
lgbm3 = LGBMRegressor(n_estimators = 2500,random_state = 0, max_depth = 11, learning_rate = 0.09, num_leaves = 23, verbose = 0, n_jobs = -1, objective = 'tweedie')
lgbm4 = LGBMRegressor(n_estimators = 1500,random_state = 0, max_depth = 11, num_leaves = 20, verbose = 0, n_jobs = -1, objective = 'tweedie')
lgbm5 = LGBMRegressor(n_estimators = 1500,random_state = 0, max_depth = 11, verbose = 0, n_jobs = -1, objective = 'tweedie')
lgbm6 = LGBMRegressor(n_estimators = 500,  num_leaves = 2048, boosting_type = "dart", random_state = 0, max_depth = 11, verbose = 0, n_jobs = -1, objective = 'mape')
lgbm7 = LGBMRegressor(n_estimators = 1200,random_state = 0, max_depth = 11, verbose = 0, n_jobs = -1, objective = 'tweedie')
lgbm8 = LGBMRegressor(n_estimators = 1500,random_state = 0, max_depth = 11, verbose = 0, n_jobs = -1, objective = 'tweedie')
lgbm9 = LGBMRegressor(n_estimators = 1200, random_state = 0, boosting_type = 'dart', max_depth = 11, verbose = 0, n_jobs = -1, objective = 'tweedie')
lgbm10 = LGBMRegressor(n_estimators = 500, boosting_type = 'dart', num_leaves = 127, random_state = 0, max_depth = 11, verbose = 0, n_jobs = -1, objective = 'mape', learning_rate = 0.09)
lgbm11 = LGBMRegressor(n_estimators = 500,random_state = 0, max_depth = 11, num_leaves = 20, verbose = 0, n_jobs = -1, objective = 'tweedie')

- loss에 variation을 줘보자  
1) regression: mse loss  
2) regression_l1: mae loss (group별로는 줄어드는데 오히려 전체에 대해서 mape를 계산하면 전체 mape는 높아짐)    
3) fair:  
4) huber:  
5) poisson regression: 분포 모양이 어느 정도 비슷함. 근데 중요한 건 얜 y가 discrete임을 가정함. 뭐 얘로 해도 ㄱㅊ하게 예측하는 게 좀 있음.  
6) quantile regression: 얘도 거지  
7) mape: 거지같이 나옴(가전, 가구는 얘가 제일 나음)    
8) gamma: ㄱㅊㄱㅊ 모양이 얘랑 비슷한 게 많음  
9) tweedie: 얘는 0에 뭉치는 경향이 있다는 게 가장 뚜렷한 특징. BEST   

In [None]:
# model fitting

pred1 = lgbm1.fit(X_train_clothes_scaled, y_train_clothes).predict(X_test_clothes_scaled)
pred2 = lgbm2.fit(X_train_inner_scaled, y_train_inner).predict(X_test_inner_scaled)
pred3 = lgbm3.fit(X_train_kitchen_scaled, y_train_kitchen).predict(X_test_kitchen_scaled)
pred4 = lgbm4.fit(X_train_food_scaled, y_train_food).predict(X_test_food_scaled)
pred5 = lgbm5.fit(X_train_beauty_scaled, y_train_beauty).predict(X_test_beauty_scaled)
pred6 = lgbm6.fit(X_train_elec_scaled, y_train_elec).predict(X_test_elec_scaled)
pred7 = lgbm7.fit(X_train_goods_scaled, y_train_goods).predict(X_test_goods_scaled)
pred8 = lgbm8.fit(X_train_health_scaled, y_train_health).predict(X_test_health_scaled)
pred9 = lgbm9.fit(X_train_etc_scaled, y_train_etc).predict(X_test_etc_scaled)
pred10 = lgbm10.fit(X_train_furn_scaled, y_train_furn).predict(X_test_furn_scaled)
pred11 = lgbm11.fit(X_train_bed_scaled, y_train_bed).predict(X_test_bed_scaled)

### Feature selection

In [None]:
from sklearn.feature_selection import SelectFromModel
import eli5
from eli5.sklearn import PermutationImportance
from sklearn.model_selection import KFold

In [None]:
def color_negative_red(val):
    """
    Takes a scalar and returns a string with
    the css property `'color: red'` for negative
    strings, black otherwise.
    """
    if val < 0 :
        color = 'red' 
    elif val == 0 :
        color = 'blue'
    else :
        color ='black'
    return 'color: %s' % color

In [None]:
original_colnames = list(X_train.columns)
new_colnames = list(X_train_scaled.columns)
colnames_dic3 = {}
for i in range(len(original_colnames)):
    colnames_dic3[str(new_colnames[i])] = original_colnames[i]

In [None]:
colnames_dic2 = {v: k for k, v in colnames_dic.items()}
colnames_dic4 = {v: k for k, v in colnames_dic3.items()}

#### **전체**

In [None]:
SEED = 0
CV = KFold(n_splits=5)
FEATURES = train_set.drop(["취급액"], axis=1).columns.tolist()
TARGET_COL = "취급액"

In [None]:
train_df = train_set.reset_index(drop=True).copy()
for fold, (train_idx, valid_idx) in enumerate(CV.split(train_df, train_df[TARGET_COL])):
    clf = LGBMRegressor(n_estimators = 1500,random_state = 0, max_depth = 11, verbose = 0, n_jobs = -1, objective = 'tweedie')
    
    scaler = StandardScaler()
    X_train_scaled2 = pd.DataFrame(scaler.fit_transform(train_df.loc[train_idx, FEATURES]))
    X_val_scaled = pd.DataFrame(scaler.fit_transform(train_df.loc[valid_idx, FEATURES]))
    
    clf.fit(X_train_scaled2, 
            train_df.loc[train_idx, TARGET_COL], 
            verbose = 0,
            early_stopping_rounds=1000,
            eval_set=[(X_val_scaled, 
                       train_df.loc[valid_idx, TARGET_COL])])
    permutation_importance = PermutationImportance(clf, random_state=SEED)
    permutation_importance.fit(X_val_scaled, 
                               train_df.loc[valid_idx, TARGET_COL])
    if fold == 0:
        df = eli5.explain_weights_df(permutation_importance, feature_names = FEATURES,top=50)
    else:
        df = pd.merge(df,eli5.explain_weights_df(permutation_importance, feature_names = FEATURES,top=50),on='feature')
df.columns = ['feature','weight_1','std_1','weight_2','std_2','weight_3','std_3','weight_4','std_4','weight_5','std_5']
df['feature_num'] = df.feature.apply(lambda x: colnames_dic4[x])
df.index = df[['feature','feature_num']]
s = df.drop(["feature",'feature_num'], axis=1).style.applymap(color_negative_red)
s


In [None]:
mape(y_test,pred_all)

In [None]:
del_columns = [33,10,37,35,16,18,9,14,15,21,24,17]
lgbm = LGBMRegressor(n_estimators = 2500,random_state = 0, max_depth = 15, verbose = 0, n_jobs = -1, objective = 'gamma')
pred_all = lgbm.fit(X_train_scaled.drop(del_columns,axis=1), y_train).predict(X_test_scaled.drop(del_columns,axis=1))
mape(y_test,pred_all)

In [None]:
pred_all

#### **의류**

In [None]:
SEED = 0
CV = KFold(n_splits=5)
FEATURES = clothes.drop(["취급액"], axis=1).columns.tolist()
TARGET_COL = "취급액"

In [None]:
train_df = clothes.reset_index(drop=True).copy()
for fold, (train_idx, valid_idx) in enumerate(CV.split(train_df, train_df[TARGET_COL])):
    clf = LGBMRegressor(n_estimators = 1500,random_state = 0, max_depth = 11, num_leaves = 23,verbose = 0, n_jobs = -1, objective = 'gamma')
    
    scaler = StandardScaler()
    X_train_scaled = pd.DataFrame(scaler.fit_transform(train_df.loc[train_idx, FEATURES]))
    X_val_scaled = pd.DataFrame(scaler.fit_transform(train_df.loc[valid_idx, FEATURES]))
    
    clf.fit(X_train_scaled, 
            train_df.loc[train_idx, TARGET_COL], 
            verbose = 0,
            early_stopping_rounds=1000,
            eval_set=[(X_val_scaled, 
                       train_df.loc[valid_idx, TARGET_COL])])
    permutation_importance = PermutationImportance(clf, random_state=SEED)
    permutation_importance.fit(X_val_scaled, 
                               train_df.loc[valid_idx, TARGET_COL])
    if fold == 0:
        df = eli5.explain_weights_df(permutation_importance, feature_names = FEATURES,top=50)
    else:
        df = pd.merge(df,eli5.explain_weights_df(permutation_importance, feature_names = FEATURES,top=50),on='feature')
df.columns = ['feature','weight_1','std_1','weight_2','std_2','weight_3','std_3','weight_4','std_4','weight_5','std_5']
df['feature_num'] = df.feature.apply(lambda x: colnames_dic2[x])
df.index = df[['feature','feature_num']]
s = df.drop(["feature",'feature_num'], axis=1).style.applymap(color_negative_red)
s


In [None]:
mape(y_test_clothes,pred1)

In [None]:
del_columns = [37,34,33,32,26,36,20,12,13,16,18]
lgbm1 = LGBMRegressor(n_estimators = 1500,random_state = 0, max_depth = 11, num_leaves = 23,verbose = 0, n_jobs = -1, objective = 'gamma')
pred1_new = lgbm1.fit(X_train_clothes_scaled.drop(del_columns,axis=1), y_train_clothes).predict(X_test_clothes_scaled.drop(del_columns,axis=1))
mape(y_test_clothes,pred1_new)

37,34,33,32,26,36,20,12,13,16,18 번 피쳐 제거

#### **속옷**

In [None]:
SEED = 0
CV = KFold(n_splits=5)
FEATURES = inner.drop(["취급액"], axis=1).columns.tolist()
TARGET_COL = "취급액"

In [None]:
train_df = inner.reset_index(drop=True).copy()
for fold, (train_idx, valid_idx) in enumerate(CV.split(train_df, train_df[TARGET_COL])):
    clf = LGBMRegressor(n_estimators = 2500,random_state = 0, max_depth = 11, learning_rate = 0.09, num_leaves = 23, verbose = 0, n_jobs = -1, objective = 'tweedie')
    
    scaler = StandardScaler()
    X_train_scaled = pd.DataFrame(scaler.fit_transform(train_df.loc[train_idx, FEATURES]))
    X_val_scaled = pd.DataFrame(scaler.fit_transform(train_df.loc[valid_idx, FEATURES]))
    
    clf.fit(X_train_scaled, 
            train_df.loc[train_idx, TARGET_COL], 
            verbose = 0,
            early_stopping_rounds=1000,
            eval_set=[(X_val_scaled, 
                       train_df.loc[valid_idx, TARGET_COL])])
    permutation_importance = PermutationImportance(clf, random_state=SEED)
    permutation_importance.fit(X_val_scaled, 
                               train_df.loc[valid_idx, TARGET_COL])
    if fold == 0:
        df = eli5.explain_weights_df(permutation_importance, feature_names = FEATURES,top=50)
    else:
        df = pd.merge(df,eli5.explain_weights_df(permutation_importance, feature_names = FEATURES,top=50),on='feature')
df.columns = ['feature','weight_1','std_1','weight_2','std_2','weight_3','std_3','weight_4','std_4','weight_5','std_5']
df['feature_num'] = df.feature.apply(lambda x: colnames_dic2[x])
df.index = df[['feature','feature_num']]
s = df.drop(["feature",'feature_num'], axis=1).style.applymap(color_negative_red)
s


In [None]:
mape(y_test_inner,pred2)

In [None]:
del_columns = [34,33,32,26,36,37,28,17,18,9,27,12,13,25,22,8,28,14,19,15,16]
lgbm2 = LGBMRegressor(n_estimators = 2500,random_state = 0, max_depth = 11, learning_rate = 0.09, num_leaves = 23, verbose = 0, n_jobs = -1, objective = 'tweedie')
pred2_new = lgbm2.fit(X_train_inner_scaled.drop(del_columns,axis=1), y_train_inner).predict(X_test_inner_scaled.drop(del_columns,axis=1))
mape(y_test_inner,pred2_new)

34,33,32,26,36,37,28,17,18,9,27,12,13,25,22,8,28,14,19,15,16 번 피쳐 제거

#### **주방**

clothes, inner, kitchen, food, beauty, elec, goods, health, etc, furn, bed

In [None]:
SEED = 0
CV = KFold(n_splits=5)
FEATURES = kitchen.drop(["취급액"], axis=1).columns.tolist()
TARGET_COL = "취급액"

In [None]:
train_df = kitchen.reset_index(drop=True).copy()
for fold, (train_idx, valid_idx) in enumerate(CV.split(train_df, train_df[TARGET_COL])):
    clf = LGBMRegressor(n_estimators = 2500,random_state = 0, max_depth = 11, learning_rate = 0.09, num_leaves = 23, verbose = 0, n_jobs = -1, objective = 'tweedie')
    
    scaler = StandardScaler()
    X_train_scaled = pd.DataFrame(scaler.fit_transform(train_df.loc[train_idx, FEATURES]))
    X_val_scaled = pd.DataFrame(scaler.fit_transform(train_df.loc[valid_idx, FEATURES]))
    
    clf.fit(X_train_scaled, 
            train_df.loc[train_idx, TARGET_COL], 
            verbose = 0,
            early_stopping_rounds=1000,
            eval_set=[(X_val_scaled, 
                       train_df.loc[valid_idx, TARGET_COL])])
    permutation_importance = PermutationImportance(clf, random_state=SEED)
    permutation_importance.fit(X_val_scaled, 
                               train_df.loc[valid_idx, TARGET_COL])
    if fold == 0:
        df = eli5.explain_weights_df(permutation_importance, feature_names = FEATURES,top=50)
    else:
        df = pd.merge(df,eli5.explain_weights_df(permutation_importance, feature_names = FEATURES,top=50),on='feature')
df.columns = ['feature','weight_1','std_1','weight_2','std_2','weight_3','std_3','weight_4','std_4','weight_5','std_5']
df['feature_num'] = df.feature.apply(lambda x: colnames_dic2[x])
df.index = df[['feature','feature_num']]
s = df.drop(["feature",'feature_num'], axis=1).style.applymap(color_negative_red)
s


In [None]:
mape(y_test_kitchen,pred3)

In [None]:
del_columns = [25,32,34,36,33,27,9,13,8,17,15,18,16,12,28,14,11]
lgbm3 = LGBMRegressor(n_estimators = 2500,random_state = 0, max_depth = 11, learning_rate = 0.09, num_leaves = 23, verbose = 0, n_jobs = -1, objective = 'tweedie')
pred3_new = lgbm3.fit(X_train_kitchen_scaled.drop(del_columns,axis=1), y_train_kitchen).predict(X_test_kitchen_scaled.drop(del_columns,axis=1))
mape(y_test_kitchen,pred3_new)

25,32,34,36,33,27,9,13,8,17,15,18,16,12,28,14,11 번 피쳐 제거

#### **농수축**

clothes, inner, kitchen, food, beauty, elec, goods, health, etc, furn, bed

In [None]:
SEED = 0
CV = KFold(n_splits=5)
FEATURES = food.drop(["취급액"], axis=1).columns.tolist()
TARGET_COL = "취급액"

In [None]:
train_df = food.reset_index(drop=True).copy()
for fold, (train_idx, valid_idx) in enumerate(CV.split(train_df, train_df[TARGET_COL])):
    clf = LGBMRegressor(n_estimators = 1500,random_state = 0, max_depth = 11, num_leaves = 20, verbose = 0, n_jobs = -1, objective = 'tweedie')
    
    scaler = StandardScaler()
    X_train_scaled = pd.DataFrame(scaler.fit_transform(train_df.loc[train_idx, FEATURES]))
    X_val_scaled = pd.DataFrame(scaler.fit_transform(train_df.loc[valid_idx, FEATURES]))
    
    clf.fit(X_train_scaled, 
            train_df.loc[train_idx, TARGET_COL], 
            verbose = 0,
            early_stopping_rounds=1000,
            eval_set=[(X_val_scaled, 
                       train_df.loc[valid_idx, TARGET_COL])])
    permutation_importance = PermutationImportance(clf, random_state=SEED)
    permutation_importance.fit(X_val_scaled, 
                               train_df.loc[valid_idx, TARGET_COL])
    if fold == 0:
        df = eli5.explain_weights_df(permutation_importance, feature_names = FEATURES,top=50)
    else:
        df = pd.merge(df,eli5.explain_weights_df(permutation_importance, feature_names = FEATURES,top=50),on='feature')
df.columns = ['feature','weight_1','std_1','weight_2','std_2','weight_3','std_3','weight_4','std_4','weight_5','std_5']
df['feature_num'] = df.feature.apply(lambda x: colnames_dic2[x])
df.index = df[['feature','feature_num']]
s = df.drop(["feature",'feature_num'], axis=1).style.applymap(color_negative_red)
s


In [None]:
mape(y_test_food,pred4)

In [None]:
del_columns = [25,36,26,9,33,34,32,37,27,17,16,15,28,29,13,20,14]
lgbm4 = LGBMRegressor(n_estimators = 1500,random_state = 0, max_depth = 11, num_leaves = 20, verbose = 0, n_jobs = -1, objective = 'tweedie')
pred4_new = lgbm4.fit(X_train_food_scaled.drop(del_columns,axis=1), y_train_food).predict(X_test_food_scaled.drop(del_columns,axis=1))
mape(y_test_food,pred4_new)

25,36,26,9,33,34,32,37,27,17,16,15,28,29,13,20,14 번 피쳐 제거

#### **이미용**

clothes, inner, kitchen, food, beauty, elec, goods, health, etc, furn, bed

In [None]:
SEED = 0
CV = KFold(n_splits=5)
FEATURES = beauty.drop(["취급액"], axis=1).columns.tolist()
TARGET_COL = "취급액"

In [None]:
train_df = beauty.reset_index(drop=True).copy()
for fold, (train_idx, valid_idx) in enumerate(CV.split(train_df, train_df[TARGET_COL])):
    clf = LGBMRegressor(n_estimators = 1500,random_state = 0, max_depth = 11, verbose = 0, n_jobs = -1, objective = 'tweedie')
    
    scaler = StandardScaler()
    X_train_scaled = pd.DataFrame(scaler.fit_transform(train_df.loc[train_idx, FEATURES]))
    X_val_scaled = pd.DataFrame(scaler.fit_transform(train_df.loc[valid_idx, FEATURES]))
    
    clf.fit(X_train_scaled, 
            train_df.loc[train_idx, TARGET_COL], 
            verbose = 0,
            early_stopping_rounds=1000,
            eval_set=[(X_val_scaled, 
                       train_df.loc[valid_idx, TARGET_COL])])
    permutation_importance = PermutationImportance(clf, random_state=SEED)
    permutation_importance.fit(X_val_scaled, 
                               train_df.loc[valid_idx, TARGET_COL])
    if fold == 0:
        df = eli5.explain_weights_df(permutation_importance, feature_names = FEATURES,top=50)
    else:
        df = pd.merge(df,eli5.explain_weights_df(permutation_importance, feature_names = FEATURES,top=50),on='feature')
df.columns = ['feature','weight_1','std_1','weight_2','std_2','weight_3','std_3','weight_4','std_4','weight_5','std_5']
df['feature_num'] = df.feature.apply(lambda x: colnames_dic2[x])
df.index = df[['feature','feature_num']]
s = df.drop(["feature",'feature_num'], axis=1).style.applymap(color_negative_red)
s


In [None]:
mape(y_test_beauty,pred5)

In [None]:
del_columns = [11,16,14,18,13,22,34,33,32,29,26,36,37,8,12,19,17,15]
lgbm5 = LGBMRegressor(n_estimators = 500,random_state = 0, max_depth = 4, verbose = 0, n_jobs = -1, objective = 'gamma')
pred5_new = lgbm5.fit(X_train_beauty_scaled.drop(del_columns,axis=1), y_train_beauty).predict(X_test_beauty_scaled.drop(del_columns,axis=1))
mape(y_test_beauty,pred5_new)

11,16,14,18,13,22,34,33,32,29,26,36,37,8,12,19,17,15 번 피쳐 제거

#### **가전**

clothes, inner, kitchen, food, beauty, elec, goods, health, etc, furn, bed

In [None]:
SEED = 0
CV = KFold(n_splits=5)
FEATURES = elec.drop(["취급액"], axis=1).columns.tolist()
TARGET_COL = "취급액"

In [None]:
train_df = elec.reset_index(drop=True).copy()
for fold, (train_idx, valid_idx) in enumerate(CV.split(train_df, train_df[TARGET_COL])):
    clf = LGBMRegressor(n_estimators = 1000,  num_leaves = 1024, boosting_type = "dart", random_state = 0, max_depth = 11, verbose = 0, n_jobs = -1, objective = 'mape')
    
    scaler = StandardScaler()
    X_train_scaled = pd.DataFrame(scaler.fit_transform(train_df.loc[train_idx, FEATURES]))
    X_val_scaled = pd.DataFrame(scaler.fit_transform(train_df.loc[valid_idx, FEATURES]))
    
    clf.fit(X_train_scaled, 
            train_df.loc[train_idx, TARGET_COL], 
            verbose = 0,
            early_stopping_rounds=1000,
            eval_set=[(X_val_scaled, 
                       train_df.loc[valid_idx, TARGET_COL])])
    permutation_importance = PermutationImportance(clf, random_state=SEED)
    permutation_importance.fit(X_val_scaled, 
                               train_df.loc[valid_idx, TARGET_COL])
    if fold == 0:
        df = eli5.explain_weights_df(permutation_importance, feature_names = FEATURES,top=50)
    else:
        df = pd.merge(df,eli5.explain_weights_df(permutation_importance, feature_names = FEATURES,top=50),on='feature')
df.columns = ['feature','weight_1','std_1','weight_2','std_2','weight_3','std_3','weight_4','std_4','weight_5','std_5']
df['feature_num'] = df.feature.apply(lambda x: colnames_dic2[x])
df.index = df[['feature','feature_num']]
s = df.drop(["feature",'feature_num'], axis=1).style.applymap(color_negative_red)
s


In [None]:
mape(y_test_elec,pred6)

In [None]:
del_columns = [8,10,11,12,13,14,17,18,19,25,32,33,34,35,36]
lgbm6 = LGBMRegressor(n_estimators = 1000,  num_leaves = 1024, boosting_type = "dart", random_state = 0, max_depth = 11, verbose = 0, n_jobs = -1, objective = 'mape')
pred6_new = lgbm6.fit(X_train_elec_scaled.drop(del_columns,axis=1), y_train_elec).predict(X_test_elec_scaled.drop(del_columns,axis=1))
mape(y_test_elec,pred6_new)

8,10,11,12,13,14,17,18,19,25,32,33,34,35,36 번 피쳐 제거

#### **생활용품**

goods, health, etc, furn, bed

In [None]:
SEED = 0
CV = KFold(n_splits=5)
FEATURES = goods.drop(["취급액"], axis=1).columns.tolist()
TARGET_COL = "취급액"

In [None]:
train_df = goods.reset_index(drop=True).copy()
for fold, (train_idx, valid_idx) in enumerate(CV.split(train_df, train_df[TARGET_COL])):
    clf = LGBMRegressor(n_estimators = 1200,random_state = 0, max_depth = 11, verbose = 0, n_jobs = -1, objective = 'tweedie')
    
    scaler = StandardScaler()
    X_train_scaled = pd.DataFrame(scaler.fit_transform(train_df.loc[train_idx, FEATURES]))
    X_val_scaled = pd.DataFrame(scaler.fit_transform(train_df.loc[valid_idx, FEATURES]))
    
    clf.fit(X_train_scaled, 
            train_df.loc[train_idx, TARGET_COL], 
            verbose = 0,
            early_stopping_rounds=1000,
            eval_set=[(X_val_scaled, 
                       train_df.loc[valid_idx, TARGET_COL])])
    permutation_importance = PermutationImportance(clf, random_state=SEED)
    permutation_importance.fit(X_val_scaled, 
                               train_df.loc[valid_idx, TARGET_COL])
    if fold == 0:
        df = eli5.explain_weights_df(permutation_importance, feature_names = FEATURES,top=50)
    else:
        df = pd.merge(df,eli5.explain_weights_df(permutation_importance, feature_names = FEATURES,top=50),on='feature')
df.columns = ['feature','weight_1','std_1','weight_2','std_2','weight_3','std_3','weight_4','std_4','weight_5','std_5']
df['feature_num'] = df.feature.apply(lambda x: colnames_dic2[x])
df.index = df[['feature','feature_num']]
s = df.drop(["feature",'feature_num'], axis=1).style.applymap(color_negative_red)
s


In [None]:
mape(y_test_goods,pred7)

In [None]:
del_columns = [13,14,15,16,17,19,20,21,22,23,25,29,32,33,34,37]
lgbm7 = LGBMRegressor(n_estimators = 1200,random_state = 0, max_depth = 11, verbose = 0, n_jobs = -1, objective = 'tweedie')
pred7_new = lgbm7.fit(X_train_goods_scaled.drop(del_columns,axis=1), y_train_goods).predict(X_test_goods_scaled.drop(del_columns,axis=1))
mape(y_test_goods,pred7_new)

13,14,15,16,17,19,20,21,22,23,25,29,32,33,34,37번 피쳐 제거

#### **건강기능**

health, etc, furn, bed

In [None]:
SEED = 0
CV = KFold(n_splits=5)
FEATURES = health.drop(["취급액"], axis=1).columns.tolist()
TARGET_COL = "취급액"

In [None]:
train_df = health.reset_index(drop=True).copy()
for fold, (train_idx, valid_idx) in enumerate(CV.split(train_df, train_df[TARGET_COL])):
    clf = LGBMRegressor(n_estimators = 1500,random_state = 0, max_depth = 11, verbose = 0, n_jobs = -1, objective = 'tweedie')
    
    scaler = StandardScaler()
    X_train_scaled = pd.DataFrame(scaler.fit_transform(train_df.loc[train_idx, FEATURES]))
    X_val_scaled = pd.DataFrame(scaler.fit_transform(train_df.loc[valid_idx, FEATURES]))
    
    clf.fit(X_train_scaled, 
            train_df.loc[train_idx, TARGET_COL], 
            verbose = 0,
            early_stopping_rounds=1000,
            eval_set=[(X_val_scaled, 
                       train_df.loc[valid_idx, TARGET_COL])])
    permutation_importance = PermutationImportance(clf, random_state=SEED)
    permutation_importance.fit(X_val_scaled, 
                               train_df.loc[valid_idx, TARGET_COL])
    if fold == 0:
        df = eli5.explain_weights_df(permutation_importance, feature_names = FEATURES,top=50)
    else:
        df = pd.merge(df,eli5.explain_weights_df(permutation_importance, feature_names = FEATURES,top=50),on='feature')
df.columns = ['feature','weight_1','std_1','weight_2','std_2','weight_3','std_3','weight_4','std_4','weight_5','std_5']
df['feature_num'] = df.feature.apply(lambda x: colnames_dic2[x])
df.index = df[['feature','feature_num']]
s = df.drop(["feature",'feature_num'], axis=1).style.applymap(color_negative_red)
s


In [None]:
mape(y_test_health,pred8)

In [None]:
del_columns = [8,11,13,14,15,16,17,18,20,25,26,27,32,33,34,36,37]
lgbm8 = LGBMRegressor(n_estimators = 1500,random_state = 0, max_depth = 11, verbose = 0, n_jobs = -1, objective = 'tweedie')
pred8_new = lgbm8.fit(X_train_health_scaled.drop(del_columns,axis=1), y_train_health).predict(X_test_health_scaled.drop(del_columns,axis=1))
mape(y_test_health,pred8_new)

8,11,13,14,15,16,17,18,20,25,26,27,32,33,34,36,37 번 피쳐 제거

#### **잡화**

etc, furn, bed

In [None]:
SEED = 0
CV = KFold(n_splits=5)
FEATURES = etc.drop(["취급액"], axis=1).columns.tolist()
TARGET_COL = "취급액"

In [None]:
train_df = etc.reset_index(drop=True).copy()
for fold, (train_idx, valid_idx) in enumerate(CV.split(train_df, train_df[TARGET_COL])):
    clf = LGBMRegressor(n_estimators = 1200, random_state = 0, boosting_type = 'dart', max_depth = 11, verbose = 0, n_jobs = -1, objective = 'tweedie')
    
    scaler = StandardScaler()
    X_train_scaled = pd.DataFrame(scaler.fit_transform(train_df.loc[train_idx, FEATURES]))
    X_val_scaled = pd.DataFrame(scaler.fit_transform(train_df.loc[valid_idx, FEATURES]))
    
    clf.fit(X_train_scaled, 
            train_df.loc[train_idx, TARGET_COL], 
            verbose = 0,
            early_stopping_rounds=1000,
            eval_set=[(X_val_scaled, 
                       train_df.loc[valid_idx, TARGET_COL])])
    permutation_importance = PermutationImportance(clf, random_state=SEED)
    permutation_importance.fit(X_val_scaled, 
                               train_df.loc[valid_idx, TARGET_COL])
    if fold == 0:
        df = eli5.explain_weights_df(permutation_importance, feature_names = FEATURES,top=50)
    else:
        df = pd.merge(df,eli5.explain_weights_df(permutation_importance, feature_names = FEATURES,top=50),on='feature')
df.columns = ['feature','weight_1','std_1','weight_2','std_2','weight_3','std_3','weight_4','std_4','weight_5','std_5']
df['feature_num'] = df.feature.apply(lambda x: colnames_dic2[x])
df.index = df[['feature','feature_num']]
s = df.drop(["feature",'feature_num'], axis=1).style.applymap(color_negative_red)
s


In [None]:
mape(y_test_etc,pred9)

In [None]:
del_columns = [9,11,13,14,15,16,17,18,19,20,22,23,25,27,32,33,34,35,36,37]
lgbm9 = LGBMRegressor(n_estimators = 1200, random_state = 0, boosting_type = 'dart', max_depth = 11, verbose = 0, n_jobs = -1, objective = 'tweedie')
pred9_new = lgbm9.fit(X_train_etc_scaled.drop(del_columns,axis=1), y_train_etc).predict(X_test_etc_scaled.drop(del_columns,axis=1))
mape(y_test_etc,pred9_new)

9,11,13,14,15,16,17,18,19,20,22,23,25,27,32,33,34,35,36,37 번 피쳐 제거

#### **가구**

furn, bed

In [None]:
SEED = 0
CV = KFold(n_splits=5)
FEATURES = furn.drop(["취급액"], axis=1).columns.tolist()
TARGET_COL = "취급액"

In [None]:
train_df = furn.reset_index(drop=True).copy()
for fold, (train_idx, valid_idx) in enumerate(CV.split(train_df, train_df[TARGET_COL])):
    clf = LGBMRegressor(n_estimators = 500, boosting_type = 'dart', num_leaves = 127, random_state = 0, max_depth = 11, verbose = 0, n_jobs = -1, objective = 'mape', learning_rate = 0.09)
    
    scaler = StandardScaler()
    X_train_scaled = pd.DataFrame(scaler.fit_transform(train_df.loc[train_idx, FEATURES]))
    X_val_scaled = pd.DataFrame(scaler.fit_transform(train_df.loc[valid_idx, FEATURES]))
    
    clf.fit(X_train_scaled, 
            train_df.loc[train_idx, TARGET_COL], 
            verbose = 0,
            early_stopping_rounds=1000,
            eval_set=[(X_val_scaled, 
                       train_df.loc[valid_idx, TARGET_COL])])
    permutation_importance = PermutationImportance(clf, random_state=SEED)
    permutation_importance.fit(X_val_scaled, 
                               train_df.loc[valid_idx, TARGET_COL])
    if fold == 0:
        df = eli5.explain_weights_df(permutation_importance, feature_names = FEATURES,top=50)
    else:
        df = pd.merge(df,eli5.explain_weights_df(permutation_importance, feature_names = FEATURES,top=50),on='feature')
df.columns = ['feature','weight_1','std_1','weight_2','std_2','weight_3','std_3','weight_4','std_4','weight_5','std_5']
df['feature_num'] = df.feature.apply(lambda x: colnames_dic2[x])
df.index = df[['feature','feature_num']]
s = df.drop(["feature",'feature_num'], axis=1).style.applymap(color_negative_red)
s


In [None]:
mape(y_test_furn,pred10)

In [None]:
del_columns = [8,10,11,12,13,14,15,18,19,23,25,27,32,33,34,35,36]
lgbm10 = LGBMRegressor(n_estimators = 500, boosting_type = 'dart', num_leaves = 127, random_state = 0, max_depth = 11, verbose = 0, n_jobs = -1, objective = 'mape', learning_rate = 0.09)
pred10_new = lgbm10.fit(X_train_furn_scaled.drop(del_columns,axis=1), y_train_furn).predict(X_test_furn_scaled.drop(del_columns,axis=1))
mape(y_test_furn,pred10_new)

8,10,11,12,13,14,15,18,19,23,25,27,32,33,34,35,36 번 피쳐 제거

#### **침구**

bed

In [None]:
SEED = 0
CV = KFold(n_splits=5)
FEATURES = bed.drop(["취급액"], axis=1).columns.tolist()
TARGET_COL = "취급액"

In [None]:
train_df = bed.reset_index(drop=True).copy()
for fold, (train_idx, valid_idx) in enumerate(CV.split(train_df, train_df[TARGET_COL])):
    clf = LGBMRegressor(n_estimators = 500,random_state = 0, max_depth = 11, num_leaves = 20, verbose = 0, n_jobs = -1, objective = 'tweedie')
    
    scaler = StandardScaler()
    X_train_scaled = pd.DataFrame(scaler.fit_transform(train_df.loc[train_idx, FEATURES]))
    X_val_scaled = pd.DataFrame(scaler.fit_transform(train_df.loc[valid_idx, FEATURES]))
    
    clf.fit(X_train_scaled, 
            train_df.loc[train_idx, TARGET_COL], 
            verbose = 0,
            early_stopping_rounds=1000,
            eval_set=[(X_val_scaled, 
                       train_df.loc[valid_idx, TARGET_COL])])
    permutation_importance = PermutationImportance(clf, random_state=SEED)
    permutation_importance.fit(X_val_scaled, 
                               train_df.loc[valid_idx, TARGET_COL])
    if fold == 0:
        df = eli5.explain_weights_df(permutation_importance, feature_names = FEATURES,top=50)
    else:
        df = pd.merge(df,eli5.explain_weights_df(permutation_importance, feature_names = FEATURES,top=50),on='feature')
df.columns = ['feature','weight_1','std_1','weight_2','std_2','weight_3','std_3','weight_4','std_4','weight_5','std_5']
df['feature_num'] = df.feature.apply(lambda x: colnames_dic2[x])
df.index = df[['feature','feature_num']]
s = df.drop(["feature",'feature_num'], axis=1).style.applymap(color_negative_red)
s


In [None]:
mape(y_test_bed,pred11)

In [None]:
del_columns = [8,13,14,15,16,17,19,20,22,23,25,26,27,28,29,31,32,33,34,35,36]
lgbm11 = LGBMRegressor(n_estimators = 500,random_state = 0, max_depth = 11, num_leaves = 20, verbose = 0, n_jobs = -1, objective = 'tweedie')
pred11_new = lgbm11.fit(X_train_bed_scaled.drop(del_columns,axis=1), y_train_bed).predict(X_test_bed_scaled.drop(del_columns,axis=1))
mape(y_test_bed,pred11_new)

8,13,14,15,16,17,19,20,22,23,25,26,27,28,29,31,32,33,34,35,36 번 피쳐 제거

In [None]:
# 각 상품군별 삭제할 피쳐
colnames_dic
clothes_del_columns = [37,34,33,32,26,36,20,12,13,16,18]
inner_del_columns = [34,33,32,26,36,37,28,17,18,9,27,12,13,25,22,8,28,14,19,15,16]
kitchen_del_columns = [25,32,34,36,33,27,9,13,8,17,15,18,16,12,28,14,11]
food_del_columns = [25,36,26,9,33,34,32,37,27,17,16,15,28,29,13,20,14]
beauty_del_columns = [11,16,14,18,13,22,34,33,32,29,26,36,37,8,12,19,17,15]
elec_del_columns = [8,10,11,12,13,14,17,18,19,25,32,33,34,35,36]
goods_del_columns = [13,14,15,16,17,19,20,21,22,23,25,29,32,33,34,37]
health_del_columns = [8,11,13,14,15,16,17,18,20,25,26,27,32,33,34,36,37]
etc_del_columns = [9,11,13,14,15,16,17,18,19,20,22,23,25,27,32,33,34,35,36,37]
furn_del_columns = [8,10,11,12,13,14,15,18,19,23,25,27,32,33,34,35,36]
bed_del_columns = [8,13,14,15,16,17,19,20,22,23,25,26,27,28,29,31,32,33,34,35,36]

## Model Evaluation


In [None]:
lgbm3 = LGBMRegressor(n_estimators = 1200,random_state = 0, max_depth = 11, learning_rate = 0.07, num_leaves = 23, verbose = 0, n_jobs = -1, objective = 'tweedie')
lgbm6 = LGBMRegressor(n_estimators = 500,  num_leaves = 1024, boosting_type = "dart", random_state = 0, max_depth = 12, verbose = 0, n_jobs = -1, objective = 'mape')
lgbm7 = LGBMRegressor(n_estimators = 600,random_state = 0, max_depth = 9, verbose = 0,  n_jobs = -1, objective = 'tweedie')

In [None]:



lgbm9 = LGBMRegressor(n_estimators = 1200, random_state = 0, boosting_type = 'dart', max_depth = 17, learning_rate = 0.15, verbose = 0, n_jobs = -1, objective = 'tweedie')


pred9 = lgbm9.fit(X_train_etc_scaled, y_train_etc).predict(X_test_etc_scaled)

In [None]:
predictions = [pred1, pred2, pred3, pred4, pred5, pred6, pred7, pred8, pred9, pred10, pred11]
# predictions = [pred1_new, pred2_new, pred3_new, pred4_new, pred5_new, pred6_new, pred7_new, pred8_new, pred9_new, pred10_new, pred11_new] # selection한 결과물
trues = [y_test_clothes, y_test_inner, y_test_kitchen, y_test_food, y_test_beauty, y_test_elec, y_test_goods, y_test_health, y_test_etc, y_test_furn, y_test_bed]

In [None]:
# 피쳐추가이전
mape_list = []
for pred, true in zip(predictions, trues):
    mape_res = mape(true, pred)
    print(f"mape: {mape_res}" )
    mape_list.append(mape_res)

In [None]:
# final_selected
mape_list = []
for pred, true in zip(predictions, trues):
    mape_res = mape(true, pred)
    print(f"mape: {mape_res}" )
    mape_list.append(mape_res)

### 기존 format으로 변경 후 mape 계산

In [None]:
y_pred = np.zeros_like(y_test)

In [None]:
# y_test format으로 다시 넣어주기
for i in range(len(prod_group)):
      y_pred[X_test["상품군"] == prod_group[i]] = predictions[i]

In [None]:
print(best_tot_mape)
# 수정전

In [None]:
print(best_tot_mape)
# 최종