**'SUBMISSION MODEL' 이라는 이름의 상위 폴더를 지정**

In [None]:
from google.colab import drive
drive.mount('/gdrive', force_remount=True)

Mounted at /gdrive


In [None]:
dir_ = '/gdrive/My Drive/SUBMISSION MODEL/'

**다른 디렉토리를 설정**

In [None]:
raw_data_dir = dir_+'2. data/'
processed_data_dir = dir_+'2. data/processed/'
log_dir = dir_+'4. logs/'
model_dir = dir_+'5. models/'

**Nonrecursive Model by store**

In [None]:
cvs = ['private']
# STORES : 가게의 종류를 담고 있음
STORES = ['CA_1', 'CA_2', 'CA_3', 'CA_4', 'TX_1', 'TX_2', 'TX_3', 'WI_1', 'WI_2', 'WI_3']

In [None]:
from  datetime import datetime, timedelta
import gc
import numpy as np, pandas as pd
import lightgbm as lgb

import os, sys, gc, time, warnings, pickle, psutil, random

warnings.filterwarnings('ignore')

**Memory 사용 줄이기**

In [None]:
# 메모리 사용을 줄이기 위한 함수
# df : DataFrame
def reduce_mem_usage(df, verbose=False):
    numerics = ['int16', 'int32', 'int64', 'float16', 'float32', 'float64']
    start_mem = df.memory_usage().sum() / 1024**2
    for col in df.columns:
        col_type = df[col].dtypes
        if col_type in numerics:
            c_min = df[col].min()
            c_max = df[col].max()
            if str(col_type)[:3] == 'int':
                if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
                    df[col] = df[col].astype(np.int8)
                elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
                       df[col] = df[col].astype(np.int16)
                elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
                    df[col] = df[col].astype(np.int32)
                elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
                    df[col] = df[col].astype(np.int64)  
            else:
                if c_min > np.finfo(np.float16).min and c_max < np.finfo(np.float16).max:
                    df[col] = df[col].astype(np.float16)
                elif c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:
                    df[col] = df[col].astype(np.float32)
                else:
                    df[col] = df[col].astype(np.float64)    
    end_mem = df.memory_usage().sum() / 1024**2
    if verbose: print('Mem. usage decreased to {:5.2f} Mb ({:.1f}% reduction)'.format(end_mem, 100 * (start_mem - end_mem) / start_mem))
    return df

In [None]:
FIRST_DAY = 710
# 제거할 특징을 선정_여기에서는 샵 처리하지 않은 7개의 특징을 제거할 예정인 것
remove_feature = ['id',
                  'state_id',
                  'store_id',
#                   'item_id',
#                   'dept_id',
#                   'cat_id',
                  'date','wm_yr_wk','d','sales']

# 카테고리에 다음 8가지의 정보를 담기
cat_var = ['item_id', 'dept_id','store_id', 'cat_id', 'state_id'] + ["event_name_1", "event_name_2", "event_type_1", "event_type_2"]
# 제거할 특징을 전체 카테고리에서 제거함
cat_var = list(set(cat_var) - set(remove_feature))

In [None]:
grid2_colnm = ['sell_price', 'price_max', 'price_min', 'price_std',
               'price_mean', 'price_norm', 'price_nunique', 'item_nunique',
               'price_momentum', 'price_momentum_m', 'price_momentum_y']

grid3_colnm = ['event_name_1', 'event_type_1', 'event_name_2',
               'event_type_2', 'snap_CA', 'snap_TX', 'snap_WI', 'tm_d', 'tm_w', 'tm_m',
               'tm_y', 'tm_wm', 'tm_dw', 'tm_w_end']

# 28 일 지연 ~ 42 일 지연 (28 +15일)
# 7일, 14일, 30일, 60일, 180일 rolling 
lag_colnm = [ 'sales_lag_28', 'sales_lag_29', 'sales_lag_30',
             'sales_lag_31', 'sales_lag_32', 'sales_lag_33', 'sales_lag_34',
             'sales_lag_35', 'sales_lag_36', 'sales_lag_37', 'sales_lag_38',
             'sales_lag_39', 'sales_lag_40', 'sales_lag_41', 'sales_lag_42',
             
             'rolling_mean_7', 'rolling_std_7', 'rolling_mean_14', 'rolling_std_14',
             'rolling_mean_30', 'rolling_std_30', 'rolling_mean_60',
             'rolling_std_60', 'rolling_mean_180', 'rolling_std_180']

mean_enc_colnm = [
    
    'enc_store_id_dept_id_mean', 'enc_store_id_dept_id_std', 
    'enc_item_id_state_id_mean', 'enc_item_id_state_id_std',


]

In [None]:
## Make grid
#################################################################################
def prepare_data(store):
    # preprocessing 에서 생성한 피클 형태의 파일을 불러옴
    grid_1 = pd.read_pickle(processed_data_dir+"grid_part_1.pkl")
    grid_2 = pd.read_pickle(processed_data_dir+"grid_part_2.pkl")[grid2_colnm]
    grid_3 = pd.read_pickle(processed_data_dir+"grid_part_3.pkl")[grid3_colnm]

    # 행 방향(옆)으로 grid 1, 2, 3 을 합치기
    grid_df = pd.concat([grid_1, grid_2, grid_3], axis=1)
    # grid 1, 2, 3 변수를 삭제 및 사용하지 않는 데이터를 메모리에서 지우기
    del grid_1, grid_2, grid_3; gc.collect()
    
    # 상점(store) 에 해당하는 데이터만 추출
    grid_df = grid_df[grid_df['store_id'] == store]
    # 첫번째 날 이후에 해당하는 데이터만 추출
    grid_df = grid_df[grid_df['d'] >= FIRST_DAY]
    
    # lags_data frame 을 불러옴
    lag = pd.read_pickle(processed_data_dir+"lags_df_28.pkl")[lag_colnm]
    
    # grid_df 인덱스와 동일한 lag 인덱스를 가질 때의 lag 데이터만 추출
    lag = lag[lag.index.isin(grid_df.index)]
    
    # 행 방향(옆)으로 grid_df 와 lag 를 합치기
    grid_df = pd.concat([grid_df,
                     lag],
                    axis=1)
    
    # lag 변수를 삭제 및 사용하지 않는 데이터를 메모리에서 지우기
    del lag; gc.collect()
    
    # mean_encoding_df 을 불러옴
    mean_enc = pd.read_pickle(processed_data_dir+"mean_encoding_df.pkl")[mean_enc_colnm]
    # grid_df 인덱스와 동일한 mean_enc 인덱스를 가질 때의 mean_enc 데이터만 추출
    mean_enc = mean_enc[mean_enc.index.isin(grid_df.index)]
    
    # 행 방향(옆)으로 grid_df, mean_enc 을 합치기
    grid_df = pd.concat([grid_df,
                         mean_enc],
                        axis=1)
    #  mean_enc 변수를 삭제 및 사용하지 않는 데이터를 메모리에서 지우기
    del mean_enc; gc.collect()
    
    #gird_data frame 의 메모리 사용량을 최소화 함
    grid_df = reduce_mem_usage(grid_df)
    
    
    
    return grid_df

In [None]:
# 날짜 영역을 나눔
validation = {
    'cv1' : [1551, 1610],
    'cv2' : [1829,1857],
    'cv3' : [1857, 1885],
    'cv4' : [1885,1913],
    'public' : [1913, 1941],
    'private' : [1941, 1969]
}

In [None]:
########################### Model params
#################################################################################
# 모델의 파라미터를 설정
lgb_params = {
                    'boosting_type': 'gbdt',
                    'objective': 'tweedie',
                    'tweedie_variance_power': 1.1,
                    'metric': 'rmse',
                    'subsample': 0.5,
                    'subsample_freq': 1,
                    'learning_rate': 0.015,
                    'num_leaves': 2**8-1,
                    'min_data_in_leaf': 2**8-1,
                    'feature_fraction': 0.5,
                    'max_bin': 100,
                    'n_estimators': 3000,
                    'boost_from_average': False,
                    'verbose': -1,
                    'seed' : 1995
                }

In [None]:
########################### Train Models
#################################################################################

rmsse_bycv = dict()

# cvs = private = 1941 일차 ~ 1969 일차
for cv in cvs:
    print('cv : day', validation[cv])
    pred_list = []
    # 8 개의 상점을 차례 대로 for문에 돌림
    for store in STORES:
        # 상점 이름을 출력
        print(store, 'start')
        # 상점에 해당하는 데이터를 준비
        grid_df = prepare_data(store)
        # remove_feature 에 해당하지 않는 열 을 model_var 에 저장
        model_var = grid_df.columns[~grid_df.columns.isin(remove_feature)]
        # validation[cv][0] = 1641, validation[cv][1] = 1969
        # first day(720) <= (grid 의 일차) <= 1641 : train
        # 1641 < (grid 의 일차) <= 1969 : validation
        tr_mask = (grid_df['d'] <= validation[cv][0]) & (grid_df['d'] >= FIRST_DAY)
        vl_mask = (grid_df['d'] > validation[cv][0]) & (grid_df['d'] <= validation[cv][1])
        
        # lgb.Dataset(data = train_x, label = train_y)
        train_data = lgb.Dataset(grid_df[tr_mask][model_var], 
                       label=grid_df[tr_mask]['sales'])

        valid_data = lgb.Dataset(grid_df[vl_mask][model_var], 
                           label=grid_df[vl_mask]['sales'])
        
        # LGB training 한 결과를 m_lgb 에 저장
        m_lgb = lgb.train(lgb_params, train_data, valid_sets = [valid_data, train_data], verbose_eval=100) 
        display(pd.DataFrame({'name':m_lgb.feature_name(),
                              'imp':m_lgb.feature_importance()}).sort_values('imp',ascending=False).head(25))
        
        # 모델 이름 설정
        model_name = model_dir+'non_recur_model_'+store+'.bin'
        # m_lab 을 madel_name 에 저장
        pickle.dump(m_lgb, open(model_name, 'wb'))
        
        # 변수의 메모리 지우기
        del grid_df, train_data, valid_data, m_lgb, tr_mask, vl_mask; gc.collect

cv : day [1941, 1969]
CA_1 start
[100]	training's rmse: 2.72962	valid_0's rmse: nan
[200]	training's rmse: 2.45857	valid_0's rmse: nan
[300]	training's rmse: 2.37685	valid_0's rmse: nan
[400]	training's rmse: 2.3261	valid_0's rmse: nan
[500]	training's rmse: 2.28967	valid_0's rmse: nan
[600]	training's rmse: 2.25763	valid_0's rmse: nan
[700]	training's rmse: 2.23189	valid_0's rmse: nan
[800]	training's rmse: 2.20968	valid_0's rmse: nan
[900]	training's rmse: 2.18903	valid_0's rmse: nan
[1000]	training's rmse: 2.17042	valid_0's rmse: nan
[1100]	training's rmse: 2.15307	valid_0's rmse: nan
[1200]	training's rmse: 2.13658	valid_0's rmse: nan
[1300]	training's rmse: 2.12263	valid_0's rmse: nan
[1400]	training's rmse: 2.10834	valid_0's rmse: nan
[1500]	training's rmse: 2.0959	valid_0's rmse: nan
[1600]	training's rmse: 2.08276	valid_0's rmse: nan
[1700]	training's rmse: 2.07092	valid_0's rmse: nan
[1800]	training's rmse: 2.05981	valid_0's rmse: nan
[1900]	training's rmse: 2.04803	valid_0's 

Unnamed: 0,name,imp
0,item_id,181894
15,event_name_1,39191
23,tm_w,36684
52,rolling_mean_180,22452
13,price_momentum_m,21894
53,rolling_std_180,19862
50,rolling_mean_60,18764
51,rolling_std_60,17747
22,tm_d,17196
49,rolling_std_30,16062


CA_2 start
[100]	training's rmse: 2.10673	valid_0's rmse: nan
[200]	training's rmse: 1.96481	valid_0's rmse: nan
[300]	training's rmse: 1.91438	valid_0's rmse: nan
[400]	training's rmse: 1.8834	valid_0's rmse: nan
[500]	training's rmse: 1.85993	valid_0's rmse: nan
[600]	training's rmse: 1.84068	valid_0's rmse: nan
[700]	training's rmse: 1.82501	valid_0's rmse: nan
[800]	training's rmse: 1.81118	valid_0's rmse: nan
[900]	training's rmse: 1.79768	valid_0's rmse: nan
[1000]	training's rmse: 1.78469	valid_0's rmse: nan
[1100]	training's rmse: 1.77345	valid_0's rmse: nan
[1200]	training's rmse: 1.76244	valid_0's rmse: nan
[1300]	training's rmse: 1.75234	valid_0's rmse: nan
[1400]	training's rmse: 1.74226	valid_0's rmse: nan
[1500]	training's rmse: 1.73368	valid_0's rmse: nan
[1600]	training's rmse: 1.72468	valid_0's rmse: nan
[1700]	training's rmse: 1.71589	valid_0's rmse: nan
[1800]	training's rmse: 1.70789	valid_0's rmse: nan
[1900]	training's rmse: 1.69988	valid_0's rmse: nan
[2000]	trai

Unnamed: 0,name,imp
0,item_id,181061
15,event_name_1,43235
23,tm_w,33292
52,rolling_mean_180,22948
53,rolling_std_180,20628
13,price_momentum_m,20060
50,rolling_mean_60,19577
22,tm_d,18884
51,rolling_std_60,18119
45,rolling_std_7,17252


CA_3 start
[100]	training's rmse: 3.91922	valid_0's rmse: nan
[200]	training's rmse: 3.3962	valid_0's rmse: nan
[300]	training's rmse: 3.22926	valid_0's rmse: nan
[400]	training's rmse: 3.14072	valid_0's rmse: nan
[500]	training's rmse: 3.08212	valid_0's rmse: nan
[600]	training's rmse: 3.03514	valid_0's rmse: nan
[700]	training's rmse: 2.99762	valid_0's rmse: nan
[800]	training's rmse: 2.96278	valid_0's rmse: nan
[900]	training's rmse: 2.93159	valid_0's rmse: nan
[1000]	training's rmse: 2.90362	valid_0's rmse: nan
[1100]	training's rmse: 2.87776	valid_0's rmse: nan
[1200]	training's rmse: 2.85355	valid_0's rmse: nan
[1300]	training's rmse: 2.832	valid_0's rmse: nan
[1400]	training's rmse: 2.80897	valid_0's rmse: nan
[1500]	training's rmse: 2.78915	valid_0's rmse: nan
[1600]	training's rmse: 2.76927	valid_0's rmse: nan
[1700]	training's rmse: 2.75115	valid_0's rmse: nan
[1800]	training's rmse: 2.73444	valid_0's rmse: nan
[1900]	training's rmse: 2.71851	valid_0's rmse: nan
[2000]	traini

Unnamed: 0,name,imp
0,item_id,181857
15,event_name_1,38067
23,tm_w,33154
52,rolling_mean_180,21346
13,price_momentum_m,20009
53,rolling_std_180,19358
50,rolling_mean_60,18215
22,tm_d,17730
51,rolling_std_60,17633
14,price_momentum_y,16270


CA_4 start
[100]	training's rmse: 1.56647	valid_0's rmse: nan
[200]	training's rmse: 1.48984	valid_0's rmse: nan
[300]	training's rmse: 1.46445	valid_0's rmse: nan
[400]	training's rmse: 1.4466	valid_0's rmse: nan
[500]	training's rmse: 1.43174	valid_0's rmse: nan
[600]	training's rmse: 1.41928	valid_0's rmse: nan
[700]	training's rmse: 1.408	valid_0's rmse: nan
[800]	training's rmse: 1.39681	valid_0's rmse: nan
[900]	training's rmse: 1.38733	valid_0's rmse: nan
[1000]	training's rmse: 1.37941	valid_0's rmse: nan
[1100]	training's rmse: 1.37114	valid_0's rmse: nan
[1200]	training's rmse: 1.36357	valid_0's rmse: nan
[1300]	training's rmse: 1.35706	valid_0's rmse: nan
[1400]	training's rmse: 1.35059	valid_0's rmse: nan
[1500]	training's rmse: 1.34439	valid_0's rmse: nan
[1600]	training's rmse: 1.33824	valid_0's rmse: nan
[1700]	training's rmse: 1.33299	valid_0's rmse: nan
[1800]	training's rmse: 1.32752	valid_0's rmse: nan
[1900]	training's rmse: 1.32236	valid_0's rmse: nan
[2000]	traini

Unnamed: 0,name,imp
0,item_id,179667
15,event_name_1,46405
23,tm_w,34361
52,rolling_mean_180,23512
53,rolling_std_180,20926
13,price_momentum_m,20351
50,rolling_mean_60,19257
22,tm_d,18282
51,rolling_std_60,18163
45,rolling_std_7,17522


TX_1 start
[100]	training's rmse: 2.25723	valid_0's rmse: nan
[200]	training's rmse: 2.05956	valid_0's rmse: nan
[300]	training's rmse: 1.99312	valid_0's rmse: nan
[400]	training's rmse: 1.9477	valid_0's rmse: nan
[500]	training's rmse: 1.90839	valid_0's rmse: nan
[600]	training's rmse: 1.87651	valid_0's rmse: nan
[700]	training's rmse: 1.85008	valid_0's rmse: nan
[800]	training's rmse: 1.8267	valid_0's rmse: nan
[900]	training's rmse: 1.80671	valid_0's rmse: nan
[1000]	training's rmse: 1.79035	valid_0's rmse: nan
[1100]	training's rmse: 1.77459	valid_0's rmse: nan
[1200]	training's rmse: 1.75934	valid_0's rmse: nan
[1300]	training's rmse: 1.74566	valid_0's rmse: nan
[1400]	training's rmse: 1.73357	valid_0's rmse: nan
[1500]	training's rmse: 1.72195	valid_0's rmse: nan
[1600]	training's rmse: 1.71054	valid_0's rmse: nan
[1700]	training's rmse: 1.69992	valid_0's rmse: nan
[1800]	training's rmse: 1.69019	valid_0's rmse: nan
[1900]	training's rmse: 1.68006	valid_0's rmse: nan
[2000]	train

Unnamed: 0,name,imp
0,item_id,176762
15,event_name_1,40979
23,tm_w,35509
52,rolling_mean_180,23766
13,price_momentum_m,22545
53,rolling_std_180,21291
50,rolling_mean_60,19821
51,rolling_std_60,19086
22,tm_d,18353
49,rolling_std_30,17772


TX_2 start
[100]	training's rmse: 2.77263	valid_0's rmse: nan
[200]	training's rmse: 2.45683	valid_0's rmse: nan
[300]	training's rmse: 2.37185	valid_0's rmse: nan
[400]	training's rmse: 2.3164	valid_0's rmse: nan
[500]	training's rmse: 2.26892	valid_0's rmse: nan
[600]	training's rmse: 2.22665	valid_0's rmse: nan
[700]	training's rmse: 2.19006	valid_0's rmse: nan
[800]	training's rmse: 2.15772	valid_0's rmse: nan
[900]	training's rmse: 2.12889	valid_0's rmse: nan
[1000]	training's rmse: 2.10515	valid_0's rmse: nan
[1100]	training's rmse: 2.08114	valid_0's rmse: nan
[1200]	training's rmse: 2.05813	valid_0's rmse: nan
[1300]	training's rmse: 2.03759	valid_0's rmse: nan
[1400]	training's rmse: 2.01876	valid_0's rmse: nan
[1500]	training's rmse: 2.00165	valid_0's rmse: nan
[1600]	training's rmse: 1.98463	valid_0's rmse: nan
[1700]	training's rmse: 1.96885	valid_0's rmse: nan
[1800]	training's rmse: 1.9541	valid_0's rmse: nan
[1900]	training's rmse: 1.93877	valid_0's rmse: nan
[2000]	train

Unnamed: 0,name,imp
0,item_id,184650
15,event_name_1,40507
23,tm_w,36756
52,rolling_mean_180,22568
13,price_momentum_m,22292
53,rolling_std_180,20287
50,rolling_mean_60,19083
22,tm_d,18231
51,rolling_std_60,18035
49,rolling_std_30,16468


TX_3 start
[100]	training's rmse: 2.43652	valid_0's rmse: nan
[200]	training's rmse: 2.17071	valid_0's rmse: nan
[300]	training's rmse: 2.09349	valid_0's rmse: nan
[400]	training's rmse: 2.04673	valid_0's rmse: nan
[500]	training's rmse: 2.00978	valid_0's rmse: nan
[600]	training's rmse: 1.97982	valid_0's rmse: nan
[700]	training's rmse: 1.95453	valid_0's rmse: nan
[800]	training's rmse: 1.9316	valid_0's rmse: nan
[900]	training's rmse: 1.91181	valid_0's rmse: nan
[1000]	training's rmse: 1.89308	valid_0's rmse: nan
[1100]	training's rmse: 1.87608	valid_0's rmse: nan
[1200]	training's rmse: 1.8604	valid_0's rmse: nan
[1300]	training's rmse: 1.84639	valid_0's rmse: nan
