# 0. 개요

- 모든 시간대를 대상으로 하는 것이 아닌, 예측해야하는 시간대의 target이 비슷한 데이터를 바탕으로
- 2016-04-25 ~ 2016-05-22 : d_1914 ~ d_1941 (public)
- 2016-05-23 ~ 2016-06-19 : d_1942 ~ d_1969 (private)
- 모델을 제작

https://www.kaggle.com/yassinealouini/trends-per-store  
{'CA_1': 1.4515535682921805,  
 'CA_2': 1.2478469436321578,  
 'CA_3': 2.0503897391977617,  
 'CA_4': 0.8015278327204106,  
 'TX_1': 1.0426264577891047,  
 'TX_2': 1.2509886678837656,  
 'TX_3': 1.2319171250395966,  
 'WI_1': 1.1729008570725148,  
 'WI_2': 1.416137702642638,  
 'WI_3': 1.1452668162125874}  

# 1. 라이브러리 로드 및 데이터 불러오기

In [None]:
import pandas as pd
import numpy as np 
from sklearn.ensemble import RandomForestRegressor
from lightgbm import LGBMRegressor
from sklearn import metrics
from sklearn.model_selection import TimeSeriesSplit, KFold

import matplotlib.pyplot as plt
import seaborn as sns
import pickle 
import time
import datetime
import os

from scipy.sparse import csr_matrix

from mypackage import *

pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', 400)

### <div style="color:red">파일명 체크!!!</div>
- all_df: 기초작업 수준. 
- all_df4: lag 데이터 엄청 많이 만들어 놓은 것.
- all_df5: 'id', 'item_id', 'dept_id', 'cat_id', 'store_id', 'state_id', 'd', 'sales', 'date', 'wm_yr_wk', 'weekday', 'month', 'year', 'event_name_1', 'event_type_1', 'snap_CA', 'snap_TX', 'snap_WI', 'is_event', 'day', 'week', 'sell_price', 'revenue', sales_rolling_mean_t7', 'rolling_mean_t7', 'rolling_std_t7', 'sales_rolling_mean_t28', 'rolling_mean_t28', 'rolling_std_t28', 'sales_rolling_mean_t56', 'rolling_mean_t56', 'rolling_std_t56', 'sales_rolling_mean_t112','rolling_mean_t112', 'rolling_std_t112', 'sales_rolling_mean_t168', 'rolling_mean_t168', 'rolling_std_t168', 'lag_t28', 'lag_t29', 'lag_t30', 'lag_t31', 'lag_t32', 'lag_t33', 'lag_t34', 'revenue_lag_t28', 'revenue_lag_t29', 'revenue_lag_t30','revenue_lag_t31', 'revenue_lag_t32', 'revenue_lag_t33', 'revenue_lag_t34'

In [None]:
print('data loading')
with open('inputs/all_df.pickle', 'rb') as f:
    all_df = pickle.load(f)
print('data loaded')

In [None]:
all_df['weekofmonth'] = np.ceil(all_df['day'] // 7).astype('int8')

In [None]:
c1 = all_df['week'] == 13
c2 = all_df['week'] == 14
c3 = all_df['week'] == 15
c4 = all_df['week'] == 16

c5 = all_df['week'] == 17
c6 = all_df['week'] == 18
c7 = all_df['week'] == 19
c8 = all_df['week'] == 20

all_df2 = all_df[c1 | c2 | c3 | c4 | c5 | c6 | c7 | c8]

In [None]:
from sklearn.preprocessing import LabelEncoder

le_id = LabelEncoder()
le_id.fit(all_df2['id'])

all_df2['id'] = le_id.transform(all_df2['id'])

In [None]:
all_df2.columns

In [None]:
all_df2.columns = ['id', 'item_id', 'dept_id', 'cat_id', 'store_id', 'state_id', 'd',
       'sales', 'date', 'wm_yr_wk', 'weekday', 'month', 'year',
       'event_name_1', 'event_type_1', 'snap_CA', 'snap_TX', 'snap_WI',
       'is_event', 'day', 'week', 'sell_price', 'lag_t28', 'lag_t29',
       'lag_t30', 'lag_t24', 'lag_t25', 'lag_t26', 'lag_t27',
       'rolling_mean_t7', 'rolling_std_t7', 'rolling_mean_t30',
       'rolling_std_t30', 'rolling_mean_t90', 'rolling_mean_t180',
       'rolling_mean_t60', 'rolling_std_t60', 'lag_price_t1',
       'price_change_t1', 'rolling_price_max_t365', 'price_change_t365',
       'rolling_price_std_t7', 'rolling_price_std_t30', 'weekofmonth']

In [None]:
item_id_gb_sell_price_mean = all_df2.groupby(['id', 'year', 'week'])['sell_price'].mean().rename('item_id_gb_sell_price_mean')
all_df2 = pd.merge(all_df2, item_id_gb_sell_price_mean, on=['id', 'year', 'week'], how='left')

item_id_gb_sell_price_std = all_df2.groupby(['id', 'year', 'week'])['sell_price'].std().rename('item_id_gb_sell_price_std')
all_df2 = pd.merge(all_df2, item_id_gb_sell_price_std, on=['id', 'year', 'week'], how='left')

In [None]:
drop_cols = ['d', 'sales', 'store_id', 'snap_CA', 'snap_TX', 'snap_WI', 'state_id', 'date', 'wm_yr_wk', 'is_event' ,'lag_t24', 'lag_t25', 'lag_t26', 'lag_t27', 'lag_price_t1', 'rolling_price_max_t365']

features = all_df2.columns.drop(drop_cols)

In [None]:
features

In [None]:
stores = []
test_index = []
for i in range(10):
    all_df3 = all_df2[all_df2.store_id == i]
    
    train_set = all_df3[all_df3['date'] <= '2016-04-24']
    train_set_X = train_set[features]
    train_set_y = train_set['sales']

    # 테스트 셋
    test = all_df3[all_df3['date'] > '2016-04-24']
    test_set = test[features]
    test_index.append(test.index)

    var_set = all_df3[(all_df3['date'] > '2015-04-27') & (all_df3['date'] <= '2015-05-22')]
    var_set_X = var_set[features]
    var_set_y = var_set['sales']
    
    
    
    n_fold = 2
    folds = KFold(n_splits=n_fold, shuffle=True)
    splits = folds.split(train_set_X, train_set_y)

    y_preds = np.zeros(test_set.shape[0])


    for fold_n, (train_index, valid_index) in enumerate(splits):
        print('Fold:',fold_n+1)

        X_train, X_valid = train_set_X.iloc[train_index], train_set_X.iloc[valid_index]
        y_train, y_valid = train_set_y.iloc[train_index], train_set_y.iloc[valid_index]

        lgb = LGBMRegressor(
            objective = 'regression',
            boosting_type = 'gbdt',
            num_leaves = 2048,
            colsample_bytree = 0.8,
            subsample = 0.8,
            n_estimators = 600, ## 중요!!!!
            learning_rate = 0.05,
            n_jobs = -1,
            reg_lambda = 0.1,
            device = 'gpu'
        )
        lgb.fit(X_train, y_train, eval_set=[(X_valid, y_valid)], early_stopping_rounds = 20, verbose = True)


        # 예측
        y_preds += lgb.predict(test_set, num_iteration=lgb.best_iteration_) / n_fold

        # 메모리 정리
        del X_train, X_valid, y_train, y_valid



    stores.append(y_preds)
    print(pd.DataFrame(lgb.feature_importances_, index=features))


0번 - 14.8375 5.29964  
1번 - 6.80630 3.22881  
2번 - 30.5934 8.55259  
3번 - 3.67819 1.82227  
4번 - 10.6207 4.35351  
5번 - 15.6138 4.49576  
6번 - 12.2176 3.38841  
7번 - 5.10856 2.16503  
8번 - 12.8646 5.08373  
9번 - 13.4699 4.28131  

2번이 문제임

In [None]:
plt.figure(figsize = (12, 8))
i = 3
all_df3 = all_df2[all_df2.store_id == i]
all_df3 = all_df3[all_df3.date < '2016-04-24']
sns.lineplot(x=all_df3_train.groupby('date')['sales'].mean().index, y=all_df3_train.groupby('date')['sales'].mean())

In [None]:
all_df3_train.groupby('date')['sales'].mean().plot()

In [None]:
all_df4_train.groupby('date')['sales'].mean().plot()

In [None]:
all_df5_train.groupby('date')['sales'].mean().plot()

In [None]:
plt.figure(figsize = (12, 8))
sns.boxplot(all_df3_train.weekofmonth, all_df3_train.sales, showfliers=False) 

In [None]:
all_df3_train.groupby('snap_WI')['sales'].mean()

In [None]:
all_df3_train.snap_WI.value_counts()

In [None]:
all_df3_train[all_df3_train.sales < 0]

In [None]:
all_df3_train[all_df3_train.weekday == 4]

In [None]:
all_df_1 = all_df2[all_df2.store_id == 1]
all_df_2 = all_df2[all_df2.store_id == 2]
all_df_3 = all_df2[all_df2.store_id == 3]
all_df_4 = all_df2[all_df2.store_id == 4]
all_df_5 = all_df2[all_df2.store_id == 5]
all_df_6 = all_df2[all_df2.store_id == 6]

In [None]:
display(
all_df_1.groupby('dept_id')['sales'].mean(),
all_df_2.groupby('dept_id')['sales'].mean(),
all_df_3.groupby('dept_id')['sales'].mean(),
all_df_4.groupby('dept_id')['sales'].mean(),
all_df_5.groupby('dept_id')['sales'].mean(),
all_df_6.groupby('dept_id')['sales'].mean(),

)

In [None]:
tmp = [all_df_1, all_df_2, all_df_3, all_df_4, all_df_5, all_df_6]
for i in tmp:
    print(len(i[(i.dept_id == 2) & (i.sales > 50)]))

In [None]:
all_df2[(all_df2.item_id == 1361) & (all_df2.sales > 100)]

In [None]:
stores = []
test_index = []
all_df3 = all_df2[all_df2.store_id == 2]

train_set = all_df3[all_df3['date'] <= '2016-04-24']
train_set_X = train_set[features]
train_set_y = train_set['sales']

# 테스트 셋
test = all_df3[all_df3['date'] > '2016-04-24']
test_set = test[features]
test_index.append(test.index)

var_set = all_df3[(all_df3['date'] > '2015-04-27') & (all_df3['date'] <= '2015-05-22')]
var_set_X = var_set[features]
var_set_y = var_set['sales']



n_fold = 2
folds = KFold(n_splits=n_fold, shuffle=True)
splits = folds.split(train_set_X, train_set_y)

y_preds = np.zeros(test_set.shape[0])


for fold_n, (train_index, valid_index) in enumerate(splits):
    print('Fold:',fold_n+1)

    X_train, X_valid = train_set_X.iloc[train_index], train_set_X.iloc[valid_index]
    y_train, y_valid = train_set_y.iloc[train_index], train_set_y.iloc[valid_index]

    lgb = LGBMRegressor(
        objective = 'regression',
        boosting_type = 'gbdt',
        num_leaves = 2048,
        colsample_bytree = 0.8,
        subsample = 0.8,
        n_estimators = 600, ## 중요!!!!
        learning_rate = 0.05,
        n_jobs = -1,
        reg_lambda = 10,
        device = 'gpu'
    )
    lgb.fit(X_train, y_train, eval_set=[(X_valid, y_valid)], early_stopping_rounds = 20, verbose = True)


    # 예측
    y_preds += lgb.predict(test_set, num_iteration=lgb.best_iteration_) / n_fold

    # 메모리 정리
    del X_train, X_valid, y_train, y_valid



stores.append(y_preds)
print(pd.DataFrame(lgb.feature_importances_, index=features))


In [None]:
stores[0]

In [None]:
stores_index = [all_df2[all_df2.store_id == i].index for i in range(10)]

In [None]:
stores_index[0]

In [None]:
s_v = []
for i in stores:
    for j in i:
        s_v.append(j)

In [None]:
s_i = []
for i in range(10):
    for j in test_index:
        s_i.append(j)

In [None]:
predict = pd.DataFrame(s_v, index=s_i, columns=['sales'])

In [None]:
all_df2_test = all_df22[all_df22.date  > '2016-04-24']
del all_df2_test['sales']

In [None]:
all_df2_test = pd.merge(all_df2_test, predict, left_index=True, right_index=True, how='left')

In [None]:
sub = pd.read_csv('inputs/sample_submission.csv')

predictions = all_df2_test[['id', 'date', 'sales']]
predictions['id'] = list(le_id.inverse_transform(predictions['id']))


predictions = pd.pivot(predictions, index = 'id', columns = 'date', values = 'sales').reset_index()
predictions.columns = ['id'] + ['F' + str(i + 1) for i in range(28)]

evaluation_rows = [row for row in sub['id'] if 'evaluation' in row] 
evaluation = sub[sub['id'].isin(evaluation_rows)]

validation = sub[['id']].merge(predictions, on = 'id')
final = pd.concat([validation, evaluation])

for i in range(1,29):
    final['F'+str(i)] *= 1.0315
final.to_csv('submissions/submission.csv', index = False)

In [None]:
final

https://www.kaggle.com/c/m5-forecasting-accuracy/submit

In [None]:
time.sleep(2)
os.chdir("submissions")
!kaggle competitions submit -c m5-forecasting-accuracy -f submission.csv -m lgb
os.chdir("../")

# 모델 파라미터 및 피처 기록 및 모델 저장하기

In [None]:
write_params_features(features, params, eval_results, mean_score)

In [None]:
save_feature_importance(feature_importances)

In [None]:
## 1등 노트북이랑 비교해서 카테고리형 피처 빠진거있나 확인하기