## 読み込み

In [1]:
import os
import numpy as np
import pandas as pd
pd.set_option('max_columns', 100)
import matplotlib.pyplot as plt
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import KFold
import datetime
import gc
import os

# ./walmart/input/配下にコンペデータのディレクトリがある想定
print(os.listdir('./walmart/input/walmart-recruiting-store-sales-forecasting/'))

BASE_DIR = './walmart/input/walmart-recruiting-store-sales-forecasting/'
# kaggle上では以下になる
# BASE_DIR = './kaggle/input/walmart-recruiting-store-sales-forecasting/'

['features.csv.zip', '.DS_Store', 'sampleSubmission.csv', 'test.csv', 'sampleSubmission.csv.zip', 'train.csv', 'features.csv', 'test.csv.zip', 'train.csv.zip', 'stores.csv']


In [19]:
train = pd.read_csv(BASE_DIR + 'train.csv.zip')
test = pd.read_csv(BASE_DIR + 'test.csv.zip')
stores = pd.read_csv(BASE_DIR + 'stores.csv')
features = pd.read_csv(BASE_DIR + 'features.csv.zip')
submission = pd.read_csv(BASE_DIR + 'sampleSubmission.csv.zip')

In [20]:
# データの結合
train = train.merge(stores, on='Store', how='left')
train = train.merge(features, on=['Store', 'Date', 'IsHoliday'], how='left')
test = test.merge(stores, on='Store', how='left')
test = test.merge(features, on=['Store', 'Date', 'IsHoliday'], how='left')

train.head()

Unnamed: 0,Store,Dept,Date,Weekly_Sales,IsHoliday,Type,Size,Temperature,Fuel_Price,MarkDown1,MarkDown2,MarkDown3,MarkDown4,MarkDown5,CPI,Unemployment
0,1,1,2010-02-05,24924.5,False,A,151315,42.31,2.572,,,,,,211.096358,8.106
1,1,1,2010-02-12,46039.49,True,A,151315,38.51,2.548,,,,,,211.24217,8.106
2,1,1,2010-02-19,41595.55,False,A,151315,39.93,2.514,,,,,,211.289143,8.106
3,1,1,2010-02-26,19403.54,False,A,151315,46.63,2.561,,,,,,211.319643,8.106
4,1,1,2010-03-05,21827.9,False,A,151315,46.5,2.625,,,,,,211.350143,8.106


## 特徴量エンジニアリング
### 特徴量作成

### Date

In [21]:
def datetime_function(df):
    df['Date'] = pd.to_datetime(df.Date)
    df['year'] = df.Date.dt.year
    df['month'] = df.Date.dt.month
    df['day'] = df.Date.dt.day
    return df
    
train = datetime_function(train)
test = datetime_function(test)

In [22]:
def Holiday_type(df):
    df['HolidayType'] = 0
    idx = (df.IsHoliday==True) & (df.month==2)
    df.loc[idx, 'HolidayType'] = 1
    idx = (df.IsHoliday==True) & (df.month==9)
    df.loc[idx, 'HolidayType'] = 2
    idx = (df.IsHoliday==True) & (df.month==11)
    df.loc[idx, 'HolidayType'] = 3
    idx = (df.IsHoliday==True) & (df.month==12)
    df.loc[idx, 'HolidayType'] = 4
    return df

train = Holiday_type(train)
test = Holiday_type(test)

In [23]:
def label_encoder(train, test):
    type_le = LabelEncoder()
    train['Type'] = type_le.fit_transform(train['Type'])
    test['Type'] = type_le.transform(test['Type'])
    
    holiday_le = LabelEncoder()
    train['IsHoliday'] = holiday_le.fit_transform(train['IsHoliday'])
    test['IsHoliday'] = holiday_le.transform(test['IsHoliday'])
    return train, test

train, test = label_encoder(train, test)

In [24]:
def holiday_relation(df):
    idx = df.IsHoliday == True
    before = (df.loc[idx, 'Date'] - datetime.timedelta(days=7))
    after = (df.loc[idx, 'Date'] + datetime.timedelta(days=7))

    before_idx = df.Date.isin(before.tolist())
    after_idx = df.Date.isin(after.tolist())
    # train['HolidayRelation'] = np.nan
    df.loc[idx, 'HolidayRelation'] = 0
    df.loc[before_idx, 'HolidayRelation'] = -1
    df.loc[after_idx, 'HolidayRelation'] = 1
    return df

train = holiday_relation(train)
test = holiday_relation(test)

In [25]:
# Store + Deptを明示的に関連づける
def store_dept_relation(train, test):
    train['StoreDept'] = list(map(lambda x, y: str(x) + '_' + str(y), train['Store'], train['Dept']))
    test['StoreDept'] = list(map(lambda x, y: str(x) + '_' + str(y), test['Store'], test['Dept']))

    all_StoreDept = list(train['StoreDept'].unique())
    StoreDept_map = dict(zip(all_StoreDept, np.arange(len(all_StoreDept))))

    train['StoreDeptCategory'] = train['StoreDept'].map(StoreDept_map)
    test['StoreDeptCategory'] = test['StoreDept'].map(StoreDept_map)
    return train, test

train, test = store_dept_relation(train, test)

In [26]:
# 周期特徴
def encode(df, col):
    df[col + '_cos'] = np.cos(2 * np.pi * df[col]/df[col].max())
    df[col + '_sin'] = np.sin(2 * np.pi * df[col]/df[col].max())
    return df
train = encode(train, 'month')
test = encode(test, 'month')

In [27]:
train.head()

Unnamed: 0,Store,Dept,Date,Weekly_Sales,IsHoliday,Type,Size,Temperature,Fuel_Price,MarkDown1,MarkDown2,MarkDown3,MarkDown4,MarkDown5,CPI,Unemployment,year,month,day,HolidayType,HolidayRelation,StoreDept,StoreDeptCategory,month_cos,month_sin
0,1,1,2010-02-05,24924.5,0,0,151315,42.31,2.572,,,,,,211.096358,8.106,2010,2,5,0,-1.0,1_1,0,0.5,0.866025
1,1,1,2010-02-12,46039.49,1,0,151315,38.51,2.548,,,,,,211.24217,8.106,2010,2,12,1,0.0,1_1,0,0.5,0.866025
2,1,1,2010-02-19,41595.55,0,0,151315,39.93,2.514,,,,,,211.289143,8.106,2010,2,19,0,1.0,1_1,0,0.5,0.866025
3,1,1,2010-02-26,19403.54,0,0,151315,46.63,2.561,,,,,,211.319643,8.106,2010,2,26,0,,1_1,0,0.5,0.866025
4,1,1,2010-03-05,21827.9,0,0,151315,46.5,2.625,,,,,,211.350143,8.106,2010,3,5,0,,1_1,0,6.123234000000001e-17,1.0


### 特徴量選択

In [28]:
# Date, year, month, dayはmonth_cos, month_sinで代替可能であると考え削除
train = train.drop(['Date', 'StoreDept', 'year', 'month', 'day'], axis=1)
test = test.drop(['Date', 'StoreDept', 'year', 'month', 'day'], axis=1)

In [29]:
y_train = train['Weekly_Sales']
train = train.drop('Weekly_Sales', axis=1)

In [6]:
# 重要度を確認するため、簡易的なモデルを構築する

categorical_features = ['Store', 'Dept', 'IsHoliday', 'Type', 'HolidayType', 'HolidayRelation', 'StoreDeptCategory']

params = {
    "objective": "regression",
    "boosting": "gbdt",
    "num_leaves": 1300,
    "learning_rate": 0.05,
    "feature_fraction": 0.8,
    "reg_lambda": 2,
    "metric": "rmse",
}

data = lgb.Dataset(train, label=y_train, categorical_feature=categorical_features, free_raw_data=False)
model1 = lgb.train(params, train_set=data, num_boost_round=500, verbose_eval=25)

In [None]:
# 重要度を可視化
lgb.plot_importance(model1)

In [None]:
# null importanceでノイズとなりうる特徴量がないか確認
# 目的変数をシャッフルする
y_nullimportance = y_train.sample(frac=1)

In [None]:
data = lgb.Dataset(train, label=y_nullimportance, categorical_feature=categorical_features, free_raw_data=False)
model2 = lgb.train(params, train_set=data, num_boost_round=500, verbose_eval=25)

In [None]:
# 目的変数をシャッフルして学習した場合の重要度を可視化
lgb.plot_importance(model2)

In [None]:
# 特徴量と対応する重要度（＋　null importanceでの重要度）をカラムにもつDataFrameを作成
# これらのデータを確認することにより["Temperature", 'CPI', 'Fuel_Price', 'Unemployment']が学習時のノイズになりうると判断し、削除する方向性にする

importance_df1 = pd.DataFrame()
importance_df1['feature'] = train.columns
importance_df1['importance'] = model1.feature_importance()
importance_df1 = importance_df1.sort_values('importance', ascending=False)

importance_df2 = pd.DataFrame()
importance_df2['feature'] = train.columns
importance_df2['importance'] = model2.feature_importance()
importance_df2 = importance_df2.sort_values('importance', ascending=False)

importance_df = importance_df1.merge(importance_df2, on='feature', how='left')
importance_df.head(10)