In [None]:
import pandas as pd
pd.set_option('max_columns', 100)
import numpy as np
import matplotlib.pyplot as plt
import lightgbm as lgb
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import KFold
from sklearn.ensemble import RandomForestClassifier
import datetime
import gc
import os

BASE_DIR = './walmart/input/walmart-recruiting-store-sales-forecasting/'

In [None]:
train = pd.read_csv(BASE_DIR + 'train.csv.zip')
test = pd.read_csv(BASE_DIR + 'test.csv.zip')
stores = pd.read_csv(BASE_DIR + 'stores.csv')
features = pd.read_csv(BASE_DIR + 'features.csv.zip')
submission = pd.read_csv(BASE_DIR + 'sampleSubmission.csv.zip')

In [None]:
train = train.merge(stores, on='Store', how='left')
train = train.merge(features, on=['Store', 'Date', 'IsHoliday'], how='left')

test = test.merge(stores, on='Store', how='left')
test = test.merge(features, on=['Store', 'Date', 'IsHoliday'], how='left')

In [None]:
def datetime_function(df):
    df['Date'] = pd.to_datetime(df.Date)
    df['year'] = df.Date.dt.year
    df['month'] = df.Date.dt.month
    df['day'] = df.Date.dt.day
    return df
    
train = datetime_function(train)
test = datetime_function(test)

In [None]:
def Holiday_type(df):
    df['HolidayType'] = 0
    idx = (df.IsHoliday==True) & (df.month==2)
    df.loc[idx, 'HolidayType'] = 1
    idx = (df.IsHoliday==True) & (df.month==9)
    df.loc[idx, 'HolidayType'] = 2
    idx = (df.IsHoliday==True) & (df.month==11)
    df.loc[idx, 'HolidayType'] = 3
    idx = (df.IsHoliday==True) & (df.month==12)
    df.loc[idx, 'HolidayType'] = 4
    return df

train = Holiday_type(train)
test = Holiday_type(test)

In [None]:
def label_encoder(train, test):
    type_le = LabelEncoder()
    train['Type'] = type_le.fit_transform(train['Type'])
    test['Type'] = type_le.transform(test['Type'])
    
    holiday_le = LabelEncoder()
    train['IsHoliday'] = holiday_le.fit_transform(train['IsHoliday'])
    test['IsHoliday'] = holiday_le.transform(test['IsHoliday'])
    return train, test

train, test = label_encoder(train, test)

In [None]:
def holiday_relation(df):
    idx = df.IsHoliday == True
    before = (df.loc[idx, 'Date'] - datetime.timedelta(days=7))
    after = (df.loc[idx, 'Date'] + datetime.timedelta(days=7))

    before_idx = df.Date.isin(before.tolist())
    after_idx = df.Date.isin(after.tolist())
    # train['HolidayRelation'] = np.nan
    df.loc[idx, 'HolidayRelation'] = 0
    df.loc[before_idx, 'HolidayRelation'] = -1
    df.loc[after_idx, 'HolidayRelation'] = 1
    return df

train = holiday_relation(train)
test = holiday_relation(test)

In [None]:
# Store + Deptを明示的に関連づける
def store_dept_relation(train, test):
    train['StoreDept'] = list(map(lambda x, y: str(x) + '_' + str(y), train['Store'], train['Dept']))
    test['StoreDept'] = list(map(lambda x, y: str(x) + '_' + str(y), test['Store'], test['Dept']))

    all_StoreDept = list(train['StoreDept'].unique())
    StoreDept_map = dict(zip(all_StoreDept, np.arange(len(all_StoreDept))))

    train['StoreDeptCategory'] = train['StoreDept'].map(StoreDept_map)
    test['StoreDeptCategory'] = test['StoreDept'].map(StoreDept_map)
    return train, test

train, test = store_dept_relation(train, test)

In [None]:
# 周期特徴
def encode(df, col):
    df[col + '_cos'] = np.cos(2 * np.pi * df[col]/df[col].max())
    df[col + '_sin'] = np.sin(2 * np.pi * df[col]/df[col].max())
    return df
train = encode(train, 'month')
test = encode(test, 'month')

In [None]:
# 特徴量選択1:feature importanceによる
train = train.drop(["Temperature", 'CPI', 'Fuel_Price', 'Unemployment'], axis=1)
test = test.drop(["Temperature", 'CPI', 'Fuel_Price', 'Unemployment'], axis=1)

In [None]:
# CV戦略
idx1 = train.Date>='2011-11-01'
idx2 = train.Date<'2011-11-01'

In [None]:
# 特徴量選択2
train = train.drop(['Date', 'StoreDept', 'year', 'month', 'day'], axis=1)
test = test.drop(['Date', 'StoreDept', 'year', 'month', 'day'], axis=1)

In [None]:
y_train = train['Weekly_Sales']
train = train.drop('Weekly_Sales', axis=1)

categorical_features = ['Store', 'Dept', 'IsHoliday', 'Type', 'HolidayType', 'HolidayRelation', 'StoreDeptCategory']
params = {
    "objective": "regression",
    "boosting": "gbdt",
    "num_leaves": 1300,
    "learning_rate": 0.05,
    "feature_fraction": 0.8,
    "reg_lambda": 2,
    "metric": "rmse",
}

X1 = train.loc[idx2, :]
X2 = train.loc[idx1, :]
X = train

y1 = y_train.loc[idx2]
y2 = y_train.loc[idx1]
y = y_train

d1 = lgb.Dataset(X1, label=y1, categorical_feature=categorical_features, free_raw_data=False)
d2 = lgb.Dataset(X2, label=y2, categorical_feature=categorical_features, free_raw_data=False)
d = lgb.Dataset(X, label=y, categorical_feature=categorical_features, free_raw_data=False)

watchlist1 = [d1, d2]
watchlist2 = [d2, d1]

In [None]:
print("Building model with first")
model1 = lgb.train(params, train_set=d1, num_boost_round=1000, valid_sets=watchlist1, verbose_eval=200, early_stopping_rounds=200)
print("Building model with second")
model2 = lgb.train(params, train_set=d2, num_boost_round=1000, valid_sets=watchlist2, verbose_eval=200, early_stopping_rounds=200)

In [None]:
params = {
    "objective": "regression",
    "boosting": "gbdt",
    "num_leaves": 1300,
    "learning_rate": 0.05,
    "feature_fraction": 0.8,
    "reg_lambda": 2,
    "metric": "rmse",
    'num_iterations': 700,
}

model3 = lgb.train(params, train_set=d, num_boost_round=1000, verbose_eval=50)

In [None]:
del X1, X2, X, y1, y2, y, d1, d2, d, watchlist1, watchlist2
gc.collect()

In [None]:
lgb.plot_importance(model1)
plt.show()

In [None]:
lgb.plot_importance(model2)
plt.show()

In [None]:
lgb.plot_importance(model3)
plt.show()

In [None]:
# テストデータの予測
pred1 = model1.predict(test, num_iteration=model1.best_iteration)
pred2 = model2.predict(test, num_iteration=model2.best_iteration)
pred3 = model3.predict(test, num_iteration=model3.best_iteration)

# cvの結果から重みを考える
param = {'model1': 0, 'model2': 0.8, 'model3': 0.2}
pred = pred1*param['model1'] + pred2*param['model2'] + pred3*param['model3']

In [None]:
submission['Weekly_Sales'] = pred
submission.to_csv('submission.csv', index=False)

In [None]:
submission.head()