In [27]:
import pandas as pd
import numpy as np

In [28]:
train = pd.read_csv('/kaggle/input/store-sales-time-series-forecasting/train.csv')
test = pd.read_csv('/kaggle/input/store-sales-time-series-forecasting/test.csv')
holidays = pd.read_csv('/kaggle/input/store-sales-time-series-forecasting/holidays_events.csv')
oil = pd.read_csv('/kaggle/input/store-sales-time-series-forecasting/oil.csv')
stores = pd.read_csv('/kaggle/input/store-sales-time-series-forecasting/stores.csv')

In [29]:
oil.isnull().sum()

date           0
dcoilwtico    43
dtype: int64

In [30]:
print(oil.head(2))
oil.tail(2)

         date  dcoilwtico
0  2013-01-01         NaN
1  2013-01-02       93.14


Unnamed: 0,date,dcoilwtico
1216,2017-08-30,45.96
1217,2017-08-31,47.26


only oil dataframe has some null values, lets deal with them with interpolation

but the first row is also a nan so for this i will use bfill

In [31]:
oil = oil.interpolate()
oil = oil.fillna(method='bfill')

convert all date columns to datetime

In [32]:
train.date = pd.to_datetime(train.date)
test.date = pd.to_datetime(test.date)
holidays.date = pd.to_datetime(holidays.date)
oil.date = pd.to_datetime(oil.date)

---

coming up with features

In [33]:
print(train.columns)
print(test.columns)

Index(['id', 'date', 'store_nbr', 'family', 'sales', 'onpromotion'], dtype='object')
Index(['id', 'date', 'store_nbr', 'family', 'onpromotion'], dtype='object')


besides what is already in our train and test dataframe, i will add some new features

- more specific date information
- oil price
- store type
- is it holiday or event
- take into consideration that salaries are paid on 15th and last day of the month
- 16 april 2016 earthquake

In [34]:
def CreateFeatures(df):
    
    df = df.copy()
    df['dayofmonth'] = df.date.dt.day
    df['month'] = df.date.dt.month
    df['year'] = df.date.dt.year
    df['quarter'] = df.date.dt.quarter
    df['salary_recently'] = df['date'].apply(lambda x: 1 if x.day in [1, 2, 15, 16, 17] else 0)
    df = pd.merge(df, oil, how='left', on='date')
    df['store_type'] = df['store_nbr'].apply(lambda x: 1 if x in range(44, 53) else (2 if x in [9, 11, 18, 20, 21, 31, 34, 39] else (3 if x in [10, 19, 22, 30, 32, 33, 35, 40, 54] or x in range(12, 18) else (4 if x in range(1,9) or x in range(23, 28) or x in [37,38,41,42,53] else 5))))
    df['holiday'] = df['date'].apply(lambda x: 1 if x in holidays.date and holidays['type'] in ['Holiday', 'Transfer', 'Additional'] and holidays['transferred'] != 'True' and holidays['locale'] == 'National' else 0)
    df['event'] = df['date'].apply(lambda x: 1 if x in holidays.date and holidays['type'] in ['Event'] and holidays['locale'] == 'National' else 0)
    
    return df

In [35]:
train = CreateFeatures(train)
test = CreateFeatures(test)

---

data preprocessing and model building

In [36]:
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
import xgboost as xgb

In [37]:
le = LabelEncoder()
train['family'] = le.fit_transform(train['family'])
test['family'] = le.fit_transform(test['family'])

In [38]:
features = ['family', 'onpromotion', 'dayofmonth', 'month', 'year', 'quarter', 'salary_recently', 'dcoilwtico', 'store_type', 'holiday', 'event']
target = 'sales'

X = train[features]
y = train[target]

X_train, X_valid, y_train, y_valid = train_test_split(X, y, test_size=0.25, random_state=42)

In [39]:
model = xgb.XGBRegressor(base_score=0.5, booster='gbtree',
                        n_estimators=500,
                        early_stopping_rounds=50,
                        objective='reg:linear',
                        max_depth=3,
                        learning_rate=0.01)

model.fit(X_train,y_train,
        eval_set=[(X_train, y_train), (X_valid, y_valid)],
        verbose=100)  

[0]	validation_0-rmse:1151.55008	validation_1-rmse:1162.64970
[100]	validation_0-rmse:894.80530	validation_1-rmse:906.16998
[200]	validation_0-rmse:787.54270	validation_1-rmse:799.14504
[300]	validation_0-rmse:721.66895	validation_1-rmse:731.81801
[400]	validation_0-rmse:684.20149	validation_1-rmse:693.22146
[499]	validation_0-rmse:661.56399	validation_1-rmse:669.98195


XGBRegressor(base_score=0.5, booster='gbtree', callbacks=None,
             colsample_bylevel=1, colsample_bynode=1, colsample_bytree=1,
             early_stopping_rounds=50, enable_categorical=False,
             eval_metric=None, gamma=0, gpu_id=-1, grow_policy='depthwise',
             importance_type=None, interaction_constraints='',
             learning_rate=0.01, max_bin=256, max_cat_to_onehot=4,
             max_delta_step=0, max_depth=3, max_leaves=0, min_child_weight=1,
             missing=nan, monotone_constraints='()', n_estimators=500, n_jobs=0,
             num_parallel_tree=1, objective='reg:linear', predictor='auto',
             random_state=0, reg_alpha=0, ...)

In [40]:
X_test = test[features]
test['preds'] = model.predict(X_test)