In [3]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import pandas as pd
import numpy as np
import seaborn as sns
import os
import warnings
import matplotlib.pyplot as plt

from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.metrics import r2_score
from xgboost import XGBRegressor

warnings.simplefilter(action='ignore', category=FutureWarning)
warnings.simplefilter(action='ignore', category = UserWarning)

In [4]:

'''
Loading the data
'''

folder_path = "/kaggle/input/store-sales-time-series-forecasting/"

holiday_event_df = pd.read_csv(os.path.join(folder_path,"holidays_events.csv"),
                               dtype={'type': 'category',
                                      'locale': 'category',
                                      'locale_name': 'category',
                                      'description': 'category',
                                      'transferred': 'bool'},
                               parse_dates=['date'],
                               infer_datetime_format=True)

holiday_event_df = holiday_event_df.set_index('date').to_period('D')

holiday_event_df_duplicates = holiday_event_df[holiday_event_df.index.duplicated(keep=False)] #Find duplicate values in holidays

holiday_event_df_without_duplicates = holiday_event_df[~holiday_event_df.index.duplicated(keep='first')] #Handle duplicate values in holidays

oil_df = pd.read_csv(os.path.join(folder_path,"oil.csv"),
                     parse_dates=['date'],
                     infer_datetime_format=True)

oil_df = oil_df.set_index('date').to_period('D')

oil_df = oil_df.interpolate() #Handle missing values in oil prices

oil_df.iloc[0] = oil_df.iloc[1] #Handle missing values in oil prices

oil_df.rename(columns={"dcoilwtico": "oil_price"}, inplace = True)

stores_df = pd.read_csv(os.path.join(folder_path,"stores.csv"))

transaction_df = pd.read_csv(os.path.join(folder_path,"transactions.csv"),
                             parse_dates=['date'],
                             infer_datetime_format=True)

transaction_df['date'] = transaction_df['date'].dt.to_period('D')
transaction_df = transaction_df.set_index(['date', 'store_nbr']).sort_index()

train_df = pd.read_csv(os.path.join(folder_path,"train.csv"),
                                        usecols=['store_nbr', 'family', 'date','sales', 'onpromotion'],
                                        dtype={'store_nbr': 'category',
                                               'family': 'category',
                                               'sales': 'float'},
                                        parse_dates=['date'],
                                        infer_datetime_format=True)

train_df['date'] = train_df.date.dt.to_period('D')
train_df = train_df.set_index('date').sort_index()

competition_test_df = pd.read_csv(os.path.join(folder_path,"test.csv"),
                                  usecols=['id','store_nbr', 'family', 'date', 'onpromotion'],
                                  dtype={'store_nbr': 'category',
                                         'family': 'category',
                                         'onpromotion': 'uint32'},
                                  parse_dates=['date'],
                                  infer_datetime_format=True)

competition_test_df['date'] = competition_test_df.date.dt.to_period('D')
competition_test_df = competition_test_df.set_index('date').sort_index()

In [5]:
'''
Building a neural network to predict sales by store and family.
'''

X = train_df.loc['2016':'2017'].copy()

X['day'] = X.index.day
X['week'] = X.index.dayofweek

X = X.join(oil_df, on='date')
X['oil_price'] = X['oil_price'].interpolate()

X['NewYear'] = (X.index.dayofyear == 1)
X['holiday'] = X.index.to_series().isin(holiday_event_df.index)

onehot_encoder = OneHotEncoder()

encoded_week = onehot_encoder.fit_transform(X['week'].values.reshape(-1, 1))

encoded_week_df = pd.DataFrame(encoded_week.toarray(), columns=onehot_encoder.get_feature_names_out(['week']), index=X.index)

encoded_family = onehot_encoder.fit_transform(X['family'].values.reshape(-1, 1))

encoded_family_df = pd.DataFrame(encoded_family.toarray(), columns=onehot_encoder.get_feature_names_out(['family']), index=X.index)

encoded_store_nbr = onehot_encoder.fit_transform(X['store_nbr'].values.reshape(-1, 1))

encoded_store_nbr_df = pd.DataFrame(encoded_store_nbr.toarray(), columns=onehot_encoder.get_feature_names_out(['store_nbr']), index=X.index)

columns_to_be_scaled = ['onpromotion', 'oil_price', 'day']

scaler = StandardScaler()

scaled_data = scaler.fit_transform(X.loc[:,columns_to_be_scaled])

scaled_data_df = pd.DataFrame(scaled_data, columns=[i + "_scaled" for i in columns_to_be_scaled], index=X.index)

X = pd.concat([X, scaled_data_df, encoded_week_df, encoded_family_df, encoded_store_nbr_df], axis=1)

y = X[['sales']].copy()

X.drop(columns=columns_to_be_scaled + ['sales','week','family', 'store_nbr'], inplace = True)

X[['NewYear','holiday']] = X[['NewYear','holiday']].astype(int)

y_train, y_test = y[:"2017-06-01"], y["2017-06-02":]
X_train, X_test = X.loc[:"2017-06-01"], X.loc["2017-06-02":]

model = XGBRegressor(n_estimators = 1000,
                     learning_rate = 0.01,
                     max_depth = 5,
                     objective = "reg:squarederror")

model.fit(X_train,y_train)

y_pred = pd.DataFrame(model.predict(X_test), index=X_test.index, columns=['sales'])

print("\nTest R2 of the xgboost model: {}\n".format(r2_score(y_test, y_pred)))


Test R2 of the xgboost model: 0.9112251158933391



In [6]:
'''
Predicting sales for the competition dataset
'''

X_competition = competition_test_df.loc['2016':'2017'].copy()

X_competition['day'] = X_competition.index.day
X_competition['week'] = X_competition.index.dayofweek

X_competition = X_competition.join(oil_df, on='date')
X_competition['oil_price'] = X_competition['oil_price'].interpolate()

X_competition['NewYear'] = (X_competition.index.dayofyear == 1)
X_competition['holiday'] = X_competition.index.to_series().isin(holiday_event_df.index)

onehot_encoder = OneHotEncoder()

encoded_week = onehot_encoder.fit_transform(X_competition['week'].values.reshape(-1, 1))

encoded_week_df = pd.DataFrame(encoded_week.toarray(), columns=onehot_encoder.get_feature_names_out(['week']), index=X_competition.index)

encoded_family = onehot_encoder.fit_transform(X_competition['family'].values.reshape(-1, 1))

encoded_family_df = pd.DataFrame(encoded_family.toarray(), columns=onehot_encoder.get_feature_names_out(['family']), index=X_competition.index)

encoded_store_nbr = onehot_encoder.fit_transform(X_competition['store_nbr'].values.reshape(-1, 1))

encoded_store_nbr_df = pd.DataFrame(encoded_store_nbr.toarray(), columns=onehot_encoder.get_feature_names_out(['store_nbr']), index=X_competition.index)

columns_to_be_scaled = ['onpromotion', 'oil_price', 'day']

scaler = StandardScaler()

scaled_data = scaler.fit_transform(X_competition.loc[:,columns_to_be_scaled])

scaled_data_df = pd.DataFrame(scaled_data, columns=[i + "_scaled" for i in columns_to_be_scaled], index=X_competition.index)

X_competition = pd.concat([X_competition, scaled_data_df, encoded_week_df, encoded_family_df, encoded_store_nbr_df], axis=1)

X_competition.drop(columns=columns_to_be_scaled + ['week','family', 'store_nbr'], inplace = True)

X_competition[['NewYear','holiday']] = X_competition[['NewYear','holiday']].astype(int)

X_competition = X_competition[X.columns]

y_submit = competition_test_df[['id']].copy()
y_submit['sales'] = model.predict(X_competition)
y_submit.to_csv('submission.csv', index=False)

print("\nBelow are the predictions for the competition data:")
print(y_submit)


Below are the predictions for the competition data:
                 id        sales
date                            
2017-08-16  3000888    17.605537
2017-08-16  3000889    17.605537
2017-08-16  3000890    74.670456
2017-08-16  3000891  2688.778076
2017-08-16  3000892    17.605537
...             ...          ...
2017-08-31  3029395   238.426483
2017-08-31  3029396    22.865728
2017-08-31  3029397  1411.258301
2017-08-31  3029398   255.928650
2017-08-31  3029399    22.865728

[28512 rows x 2 columns]
