In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import pandas as pd
import numpy as np
import seaborn as sns
import os
import warnings
import matplotlib.pyplot as plt

from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.metrics import r2_score
from tensorflow.keras import layers, models, optimizers, regularizers
from tensorflow.keras.losses import Huber

warnings.simplefilter(action='ignore', category=FutureWarning)
warnings.simplefilter(action='ignore', category = UserWarning)

In [2]:
'''
Loading the data
'''

folder_path = "/kaggle/input/store-sales-time-series-forecasting/"

holiday_event_df = pd.read_csv(os.path.join(folder_path,"holidays_events.csv"),
                               dtype={'type': 'category',
                                      'locale': 'category',
                                      'locale_name': 'category',
                                      'description': 'category',
                                      'transferred': 'bool'},
                               parse_dates=['date'],
                               infer_datetime_format=True)

holiday_event_df = holiday_event_df.set_index('date').to_period('D')

holiday_event_df_duplicates = holiday_event_df[holiday_event_df.index.duplicated(keep=False)] #Find duplicate values in holidays

holiday_event_df_without_duplicates = holiday_event_df[~holiday_event_df.index.duplicated(keep='first')] #Handle duplicate values in holidays

oil_df = pd.read_csv(os.path.join(folder_path,"oil.csv"),
                     parse_dates=['date'],
                     infer_datetime_format=True)

oil_df = oil_df.set_index('date').to_period('D')

oil_df = oil_df.interpolate() #Handle missing values in oil prices

oil_df.iloc[0] = oil_df.iloc[1] #Handle missing values in oil prices

oil_df.rename(columns={"dcoilwtico": "oil_price"}, inplace = True)

stores_df = pd.read_csv(os.path.join(folder_path,"stores.csv"))

transaction_df = pd.read_csv(os.path.join(folder_path,"transactions.csv"),
                             parse_dates=['date'],
                             infer_datetime_format=True)

transaction_df['date'] = transaction_df['date'].dt.to_period('D')
transaction_df = transaction_df.set_index(['date', 'store_nbr']).sort_index()

train_df = pd.read_csv(os.path.join(folder_path,"train.csv"),
                                        usecols=['store_nbr', 'family', 'date','sales', 'onpromotion'],
                                        dtype={'store_nbr': 'category',
                                               'family': 'category',
                                               'sales': 'float'},
                                        parse_dates=['date'],
                                        infer_datetime_format=True)

train_df['date'] = train_df.date.dt.to_period('D')
train_df = train_df.set_index('date').sort_index()

competition_test_df = pd.read_csv(os.path.join(folder_path,"test.csv"),
                                  usecols=['id','store_nbr', 'family', 'date', 'onpromotion'],
                                  dtype={'store_nbr': 'category',
                                         'family': 'category',
                                         'onpromotion': 'uint32'},
                                  parse_dates=['date'],
                                  infer_datetime_format=True)

competition_test_df['date'] = competition_test_df.date.dt.to_period('D')
competition_test_df = competition_test_df.set_index('date').sort_index()

In [3]:
'''
Building a neural network to predict sales by store and family.
'''

X = train_df.loc['2016':'2017'].copy()

X['day'] = X.index.day
X['week'] = X.index.dayofweek

X = X.join(oil_df, on='date')
X['oil_price'] = X['oil_price'].interpolate()

X['NewYear'] = (X.index.dayofyear == 1)
X['holiday'] = X.index.to_series().isin(holiday_event_df.index)

y = X[['sales']].copy()

X_competition = competition_test_df.loc['2016':'2017'].copy()

X_competition['day'] = X_competition.index.day
X_competition['week'] = X_competition.index.dayofweek

X_competition = X_competition.join(oil_df, on='date')
X_competition['oil_price'] = X_competition['oil_price'].interpolate()

X_competition['NewYear'] = (X_competition.index.dayofyear == 1)
X_competition['holiday'] = X_competition.index.to_series().isin(holiday_event_df.index)

y_train, y_test = y[:"2017-06-01"], y["2017-06-02":]
X_train, X_test = X.loc[:"2017-06-01"], X.loc["2017-06-02":]

scaler = StandardScaler()

columns_to_be_scaled = ['onpromotion', 'oil_price', 'day']

scaled_train_data = scaler.fit_transform(X_train.loc[:,columns_to_be_scaled])

scaled_train_data_df = pd.DataFrame(scaled_train_data, columns=[i + "_scaled" for i in columns_to_be_scaled], index=X_train.index)

X_train = pd.concat([X_train, scaled_train_data_df], axis=1)

scaled_test_data = scaler.transform(X_test.loc[:,columns_to_be_scaled])

scaled_test_data_df = pd.DataFrame(scaled_test_data, columns=[i + "_scaled" for i in columns_to_be_scaled], index=X_test.index)

X_test = pd.concat([X_test, scaled_test_data_df], axis=1)

scaled_competition_data = scaler.transform(X_competition.loc[:,columns_to_be_scaled])

scaled_competition_data_df = pd.DataFrame(scaled_competition_data, columns=[i + "_scaled" for i in columns_to_be_scaled], index=X_competition.index)

X_competition = pd.concat([X_competition, scaled_competition_data_df], axis=1)

columns_to_be_encoded = ['week', 'family', 'store_nbr']

for column in columns_to_be_encoded:
    
    onehot_encoder = OneHotEncoder()
    
    encoded_train_column = onehot_encoder.fit_transform(X_train[column].values.reshape(-1, 1))

    encoded_train_column_df = pd.DataFrame(encoded_train_column.toarray(), columns=onehot_encoder.get_feature_names_out([column]), index=X_train.index)
    
    X_train = pd.concat([X_train, encoded_train_column_df], axis=1)
    
    encoded_test_column = onehot_encoder.transform(X_test[column].values.reshape(-1, 1))

    encoded_test_column_df = pd.DataFrame(encoded_test_column.toarray(), columns=onehot_encoder.get_feature_names_out([column]), index=X_test.index)
    
    X_test = pd.concat([X_test, encoded_test_column_df], axis=1)
    
    encoded_competition_column = onehot_encoder.transform(X_competition[column].values.reshape(-1, 1))

    encoded_competition_column_df = pd.DataFrame(encoded_competition_column.toarray(), columns=onehot_encoder.get_feature_names_out([column]), index=X_competition.index)
    
    X_competition = pd.concat([X_competition, encoded_competition_column_df], axis=1)

X_train.drop(columns=columns_to_be_scaled + ['sales'] + columns_to_be_encoded, inplace = True)

X_train[['NewYear','holiday']] = X_train[['NewYear','holiday']].astype(int)

X_test.drop(columns=columns_to_be_scaled + ['sales'] + columns_to_be_encoded, inplace = True)

X_test[['NewYear','holiday']] = X_test[['NewYear','holiday']].astype(int)

X_competition.drop(columns=columns_to_be_scaled + columns_to_be_encoded, inplace = True)

X_competition[['NewYear','holiday']] = X_competition[['NewYear','holiday']].astype(int)

model = models.Sequential()

model.add(layers.Dense(256, activation='relu', input_shape=(X_train.shape[1],)))
model.add(layers.BatchNormalization())
model.add(layers.Dropout(0.3))

model.add(layers.Dense(128, activation='relu', kernel_regularizer=regularizers.l2(0.001)))
model.add(layers.BatchNormalization())
model.add(layers.Dropout(0.3))

model.add(layers.Dense(128, activation='relu', kernel_regularizer=regularizers.l2(0.001)))
model.add(layers.BatchNormalization())
model.add(layers.Dropout(0.3))

model.add(layers.Dense(128, activation='relu', kernel_regularizer=regularizers.l2(0.001)))
model.add(layers.BatchNormalization())
model.add(layers.Dropout(0.3))

model.add(layers.Dense(64, activation='relu', kernel_regularizer=regularizers.l2(0.001)))
model.add(layers.BatchNormalization())
model.add(layers.Dropout(0.3))

model.add(layers.Dense(32, activation='relu'))

model.add(layers.Dense(1, activation='linear'))

optimizer = optimizers.Adam(learning_rate=0.001)

huber_loss = Huber(delta=1.0)

model.compile(optimizer=optimizer, loss=huber_loss, metrics=['mae'])

model_params = {'epochs' : 20, 'batch_size' : 64, 'validation_split' : 0.1}

model.fit(X_train,y_train, **model_params)

y_pred = pd.DataFrame(model.predict(X_test).flatten(), index=X_test.index, columns=['sales'])

print("\nTest R2 of the neural network model: {}\n".format(r2_score(y_test, y_pred)))

Epoch 1/20
[1m12956/12956[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m83s[0m 6ms/step - loss: 201.4458 - mae: 201.1383 - val_loss: 106.3776 - val_mae: 105.3495
Epoch 2/20
[1m12956/12956[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m83s[0m 6ms/step - loss: 115.8918 - mae: 114.6557 - val_loss: 98.9592 - val_mae: 97.1898
Epoch 3/20
[1m12956/12956[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m80s[0m 6ms/step - loss: 108.6243 - mae: 106.7017 - val_loss: 102.3998 - val_mae: 100.1070
Epoch 4/20
[1m12956/12956[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m76s[0m 6ms/step - loss: 102.9283 - mae: 100.5422 - val_loss: 95.0710 - val_mae: 92.4249
Epoch 5/20
[1m12956/12956[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m76s[0m 6ms/step - loss: 100.4157 - mae: 97.7003 - val_loss: 96.9193 - val_mae: 94.0115
Epoch 6/20
[1m12956/12956[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m76s[0m 6ms/step - loss: 98.3644 - mae: 95.4100 - val_loss: 93.8389 - val_mae: 90.7272
Epoch 7/20
[1m12956/

In [4]:
'''
Predicting sales for the competition dataset
'''
X_competition = X_competition[X_train.columns]

y_submit = competition_test_df[['id']].copy()
y_submit['sales'] = model.predict(X_competition).flatten()
y_submit.to_csv('submission.csv', index=False)

print("\nBelow are the predictions for the competition data:")
print(y_submit)

[1m891/891[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 2ms/step

Below are the predictions for the competition data:
                 id        sales
date                            
2017-08-16  3000888     4.243020
2017-08-16  3000889     0.675208
2017-08-16  3000890     4.340920
2017-08-16  3000891  2206.938232
2017-08-16  3000892     0.844535
...             ...          ...
2017-08-31  3029395   305.319824
2017-08-31  3029396    71.075508
2017-08-31  3029397  1066.058838
2017-08-31  3029398     2.239564
2017-08-31  3029399     7.420100

[28512 rows x 2 columns]
