In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import os
import warnings
import matplotlib.pyplot as plt

from sklearn.preprocessing import OneHotEncoder
from sklearn.model_selection import RandomizedSearchCV, TimeSeriesSplit
from sklearn.metrics import make_scorer
from scipy.stats import randint, uniform
from xgboost import XGBRegressor


warnings.simplefilter(action='ignore', category=FutureWarning)
warnings.simplefilter(action='ignore', category = UserWarning)

In [2]:

'''
Loading the data
'''

folder_path = "/kaggle/input/store-sales-time-series-forecasting/"

holiday_event_df = pd.read_csv(os.path.join(folder_path,"holidays_events.csv"),
                               dtype={'type': 'category',
                                      'locale': 'category',
                                      'locale_name': 'category',
                                      'description': 'category',
                                      'transferred': 'bool'},
                               parse_dates=['date'],
                               infer_datetime_format=True)

holiday_event_df = holiday_event_df.set_index('date').to_period('D')

holiday_event_df_duplicates = holiday_event_df[holiday_event_df.index.duplicated(keep=False)] #Find duplicate values in holidays

holiday_event_df_without_duplicates = holiday_event_df[~holiday_event_df.index.duplicated(keep='first')] #Handle duplicate values in holidays

oil_df = pd.read_csv(os.path.join(folder_path,"oil.csv"),
                     parse_dates=['date'],
                     infer_datetime_format=True)

oil_df = oil_df.set_index('date').to_period('D')

oil_df = oil_df.interpolate() #Handle missing values in oil prices

oil_df.iloc[0] = oil_df.iloc[1] #Handle missing values in oil prices

oil_df.rename(columns={"dcoilwtico": "oil_price"}, inplace = True)

stores_df = pd.read_csv(os.path.join(folder_path,"stores.csv"))

transaction_df = pd.read_csv(os.path.join(folder_path,"transactions.csv"),
                             parse_dates=['date'],
                             infer_datetime_format=True)

transaction_df['date'] = transaction_df['date'].dt.to_period('D')
transaction_df = transaction_df.set_index(['date', 'store_nbr']).sort_index()

train_df = pd.read_csv(os.path.join(folder_path,"train.csv"),
                                        usecols=['store_nbr', 'family', 'date','sales', 'onpromotion'],
                                        dtype={'store_nbr': 'category',
                                               'family': 'category',
                                               'sales': 'float'},
                                        parse_dates=['date'],
                                        infer_datetime_format=True)

train_df['date'] = train_df.date.dt.to_period('D')
train_df = train_df.set_index('date').sort_index()

competition_test_df = pd.read_csv(os.path.join(folder_path,"test.csv"),
                                  usecols=['id','store_nbr', 'family', 'date', 'onpromotion'],
                                  dtype={'store_nbr': 'category',
                                         'family': 'category',
                                         'onpromotion': 'uint32'},
                                  parse_dates=['date'],
                                  infer_datetime_format=True)

competition_test_df['date'] = competition_test_df.date.dt.to_period('D')
competition_test_df = competition_test_df.set_index('date').sort_index()

In [3]:
'''
Building a neural network to predict sales by store and family.
'''

def rmsle(y_true, y_pred):
    # Convert to numpy arrays
    y_true = np.array(y_true).flatten()
    y_pred = np.array(y_pred).flatten()
    
    # Clip negative values to 0
    y_true = np.clip(y_true, 0, None)
    y_pred = np.clip(y_pred, 0, None)
    
    # Compute RMSLE
    return np.sqrt(np.mean((np.log1p(y_pred) - np.log1p(y_true))**2))

rmsle_scorer = make_scorer(rmsle, greater_is_better=False)

X = train_df.loc['2016':].copy()

X['day'] = X.index.day
X['week'] = X.index.dayofweek

X = X.join(oil_df, on='date')
X['oil_price'] = X['oil_price'].interpolate()

X['NewYear'] = (X.index.dayofyear == 1)
X['holiday'] = X.index.to_series().isin(holiday_event_df.index)

onehot_encoder = OneHotEncoder()

encoded_week = onehot_encoder.fit_transform(X['week'].values.reshape(-1, 1))

encoded_week_df = pd.DataFrame(encoded_week.toarray(), columns=onehot_encoder.get_feature_names_out(['week']), index=X.index)

encoded_family = onehot_encoder.fit_transform(X['family'].values.reshape(-1, 1))

encoded_family_df = pd.DataFrame(encoded_family.toarray(), columns=onehot_encoder.get_feature_names_out(['family']), index=X.index)

encoded_store_nbr = onehot_encoder.fit_transform(X['store_nbr'].values.reshape(-1, 1))

encoded_store_nbr_df = pd.DataFrame(encoded_store_nbr.toarray(), columns=onehot_encoder.get_feature_names_out(['store_nbr']), index=X.index)

X = pd.concat([X, encoded_week_df, encoded_family_df, encoded_store_nbr_df], axis=1)

y = X[['sales']].copy()

X.drop(columns= ['sales','week','family', 'store_nbr'], inplace = True)

X[['NewYear','holiday']] = X[['NewYear','holiday']].astype(int)

X_train = X.loc['2016':'2016-10']
y_train = y.loc['2016':'2016-10']

X_val = X.loc['2016-11':'2016-12']
y_val = y.loc['2016-11':'2016-12']

X_test = X.loc['2017']
y_test = y.loc['2017']

param_dist = {
    'n_estimators': randint(100, 1000),
    'learning_rate': uniform(0.01, 0.1),
    'max_depth': randint(3, 8),
    'subsample': uniform(0.6, 0.4),
    'reg_lambda': randint(2, 6),
    'colsample_bytree': uniform(0.6, 0.4)
}

model = XGBRegressor(objective="reg:squaredlogerror", random_state=42)

X_tune = pd.concat([X_train, X_val])
y_tune = pd.concat([y_train, y_val])

tscv = TimeSeriesSplit(n_splits=5)

random_search = RandomizedSearchCV(estimator=model,
                                    param_distributions=param_dist,
                                    n_iter=30,
                                    cv=tscv,
                                    scoring=rmsle_scorer,
                                    verbose=1,
                                    n_jobs=-1,
                                    random_state=42)

random_search.fit(X_tune, y_tune.values.ravel())

best_params = random_search.best_params_

print("\nBest Parameters:", best_params)

best_model = XGBRegressor(**random_search.best_params_,
                          objective='reg:squaredlogerror',
                          early_stopping_rounds=7,
                          random_state=42)

best_model.fit(X_train, y_train.values.ravel(), 
               eval_set=[(X_val, y_val.values.ravel())],
               verbose=False)

feature_importance = best_model.feature_importances_

feature_names = X_train.columns

importance_df = pd.DataFrame({
    'Feature': feature_names,
    'Importance': feature_importance
})

importance_df = importance_df.sort_values(by='Importance', ascending=False)

print("\nFeature importance:")
print(importance_df.to_string())

y_pred_train = pd.DataFrame(best_model.predict(X_train), index=X_train.index, columns=['sales'])
y_pred_val = pd.DataFrame(best_model.predict(X_val), index=X_val.index, columns=['sales'])
y_pred_test = pd.DataFrame(best_model.predict(X_test), index=X_test.index, columns=['sales'])

y_pred_train_clipped = np.clip(y_pred_train, 0, None)
y_pred_val_clipped   = np.clip(y_pred_val, 0, None)
y_pred_test_clipped  = np.clip(y_pred_test, 0, None)

print("\nTrain RMSLE:", rmsle(y_train, y_pred_train_clipped))
print("Validation RMSLE:", rmsle(y_val, y_pred_val_clipped))
print("Test RMSLE:", rmsle(y_test, y_pred_test_clipped))

Fitting 5 folds for each of 30 candidates, totalling 150 fits

Best Parameters: {'colsample_bytree': 0.9630265895704372, 'learning_rate': 0.034929222914887495, 'max_depth': 7, 'n_estimators': 754, 'reg_lambda': 4, 'subsample': 0.6812244898939077}

Feature importance:
                              Feature  Importance
0                         onpromotion    0.260962
36                       family_MEATS    0.044519
40                     family_POULTRY    0.044093
22                        family_EGGS    0.041851
92                       store_nbr_52    0.039903
29             family_HOME APPLIANCES    0.035524
26                    family_HARDWARE    0.032861
41              family_PREPARED FOODS    0.032751
42                     family_PRODUCE    0.029901
30                   family_HOME CARE    0.028258
23                family_FROZEN FOODS    0.027419
16                       family_BOOKS    0.026816
34            family_LIQUOR,WINE,BEER    0.026376
21                        family

In [4]:
'''
Predicting sales for the competition dataset
'''

X_competition = competition_test_df.copy()

X_competition['day'] = X_competition.index.day
X_competition['week'] = X_competition.index.dayofweek

X_competition = X_competition.join(oil_df, on='date')
X_competition['oil_price'] = X_competition['oil_price'].interpolate()

X_competition['NewYear'] = (X_competition.index.dayofyear == 1)
X_competition['holiday'] = X_competition.index.to_series().isin(holiday_event_df.index)

onehot_encoder = OneHotEncoder()

encoded_week = onehot_encoder.fit_transform(X_competition['week'].values.reshape(-1, 1))

encoded_week_df = pd.DataFrame(encoded_week.toarray(), columns=onehot_encoder.get_feature_names_out(['week']), index=X_competition.index)

encoded_family = onehot_encoder.fit_transform(X_competition['family'].values.reshape(-1, 1))

encoded_family_df = pd.DataFrame(encoded_family.toarray(), columns=onehot_encoder.get_feature_names_out(['family']), index=X_competition.index)

encoded_store_nbr = onehot_encoder.fit_transform(X_competition['store_nbr'].values.reshape(-1, 1))

encoded_store_nbr_df = pd.DataFrame(encoded_store_nbr.toarray(), columns=onehot_encoder.get_feature_names_out(['store_nbr']), index=X_competition.index)

X_competition = pd.concat([X_competition, encoded_week_df, encoded_family_df, encoded_store_nbr_df], axis=1)

X_competition.drop(columns=['week','family', 'store_nbr'], inplace = True)

X_competition[['NewYear','holiday']] = X_competition[['NewYear','holiday']].astype(int)

X_competition = X_competition[X.columns]

y_submit = competition_test_df[['id']].copy()
y_submit['sales'] = best_model.predict(X_competition)
y_submit['sales'] = np.clip(y_submit['sales'], 0, None)
y_submit.to_csv('submission.csv', index=False)

print("\nBelow are the predictions for the competition data:")
print(y_submit)


Below are the predictions for the competition data:
                 id       sales
date                           
2017-08-16  3000888    4.419261
2017-08-16  3000889    0.087824
2017-08-16  3000890  133.629883
2017-08-16  3000891  539.027100
2017-08-16  3000892    0.000000
...             ...         ...
2017-08-31  3029395  133.837479
2017-08-31  3029396   43.366795
2017-08-31  3029397  197.878082
2017-08-31  3029398  412.779602
2017-08-31  3029399    5.356805

[28512 rows x 2 columns]
