# Exercise II - Forecasting Models

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns
sns.set()

## 1. Datset

### 1.1 Dataset Description
The data set you will analyze in all exercises was provided by Walmart and consists of unit sales data of 3049 products sold in the U.S. over 1941 days (from 2011-01-29 to 2016-04-24), organized in the form of grouped time series. Similar to the Capstone project, the goal of this series of exercises is to accurately predict the daily unit sales of retail stores across locations and product categories for the next 28 days, as well as to accurately estimate the uncertainty distribution of the predicted value.

#### Sales
The dataset is a grouped time series because there is a hierarchy of different levels of aggregation, and depending on the level of aggregation used to examine the data, it is possible to obtain a different set of time series with different levels of granularity. The disaggregated data can be grouped based on either location (store and state) or product-related information (department and category):
- 3 product categories (*Hobby*, *Food* and *Household*),subdivided into 7 product departments, and a total of 3049 different products in these departments
- 3 states (California (*CA*), Texas (*TX*), and Wisconsin (*WI*)), in which 10 stores are located

Based on this groupings, several meaningful hierarchies and hence time series can be constructed from the data at different levels of aggregation. If each product is considered separately for each state, we have 30,490 different time series.

![image.png](attachment:image.png)

The 'sales_train' table contains the following columns:
- `id`: Unique id of a product in a certain store, combining all hierarchy information (store, state, department, and category) for the specific product. 
- `item_id`: Unique id of a specific product. 
- `dept_id`: Department of a specific product. 
- `cat_id`: Product category of a specific product. 
- `store_id`: Store in which the specific product was sold. 
- `state_id`: State in which the specific product was sold. 
- `d_1` bis `d_1941`: For each of the 1941 days, how many times the product has been sold on that day.

In [None]:
sales_train = pd.read_parquet('../Daten/Parquet/sales_train_evaluation.parquet')
sales_train_validation = pd.read_parquet('../Daten/Parquet/sales_train_validation.parquet')
sell_prices = pd.read_parquet('../Daten/Parquet/sell_prices.parquet')
calendar = pd.read_parquet('../Daten/Parquet/calendar.parquet')

In [None]:
sales_train.head()

In [None]:
sales_train_validation.head()

#### Exogenous/explanatory data
In addition to the sales data, additional explanatory data are available that provide additional context and help to interpret the patterns found.   

**'prices'**
- `store_id`: ID of the store.
- `item_id`: Product ID.
- `wm_yr_wk`: Identifier of the date (d_1 bis d_1969).
- `sell_price`: Selling prices of the products, provided on a week–store level (average across seven days). If not available, this means that the product was not sold during the week examined.

In [None]:
sell_prices.head()

**'calendar'**
- `date`: Date (YYYY-MM-DD)
- `wm_yr_wk`: Date identifier
- `weekday`: Weekday
- `wday`: Saturday = 1, Sunday = 2 … Friday = 7
- `month`: Month
- `year`: Year
- `d`: Date identifier (d_1 to d_1969)
- `event_name_1`: Special events, days and holidays (e.g. Super Bowl, Valentine’s Day, and Orthodox Easter).
- `event_type_1`: Type of event (Cultural, Religious, Sporting, National).
- `event_name_2`: Name of the second event, if two events occur on the same day.
- `event_type_2`: Type of the second event, if two events occur on the same day.
- `snap_CA`: SNAP activities that serve as promotions. This is a binary variable (0 or 1) indicating whether the stores of *CA* allow SNAP purchases on the date examined, 1 indicates that SNAP purchases are allowed.
- `snap_TX`: SNAP activities for *TX* (Texas).
- `snap_WI`: SNAP activities for *WI* (Wisconsin).

SNAP is a social assistance program in the USA that provides money for food purchases for people in need using a debit card. The field is 1 if SNAP card purchases are allowed in CA (California) that day.


In [None]:
calendar.head()

In [None]:
sales_train_T = sales_train.melt(id_vars=['id', 'item_id', 'dept_id', 'cat_id', 'store_id', 'state_id'], value_vars=[col for col in sales_train.columns if col.startswith('d_')], var_name='d' , value_name='sales')
sales_train_T.head()

In [None]:
df_sales_calendar = sales_train_T.merge(calendar, on='d', how='left')
df_sales_calendar.head()

In [None]:
df_complete = df_sales_calendar.merge(sell_prices, on=['store_id', 'item_id', 'wm_yr_wk'], how='left')
df_complete.head()

In [None]:
df_complete.info()

In [None]:
df_complete.describe()

In [None]:
df_complete = df_complete[(df_complete['state_id'] == 'CA') &  (df_complete['cat_id'] == 'FOODS')]

### 1.2 Seasonality

In [None]:
from statsmodels.tsa.seasonal import seasonal_decompose
from plotly.subplots import make_subplots
import plotly.graph_objects as go

In [None]:
grouped_sales_total = df_complete[['date', 'sales']].groupby('date').sum()
grouped_sales_total.index = pd.to_datetime(grouped_sales_total.index)

weekly_sales = grouped_sales_total['sales'].resample('W').mean()


fig, ax = plt.subplots(2, figsize=(16, 9))

grouped_sales_total['sales'].plot(title='Total Sales per Day', ax=ax[0])
weekly_sales.plot(title='Mean Sales per Week', ax=ax[1])

plt.tight_layout() 
plt.show()


In [None]:
def plot_components(result):

  df = pd.concat([result.observed, result.trend, result.seasonal, result.resid], axis=1)
  df = df.rename(columns={0:'Original Data', 'season':'seasonal','observed':'Original Data'})
  components = df.columns
  rows = len(components)
  fig = make_subplots(rows=rows, cols=1, shared_xaxes=True, subplot_titles = [i for i in components])

  # Plot original data
  for i, col in enumerate(components):
    fig.add_trace(go.Scatter(x=df.index, y=df[col], mode='lines', name=col), row=i+1, col=1)


  # Update layout
  fig.update_layout(
      title='Time Series Decomposition',
      xaxis_title='Time',
      height=1200,
      width=1200
  )

  fig.show()

In [None]:
period = 365 #setting the period for decomposition
# Apply seasonal_decompose
result_sd = seasonal_decompose(grouped_sales_total['sales'], model='additive', period=period, extrapolate_trend=1)

# Plot the results
plot_components(result_sd)

### 1.3 Autocorrelation

In [None]:
from statsmodels.graphics.tsaplots import plot_acf, plot_pacf

In [None]:
# ACF and PACF plots of the total sales
fig, axes = plt.subplots(1,2,figsize=(16,3), dpi= 100)
plot_acf(grouped_sales_total['sales'].tolist(), lags=50, ax=axes[0], title='Autocorrelation (ACF)')
plot_pacf(grouped_sales_total['sales'].tolist(), lags=50, ax=axes[1], title='Partial Autocorrelation (PACF)')
plt.show()

We can see that the autocorrelation is present. Especially values autocorrelate with values from the same day in previous weeks.

In [None]:
# ACF and PACF plots of the weekly sales
fig, axes = plt.subplots(1,2,figsize=(16,3), dpi= 100)
plot_acf(weekly_sales.tolist(), lags=50, ax=axes[0])
plot_pacf(weekly_sales.tolist(), lags=50, ax=axes[1])
plt.show()


Weekly sales has weaker autocorrelation. This implies that the seasonality of weekdays has a strong impact on the data.

### 1.4 Stationarity

In [None]:
from statsmodels.tsa.stattools import adfuller

In [None]:
result = adfuller(weekly_sales)
print("ADF Statistic:", result[0])
print("p-value:", result[1])

if result[1] < 0.05:
    print("The series is stationary.")
else:
    print("The series is not stationary.")

In [None]:
# Log Transformation
weekly_sales_log = np.log(weekly_sales)

weekly_sales_log.plot(figsize=(20, 5), title='Mean Log Sales per Week')
plt.show()

result = adfuller(weekly_sales_log)
print("ADF Statistic:", result[0])
print("p-value:", result[1])

if result[1] < 0.05:
    print("The series is stationary.")
else:
    print("The series is not stationary.")

In [None]:
# Differencing
# Removes trends by subtracting the previous value from the current value
# First-order differencing removes linear trends

weekly_sales_diff = weekly_sales.diff().dropna()

weekly_sales_diff.plot(figsize=(20, 5), title='Differenced Mean Sales per Week')
plt.show()

result = adfuller(weekly_sales_diff)
print("ADF Statistic:", result[0])
print("p-value:", result[1])

if result[1] < 0.05:
    print("The series is stationary.")
else:
    print("The series is not stationary.")

In [None]:
# Draw Plot
fig, axes = plt.subplots(1,2,figsize=(16,3), dpi= 100)
plot_acf(weekly_sales, lags=50, ax=axes[0])
plot_pacf(weekly_sales, lags=50, ax=axes[1])
plt.show()

## 2. Forecasting Models

### 2.1 ARIMA


In [None]:
from statsmodels.tsa.arima.model import ARIMA
from pmdarima import auto_arima
from sklearn.metrics import root_mean_squared_error

In [None]:
arima_model = ARIMA(weekly_sales[:-28], order=(3, 2, 3))
arima_model_fit = arima_model.fit()

In [None]:
# Get d_1914 - d_1941
forecast = arima_model_fit.forecast(steps=28)
forecast = pd.Series(forecast, index=weekly_sales.index[-28:])

plt.figure(figsize=(20, 5))
plt.plot(weekly_sales, label='Original')
plt.plot(forecast, label='Forecast')
plt.legend()

In [None]:
auto_arima_model = auto_arima(weekly_sales[:-28], seasonal=False, trace=True)
print(auto_arima_model.summary())

In [None]:
forecast = auto_arima_model.predict(n_periods=28)
forecast = pd.Series(forecast, index=weekly_sales.index[-28:])

plt.figure(figsize=(20, 5))
plt.plot(weekly_sales, label='Original')
plt.plot(forecast, label='Forecast')
plt.legend()
plt.show()

### 2.2 Prophet


In [None]:
from prophet import Prophet

In [None]:
data = pd.DataFrame({'ds': grouped_sales_total['sales'].index, 'y': grouped_sales_total['sales'].values})

In [None]:
model = Prophet()
model.fit(data.iloc[:-28])

In [None]:
future = model.make_future_dataframe(periods=28)
future.tail()

In [None]:
forecast = model.predict(future)
forecast[['ds', 'yhat', 'yhat_lower', 'yhat_upper']].tail()

In [None]:
plt.figure(figsize=(20, 5))
plt.plot(data.iloc[-28:]['ds'], data.iloc[-28:]['y'], 'o', label='Original', )
plt.plot(forecast.iloc[-28:]['ds'], forecast.iloc[-28:]['yhat'], label='Forecast')
plt.fill_between(forecast.iloc[-28:]['ds'], forecast.iloc[-28:]['yhat_lower'], forecast.iloc[-28:]['yhat_upper'], color='gray', alpha=0.2)
plt.legend()
plt.plot()

In [None]:
fig = model.plot_components(forecast)

In [None]:
grouped_state_cat = df_complete[['store_id', 'cat_id', 'date', 'sales']].groupby(['store_id', 'cat_id', 'date']).sum()
grouped_state_cat = grouped_state_cat.reset_index()

In [None]:
unique_combinations = grouped_state_cat[['store_id', 'cat_id']].drop_duplicates()

# Calculate the grid size for subplots
n_combinations = len(unique_combinations)
n_cols = 3  # Number of columns in the grid
n_rows = (n_combinations + n_cols - 1) // n_cols  # Calculate rows needed

# Create subplots
fig, axs = plt.subplots(n_rows, n_cols, figsize=(15, 5 * n_rows), sharex=True, sharey=True)
axs = axs.flatten()


results = pd.DataFrame(columns=['store_id', 'cat_id', 'ds', 'y_true', 'yhat', 'yhat_lower', 'yhat_upper'])

for i, (store, cat) in enumerate(unique_combinations.values):
    filtered_data = grouped_state_cat[(grouped_state_cat['store_id'] == store) & (grouped_state_cat['cat_id'] == cat)]

    data = pd.DataFrame({'ds': filtered_data['date'], 'y': filtered_data['sales']})

    prophet_model = Prophet()
    prophet_model.fit(data.iloc[:-28])

    future = prophet_model.make_future_dataframe(periods=28)
    forecast = prophet_model.predict(future)

    forecast['store_id'] = store
    forecast['cat_id'] = cat
    forecast['y_true'] = filtered_data['sales'].values

    results = pd.concat([results, forecast.iloc[-28:][['store_id', 'cat_id', 'ds', 'y_true', 'yhat', 'yhat_lower', 'yhat_upper']]], axis=0)

    ax = axs[i]
    ax.plot(forecast.iloc[-28:]['ds'], forecast.iloc[-28:]['yhat'], label='yhat', marker='o')
    ax.plot(forecast.iloc[-28:]['ds'], forecast.iloc[-28:]['y_true'], label='y_true', marker='x')
    ax.set_title(f"State: {store}, Dept: {cat}", fontsize=10)
    ax.set_xlabel("date", fontsize=9)
    ax.set_ylabel("Values", fontsize=9)
    ax.legend(fontsize=8)
    ax.grid(True)

# Remove any unused subplots
for j in range(i + 1, len(axs)):
    fig.delaxes(axs[j])

plt.tight_layout()
plt.show()

### 2.3 XGBoost

In [None]:
import xgboost as xgb

#### 2.3.1 Feature Engineering

In [None]:
# Tranform categorical features
# Alternatively use one-hot encoding or label encoding
for col in ['dept_id', 'cat_id', 'store_id', 'state_id', 'item_id']:
    df_complete[col] = df_complete[col].astype('category')

In [None]:
# Lag features
# Capture the effect of previous sales on current sales
lags = [1, 2, 7, 14, 21, 28, 35, 365]

for lag in lags:
    df_complete[f'lag_{lag}'] = df_complete.groupby(['id'])['sales'].shift(lag)

    # Fill NaN values with 0
    df_complete.fillna({f'lag_{lag}' : 0}, inplace=True)

In [None]:
# Rolling mean features
# Capture the average sales over a specific time window
rolling_mean_windows = [7, 14, 28]

for window in rolling_mean_windows:
    df_complete[f'rolling_mean_{window}'] = df_complete.groupby(['id'])['sales'].transform(lambda x: x.rolling(window=window).mean())

    # Fill NaN values with 0
    df_complete.fillna({f'rolling_mean_{window}' : 0}, inplace=True)

Some features are cyclic (e.g. month, weekday). This leads to large jumps in values (i.e. Dec-Jan is difference of 11). Some models do not handle this behavior well. Therefore we need to encode them. One possible way of doing this is one-hot encodeing. Another apporach is the use of basic mathematical functions that represent the values in a vector space. Here we use sine/cosine encoding.

In [None]:
fig, axes = plt.subplots(1, 2, figsize=(20, 5))

calendar.plot(x='date', y='month', ax=axes[0], title='Month')
calendar.iloc[:42].plot(x='date', y='wday', ax=axes[1], title='Weekday')

plt.show()

In [None]:
# Sin/Cosine Encoding
def encode(values, max_val):
    return np.sin(2 * np.pi * values/max_val), np.cos(2 * np.pi * values/max_val)

In [None]:
df_complete['month_sin'], df_complete['month_cos'] = encode(df_complete['month'], 12)
df_complete['wday_sin'], df_complete['wday_cos'] = encode(df_complete['wday'], 7)

In [None]:
fig, axes = plt.subplots(1, 2, figsize=(10, 5))  # Create 1 row and 2 columns of subplots
df_complete[df_complete['id'] == 'FOODS_1_001_CA_1_evaluation'].plot(x='month_sin', y='month_cos', kind='scatter', ax=axes[0], title='Month Sin/Cos')
df_complete[df_complete['id'] == 'FOODS_1_001_CA_1_evaluation'].plot(x='wday_sin', y='wday_cos', kind='scatter', ax=axes[1], title='Weekday Sin/Cos')

plt.tight_layout()
plt.show()

Now the distance between each value is equal.

In [None]:
features = [
    'sell_price',
    'snap_CA', 'snap_TX', 'snap_WI', 
    'month_sin', 'month_cos', 'wday_sin', 'wday_cos', 'year', 'd_int',
    'dept_id', 'cat_id', 'store_id', 'state_id', 'item_id',
    'lag_1', 'lag_2', 'lag_7', 'lag_14', 'lag_21', 'lag_28', 'lag_35', 'lag_365',
    'rolling_mean_7', 'rolling_mean_14', 'rolling_mean_28'
    ]

#### 2.3.2 Model Training

In [None]:
last_train_day = 1913

df_complete['d_int'] = df_complete['d'].str.replace('d_', '').astype(int)

train, test = df_complete[df_complete['d_int'] <= last_train_day], df_complete[df_complete['d_int'] > last_train_day]

X_train = train[features]
y_train = train['sales']

X_test = test[features]
y_test = test['sales']

In [None]:
xgb_model = xgb.XGBRegressor(
    early_stopping_rounds=10,
    enable_categorical=True,
)

In [None]:
xgb_model.fit(
    X_train, y_train,
    eval_set=[(X_train, y_train), (X_test, y_test)],
    verbose=True
)

In [None]:
y_pred = xgb_model.predict(X_test)

In [None]:
X_test['y_pred'] = y_pred.round()
X_test['y_true'] = y_test

#### 2.3.3 Model Evaluation

In [None]:
grouped_X_test = X_test[['y_pred', 'y_true', 'store_id', 'cat_id', 'd_int']].groupby(['store_id', 'cat_id', 'd_int']).sum()
grouped_X_test = grouped_X_test.reset_index()


In [None]:
# Get unique combinations of state_id and dept_id for subplot creation
unique_combinations = grouped_X_test[['store_id', 'cat_id']].drop_duplicates()

# Calculate the grid size for subplots
n_combinations = len(unique_combinations)
n_cols = 3  # Number of columns in the grid
n_rows = (n_combinations + n_cols - 1) // n_cols  # Calculate rows needed

# Create subplots
fig, axs = plt.subplots(n_rows, n_cols, figsize=(15, 5 * n_rows), sharex=True, sharey=True)
axs = axs.flatten()

# Plot each store_id and dept_id combination
for i, (store, cat) in enumerate(unique_combinations.values):
    filtered_data = grouped_X_test[(grouped_X_test['store_id'] == store) &
                                   (grouped_X_test['cat_id'] == cat)]
    
    prophet_prediction = results[(results['store_id'] == store) & (results['cat_id'] == cat)]
    
    ax = axs[i]
    ax.plot(filtered_data['d_int'], filtered_data['y_true'], label='y_true', marker='x')
    ax.plot(filtered_data['d_int'], filtered_data['y_pred'], label='XGBoost Prediction', marker='o')
    ax.plot(filtered_data['d_int'], prophet_prediction['yhat'], label='Prophet Forecast', marker='o')
    ax.set_title(f"State: {store}, Dept: {cat}", fontsize=10)
    ax.set_xlabel("d_int", fontsize=9)
    ax.set_ylabel("Values", fontsize=9)
    ax.legend(fontsize=8)
    ax.grid(True)

# Remove any unused subplots
for j in range(i + 1, len(axs)):
    fig.delaxes(axs[j])

plt.tight_layout()
plt.show()

## Tasks

### Task 1: Compare the RMSE of the Prophet models with the XGBoost model overall and on store level

### Task 2: Identify the 5 features most used for splits and the 5 most influencial features of the XGBoost model