# E-commerce Sales Forecasting Analysis

This notebook focuses on building and evaluating sales forecasting models:
1. Data Preparation for Time Series Analysis
2. Time Series Decomposition
3. Model Building
    - Simple Moving Average
    - Exponential Smoothing
    - SARIMA Model
4. Model Evaluation
5. Future Sales Prediction

In [None]:
# Import required libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from statsmodels.tsa.seasonal import seasonal_decompose
from statsmodels.tsa.holtwinters import ExponentialSmoothing
from statsmodels.tsa.statespace.sarimax import SARIMAX
from sklearn.metrics import mean_absolute_error, mean_squared_error
import sys
sys.path.append('..')

from src.model_utils import ModelUtils
from src.config import PROCESSED_DATA_DIR

%matplotlib inline
plt.style.use('seaborn')

## 1. Data Preparation

In [None]:
# Load cleaned data
df = pd.read_csv(PROCESSED_DATA_DIR / 'cleaned_sales_data.csv')
df['order_date'] = pd.to_datetime(df['order_date'])

# Create daily sales time series
daily_sales = df.groupby('order_date')['price'].sum().reset_index()
daily_sales.set_index('order_date', inplace=True)

# Fill missing dates with zero sales
idx = pd.date_range(daily_sales.index.min(), daily_sales.index.max())
daily_sales = daily_sales.reindex(idx, fill_value=0)

print("Time series information:")
print(f"Start date: {daily_sales.index.min()}")
print(f"End date: {daily_sales.index.max()}")
print(f"Total days: {len(daily_sales)}")

## 2. Time Series Decomposition

In [None]:
# Perform time series decomposition
decomposition = seasonal_decompose(daily_sales['price'], period=7)

# Plot decomposition
plt.figure(figsize=(15, 12))

plt.subplot(411)
plt.plot(daily_sales.index, daily_sales['price'])
plt.title('Original Time Series')

plt.subplot(412)
plt.plot(daily_sales.index, decomposition.trend)
plt.title('Trend')

plt.subplot(413)
plt.plot(daily_sales.index, decomposition.seasonal)
plt.title('Seasonal')

plt.subplot(414)
plt.plot(daily_sales.index, decomposition.resid)
plt.title('Residual')

plt.tight_layout()
plt.show()

## 3. Model Building

In [None]:
# Split data into train and test sets
train_size = int(len(daily_sales) * 0.8)
train_data = daily_sales[:train_size]
test_data = daily_sales[train_size:]

print(f"Training set size: {len(train_data)} days")
print(f"Test set size: {len(test_data)} days")

### 3.1 Simple Moving Average

In [None]:
def calculate_moving_average(data, window):
    """Calculate moving average predictions"""
    return data.rolling(window=window).mean()

# Calculate 7-day moving average
ma_predictions = calculate_moving_average(train_data['price'], 7)

plt.figure(figsize=(15, 6))
plt.plot(train_data.index, train_data['price'], label='Actual')
plt.plot(train_data.index, ma_predictions, label='7-day MA')
plt.title('Simple Moving Average Forecast')
plt.legend()
plt.show()

### 3.2 Exponential Smoothing

In [None]:
# Fit Holt-Winters model
hw_model = ExponentialSmoothing(
    train_data['price'],
    seasonal_periods=7,
    trend='add',
    seasonal='add'
)
hw_results = hw_model.fit()

# Generate predictions
hw_predictions = hw_results.forecast(len(test_data))

# Plot results
plt.figure(figsize=(15, 6))
plt.plot(train_data.index, train_data['price'], label='Training Data')
plt.plot(test_data.index, test_data['price'], label='Actual')
plt.plot(test_data.index, hw_predictions, label='Holt-Winters')
plt.title('Holt-Winters Exponential Smoothing Forecast')
plt.legend()
plt.show()

### 3.3 SARIMA Model

In [None]:
# Fit SARIMA model
sarima_model = SARIMAX(
    train_data['price'],
    order=(1, 1, 1),
    seasonal_order=(1, 1, 1, 7)
)
sarima_results = sarima_model.fit()

# Generate predictions
sarima_predictions = sarima_results.forecast(len(test_data))

# Plot results
plt.figure(figsize=(15, 6))
plt.plot(train_data.index, train_data['price'], label='Training Data')
plt.plot(test_data.index, test_data['price'], label='Actual')
plt.plot(test_data.index, sarima_predictions, label='SARIMA')
plt.title('SARIMA Model Forecast')
plt.legend()
plt.show()

## 4. Model Evaluation

In [None]:
def calculate_metrics(actual, predicted):
    """Calculate error metrics"""
    mae = mean_absolute_error(actual, predicted)
    rmse = np.sqrt(mean_squared_error(actual, predicted))
    mape = np.mean(np.abs((actual - predicted) / actual)) * 100
    return mae, rmse, mape

# Calculate metrics for each model
hw_mae, hw_rmse, hw_mape = calculate_metrics(
    test_data['price'], 
    hw_predictions
)

sarima_mae, sarima_rmse, sarima_mape = calculate_metrics(
    test_data['price'], 
    sarima_predictions
)

# Create comparison table
metrics_df = pd.DataFrame({
    'Model': ['Holt-Winters', 'SARIMA'],
    'MAE': [hw_mae, sarima_mae],
    'RMSE': [hw_rmse, sarima_rmse],
    'MAPE': [hw_mape, sarima_mape]
})

print("Model Performance Comparison:")
print(metrics_df.to_string(index=False))

## 5. Future Sales Prediction

In [None]:
# Generate future predictions using the best performing model
future_periods = 30  # Forecast next 30 days

# Use Holt-Winters model for final predictions
final_model = ExponentialSmoothing(
    daily_sales['price'],
    seasonal_periods=7,
    trend='add',
    seasonal='add'
)
final_results = final_model.fit()

# Generate future dates
last_date = daily_sales.index[-1]
future_dates = pd.date_range(
    start=last_date + pd.Timedelta(days=1),
    periods=future_periods
)

# Generate predictions
future_predictions = final_results.forecast(future_periods)

# Plot final forecast
plt.figure(figsize=(15, 6))
plt.plot(daily_sales.index, daily_sales['price'], label='Historical Data')
plt.plot(future_dates, future_predictions, label='Forecast', linestyle='--')
plt.title('30-Day Sales Forecast')
plt.legend()
plt.show()

# Create forecast summary
forecast_df = pd.DataFrame({
    'Date': future_dates,
    'Predicted_Sales': future_predictions
})

print("\nForecast Summary:")
print(f"Total Predicted Sales: ${forecast_df['Predicted_Sales'].sum():,.2f}")
print(f"Average Daily Sales: ${forecast_df['Predicted_Sales'].mean():,.2f}")
print(f"Peak Sales Day: {forecast_df.loc[forecast_df['Predicted_Sales'].idxmax(), 'Date'].strftime('%Y-%m-%d')}")

## 6. Save Model and Predictions

In [None]:
# Save forecast results
forecast_df.to_csv(PROCESSED_DATA_DIR / 'sales_forecast.csv', index=False)

# Save model performance metrics
metrics_df.to_csv(PROCESSED_DATA_DIR / 'model_metrics.csv', index=False)

print("Forecast results and model metrics saved successfully.")