# Sales Forecasting Test Notebook

This notebook demonstrates a time series analysis workflow for testing the context retrieval persona with temporal data.

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from datetime import datetime, timedelta
from sklearn.metrics import mean_absolute_error, mean_squared_error
from statsmodels.tsa.seasonal import seasonal_decompose
from statsmodels.tsa.arima.model import ARIMA
import warnings
warnings.filterwarnings('ignore')

In [None]:
# Generate synthetic time series data for e-commerce sales
np.random.seed(42)
start_date = datetime(2020, 1, 1)
end_date = datetime(2023, 12, 31)
date_range = pd.date_range(start=start_date, end=end_date, freq='D')

# Create base trend
n_days = len(date_range)
trend = np.linspace(1000, 2000, n_days)

# Add seasonal patterns (weekly and yearly)
weekly_pattern = 200 * np.sin(2 * np.pi * np.arange(n_days) / 7)
yearly_pattern = 300 * np.sin(2 * np.pi * np.arange(n_days) / 365.25)

# Add random noise
noise = np.random.normal(0, 100, n_days)

# Combine all components
sales = trend + weekly_pattern + yearly_pattern + noise

# Create DataFrame
ts_data = pd.DataFrame({
    'date': date_range,
    'daily_sales': np.maximum(sales, 0)  # Ensure non-negative sales
})

ts_data.set_index('date', inplace=True)
print(f"Time series shape: {ts_data.shape}")
print(f"Date range: {ts_data.index.min()} to {ts_data.index.max()}")
ts_data.head()

In [None]:
# Basic time series visualization
plt.figure(figsize=(15, 8))

plt.subplot(2, 2, 1)
plt.plot(ts_data.index, ts_data['daily_sales'])
plt.title('Daily Sales Over Time')
plt.ylabel('Sales ($)')

plt.subplot(2, 2, 2)
monthly_sales = ts_data.resample('M').sum()
plt.plot(monthly_sales.index, monthly_sales['daily_sales'])
plt.title('Monthly Sales')
plt.ylabel('Monthly Sales ($)')

plt.subplot(2, 2, 3)
ts_data['daily_sales'].hist(bins=50)
plt.title('Distribution of Daily Sales')
plt.xlabel('Sales ($)')

plt.subplot(2, 2, 4)
weekly_avg = ts_data.groupby(ts_data.index.dayofweek)['daily_sales'].mean()
days = ['Mon', 'Tue', 'Wed', 'Thu', 'Fri', 'Sat', 'Sun']
plt.bar(days, weekly_avg)
plt.title('Average Sales by Day of Week')
plt.ylabel('Average Sales ($)')

plt.tight_layout()
plt.show()

print(f"Daily sales statistics:")
print(ts_data['daily_sales'].describe())

In [None]:
# Time series decomposition
print("Performing time series decomposition...")

# Decompose the time series
decomposition = seasonal_decompose(ts_data['daily_sales'], model='additive', period=365)

# Plot decomposition
fig, axes = plt.subplots(4, 1, figsize=(15, 12))

decomposition.observed.plot(ax=axes[0], title='Original Time Series')
decomposition.trend.plot(ax=axes[1], title='Trend Component')
decomposition.seasonal.plot(ax=axes[2], title='Seasonal Component')
decomposition.resid.plot(ax=axes[3], title='Residual Component')

plt.tight_layout()
plt.show()

# Calculate component statistics
trend_strength = 1 - (decomposition.resid.var() / (decomposition.trend + decomposition.resid).var())
seasonal_strength = 1 - (decomposition.resid.var() / (decomposition.seasonal + decomposition.resid).var())

print(f"Trend strength: {trend_strength:.3f}")
print(f"Seasonal strength: {seasonal_strength:.3f}")

In [None]:
# Split data for time series forecasting
train_size = int(len(ts_data) * 0.8)
train_data = ts_data[:train_size]
test_data = ts_data[train_size:]

print(f"Training period: {train_data.index.min()} to {train_data.index.max()}")
print(f"Test period: {test_data.index.min()} to {test_data.index.max()}")
print(f"Training samples: {len(train_data)}")
print(f"Test samples: {len(test_data)}")

# Visualize train/test split
plt.figure(figsize=(15, 6))
plt.plot(train_data.index, train_data['daily_sales'], label='Training', color='blue')
plt.plot(test_data.index, test_data['daily_sales'], label='Test', color='red')
plt.axvline(x=train_data.index[-1], color='black', linestyle='--', alpha=0.7, label='Train/Test Split')
plt.title('Train/Test Split Visualization')
plt.ylabel('Daily Sales ($)')
plt.legend()
plt.show()

In [None]:
# Fit ARIMA model for forecasting
print("Fitting ARIMA model...")

# Simple ARIMA model (could be improved with proper order selection)
model = ARIMA(train_data['daily_sales'], order=(1, 1, 1))
fitted_model = model.fit()

# Generate forecasts
forecast_steps = len(test_data)
forecast = fitted_model.forecast(steps=forecast_steps)
forecast_ci = fitted_model.get_forecast(steps=forecast_steps).conf_int()

print(f"Model summary:")
print(fitted_model.summary())

# Calculate forecast errors
mae = mean_absolute_error(test_data['daily_sales'], forecast)
rmse = np.sqrt(mean_squared_error(test_data['daily_sales'], forecast))
mape = np.mean(np.abs((test_data['daily_sales'] - forecast) / test_data['daily_sales'])) * 100

print(f"\nForecast Performance:")
print(f"MAE: ${mae:.2f}")
print(f"RMSE: ${rmse:.2f}")
print(f"MAPE: {mape:.2f}%")

In [None]:
# Visualize forecasts
plt.figure(figsize=(15, 8))

# Plot training data
plt.plot(train_data.index[-100:], train_data['daily_sales'][-100:], 
         label='Historical (last 100 days)', color='blue', alpha=0.7)

# Plot actual test data
plt.plot(test_data.index, test_data['daily_sales'], 
         label='Actual', color='green', linewidth=2)

# Plot forecasts
plt.plot(test_data.index, forecast, 
         label='ARIMA Forecast', color='red', linewidth=2)

# Plot confidence intervals
plt.fill_between(test_data.index, 
                 forecast_ci.iloc[:, 0], 
                 forecast_ci.iloc[:, 1], 
                 color='red', alpha=0.2, label='95% Confidence Interval')

plt.axvline(x=train_data.index[-1], color='black', linestyle='--', alpha=0.7, label='Forecast Start')
plt.title('Time Series Forecasting Results')
plt.ylabel('Daily Sales ($)')
plt.legend()
plt.grid(True, alpha=0.3)
plt.show()

# Residual analysis
residuals = test_data['daily_sales'] - forecast

plt.figure(figsize=(12, 4))
plt.subplot(1, 2, 1)
plt.plot(test_data.index, residuals)
plt.title('Forecast Residuals')
plt.ylabel('Residual')

plt.subplot(1, 2, 2)
plt.hist(residuals, bins=30, alpha=0.7)
plt.title('Distribution of Residuals')
plt.xlabel('Residual')

plt.tight_layout()
plt.show()