In [1]:
# Centralizing data processing.
import pandas as pd
import os
def prepare_sales_data(file_path):
    # Load the sales data
    df = pd.read_csv(file_path)
    df = pd.melt(df,id_vars=['Product Code', 'Product Name'], var_name='Date', value_name='Sales')
    # Example data preparation steps
    # Convert date columns to periods
    df['Date'] = pd.to_datetime(df['Date'])
        
    # Potentially other preprocessing like filling missing values, etc.
    # df = ...

    return df


In [2]:
file_path = os.path.join("data", "sales_data.csv")
df = prepare_sales_data(file_path) 
print(df)
 
                         

      Product Code       Product Name       Date  Sales
0            FG001    Finished Good A 2023-01-01      0
1            FG002    Finished Good B 2023-01-01      0
2            FG003    Finished Good C 2023-01-01      5
3            FG004    Finished Good D 2023-01-01      2
4            FG005    Finished Good E 2023-01-01      0
...            ...                ...        ...    ...
16957        FG029   Finished Good D1 2024-05-28     11
16958        FG030   Finished Good F1 2024-05-28     11
16959        FG031  Finished Good LG1 2024-05-28      0
16960        FG032   Finished Good H1 2024-05-28     28
16961        FG033   Finished Good J1 2024-05-28      0

[16962 rows x 4 columns]


# Function to create date and period features
def period_features(melted_sales_data):
    # Add New Date features
    melted_sales_data['weekday'] = melted_sales_data['Date'].dt.weekday
    melted_sales_data['month'] = melted_sales_data['Date'].dt.month
    melted_sales_data['year'] = melted_sales_data['Date'].dt.year
    melted_sales_data['dayofyear'] = melted_sales_data['Date'].dt.dayofyear
    melted_sales_data['weekofyear'] = melted_sales_data['Date'].dt.isocalendar().week
    melted_sales_data['quarter'] = melted_sales_data['Date'].dt.quarter

    # Rolling window features
    melted_sales_data['rolling_mean_7'] = melted_sales_data.groupby(['Product Code', 'Product Name'])['Sales'].shift(1).rolling(window=7).mean()
    melted_sales_data['rolling_std_7'] = melted_sales_data.groupby(['Product Code', 'Product Name'])['Sales'].shift(1).rolling(window=7).std()
    melted_sales_data['rolling_mean_30'] = melted_sales_data.groupby(['Product Code', 'Product Name'])['Sales'].shift(1).rolling(window=30).mean()
    melted_sales_data['rolling_std_30'] = melted_sales_data.groupby(['Product Code', 'Product Name'])['Sales'].shift(1).rolling(window=30).std()

    return melted_sales_data

In [6]:
import pandas as pd
import numpy as np
from statsmodels.tsa.holtwinters import ExponentialSmoothing
from sklearn.metrics import mean_squared_error
from math import sqrt

def holt_winters_cross_validation(sales_data, n_splits=5):
    sales_data = os.path.join("data", "sales_data.csv")
    df = prepare_sales_data(sales_data)    
    product_codes = df['Product Code'].unique()
    product_names = df['Product Name'].unique()

    all_errors = []

    for product_code, product_name in zip(product_codes, product_names):
        # Filter the data for the specific product
        product_data = df.loc[(df['Product Code'] == product_code) & 
                                             (df['Product Name'] == product_name)].copy()
        
        # Set 'Date' as the index and convert to PeriodIndex with daily frequency
        product_data['Date'] = pd.to_datetime(product_data['Date'])  # Ensure it's datetime first
        product_data.set_index('Date', inplace=True)
        product_data.index = product_data.index.to_period('D')  # Convert the index to PeriodIndex

        # Split the data into train and test
        total_points = len(product_data)
        split_size = total_points // n_splits
        
        product_errors = []
        
        for i in range(n_splits):
            train_end = (i + 1) * split_size
            train_data = product_data.iloc[:train_end]
            test_data = product_data.iloc[train_end:train_end + split_size]
            
            if len(test_data) == 0:
                continue
            
            # Fit the Holt-Winters model
            model = ExponentialSmoothing(train_data['Sales'], trend='add', seasonal='add', seasonal_periods=12).fit()
            
            # Predict the next values
            forecast = model.forecast(len(test_data))
            
            # Calculate RMSE for this split
            rmse = sqrt(mean_squared_error(test_data['Sales'], forecast))
            product_errors.append(rmse)
        
        # Calculate the average RMSE for this product and add to the list
        avg_product_rmse = np.mean(product_errors)
        all_errors.append(avg_product_rmse)
    
    # Calculate the average RMSE across all products
    overall_avg_rmse = np.mean(all_errors)
    return overall_avg_rmse

# Example usage:
average_rmse = holt_winters_cross_validation(df)
print("Average RMSE across all products:", average_rmse)


Average RMSE across all products: 10.52620142078916


In [11]:
import pandas as pd
from statsmodels.tsa.holtwinters import ExponentialSmoothing

def generate_forecast_for_products(sales_data, forecast_days=365):
    sales_data = os.path.join("data", "sales_data.csv")
    df = prepare_sales_data(sales_data)
    product_codes = df['Product Code'].unique()
    product_names = df['Product Name'].unique()

    forecasts = []

    for product_code, product_name in zip(product_codes, product_names):
        # Filter the data for the specific product
        product_data = df.loc[(df['Product Code'] == product_code) & 
                                             (df['Product Name'] == product_name)].copy()
        
        # Set 'Date' as the index and convert to PeriodIndex with daily frequency
        product_data['Date'] = pd.to_datetime(product_data['Date'])  # Ensure it's datetime first
        product_data.set_index('Date', inplace=True)
        product_data.index = product_data.index.to_period('D')  # Convert the index to PeriodIndex

        # Fit the Holt-Winters model
        model = ExponentialSmoothing(product_data['Sales'], trend='add', seasonal='add', seasonal_periods=12).fit()
        
        # Forecast the next `forecast_days` days
        forecast = model.forecast(forecast_days)
        
        # Round the forecasted values and replace any values less than zero with zero
        forecast = forecast.round(0).clip(lower=0)  
        
        # Store the forecast in a list of dictionaries
        forecast_dict = {
            'Product Code': product_code,
            'Product Name': product_name,
        }
        
        # Add the forecasted values to the dictionary with dates as keys
        for i, value in enumerate(forecast):
            forecast_date = (product_data.index[-1] + pd.Timedelta(days=i+1)).strftime('%Y-%m-%d')
            forecast_dict[forecast_date] = value
        
        forecasts.append(forecast_dict)
    
    # Convert the list of dictionaries into a DataFrame
    forecast_df = pd.DataFrame(forecasts)
    
    return forecast_df

# Example usage:
forecasts_df = generate_forecast_for_products(df)
print(forecasts_df)


   Product Code       Product Name  2024-05-29  2024-05-30  2024-05-31  \
0         FG001    Finished Good A         4.0         5.0         6.0   
1         FG002    Finished Good B         4.0         5.0         7.0   
2         FG003    Finished Good C        10.0        12.0        10.0   
3         FG004    Finished Good D         2.0         2.0         2.0   
4         FG005    Finished Good E         3.0         2.0         3.0   
5         FG006    Finished Good F         8.0         9.0         9.0   
6         FG007    Finished Good G         6.0         5.0         5.0   
7         FG008    Finished Good H         7.0         7.0         8.0   
8         FG009    Finished Good I        14.0        14.0        17.0   
9         FG010    Finished Good J        37.0        39.0        36.0   
10        FG011    Finished Good K        25.0        25.0        24.0   
11        FG012    Finished Good L        45.0        46.0        46.0   
12        FG013    Finished Good M    

In [None]:
print(forecasts)