In [19]:
import os
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split, KFold
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
from sklearn.preprocessing import MinMaxScaler
from sklearn.linear_model import LinearRegression, Ridge, Lasso, ElasticNet
from sklearn.svm import SVR
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
import xgboost as xgb
from catboost import CatBoostRegressor

# Function to load and prepare data based on aggregation type
def load_and_prepare_data(df, aggregation_type='Monthly'):
    df.drop(columns=['Sku'], inplace=True, errors='ignore')  # Drop SKU column if exists
    df['Created'] = pd.to_datetime(df['Created'])

    if aggregation_type == 'Monthly':
        df['YearMonth'] = df['Created'].dt.to_period('M')
        aggregated_data = df.groupby('YearMonth')['Quantity'].sum().reset_index()
        aggregated_data['Month'] = aggregated_data['YearMonth'].dt.month
        aggregated_data['Year'] = aggregated_data['YearMonth'].dt.year

    elif aggregation_type == 'Weekly':
        df['YearWeek'] = df['Created'].dt.to_period('W')
        aggregated_data = df.groupby('YearWeek')['Quantity'].sum().reset_index()
        aggregated_data['Week'] = aggregated_data['YearWeek'].dt.week
        aggregated_data['Year'] = aggregated_data['YearWeek'].dt.year

    elif aggregation_type == 'Daily':
        df['Day'] = df['Created'].dt.date
        aggregated_data = df.groupby('Day')['Quantity'].sum().reset_index()
        aggregated_data['Year'] = pd.to_datetime(aggregated_data['Day']).dt.year

    elif aggregation_type == 'Quarterly':
        df['Quarter'] = df['Created'].dt.to_period('Q')
        aggregated_data = df.groupby('Quarter')['Quantity'].sum().reset_index()
        aggregated_data['Year'] = aggregated_data['Quarter'].dt.year
        aggregated_data['Quarter'] = aggregated_data['Quarter'].dt.quarter

    elif aggregation_type == 'Yearly':
        df['Year'] = df['Created'].dt.year
        aggregated_data = df.groupby('Year')['Quantity'].sum().reset_index()

    aggregated_data['sales_diff'] = aggregated_data['Quantity'].diff().fillna(0)
    return aggregated_data


def find_optimal_lag(data, max_possible_lag=12, threshold=0.2):
    """
    Finds the optimal lag for supervised learning using PACF.
    
    Parameters:
        data (pd.DataFrame): The aggregated time series dataset.
        max_possible_lag (int): The maximum lag we can consider.
        threshold (float): The cutoff value for selecting the best lag from PACF.
    
    Returns:
        int: Optimal lag value
    """
    # Ensure we have enough data to compute lags
    num_months = data.shape[0]
    if num_months < 2:
        raise ValueError("Not enough data points to compute lags.")

    # Compute Partial Autocorrelation Function (PACF)
    pacf_values = pacf(data['sales_diff'].dropna(), nlags=min(max_possible_lag, num_months - 1))

    # Find the first lag where PACF drops below the threshold
    optimal_lag = np.where(np.abs(pacf_values) < threshold)[0]
    
    if len(optimal_lag) > 0:
        best_lag = min(optimal_lag)  # Select the first significant lag
    else:
        best_lag = min(max_possible_lag, num_months - 1)  # Default to max possible lag

    print(f"ðŸ“Œ Optimal Lag Selected: {best_lag}")
    return best_lag


# Load and aggregate data
m_df = load_and_prepare_data(df, aggregation_type)

# Build supervised dataset with optimal lag
supervised_data = build_supervised(m_df)

print(f"Final dataset shape after transformation: {supervised_data.shape}")




# Split data into training and testing sets
def train_test_split_data(data):
    data = data.drop(columns=['Quantity', 'YearMonth'], errors='ignore')
    train, test = data[:-12].values, data[-12:].values
    return train, test

# Scale data
def scale_data(train, test):
    scaler = MinMaxScaler(feature_range=(-1, 1))
    scaler.fit(train)
    train_scaled = scaler.transform(train)
    test_scaled = scaler.transform(test)
    
    X_train, y_train = train_scaled[:, 1:], train_scaled[:, 0:1].ravel()
    X_test, y_test = test_scaled[:, 1:], test_scaled[:, 0:1].ravel()
    
    return X_train, y_train, X_test, y_test, scaler

# Rescale predictions
def rescale_predictions(y_pred, X_test, scaler):
    y_pred = y_pred.reshape(y_pred.shape[0], 1, 1)
    X_test = X_test.reshape(X_test.shape[0], 1, X_test.shape[1])
    
    pred_test_set = [np.concatenate([y_pred[i], X_test[i]], axis=1) for i in range(len(y_pred))]
    pred_test_set = np.array(pred_test_set).reshape(len(y_pred), -1)
    
    return scaler.inverse_transform(pred_test_set)

# Create a dataframe for predictions
def create_prediction_df(unscaled_predictions, original_df):
    results = []
    sales_dates = list(original_df[-13:].YearMonth)
    actual_sales = list(original_df[-13:].Quantity)

    for i in range(len(unscaled_predictions)):
        results.append({'date': sales_dates[i+1], 'pred_value': int(unscaled_predictions[i][0] + actual_sales[i])})
    
    return pd.DataFrame(results)

# Model evaluation
def evaluate_model(pred_df, original_df):
    rmse = np.sqrt(mean_squared_error(original_df.Quantity[-12:], pred_df.pred_value[-12:]))
    mae = mean_absolute_error(original_df.Quantity[-12:], pred_df.pred_value[-12:])
    r2 = r2_score(original_df.Quantity[-12:], pred_df.pred_value[-12:])
    
    print(f"RMSE: {rmse}\nMAE: {mae}\nRÂ² Score: {r2}")
    return {'RMSE': rmse, 'MAE': mae, 'RÂ²': r2}

# Plot results
def plot_results(results_df, original_df, model_name):
    plt.figure(figsize=(10,5))
    plt.plot(original_df.YearMonth.astype(str), original_df.Quantity, marker='o', linestyle='-', label='Actual', color='blue')
    plt.plot(results_df.date.astype(str), results_df.pred_value, marker='o', linestyle='--', label='Predicted', color='red')
    
    plt.xlabel("Date")
    plt.ylabel("Sales")
    plt.title(f"{model_name} Sales Forecasting Prediction")
    plt.legend()
    plt.xticks(rotation=45)
    plt.grid(True)
    plt.show()

# Run a regression model
def train_and_predict(train_data, test_data, model, model_name):
    X_train, y_train, X_test, y_test, scaler = scale_data(train_data, test_data)

    model.fit(X_train, y_train)
    predictions = model.predict(X_test)

    unscaled_predictions = rescale_predictions(predictions, X_test, scaler)
    pred_df = create_prediction_df(unscaled_predictions, m_df)
    
    scores = evaluate_model(pred_df, m_df)
    plot_results(pred_df, m_df, model_name)

    return scores, pred_df

# Load dataset (Modify the file path as needed)
file_path = 'data/SKU2/0043121-X_2.csv'  # Change this to your dataset
df = pd.read_csv(file_path)

# Choose aggregation type
aggregation_type = "Monthly"

# Process and transform data
m_df = load_and_prepare_data(df, aggregation_type)
supervised_data = build_supervised(m_df)
train, test = train_test_split_data(supervised_data)

# Select a model and train
selected_model = RandomForestRegressor()
model_name = "RandomForest"

# Train and make predictions
results, forecast_df = train_and_predict(train, test, selected_model, model_name)

# Display results
print("\nFinal Predictions:")
print(forecast_df)


Final dataset shape after transformation: (0, 17)


  df['Created'] = pd.to_datetime(df['Created'])


ValueError: Found array with 0 sample(s) (shape=(0, 15)) while a minimum of 1 is required by MinMaxScaler.

In [14]:
import os

file_path = 'data/SKU2/0043121-X_2.csv'

if not os.path.exists(file_path):
    raise FileNotFoundError(f"Error: The file {file_path} does not exist. Please check the path.")

In [16]:
print(f"Shape of dataset before scaling: {train.shape}, {test.shape}")

Shape of dataset before scaling: (0, 15), (0, 15)


In [17]:
# Check if the CSV file loads correctly
print(f"Original Data Shape: {df.shape}")
print(df.head())

# Check after aggregation
m_df = load_and_prepare_data(df, aggregation_type)
print(f"After Aggregation: {m_df.shape}")
print(m_df.head())

# Check after supervised learning transformation
supervised_data = build_supervised(m_df)
print(f"After Supervised Learning Transformation: {supervised_data.shape}")
print(supervised_data.head())

# Check train/test split
train, test = train_test_split_data(supervised_data)
print(f"Train Shape: {train.shape}, Test Shape: {test.shape}")

Original Data Shape: (8293, 4)
   Unnamed: 0             Created  Quantity YearMonth
0           0 2023-10-16 13:59:07        24   2023-10
1           1 2023-10-18 14:09:15        12   2023-10
2           2 2023-10-19 11:25:47        12   2023-10
3           3 2023-10-19 11:30:16        12   2023-10
4           4 2023-10-19 11:48:47        24   2023-10
After Aggregation: (10, 5)
  YearMonth  Quantity  Month  Year  sales_diff
0   2023-10      5784     10  2023         0.0
1   2023-11     28824     11  2023     23040.0
2   2023-12     32004     12  2023      3180.0
3   2024-01      1380      1  2024    -30624.0
4   2024-09      3348      9  2024      1968.0
After Supervised Learning Transformation: (0, 17)
Empty DataFrame
Columns: [YearMonth, Quantity, Month, Year, sales_diff, lag_1, lag_2, lag_3, lag_4, lag_5, lag_6, lag_7, lag_8, lag_9, lag_10, lag_11, lag_12]
Index: []
Train Shape: (0, 15), Test Shape: (0, 15)
