## **Model Evaluation** - Products with data 

In [1]:
import mlflow
import pandas as pd

# Step 1: Load experiment by name
experiment_name = "Model_For_Products_WITH_data-ARIMA"
experiment = mlflow.get_experiment_by_name(experiment_name)

# Step 2: Get all runs for the experiment
runs_df = mlflow.search_runs(experiment_ids=[experiment.experiment_id])

# Step 3: Select relevant metrics columns
metrics_df = runs_df[[
    "metrics.mae",
    "metrics.rmse",
    "metrics.r2",
    "metrics.accuracy"
]].dropna()

# Step 4: Calculate mean of each metric
mean_metrics = metrics_df.mean()

# Step 5: Display results
print("Mean metrics across all runs:")
print(mean_metrics)


StatementMeta(, 8784d349-fb52-4044-b81f-ad91d493f478, 3, Finished, Available, Finished)

Mean metrics across all runs:
metrics.mae         0.533544
metrics.rmse        0.574898
metrics.r2          0.000000
metrics.accuracy    0.729411
dtype: float64


In [20]:
### Section 6: Conclusion and Summary
# This section provides a summary of the analysis.

print("\n--- Analysis Complete ---")
print("The analysis has identified the best model for each store-product combination based on Mean Absolute Error (MAE).")
print("The distribution of the best models is displayed above, showing which model performed best most frequently.")

# Optional: Calculate percentages
total_combinations = df_best_model.count()
if total_combinations > 0:
    best_model_percentages = best_model_counts.withColumn("percentage", (col("count") / total_combinations) * 100)
    print("\nBest Model Distribution (with percentages):")
    best_model_percentages.show(truncate=False)
else:
    print("\nNo combinations found to analyze.")


### Section 7: Contextualizing ARIMA MAE (and example for others)
# This section helps put the MAE values into perspective by comparing them to the
# typical range of the 'weekly_quantities_sold' data.

print("\n--- Contextualizing MAE Values with Source Data Statistics ---")

# --- Analysis for ARIMA model's source data ---
# This table was identified from the Model_For_Products_WITH_data-ARIMA (2).ipynb notebook.
arima_features_table = "Machine_Learning.features.weekly_features_combos_with_data"

try:
    df_arima_features = spark.table(arima_features_table)
    print(f"\nDescriptive statistics for 'weekly_quantities_sold' from ARIMA model's source table ({arima_features_table}):")
    df_arima_features.select("weekly_quantities_sold").summary().show()

    arima_sales_stats = df_arima_features.select(
        F.mean("weekly_quantities_sold").alias("mean_sales"),
        F.stddev("weekly_quantities_sold").alias("stddev_sales"),
        F.min("weekly_quantities_sold").alias("min_sales"),
        F.max("weekly_quantities_sold").alias("max_sales")
    ).collect()[0]

    mean_sales_arima = arima_sales_stats["mean_sales"]
    stddev_sales_arima = arima_sales_stats["stddev_sales"]
    min_sales_arima = arima_sales_stats["min_sales"]
    max_sales_arima = arima_sales_stats["max_sales"]

    # Assume you have the average MAE for ARIMA from MLflow search_runs
    arima_avg_mae = 0.534194
    arima_avg_r2 = 0.000000      # Add ARIMA's average R2
    arima_avg_accuracy = 0.730183 # Add ARIMA's average Accuracy (the problematic one)

    print(f"\n--- Contextualizing ARIMA's Mean MAE ({arima_avg_mae:.6f}) ---")
    print(f"  Mean 'weekly_quantities_sold': {mean_sales_arima:.4f}")
    print(f"  Standard Deviation 'weekly_quantities_sold': {stddev_sales_arima:.4f}")
    print(f"  Min 'weekly_quantities_sold': {min_sales_arima:.4f}")
    print(f"  Max 'weekly_quantities_sold': {max_sales_arima:.4f}")

    if mean_sales_arima is not None and float(mean_sales_arima) != 0:
        mae_as_percent_of_mean = (arima_avg_mae / float(mean_sales_arima)) * 100
        print(f"  MAE as % of Mean Sales: {mae_as_percent_of_mean:.2f}%")
    else:
        print("  Cannot calculate MAE as % of Mean Sales (mean sales is zero or null).")

    if stddev_sales_arima is not None and float(stddev_sales_arima) != 0:
        mae_as_percent_of_stddev = (arima_avg_mae / float(stddev_sales_arima)) * 100
        print(f"  MAE as % of Standard Deviation: {mae_as_percent_of_stddev:.2f}%")
    else:
        print("  Cannot calculate MAE as % of Standard Deviation (stddev sales is zero or null).")

    # --- Add critical observations for ARIMA's R2 and Accuracy ---
    print(f"\n--- Additional ARIMA Metrics (Overall Average) ---")
    print(f"  Average R2: {arima_avg_r2:.6f}")
    print(f"  Average Accuracy: {arima_avg_accuracy:.6f}")
    print(f"  (Note: The average Accuracy of {arima_avg_accuracy:.6f} for ARIMA (1 - MAPE) implies a very high percentage error, indicating potential issues with the MAPE calculation, especially for time series with zero or near-zero actuals.)")


except Exception as e:
    print(f"Error accessing or processing data from {arima_features_table}: {e}")

StatementMeta(, 6e10dd3b-1cf4-4a85-8432-fdf3468ca7fe, 22, Finished, Available, Finished)


--- Analysis Complete ---
The analysis has identified the best model for each store-product combination based on Mean Absolute Error (MAE).
The distribution of the best models is displayed above, showing which model performed best most frequently.

Best Model Distribution (with percentages):
+-----------------+-----+------------------+
|best_model       |count|percentage        |
+-----------------+-----+------------------+
|XGBoost          |5049 |40.49891714125291 |
|Baseline Mean    |3864 |30.993823694553623|
|Linear Regression|3554 |28.50725916419347 |
+-----------------+-----+------------------+


--- Contextualizing MAE Values with Source Data Statistics ---

Descriptive statistics for 'weekly_quantities_sold' from ARIMA model's source table (Machine_Learning.features.weekly_features_combos_with_data):
+-------+----------------------+
|summary|weekly_quantities_sold|
+-------+----------------------+
|  count|               2836449|
|   mean|              4.381446|
| stddev|    1

## **Model Evaluation** - Products with no data 

In [18]:
### Section 1: Setup and Configuration

from pyspark.sql import SparkSession
from pyspark.sql.functions import col, lit, min, count, when
from pyspark.sql.window import Window



baseline_table = "Machine_Learning.results.results_baseline_mean"
linear_regression_table = "Machine_Learning.results.results_linear_regression_products_w_no_data"
xgboost_table = "Machine_Learning.results.results_xgboost_products_w_no_data"

print(f"Table paths defined:\n"
      f"  Baseline: {baseline_table}\n"
      f"  Linear Regression: {linear_regression_table}\n"
      f"  XGBoost: {xgboost_table}")

### Section 2: Load Data
# This section loads the necessary columns (store_key, product_key, mae) from each results table.

print("\n--- Loading Data ---")

# Load Baseline Mean results
try:
    df_baseline = spark.table(baseline_table).select(
        col("store_key"),
        col("product_key"),
        col("mae").alias("mae_baseline"),
        col("r2").alias("r2_baseline"),          # ADD THIS LINE
        col("accuracy").alias("accuracy_baseline") # ADD THIS LINE
    )
    print(f"Loaded {df_baseline.count()} rows from Baseline table.")
    print("Sample from df_baseline:")
    df_baseline.show(5)
except Exception as e:
    print(f"Error loading Baseline table '{baseline_table}': {e}")
    # IMPORTANT: Update the schema for the empty DataFrame as well!
    df_baseline = spark.createDataFrame([], "store_key:int, product_key:int, mae_baseline:double, r2_baseline:double, accuracy_baseline:double")


# Load Linear Regression results
try:
    df_linear_reg = spark.table(linear_regression_table).select(
        col("store_key"),
        col("product_key"),
        col("mae").alias("mae_linear_reg"),
        col("r2").alias("r2_linear_reg"),          # ADD THIS LINE
        col("accuracy").alias("accuracy_linear_reg") # ADD THIS LINE
    )
    print(f"Loaded {df_linear_reg.count()} rows from Linear Regression table.")
    print("Sample from df_linear_reg:")
    df_linear_reg.show(5)
except Exception as e:
    print(f"Error loading Linear Regression table '{linear_regression_table}': {e}")
    # IMPORTANT: Update the schema for the empty DataFrame as well!
    df_linear_reg = spark.createDataFrame([], "store_key:int, product_key:int, mae_linear_reg:double, r2_linear_reg:double, accuracy_linear_reg:double")


# Load XGBoost results
try:
    df_xgboost = spark.table(xgboost_table).select(
        col("store_key"),
        col("product_key"),
        col("mae").alias("mae_xgboost"),
        col("r2").alias("r2_xgboost"),          # ADD THIS LINE
        col("accuracy").alias("accuracy_xgboost") # ADD THIS LINE
    )
    print(f"Loaded {df_xgboost.count()} rows from XGBoost table.")
    print("Sample from df_xgboost:")
    df_xgboost.show(5)
except Exception as e:
    print(f"Error loading XGBoost table '{xgboost_table}': {e}")
    # IMPORTANT: Update the schema for the empty DataFrame as well!
    df_xgboost = spark.createDataFrame([], "store_key:int, product_key:int, mae_xgboost:double, r2_xgboost:double, accuracy_xgboost:double")



### Section 3: Join Data
# This section joins the loaded dataframes on `store_key` and `product_key` to consolidate MAE values.

print("\n--- Joining Data ---")

# Join all dataframes using full outer join to ensure all combinations are considered.
# Fill null MAE values with infinity so missing model results are not considered the "best".
df_joined = df_baseline.join(df_linear_reg, ["store_key", "product_key"], "fullouter") \
                       .join(df_xgboost, ["store_key", "product_key"], "fullouter")

print(f"Initial joined dataframe has {df_joined.count()} rows.")


# Handle potential nulls in MAE columns (e.g., if a model didn't have results for a specific combination)
# We fill with float('inf') so these combinations are not selected as the best.
df_joined = df_joined.fillna(float('inf'), subset=["mae_baseline", "mae_linear_reg", "mae_xgboost"])



### Section 4: Determine the Best Model for Each Combination
# This section identifies the model with the lowest MAE for every unique store-product pair.

print("\n--- Determining Best Model per Combination ---")

# Import the 'least' function if not already imported
from pyspark.sql.functions import least

# Find the minimum MAE across all models for each combination using least()
# We don't need Window.partitionBy("store_key", "product_key") here because least() operates row-wise.
df_with_min_mae = df_joined.withColumn("min_mae",
                                       least(col("mae_baseline"), col("mae_linear_reg"), col("mae_xgboost")))

# Determine the best model based on the minimum MAE
df_best_model = df_with_min_mae.withColumn(
    "best_model",
    when(col("mae_baseline") == col("min_mae"), lit("Baseline Mean"))
    .when(col("mae_linear_reg") == col("min_mae"), lit("Linear Regression"))
    .when(col("mae_xgboost") == col("min_mae"), lit("XGBoost"))
    .otherwise(lit("Undetermined_Error")) # Should ideally not happen if all MAEs are handled
)

print("Sample of determined best model for each combination:")
df_best_model.show(10)
### Section 5: Count Best Model Occurrences
# This section counts how many times each model was selected as the best across all combinations.

print("\n--- Counting Best Model Occurrences ---")

# Count occurrences of each best model
best_model_counts = df_best_model.groupBy("best_model").count().orderBy(col("count").desc())

print("Final count of best model selections:")
best_model_counts.show(truncate=False)

### Section 6: Conclusion and Summary
# This section provides a summary of the analysis.

print("\n--- Analysis Complete ---")
print("The analysis has identified the best model for each store-product combination based on Mean Absolute Error (MAE).")
print("The distribution of the best models is displayed above, showing which model performed best most frequently.")

# Optional: Calculate percentages
total_combinations = df_best_model.count()
if total_combinations > 0:
    best_model_percentages = best_model_counts.withColumn("percentage", (col("count") / total_combinations) * 100)
    print("\nBest Model Distribution (with percentages):")
    best_model_percentages.show(truncate=False)
else:
    print("\nNo combinations found to analyze.")


StatementMeta(, 6e10dd3b-1cf4-4a85-8432-fdf3468ca7fe, 20, Finished, Available, Finished)

Table paths defined:
  Baseline: Machine_Learning.results.results_baseline_mean
  Linear Regression: Machine_Learning.results.results_linear_regression_products_w_no_data
  XGBoost: Machine_Learning.results.results_xgboost_products_w_no_data

--- Loading Data ---
Loaded 12467 rows from Baseline table.
Sample from df_baseline:
+---------+-----------+--------------------+-----------+------------------+
|store_key|product_key|        mae_baseline|r2_baseline| accuracy_baseline|
+---------+-----------+--------------------+-----------+------------------+
|        1|         18|0.022805429864253046|        0.0|0.9928058580869864|
|        1|         38| 0.07632107023411372|        0.0|0.9510762370294142|
|        1|         44|0.002266187050359747|        0.0|0.9978209739900387|
|        1|         61|0.023174061433446802|        0.0|0.9905024338387514|
|        1|         68|  0.2788829787234044|        0.0|0.8605585106382978|
+---------+-----------+--------------------+-----------+--------

In [12]:
import numpy as np

StatementMeta(, 6e10dd3b-1cf4-4a85-8432-fdf3468ca7fe, 14, Finished, Available, Finished)

In [15]:
# --- Section 7: Average MAE for Each Model When Selected as Best ---
# This section calculates the average MAE for each model, considering only
# the combinations where that specific model was chosen as the best.

print("\n--- Average MAE for Each Model When Selected as Best ---")

# --- Baseline Mean ---
baseline_best_count = df_best_model.filter(col("best_model") == "Baseline Mean").count()
if baseline_best_count > 0:
    avg_mae_baseline_when_best = df_best_model.filter(col("best_model") == "Baseline Mean") \
                                              .select(F.mean("mae_baseline")) \
                                              .collect()[0][0]
    print(f"Average MAE for Baseline Mean (when selected as best, {baseline_best_count} times): {avg_mae_baseline_when_best:.6f}")
else:
    avg_mae_baseline_when_best = np.nan
    print("Baseline Mean was never selected as the best model.")


# --- Linear Regression ---
lr_best_count = df_best_model.filter(col("best_model") == "Linear Regression").count()
if lr_best_count > 0:
    avg_mae_lr_when_best = df_best_model.filter(col("best_model") == "Linear Regression") \
                                        .select(F.mean("mae_linear_reg")) \
                                        .collect()[0][0]
    print(f"Average MAE for Linear Regression (when selected as best, {lr_best_count} times): {avg_mae_lr_when_best:.6f}")
else:
    avg_mae_lr_when_best = np.nan
    print("Linear Regression was never selected as the best model.")


# --- XGBoost ---
xgb_best_count = df_best_model.filter(col("best_model") == "XGBoost").count()
if xgb_best_count > 0:
    avg_mae_xgb_when_best = df_best_model.filter(col("best_model") == "XGBoost") \
                                         .select(F.mean("mae_xgboost")) \
                                         .collect()[0][0]
    print(f"Average MAE for XGBoost (when selected as best, {xgb_best_count} times): {avg_mae_xgb_when_best:.6f}")
else:
    avg_mae_xgb_when_best = np.nan
    print("XGBoost was never selected as the best model.")


# --- ARIMA ---
arima_best_count = df_best_model.filter(col("best_model") == "ARIMA").count()
if arima_best_count > 0:
    avg_mae_arima_when_best = df_best_model.filter(col("best_model") == "ARIMA") \
                                           .select(F.mean("mae_arima")) \
                                           .collect()[0][0]
    print(f"Average MAE for ARIMA (when selected as best, {arima_best_count} times): {avg_mae_arima_when_best:.6f}")
else:
    avg_mae_arima_when_best = np.nan
    print("ARIMA was never selected as the best model.")


print("\n--- Contextualizing These Specific MAEs with Source Data Statistics ---")

# --- First, get the descriptive statistics for the relevant source tables ---

# For ARIMA (weekly_features_combos_with_data)
arima_features_table = "Machine_Learning.features.weekly_features_combos_with_data"
try:
    df_arima_features = spark.table(arima_features_table)
    arima_sales_stats = df_arima_features.select(
        F.mean("weekly_quantities_sold").alias("mean_sales"),
        F.stddev("weekly_quantities_sold").alias("stddev_sales"),
        F.min("weekly_quantities_sold").alias("min_sales"),
        F.max("weekly_quantities_sold").alias("max_sales")
    ).collect()[0]
    mean_sales_arima = arima_sales_stats["mean_sales"]
    stddev_sales_arima = arima_sales_stats["stddev_sales"]
except Exception as e:
    mean_sales_arima, stddev_sales_arima = None, None
    print(f"Error getting stats for ARIMA source table '{arima_features_table}': {e}")


# For Baseline, Linear Regression, XGBoost (weekly_features_combos_with_no_data)
other_models_features_table = "Machine_Learning.features.weekly_features_combos_with_no_data"
try:
    df_other_models_features = spark.table(other_models_features_table)
    other_sales_stats = df_other_models_features.select(
        F.mean("weekly_quantities_sold").alias("mean_sales"),
        F.stddev("weekly_quantities_sold").alias("stddev_sales"),
        F.min("weekly_quantities_sold").alias("min_sales"),
        F.max("weekly_quantities_sold").alias("max_sales")
    ).collect()[0]
    mean_sales_other = other_sales_stats["mean_sales"]
    stddev_sales_other = other_sales_stats["stddev_sales"]
except Exception as e:
    mean_sales_other, stddev_sales_other = None, None
    print(f"Error getting stats for LR/XGBoost/Baseline source table '{other_models_features_table}': {e}")


# --- Now contextualize each relevant MAE ---

# Contextualize Baseline MAE
if avg_mae_baseline_when_best is not None and not np.isnan(avg_mae_baseline_when_best):
    print(f"\n--- Contextualizing Baseline Mean's MAE (when best): {avg_mae_baseline_when_best:.6f} ---")
    if mean_sales_other is not None and float(mean_sales_other) != 0:
        print(f"  MAE as % of Mean Sales: {(avg_mae_baseline_when_best / float(mean_sales_other) * 100):.2f}%")
    if stddev_sales_other is not None and float(stddev_sales_other) != 0:
        print(f"  MAE as % of Standard Deviation: {(avg_mae_baseline_when_best / float(stddev_sales_other) * 100):.2f}%")

# Contextualize Linear Regression MAE
if avg_mae_lr_when_best is not None and not np.isnan(avg_mae_lr_when_best):
    print(f"\n--- Contextualizing Linear Regression's MAE (when best): {avg_mae_lr_when_best:.6f} ---")
    if mean_sales_other is not None and float(mean_sales_other) != 0:
        print(f"  MAE as % of Mean Sales: {(avg_mae_lr_when_best / float(mean_sales_other) * 100):.2f}%")
    if stddev_sales_other is not None and float(stddev_sales_other) != 0:
        print(f"  MAE as % of Standard Deviation: {(avg_mae_lr_when_best / float(stddev_sales_other) * 100):.2f}%")

# Contextualize XGBoost MAE
if avg_mae_xgb_when_best is not None and not np.isnan(avg_mae_xgb_when_best):
    print(f"\n--- Contextualizing XGBoost's MAE (when best): {avg_mae_xgb_when_best:.6f} ---")
    if mean_sales_other is not None and float(mean_sales_other) != 0:
        print(f"  MAE as % of Mean Sales: {(avg_mae_xgb_when_best / float(mean_sales_other) * 100):.2f}%")
    if stddev_sales_other is not None and float(stddev_sales_other) != 0:
        print(f"  MAE as % of Standard Deviation: {(avg_mae_xgb_when_best / float(stddev_sales_other) * 100):.2f}%")

# Contextualize ARIMA MAE
if avg_mae_arima_when_best is not None and not np.isnan(avg_mae_arima_when_best):
    print(f"\n--- Contextualizing ARIMA's MAE (when best): {avg_mae_arima_when_best:.6f} ---")
    if mean_sales_arima is not None and float(mean_sales_arima) != 0:
        print(f"  MAE as % of Mean Sales: {(avg_mae_arima_when_best / float(mean_sales_arima) * 100):.2f}%")
    if stddev_sales_arima is not None and float(stddev_arima_sales) != 0: # Corrected variable name from stddev_arima_sales
        print(f"  MAE as % of Standard Deviation: {(avg_mae_arima_when_best / float(stddev_sales_arima) * 100):.2f}%")

StatementMeta(, 6e10dd3b-1cf4-4a85-8432-fdf3468ca7fe, 17, Finished, Available, Finished)


--- Average MAE for Each Model When Selected as Best ---
Average MAE for Baseline Mean (when selected as best, 3864 times): 0.012532
Average MAE for Linear Regression (when selected as best, 3554 times): 0.195571
Average MAE for XGBoost (when selected as best, 5049 times): 0.061120
ARIMA was never selected as the best model.

--- Contextualizing These Specific MAEs with Source Data Statistics ---

--- Contextualizing Baseline Mean's MAE (when best): 0.012532 ---
  MAE as % of Mean Sales: 0.36%
  MAE as % of Standard Deviation: 0.10%

--- Contextualizing Linear Regression's MAE (when best): 0.195571 ---
  MAE as % of Mean Sales: 5.67%
  MAE as % of Standard Deviation: 1.55%

--- Contextualizing XGBoost's MAE (when best): 0.061120 ---
  MAE as % of Mean Sales: 1.77%
  MAE as % of Standard Deviation: 0.48%


In [19]:
print("\n--- Average R2 and Accuracy for Each Model When Selected as Best ---")

# --- Baseline Mean ---
baseline_best_df = df_best_model.filter(col("best_model") == "Baseline Mean")
if baseline_best_df.count() > 0:
    avg_r2_baseline_when_best = baseline_best_df.select(F.mean("r2_baseline")).collect()[0][0]
    avg_acc_baseline_when_best = baseline_best_df.select(F.mean("accuracy_baseline")).collect()[0][0]
    print(f"Average R2 for Baseline Mean (when best): {avg_r2_baseline_when_best:.6f}")
    print(f"Average Accuracy for Baseline Mean (when best): {avg_acc_baseline_when_best:.6f}")
else:
    print("Baseline Mean was never selected as the best model, no R2/Accuracy to report.")


# --- Linear Regression ---
lr_best_df = df_best_model.filter(col("best_model") == "Linear Regression")
if lr_best_df.count() > 0:
    avg_r2_lr_when_best = lr_best_df.select(F.mean("r2_linear_reg")).collect()[0][0]
    avg_acc_lr_when_best = lr_best_df.select(F.mean("accuracy_linear_reg")).collect()[0][0]
    print(f"Average R2 for Linear Regression (when best): {avg_r2_lr_when_best:.6f}")
    print(f"Average Accuracy for Linear Regression (when best): {avg_acc_lr_when_best:.6f}")
else:
    print("Linear Regression was never selected as the best model, no R2/Accuracy to report.")


# --- XGBoost ---
xgb_best_df = df_best_model.filter(col("best_model") == "XGBoost")
if xgb_best_df.count() > 0:
    avg_r2_xgb_when_best = xgb_best_df.select(F.mean("r2_xgboost")).collect()[0][0]
    avg_acc_xgb_when_best = xgb_best_df.select(F.mean("accuracy_xgboost")).collect()[0][0]
    print(f"Average R2 for XGBoost (when best): {avg_r2_xgb_when_best:.6f}")
    print(f"Average Accuracy for XGBoost (when best): {avg_acc_xgb_when_best:.6f}")
else:
    print("XGBoost was never selected as the best model, no R2/Accuracy to report.")

StatementMeta(, 6e10dd3b-1cf4-4a85-8432-fdf3468ca7fe, 21, Finished, Available, Finished)


--- Average R2 and Accuracy for Each Model When Selected as Best ---
Average R2 for Baseline Mean (when best): 0.113645
Average Accuracy for Baseline Mean (when best): -821259.943450
Average R2 for Linear Regression (when best): 0.033250
Average Accuracy for Linear Regression (when best): -783925.996209
Average R2 for XGBoost (when best): -0.001791
Average Accuracy for XGBoost (when best): -324990.710337
