### 0. Setup

In [2]:
# ════════════════════════════════════════════════════════════════════════════
# WEEKLY FEATURES  –  source: fact_sales_weekly_combos_with_data
# ════════════════════════════════════════════════════════════════════════════
from pyspark.sql import functions as F, Window
from pyspark.sql.types import DoubleType

# ---------------------------------------------------------------------------
# 0.  Load fact + dimensions
# ---------------------------------------------------------------------------
weekly_final = spark.table("Machine_Learning.imputed_data.fact_sales_weekly_combos_with_data")

# rename core numeric columns so downstream formulas stay identical
fact = (weekly_final
        .withColumnRenamed("quantities_sold_week",  "weekly_quantities_sold")
        .withColumnRenamed("avg_price_week",        "weekly_avg_price")
        .withColumnRenamed("avg_stock_week",        "weekly_total_stock_quantity")
        .withColumnRenamed("week_start_date",       "week_start")      # ensure the join key
)

# ─ dim_date  (one row per Monday)
d_date = (spark.table("Gold_Data.dim_date.dim_date")
               .select("week_start_date", "is_weekend", "is_holiday", "month",
                       "num_holidays_in_week", "has_holiday_in_week",
                       "num_weekend_days_in_week")
               .withColumnRenamed("week_start_date", "week_start"))

# ─ dim_store  (for store_size or other attributes)
d_store = (spark.table("Gold_Data.dim_stores.dim_stores")
                .select("store_key", "store_size"))


StatementMeta(, 0d8e31de-a219-4197-82bd-8e7b637ccba1, 3, Finished, Available, Finished)

### **1. Join Fact with Dimensions**

In [3]:
# Join  
df = (fact
      .join(d_date , "week_start")
      .join(d_store, "store_key"))

StatementMeta(, 0d8e31de-a219-4197-82bd-8e7b637ccba1, 4, Finished, Available, Finished)

### **2. Feature Engineering**

In [4]:
# ---------------------------------------------------------------------------
# 2.  Feature engineering
# ---------------------------------------------------------------------------
w = Window.partitionBy("product_key", "store_key").orderBy("week_start")

df = (df
      # lags & rolling mean
      .withColumn("lag_1"         , F.lag("weekly_quantities_sold", 1).over(w))
      .withColumn("lag_4"         , F.lag("weekly_quantities_sold", 4).over(w))
      .withColumn("rolling_mean_4", F.avg("weekly_quantities_sold")
                                     .over(w.rowsBetween(-3, 0)))
      # derived numeric
      .withColumn("log_price"     , F.log("weekly_avg_price"))
      .withColumn("inv_days_cover",
                  F.when(F.col("rolling_mean_4") == 0, 0)
                   .otherwise(F.col("weekly_total_stock_quantity") /
                              F.col("rolling_mean_4")))
      # promotions unavailable → default 0
      .withColumn("promo_flag"    , F.lit(0))
      # Fourier / seasonality
      .withColumn("t"             , F.row_number().over(w) - 1)
      .withColumn("month_sin"     , F.sin(2*3.14159265359*F.col("month")/12))
      .withColumn("month_cos"     , F.cos(2*3.14159265359*F.col("month")/12))
      .withColumn("fourier_sin_52_k1",
                  F.sin(2*3.14159265359*F.col("t")/52))
      .withColumn("fourier_cos_52_k1",
                  F.cos(2*3.14159265359*F.col("t")/52))
)

StatementMeta(, 0d8e31de-a219-4197-82bd-8e7b637ccba1, 5, Finished, Available, Finished)

### **3. Final Selection and Save to Delta Table**

In [5]:
# ---------------------------------------------------------------------------
# 3.  Select final feature set & save
# ---------------------------------------------------------------------------
final_cols = [
    # identifiers + target
    "product_key", "store_key", "week_start", "weekly_quantities_sold",
    # calendar / store attrs
    "is_weekend", "is_holiday", "month", "store_size",
    "num_holidays_in_week", "has_holiday_in_week", "num_weekend_days_in_week",
    # engineered features
    "lag_1", "lag_4", "rolling_mean_4",
    "log_price", "inv_days_cover", "promo_flag",
    "t", "month_sin", "month_cos",
    "fourier_sin_52_k1", "fourier_cos_52_k1"
]

df_final = df.select(final_cols)



StatementMeta(, 0d8e31de-a219-4197-82bd-8e7b637ccba1, 6, Finished, Available, Finished)

In [6]:
# Save the feature table to Delta Lake
# Using "overwrite" mode for simplicity; consider "merge" for incremental updates in production
(df_final.write
         .format("delta")
         .mode("overwrite")
         .option("overwriteSchema", "true")
         .saveAsTable("Machine_Learning.features.weekly_features_combos_with_data"))

print("✅  Weekly features table saved to Machine_Learning.features.weekly_features_combos_with_data")


StatementMeta(, 0d8e31de-a219-4197-82bd-8e7b637ccba1, 7, Finished, Available, Finished)

✅  Weekly features table saved to Machine_Learning.features.weekly_features_combos_with_data


## Now compute and save the one's that weren't kept

In [7]:
# ════════════════════════════════════════════════════════════════════════════
# WEEKLY FEATURES  –  source: fact_sales_weekly_combos_with_data
# ════════════════════════════════════════════════════════════════════════════
from pyspark.sql import functions as F, Window
from pyspark.sql.types import DoubleType

# ---------------------------------------------------------------------------
# 0.  Load fact + dimensions
# ---------------------------------------------------------------------------
weekly_final = spark.table("Machine_Learning.imputed_data.fact_sales_weekly_not_kept_combos")

# rename core numeric columns so downstream formulas stay identical
fact = (weekly_final
        .withColumnRenamed("quantities_sold_week",  "weekly_quantities_sold")
        .withColumnRenamed("avg_price_week",        "weekly_avg_price")
        .withColumnRenamed("avg_stock_week",        "weekly_total_stock_quantity")
        .withColumnRenamed("week_start_date",       "week_start")      # ensure the join key
)

StatementMeta(, 0d8e31de-a219-4197-82bd-8e7b637ccba1, 8, Finished, Available, Finished)

In [8]:
# Join  
df = (fact
      .join(d_date , "week_start")
      .join(d_store, "store_key"))

StatementMeta(, 0d8e31de-a219-4197-82bd-8e7b637ccba1, 9, Finished, Available, Finished)

In [9]:
# ---------------------------------------------------------------------------
# 2.  Feature engineering
# ---------------------------------------------------------------------------
w = Window.partitionBy("product_key", "store_key").orderBy("week_start")

df = (df
      # lags & rolling mean
      .withColumn("lag_1"         , F.lag("weekly_quantities_sold", 1).over(w))
      .withColumn("lag_4"         , F.lag("weekly_quantities_sold", 4).over(w))
      .withColumn("rolling_mean_4", F.avg("weekly_quantities_sold")
                                     .over(w.rowsBetween(-3, 0)))
      # derived numeric
      .withColumn("log_price"     , F.log("weekly_avg_price"))
      .withColumn("inv_days_cover",
                  F.when(F.col("rolling_mean_4") == 0, 0)
                   .otherwise(F.col("weekly_total_stock_quantity") /
                              F.col("rolling_mean_4")))
      # promotions unavailable → default 0
      .withColumn("promo_flag"    , F.lit(0))
      # Fourier / seasonality
      .withColumn("t"             , F.row_number().over(w) - 1)
      .withColumn("month_sin"     , F.sin(2*3.14159265359*F.col("month")/12))
      .withColumn("month_cos"     , F.cos(2*3.14159265359*F.col("month")/12))
      .withColumn("fourier_sin_52_k1",
                  F.sin(2*3.14159265359*F.col("t")/52))
      .withColumn("fourier_cos_52_k1",
                  F.cos(2*3.14159265359*F.col("t")/52))
)

StatementMeta(, 0d8e31de-a219-4197-82bd-8e7b637ccba1, 10, Finished, Available, Finished)

In [10]:
# ---------------------------------------------------------------------------
# 3.  Select final feature set & save
# ---------------------------------------------------------------------------
final_cols = [
    # identifiers + target
    "product_key", "store_key", "week_start", "weekly_quantities_sold",
    # calendar / store attrs
    "is_weekend", "is_holiday", "month", "store_size",
    "num_holidays_in_week", "has_holiday_in_week", "num_weekend_days_in_week",
    # engineered features
    "lag_1", "lag_4", "rolling_mean_4",
    "log_price", "inv_days_cover", "promo_flag",
    "t", "month_sin", "month_cos",
    "fourier_sin_52_k1", "fourier_cos_52_k1"
]

df_final = df.select(final_cols)

StatementMeta(, 0d8e31de-a219-4197-82bd-8e7b637ccba1, 11, Finished, Available, Finished)

In [11]:
# Save the feature table to Delta Lake
# Using "overwrite" mode for simplicity; consider "merge" for incremental updates in production
(df_final.write
         .format("delta")
         .mode("overwrite")
         .option("overwriteSchema", "true")
         .saveAsTable("Machine_Learning.features.weekly_features_combos_with_no_data"))

print("✅  Weekly features table saved to Machine_Learning.features.weekly_features_combos_with_no_data")


StatementMeta(, 0d8e31de-a219-4197-82bd-8e7b637ccba1, 12, Finished, Available, Finished)

✅  Weekly features table saved to Machine_Learning.features.weekly_features_combos_with_no_data
