In [2]:
# ─────────────────────────────────────────────────────────────────────────────
# 1. LOAD RAW TRANSACTIONS
# ─────────────────────────────────────────────────────────────────────────────
df_sales = spark.sql("""
    SELECT *
    FROM Machine_Learning.filtered_data.all_year_fact_sales
""")


StatementMeta(, b8f3ff34-1bb7-4fc2-969a-a077cdfda45f, 4, Finished, Available, Finished)

In [3]:
# ─────────────────────────────────────────────────────────────────────────────
# 2. DAILY HELPERS
# ─────────────────────────────────────────────────────────────────────────────
from pyspark.sql import functions as F

df_daily = (
    df_sales
    .withColumn("tx_date",   F.to_date(F.col("date_id").cast("string"), "yyyyMMdd"))
    .withColumn("week_start",F.date_trunc("week", F.col("tx_date")))   # Monday anchor
    .withColumn("sales_gt0", (F.col("quantities_sold") > 0).cast("int"))
    .withColumn("stock_gt0", (F.col("stock_quantity")  > 0).cast("int"))
)

StatementMeta(, b8f3ff34-1bb7-4fc2-969a-a077cdfda45f, 5, Finished, Available, Finished)

In [4]:
# ─────────────────────────────────────────────────────────────────────────────
# 3. WEEKLY AGGREGATES
# ─────────────────────────────────────────────────────────────────────────────
df_week = (
    df_daily
    .groupBy("store_key", "product_key", "week_start")
    .agg(
        F.countDistinct("tx_date").alias("days_in_week"),      # should be 7
        F.sum("quantities_sold").alias("sum_sales"),
        F.max("stock_quantity").alias("max_stock"),
        F.sum("sales_gt0").alias("days_with_sales"),
        F.sum("stock_gt0").alias("days_with_stock")
    )
    .withColumn("unsatisfied", F.col("max_stock") - F.col("sum_sales"))
)


StatementMeta(, b8f3ff34-1bb7-4fc2-969a-a077cdfda45f, 6, Finished, Available, Finished)

In [5]:
# ─────────────────────────────────────────────────────────────────────────────
# 4. CLASSIFY WEEK & SET REPLENISHMENT QTY
# ─────────────────────────────────────────────────────────────────────────────
df_week = (
    df_week
    .withColumn(
        "status",
        F.when(F.col("days_with_stock") == 0,  F.lit("NO_STOCK_ALL_WEEK"))
         .when(F.col("days_with_sales") == 0,  F.lit("STOCK_NO_SALES"))
         .when(F.col("unsatisfied")     > 0,   F.lit("CALCULABLE_DEMAND"))
         .otherwise(                           F.lit("STOCK_OUT"))
    )
    .withColumn(
        "ship_qty_next_week",
        F.when(F.col("status") == "CALCULABLE_DEMAND", F.col("unsatisfied"))
         .when(F.col("status") == "STOCK_OUT",         F.col("max_stock"))
         .otherwise(F.lit(0))
    )
)

StatementMeta(, b8f3ff34-1bb7-4fc2-969a-a077cdfda45f, 7, Finished, Available, Finished)

In [6]:
# ─────────────────────────────────────────────────────────────────────────────
# 5. SPLIT VIEWS
# ─────────────────────────────────────────────────────────────────────────────
weeks_with_demand = df_week.filter(F.col("status").isin("CALCULABLE_DEMAND","STOCK_OUT"))
weeks_no_demand   = df_week.filter(~F.col("status").isin("CALCULABLE_DEMAND","STOCK_OUT"))


StatementMeta(, b8f3ff34-1bb7-4fc2-969a-a077cdfda45f, 8, Finished, Available, Finished)

In [7]:
weeks_with_demand.count()

StatementMeta(, b8f3ff34-1bb7-4fc2-969a-a077cdfda45f, 9, Finished, Available, Finished)

610138

In [8]:
# ─────────────────────────────────────────────────────────────────────────────
# 6. FILTER DAILY TRANSACTIONS BY WEEK-LEVEL DEMAND STATUS
# ─────────────────────────────────────────────────────────────────────────────
from pyspark.sql import functions as F

# 6-a  Add the same week_start anchor to the daily snapshot
df_sales_aug = (
    df_sales
    .withColumn("week_start",
                F.date_trunc("week",
                             F.to_date(F.col("date_id").cast("string"), "yyyyMMdd")))
)

# 6-b  Columns needed as join keys
key_cols = ["store_key", "product_key", "week_start"]

# 6-c  Keep ONLY weeks_with_demand  (left_anti removes the others)
df_sales_demand_weeks = (
    df_sales_aug
      .join(weeks_no_demand.select(*key_cols), key_cols, how="left_anti")
)

StatementMeta(, b8f3ff34-1bb7-4fc2-969a-a077cdfda45f, 10, Finished, Available, Finished)

In [9]:
# df_sales_demand_weeks.show(5)


StatementMeta(, b8f3ff34-1bb7-4fc2-969a-a077cdfda45f, 11, Finished, Available, Finished)

In [10]:
df_sales_demand_weeks.write.mode("overwrite") \
    .saveAsTable("Machine_Learning.filtered_data.all_year_fact_sales_demand")

StatementMeta(, b8f3ff34-1bb7-4fc2-969a-a077cdfda45f, 12, Finished, Available, Finished)

In [11]:
# ─────────────────────────────────────────────────────────────────────────────
# 7. Get opposite set
# ─────────────────────────────────────────────────────────────────────────────
df_sales_no_demand_weeks = (
    df_sales_aug
      .join(weeks_no_demand.select(*key_cols), key_cols, how="semi")
)


StatementMeta(, b8f3ff34-1bb7-4fc2-969a-a077cdfda45f, 13, Finished, Available, Finished)

In [12]:
df_sales_no_demand_weeks.write.mode("overwrite") \
    .saveAsTable("Machine_Learning.filtered_data.all_year_fact_sales_no_demand")

StatementMeta(, b8f3ff34-1bb7-4fc2-969a-a077cdfda45f, 14, Finished, Available, Finished)

In [13]:
# df_sales_no_demand_weeks.show(5)

StatementMeta(, b8f3ff34-1bb7-4fc2-969a-a077cdfda45f, 15, Finished, Available, Finished)

 ─────────────────────────────────────────────────────────────────────────────
### 6. IMPUTE sum_sales ON NO-DEMAND WEEKS  (cumulative / rolling mean)
─────────────────────────────────────────────────────────────────────────────

  • Granularity:       (store_key, product_key)
  • Mean source:       all weeks whose status ∈ {CALCULABLE_DEMAND, STOCK_OUT}
  • Behaviour:         each imputed value is immediately fed into the running
                       mean, so gaps are filled sequentially:
                           week-3 = mean(weeks 1-2)
                           week-4 = mean(weeks 1-3)   ← week-3 now included
  • Overwrites:        df_week.sum_sales   (no other columns recalculated)
  • Output table:      Machine_Learning.filtered_data.all_year_fact_sales_weekly_imputed

 Prerequisites (already defined earlier in this notebook):
   · df_week           (§3)  :contentReference[oaicite:0]{index=0}
   · status flags      (§4)  :contentReference[oaicite:1]{index=1}
 ─────────────────────────────────────────────────────────────────────────────


In [20]:
from pyspark.sql import functions as F
import pandas as pd

# 6-a  Work in double precision to keep Arrow happy
df_week_flagged = (
    df_week
      .withColumn(
          "demand_flag",
          F.col("status").isin("CALCULABLE_DEMAND", "STOCK_OUT")
      )
      .withColumn("sum_sales", F.col("sum_sales").cast("double"))
)

# 6-b  Row-wise imputation inside each (store_key, product_key) slice
def rolling_impute(pdf: pd.DataFrame) -> pd.DataFrame:
    """
    Sequentially fills 'sum_sales' (float64) on no-demand weeks with the
    running mean of all previous weeks (including earlier imputations).
    """
    pdf = pdf.sort_values("week_start")
    run_sum, run_cnt = 0.0, 0

    for i, row in pdf.iterrows():
        current = row["sum_sales"]

        if row["demand_flag"]:
            run_sum += current
            run_cnt += 1
        else:
            if run_cnt > 0:
                imputed = run_sum / run_cnt
                pdf.at[i, "sum_sales"] = imputed
                current = imputed
            run_sum += current
            run_cnt += 1
    return pdf

df_week_imputed = (
    df_week_flagged
      .groupBy("store_key", "product_key")
      .applyInPandas(rolling_impute, schema=df_week_flagged.schema)
      .drop("demand_flag")
      # 6-b  Restore DECIMAL(28,2) with proper rounding
      .withColumn("sum_sales",
                  F.round(F.col("sum_sales"), 2).cast("decimal(28,2)"))
)

StatementMeta(, b8f3ff34-1bb7-4fc2-969a-a077cdfda45f, 22, Finished, Available, Finished)

In [21]:
df_week_imputed.show(5)

StatementMeta(, b8f3ff34-1bb7-4fc2-969a-a077cdfda45f, 23, Finished, Available, Finished)

+---------+-----------+-------------------+------------+---------+---------+---------------+---------------+-----------+-----------------+------------------+
|store_key|product_key|         week_start|days_in_week|sum_sales|max_stock|days_with_sales|days_with_stock|unsatisfied|           status|ship_qty_next_week|
+---------+-----------+-------------------+------------+---------+---------+---------------+---------------+-----------+-----------------+------------------+
|        1|          2|2017-01-02 00:00:00|           7|     2.00|        8|              1|              7|       6.00|CALCULABLE_DEMAND|              6.00|
|        1|          2|2017-01-09 00:00:00|           7|     3.00|        5|              3|              7|       2.00|CALCULABLE_DEMAND|              2.00|
|        1|          2|2017-01-16 00:00:00|           7|     2.00|        2|              2|              7|       0.00|        STOCK_OUT|              2.00|
|        1|          2|2017-01-23 00:00:00|         

# 8. AGGREGATE DAILY fact_sales → WEEKLY

In [25]:
# ─────────────────────────────────────────────────────────────────────────────
# 8. BUILD fact_sales_weekly  (year + week_of_year grain)
# ─────────────────────────────────────────────────────────────────────────────
#
# • dim_date contains the canonical calendar:   gold_data.dim_date.dim_date
#   ─ date_id           INT        (yyyymmdd)
#   ─ full_date         TIMESTAMP
#   ─ week_of_year      INT        (1-53)
#   ─ year              INT
#   ─ week_start_date   DATE       (Mon)
#   ─ week_end_date     DATE       (Sun)      ← keep if you need it later
#
# • fact_sales lives in Machine_Learning.raw.fact_sales
#   with integer date_id, so we join to dim_date to avoid date_trunc().
# ─────────────────────────────────────────────────────────────────────────────
from pyspark.sql import functions as F

# 8-a  Bring in the two source tables
fact_sales = spark.table("gold_data.fact_sales.fact_sales")
dim_date   = spark.table("gold_data.dim_date.dim_date") \
                   .select("date_id", "full_date",
                           "week_of_year", "year")

# 8-b  Attach calendar columns
fact_sales_dated = fact_sales.join(dim_date, on="date_id", how="left")

# 8-c  Weekly aggregation (granularity: store × product × year × week_of_year)
fact_sales_weekly = (
    fact_sales_dated
      .groupBy("store_key", "product_key", "year", "week_of_year")
      .agg(
          F.sum("quantities_sold" ).alias("quantities_sold_week"),
          F.sum("revenue"         ).alias("revenue_week"),
          F.avg("price"           ).alias("avg_price_week"),
          F.avg("stock_quantity"  ).alias("avg_stock_week")
      )
)

# ─────────────────────────────────────────────────────────────────────────────
# 9.  MERGE THE IMPUTED VOLUMES
# ─────────────────────────────────────────────────────────────────────────────
#
# • df_week_imputed has week_start (DATE).  Derive year + week_of_year so
#   the join keys match the weekly table.
# ─────────────────────────────────────────────────────────────────────────────
sales_replacement = (
    df_week_imputed
      .withColumn("year",         F.year      ("week_start"))
      .withColumn("week_of_year", F.weekofyear("week_start"))
      .select("store_key", "product_key", "year", "week_of_year",
              F.col("sum_sales").alias("sum_sales_imputed"))
)

fact_sales_weekly_final = (
    fact_sales_weekly.alias("f")
      .join(sales_replacement.alias("w"),
            on=["store_key", "product_key", "year", "week_of_year"],
            how="left")
      .withColumn(
          "quantities_sold_week",
          # cast to double during the coalesce, then back to DECIMAL(28,2)
          F.coalesce(F.col("w.sum_sales_imputed").cast("double"),
                     F.col("f.quantities_sold_week").cast("double"))
      )
      .withColumn("quantities_sold_week",
                  F.round(F.col("quantities_sold_week"), 2)
                   .cast("decimal(28,2)"))
      .drop("sum_sales_imputed")
)




# Quick sanity peek (optional)
fact_sales_weekly_final.orderBy("store_key", "product_key", "year", "week_of_year") \
                       .show(10, truncate=False)

StatementMeta(, b8f3ff34-1bb7-4fc2-969a-a077cdfda45f, 27, Finished, Available, Finished)

+---------+-----------+----+------------+--------------------+------------+--------------+------------------+
|store_key|product_key|year|week_of_year|quantities_sold_week|revenue_week|avg_price_week|avg_stock_week    |
+---------+-----------+----+------------+--------------------+------------+--------------+------------------+
|1        |2          |2017|1           |2.00                |12.50       |6.25000000    |7.428571428571429 |
|1        |2          |2017|2           |3.00                |19.25       |6.32142857    |4.571428571428571 |
|1        |2          |2017|3           |2.00                |13.00       |6.50000000    |1.7142857142857142|
|1        |2          |2017|4           |2.33                |0.00        |6.50000000    |1.0               |
|1        |2          |2017|5           |2.33                |0.00        |6.50000000    |1.0               |
|1        |2          |2017|6           |2.33                |0.00        |6.50000000    |1.0               |
|1        

In [26]:
# ─────────────────────────────────────────────────────────────────────────────
# 10.  SAVE  →  Machine_Learning.filtered_data.fact_sales_weekly
# ─────────────────────────────────────────────────────────────────────────────
fact_sales_weekly_final.write.mode("overwrite") \
    .saveAsTable("Machine_Learning.imputed_data.fact_sales_weekly")

StatementMeta(, b8f3ff34-1bb7-4fc2-969a-a077cdfda45f, 28, Finished, Available, Finished)