In [1]:
# ─────────────────────────────────────────────────────────────────────────────
# 11.  TRIM LAST MONTH  +  KEEP COMBOS WITH 104 CONSECUTIVE WEEKS
# ─────────────────────────────────────────────────────────────────────────────
from pyspark.sql import functions as F
from pyspark.sql import Window
import datetime as dt
# 11-a ─ Bring in the weekly table and attach week_start_date
weekly   = spark.table("Machine_Learning.imputed_data.fact_sales_weekly")
dim_date = spark.table("gold_data.dim_date.dim_date") \
                  .select("year", "week_of_year", "week_start_date") \
                  .distinct()                      # 1 row per calendar week


StatementMeta(, 57b63e36-6816-4527-8985-b63a30dc16b5, 3, Finished, Available, Finished)

In [2]:
weekly_w_dates = weekly.join(dim_date, ["year", "week_of_year"], "left")

# ─── 11-b  Hard end-date = last Monday in September 2019 (2019-09-30) ──────
# If that Monday isn’t present, use 2019-09-23 (the previous one)
TARGET_END = dt.date(2019, 9, 30)
present    = (weekly_w_dates.filter(F.col("week_start_date") == F.lit(TARGET_END))
                            .limit(1).count()) > 0
end_date   = TARGET_END if present else dt.date(2019, 9, 23)

# start = end − 103 weeks  (104 Mondays inclusive)
start_date = end_date - dt.timedelta(weeks=103)

# restrict the data to that exact 104-week window
weekly_window = (weekly_w_dates
                 .filter((F.col("week_start_date") >= F.lit(start_date)) &
                         (F.col("week_start_date") <= F.lit(end_date))))

# ─── 11-c  Keep combos that have *exactly* those 104 Mondays (no gaps) ─────
combo_stats = (weekly_window
               .groupBy("store_key", "product_key")
               .agg(F.count("*").alias("wk_cnt"),
                    F.min("week_start_date").alias("min_week"),
                    F.max("week_start_date").alias("max_week")))

valid_combo = (combo_stats
               .filter((F.col("wk_cnt") >= 104) &
                       (F.col("min_week") == F.lit(start_date)) &
                       (F.col("max_week") == F.lit(end_date)))
               .select("store_key", "product_key"))

weekly_final = weekly_window.join(valid_combo,
                                  ["store_key", "product_key"],
                                  "inner")



# quick check
weekly_final.groupBy().agg(
    F.countDistinct("store_key", "product_key").alias("kept_combos"),
    F.min("week_start_date").alias("oldest_week"),
    F.max("week_start_date").alias("newest_week")
).show(truncate=False)

StatementMeta(, 57b63e36-6816-4527-8985-b63a30dc16b5, 4, Finished, Available, Finished)

+-----------+-----------+-----------+
|kept_combos|oldest_week|newest_week|
+-----------+-----------+-----------+
|3801       |2017-10-09 |2019-09-30 |
+-----------+-----------+-----------+



In [3]:
# ─── 11-d  WRITE OUT ────────────────────────────────────────────────────────
weekly_final.write.mode("overwrite") \
    .saveAsTable("Machine_Learning.imputed_data.fact_sales_weekly_combos_with_data")

StatementMeta(, 57b63e36-6816-4527-8985-b63a30dc16b5, 5, Finished, Available, Finished)

How it works

Attach calendar Mondays with dim_date so every weekly row has
week_start_date.

Force the window to run 2017-10-02 → 2019-09-30
(or → 2019-09-23 if 30 Sep isn't present).

Count rows per (store, product) and insist on
min = start_date, max = end_date, count = 104.

Save those surviving combos to
fact_sales_weekly_final.

In [4]:
# ─────────────────────────────────────────────────────────────────────────────
# 11-c2  KEEP COMBOS THAT DID *NOT* MAKE IT (i.e. the “not_kept” set)
# ─────────────────────────────────────────────────────────────────────────────

# derive invalid combos from combo_stats
invalid_combo = combo_stats.filter(~(
    (F.col("wk_cnt") >= 104) &
    (F.col("min_week") == F.lit(start_date)) &
    (F.col("max_week") == F.lit(end_date))
)).select("store_key", "product_key")

# now pull all weekly rows for those invalid combos
fact_sales_weekly_not_kept_combos = (
    weekly_window
    .join(invalid_combo, ["store_key", "product_key"], "inner")
)

# quick sanity check
fact_sales_weekly_not_kept_combos.groupBy().agg(
    F.countDistinct("store_key", "product_key").alias("not_kept_combos"),
    F.min("week_start_date").alias("oldest_week"),
    F.max("week_start_date").alias("newest_week")
).show(truncate=False)



StatementMeta(, 57b63e36-6816-4527-8985-b63a30dc16b5, 6, Finished, Available, Finished)

+---------------+-----------+-----------+
|not_kept_combos|oldest_week|newest_week|
+---------------+-----------+-----------+
|12467          |2017-10-09 |2019-09-30 |
+---------------+-----------+-----------+



In [5]:
# ─── WRITE OUT ────────────────────────────────────────────────────────────────
fact_sales_weekly_not_kept_combos.write.mode("overwrite") \
    .saveAsTable("Machine_Learning.imputed_data.fact_sales_weekly_not_kept_combos")

StatementMeta(, 57b63e36-6816-4527-8985-b63a30dc16b5, 7, Finished, Available, Finished)