# About this Notebook


In [0]:
# import packages
from pyspark.sql import functions as F
from pyspark.sql.window import Window


In [0]:


# load to spark df 
did = spark.table("did_panel")

# sanity check: check all 4 cases of observation exists
did.select("after","close").groupBy("after","close").count().orderBy("after","close").show()
# sanity check: time range spans the intended evaluation window
did.select(F.min("date").alias("min_date"), F.max("date").alias("max_date")).show()


# Simple 2x2 DiD

(Treatment Post - Treatment Pre) - (Control Post - Control Pre)

2×2 DiD: quick baseline estimate, easy to interpret, checks coding.

Regression DiD with FE: “main” estimate that controls for:

- DMA fixed effects (markets differ in baseline sales)
- Week/date fixed effects (seasonality, macro shocks)

In [0]:
# Simple 2x2 DiD
# take average of total sales for each after and close possibilities [Treated vs Non Treated, Post vs Pre]
cell = (did.groupBy("after","close")
          .agg(F.avg("sales").alias("avg_sales"),
               F.count("*").alias("n"))
       )

display(cell.orderBy("after","close"))


In [0]:
vals = { (r["after"], r["close"]): r["avg_sales"] for r in cell.collect() }

did_simple = (vals[(1,1)] - vals[(0,1)]) - (vals[(1,0)] - vals[(0,0)])
print("Simple DiD (avg sales):", did_simple)


Negative value in average sales of Simple DiD means, treated DMAs (close=1) dropped more than control DMAs (close=0) from pre→post

# Pre-trend Plausibility Check

In [0]:
# pre-trend plausibility check

# only before BOPS initiative

# take average sales per week across DMA for each Treated (close = 1) and Non-Treated (close = 0)
pre_trends = (did.filter(F.col("after")==0)
                .groupBy("date","close")
                .agg(F.avg("sales").alias("avg_sales"),
                     F.count("*").alias("n"))
                .orderBy("date","close"))

pre_trends.write.format("delta").mode("overwrite").saveAsTable("did_pre_trends")
display(pre_trends)


Databricks visualization. Run in Databricks to view.

In [0]:
#pre-period gap plot

pre = spark.table("did_pre_trends")

w = Window.partitionBy("close").orderBy("date")

pre_indexed = (
    pre
    .withColumn("base_avg_sales", F.first("avg_sales").over(w))     # first pre week per group
    .withColumn("index_100", (F.col("avg_sales") / F.col("base_avg_sales")) * F.lit(100.0))
    .drop("base_avg_sales")
    .orderBy("date", "close")
)

pre_indexed.write.format("delta").mode("overwrite").saveAsTable("did_pre_trends_indexed")
display(pre_indexed)



Databricks visualization. Run in Databricks to view.