# Difference-in-Differences Part I: Simulations

*(Empirical-only notebook: implements all code chunks from the slide deck in both R and Python.)*


In [None]:
!pip -q install rpy2
%load_ext rpy2.ipython

In [None]:
import numpy as np
import pandas as pd
import statsmodels.formula.api as smf


In [None]:
%%R
if (!require("pacman")) install.packages("pacman")
pacman::p_load(tidyverse, modelsummary)


## Simulated DD data (R)


In [None]:
%%R
set.seed(123)
N <- 5000

dd.dat <- tibble(
  d = (runif(N, 0, 1) > 0.5),
  time_pre = "pre",
  time_post = "post"
)

dd.dat <- pivot_longer(dd.dat, c("time_pre", "time_post"), values_to = "time") %>%
  select(d, time) %>%
  mutate(
    t = (time == "post"),
    y.out = 1.5 + 3 * d + 1.5 * t + 6 * d * t + rnorm(N * 2, 0, 1)
  )

head(dd.dat)


## Simulated DD data (Python)


In [None]:
rng = np.random.default_rng(123)
N = 5000

base = pd.DataFrame({
    "d": rng.uniform(0, 1, N) > 0.5,
    "time_pre": "pre",
    "time_post": "post"
})

dd_dat = (
    base
    .melt(value_vars=["time_pre", "time_post"], value_name="time")
    .loc[:, ["d", "time"]]
)

dd_dat["t"] = dd_dat["time"] == "post"

dd_dat["y_out"] = (
    1.5
    + 3 * dd_dat["d"].astype(int)
    + 1.5 * dd_dat["t"].astype(int)
    + 6 * dd_dat["d"].astype(int) * dd_dat["t"].astype(int)
    + rng.normal(0, 1, N * 2)
)

dd_dat.head()


## Mean differences (R)


In [None]:
%%R
dd.means <- dd.dat %>%
  group_by(d, t) %>%
  summarize(mean_y = mean(y.out), .groups = "drop") %>%
  mutate(
    d = ifelse(d == TRUE, "Treated", "Control"),
    t = ifelse(t == TRUE, "Post", "Pre")
  )

dd.means


## Mean differences (Python)


In [None]:
dd_means = (
    dd_dat
    .groupby(["d", "t"], as_index=False)
    .agg(mean_y=("y_out", "mean"))
)

dd_means["d"] = dd_means["d"].map({True: "Treated", False: "Control"})
dd_means["t"] = dd_means["t"].map({True: "Post", False: "Pre"})

dd_means


## Regression estimator (R)


In [None]:
%%R
dd.est <- lm(y.out ~ d + t + d * t, data = dd.dat)

modelsummary(dd.est, gof_map = NA, coef_omit = "Intercept")


## Regression estimator (Python)


In [None]:
dd_dat_for_reg = dd_dat.copy()

dd_dat_for_reg["d"] = dd_dat_for_reg["d"].astype(int)
dd_dat_for_reg["t"] = dd_dat_for_reg["t"].astype(int)

dd_est = smf.ols("y_out ~ d + t + d:t", data=dd_dat_for_reg).fit()
dd_est.summary()
