# Matching and Weighting IRL â€“ Georgia MA Sandbox

This notebook mirrors the slide code and provides R and Python implementations for:

1. Constructing county-level GA Medicare Advantage data  
2. Defining an HHI-based binary treatment  
3. Assessing balance  
4. Matching estimators (exact / nearest neighbor / Mahalanobis / PS)  
5. IPW estimators (simple averages and regression)  
6. Regression-based estimators

The main kernel is Python; R is called via `rpy2` magics (`%%R`).


In [None]:
# Python setup: install core libs (comment out pip lines if already installed)
!pip install rpy2 statsmodels scikit-learn matplotlib pandas numpy --quiet

%load_ext rpy2.ipython


In [None]:
%%R
# R setup ---------------------------------------------------------------
if (!require("pacman")) install.packages("pacman")
pacman::p_load(
  tidyverse, ggplot2, lubridate,
  MatchIt, cobalt, knitr, kableExtra, Matching
)


## 1. Construct county-level GA MA data, define treatment, and build analysis dataset (R)

In [None]:
%%R
# Construct county-level GA MA data ------------------------------------


ga_ma_2022 <- read_csv("https://raw.githubusercontent.com/imccart/empirical-methods-content/main/data/output/ma-snippets/ga-ma-data-2022.csv") %>%
  group_by(fips) %>%
  mutate(
    total_ma_enrollment = first(avg_enrolled),
    ma_share = if_else(
      total_ma_enrollment > 0,
      (avg_enrollment / total_ma_enrollment) * 100,
      NA_real_
    )
  ) %>%
  summarize(
    hhi_ma             = sum(ma_share^2, na.rm = TRUE),
    plan_count         = n_distinct(contractid, planid),
    avg_premium_partc  = mean(premium_partc, na.rm = TRUE),
    share_pos_premiums = mean(premium_partc > 0, na.rm = TRUE),
    avg_bid            = mean(bid, na.rm = TRUE),
    avg_eligibles      = first(avg_eligibles),
    ffs_cost           = first(avg_ffscost)
  ) %>%
  ungroup()

# Define HHI-based treatment -------------------------------------------
q_hhi <- quantile(ga_ma_2022$hhi_ma, probs = c(0.33, 0.66), na.rm = TRUE)

ga_tab <- ga_ma_2022 %>%
  mutate(
    hhi_group = case_when(
      hhi_ma >= q_hhi[2] ~ "treated",  # high HHI (low competition)
      hhi_ma <= q_hhi[1] ~ "control",  # low HHI (high competition)
      TRUE               ~ NA_character_
    ),
    treated_dummy = case_when(
      hhi_group == "treated" ~ 1L,
      hhi_group == "control" ~ 0L,
      TRUE                   ~ NA_integer_
    )
  ) %>%
  filter(!is.na(hhi_group))

# Analysis dataset: lp.vars (outcome + treatment + covariates) ---------
lp.vars <- ga_tab %>%
  select(
    treated_dummy,
    hhi_ma,
    plan_count,
    avg_premium_partc,
    share_pos_premiums,
    avg_bid,
    avg_eligibles,
    ffs_cost
  ) %>%
  filter(complete.cases(.))

# Covariates for matching / PS model
lp.covs <- lp.vars %>%
  select(ffs_cost, avg_eligibles)

# Propensity scores -----------------------------------------------------
logit.model <- glm(
  treated_dummy ~ avg_eligibles + ffs_cost,
  family = binomial,
  data   = lp.vars
)
ps <- fitted(logit.model)

# Attach PS and IPW to lp.vars -----------------------------------------
lp.vars <- lp.vars %>%
  mutate(
    ps  = ps,
    ipw = case_when(
      treated_dummy == 1 ~ 1 / ps,
      treated_dummy == 0 ~ 1 / (1 - ps),
      TRUE ~ NA_real_
    )
  )

glimpse(lp.vars)


## 1b. Optional: construct analogous analysis DataFrame in Python

In [None]:
import pandas as pd
import numpy as np

# Start from the already-aggregated Python version of ga_ma_2022
ga_ma_2022_raw = pd.read_csv("https://raw.githubusercontent.com/imccart/empirical-methods-content/main/data/output/ma-snippets/ga-ma-data-2022.csv")

def summarize_fips(group: pd.DataFrame) -> pd.Series:
    total_ma_enrollment = group["avg_enrolled"].iloc[0]

    ma_share = np.where(
        total_ma_enrollment > 0,
        (group["avg_enrollment"] / total_ma_enrollment) * 100.0,
        np.nan,
    )

    hhi_ma = np.nansum(ma_share ** 2)
    plan_count = group[["contractid", "planid"]].drop_duplicates().shape[0]
    avg_premium_partc = group["premium_partc"].mean(skipna=True)
    share_pos_premiums = (group["premium_partc"] > 0).mean()
    avg_bid = group["bid"].mean(skipna=True)
    avg_eligibles = group["avg_eligibles"].iloc[0]
    ffs_cost = group["avg_ffscost"].iloc[0]

    return pd.Series(
        dict(
            hhi_ma=hhi_ma,
            plan_count=plan_count,
            avg_premium_partc=avg_premium_partc,
            share_pos_premiums=share_pos_premiums,
            avg_bid=avg_bid,
            avg_eligibles=avg_eligibles,
            ffs_cost=ffs_cost,
        )
    )

ga_ma_2022_py = (
    ga_ma_2022_raw
    .groupby("fips", as_index=False)
    .apply(summarize_fips)
    .reset_index(drop=True)
)

# Define treatment in Python the same way
q_hhi_py = ga_ma_2022_py["hhi_ma"].quantile([0.33, 0.66])
low_q, high_q = q_hhi_py.loc[0.33], q_hhi_py.loc[0.66]

lp_df = ga_ma_2022_py.copy()
lp_df["treated_dummy"] = np.where(
    lp_df["hhi_ma"] >= high_q, 1,
    np.where(lp_df["hhi_ma"] <= low_q, 0, np.nan)
)
lp_df = lp_df.dropna(subset=["treated_dummy"])

lp_df.head()


## 2. Matching estimators (R)

In [None]:
%%R
# 2.1 Exact matching on ffs_cost only ----------------------------------
lp.covs2 <- lp.covs %>% select(ffs_cost)

m.exact2 <- Matching::Match(
  Y        = lp.vars$avg_bid,
  Tr       = lp.vars$treated_dummy,
  X        = lp.covs2,
  M        = 1,
  exact    = TRUE,
  estimand = "ATE"
)

cat("Exact matching (on ffs_cost), ATE:", m.exact2$est[1], "\n")
summary(m.exact2)


In [None]:
%%R
# 2.2 Nearest neighbor, inverse-variance (Weight=1), M=1 ---------------
m.nn.var2 <- Matching::Match(
  Y        = lp.vars$avg_bid,
  Tr       = lp.vars$treated_dummy,
  X        = lp.covs,
  M        = 1,
  Weight   = 1,
  estimand = "ATE"
)

cat("NN (Weight=1, M=1), ATE:", m.nn.var2$est[1], "\n")
summary(m.nn.var2)


In [None]:
%%R
# 2.3 Nearest neighbor, Mahalanobis (Weight=2) -------------------------
m.nn.md <- Matching::Match(
  Y        = lp.vars$avg_bid,
  Tr       = lp.vars$treated_dummy,
  X        = lp.covs,
  M        = 1,
  Weight   = 2,
  estimand = "ATE"
)

cat("NN (Mahalanobis), ATE:", m.nn.md$est[1], "\n")
summary(m.nn.md)


In [None]:
%%R
# 2.4 Nearest neighbor, propensity score -------------------------------
m.nn.ps <- Matching::Match(
  Y        = lp.vars$avg_bid,
  Tr       = lp.vars$treated_dummy,
  X        = lp.vars$ps,
  M        = 1,
  estimand = "ATE"
)

cat("NN (propensity score), ATE:", m.nn.ps$est[1], "\n")
summary(m.nn.ps)


## 3. IPW estimators (R)

In [None]:
%%R
# 3.1 IPW with simple averages -----------------------------------------
mean.t1 <- lp.vars %>%
  filter(treated_dummy == 1) %>%
  summarize(mean_bid = weighted.mean(avg_bid, w = ipw))

mean.t0 <- lp.vars %>%
  filter(treated_dummy == 0) %>%
  summarize(mean_bid = weighted.mean(avg_bid, w = ipw))

ate_ipw_means <- mean.t1$mean_bid - mean.t0$mean_bid
cat("IPW (simple averages), ATE:", round(ate_ipw_means, 3), "\n")


In [None]:
%%R
# 3.2 IPW regression ---------------------------------------------------
ipw.reg <- lm(
  avg_bid ~ treated_dummy,
  data    = lp.vars,
  weights = ipw
)

summary(ipw.reg)
cat("IPW regression coef on treated_dummy:", coef(ipw.reg)["treated_dummy"], "\n")


## 4. Regression-based estimators (R, no weighting)

In [None]:
%%R
# Two-step regression --------------------------------------------------
reg1.dat <- lp.vars %>% filter(treated_dummy == 1, complete.cases(.))
reg1 <- lm(avg_bid ~ ffs_cost + avg_eligibles, data = reg1.dat)

reg0.dat <- lp.vars %>% filter(treated_dummy == 0, complete.cases(.))
reg0 <- lm(avg_bid ~ ffs_cost + avg_eligibles, data = reg0.dat)

pred1 <- predict(reg1, newdata = lp.vars)
pred0 <- predict(reg0, newdata = lp.vars)

ate_reg2 <- mean(pred1 - pred0, na.rm = TRUE)
cat("Two-step regression ATE (mean(pred1 - pred0)):", round(ate_reg2, 3), "\n")

# One-step regression with interactions -------------------------------
reg.dat <- lp.vars %>%
  ungroup() %>%
  filter(complete.cases(.)) %>%
  mutate(
    ffs_diff       = treated_dummy * (ffs_cost       - mean(ffs_cost)),
    eligibles_diff = treated_dummy * (avg_eligibles  - mean(avg_eligibles))
  )

reg <- lm(
  avg_bid ~ treated_dummy + ffs_cost + avg_eligibles +
    ffs_diff + eligibles_diff,
  data = reg.dat
)

summary(reg)
cat("One-step regression coef on treated_dummy:", coef(reg)["treated_dummy"], "\n")


## 5. Optional: IPW and regression in Python

In [None]:
import statsmodels.formula.api as smf

# Drop missing on required columns
cols_needed = ["avg_bid", "treated_dummy", "ffs_cost", "avg_eligibles"]
lp_df = lp_df.dropna(subset=cols_needed).copy()

# Propensity scores (Python) for IPW -----------------------------------
logit_res = smf.logit(
    "treated_dummy ~ avg_eligibles + ffs_cost",
    data=lp_df
).fit(disp=False)

lp_df["ps"] = logit_res.predict(lp_df)
eps = 1e-6
lp_df["ps"] = lp_df["ps"].clip(eps, 1-eps)

lp_df["ipw"] = np.where(
    lp_df["treated_dummy"] == 1,
    1.0 / lp_df["ps"],
    1.0 / (1.0 - lp_df["ps"])
)

treated_py = lp_df[lp_df["treated_dummy"] == 1]
control_py = lp_df[lp_df["treated_dummy"] == 0]

mean_t1_py = np.average(treated_py["avg_bid"], weights=treated_py["ipw"])
mean_t0_py = np.average(control_py["avg_bid"], weights=control_py["ipw"])
ate_ipw_py = mean_t1_py - mean_t0_py

print("Python IPW (simple averages), ATE:", round(ate_ipw_py, 3))

# IPW regression -------------------------------------------------------
ipw_mod = smf.wls(
    "avg_bid ~ treated_dummy",
    data=lp_df,
    weights=lp_df["ipw"]
).fit()

print(ipw_mod.summary())
print("Python IPW regression coef on treated_dummy:",
      ipw_mod.params["treated_dummy"])


## 6. Where to look

- **Matching ATEs (R):** objects `m.exact2`, `m.nn.var2`, `m.nn.md`, `m.nn.ps`  
- **IPW ATEs (R):** `ate_ipw_means`, `ipw.reg`  
- **Regression ATEs (R):** `ate_reg2`, `reg`  
- **Python analogues:** see the last IPW / regression cell using `lp_df`.

You can now modify covariates, treatment definitions, or outcomes and re-run each block to explore how the estimators behave.
