# IV with Real Data: HRRP Penalties and Hospital Prices

*(Empirical-only notebook: data prep, plots, OLS, first stage/reduced form, and IV/2SLS in both R and Python.)*

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import statsmodels.formula.api as smf

# Optional (recommended) for IV/2SLS in Python:
# !pip -q install linearmodels

try:
    from linearmodels.iv import IV2SLS
    HAVE_LINEARMODELS = True
except ImportError:
    HAVE_LINEARMODELS = False
    print("Note: linearmodels not installed. Python IV/2SLS section will not run until installed.")


In [None]:
%%R
if (!require("pacman")) install.packages("pacman")
pacman::p_load(tidyverse, fixest, modelsummary)


## Organize data (R)

In [None]:
%%R
hcris.data <- read_csv("../data/output/hcris-snippets/hcris-data.csv")

hcris.price <- hcris.data %>%
  filter(year %in% c(2011, 2014)) %>%
  select(provider_number, price, year) %>%
  pivot_wider(names_from=year, values_from=price, names_prefix="price_")

hcris.hrrp <- hcris.data %>%
  filter(year==2012) %>%
  mutate(hrrp_penalty=if_else(hrrp_payment>0 & !is.na(hrrp_payment), hrrp_payment/1000, 0)) %>%
  select(provider_number, hrrp_penalty) %>%
  mutate(any_penalty=(hrrp_penalty>0))

hcris.mcare <- hcris.data %>%
  filter(year<2012) %>%
  group_by(provider_number) %>%
  summarize(avg_mcare=mean(mcare_discharges, na.rm=TRUE)/100) %>%
  ungroup()

hcris.final <- hcris.price %>%
  left_join(hcris.hrrp, by="provider_number") %>%
  left_join(hcris.mcare, by="provider_number") %>%
  filter(!is.na(price_2011), !is.na(price_2014)) %>%
  mutate(price_change=price_2014-price_2011)

dplyr::glimpse(hcris.final)
head(hcris.final)


## Organize data (Python)

In [None]:
# read
hcris_data = pd.read_csv("../data/output/hcris-snippets/hcris-data.csv")

# prices wide
hcris_price = (
    hcris_data.loc[hcris_data["year"].isin([2011, 2014]), ["provider_number", "price", "year"]]
    .pivot(index="provider_number", columns="year", values="price")
    .rename(columns=lambda y: f"price_{int(y)}")
    .reset_index()
)

# HRRP 2012
hcris_hrrp = (
    hcris_data.loc[hcris_data["year"].eq(2012), ["provider_number", "hrrp_payment"]]
    .assign(
        hrrp_penalty=lambda df: np.where(
            (df["hrrp_payment"] > 0) & df["hrrp_payment"].notna(),
            df["hrrp_payment"] / 1000,  # $1000s
            0.0
        )
    )
    .loc[:, ["provider_number", "hrrp_penalty"]]
    .assign(any_penalty=lambda df: df["hrrp_penalty"] > 0)
)

# Medicare discharges (pre-2012 average, in 100s)
hcris_mcare = (
    hcris_data.loc[hcris_data["year"] < 2012, ["provider_number", "mcare_discharges"]]
    .groupby("provider_number", as_index=False)
    .agg(avg_mcare=("mcare_discharges", lambda x: np.nanmean(x) / 100))
)

# merge + price change
hcris_final = (
    hcris_price
    .merge(hcris_hrrp, on="provider_number", how="left")
    .merge(hcris_mcare, on="provider_number", how="left")
)

hcris_final = (
    hcris_final.loc[hcris_final["price_2011"].notna() & hcris_final["price_2014"].notna()]
    .assign(price_change=lambda df: df["price_2014"] - df["price_2011"])
)

hcris_final.head()


## Naive OLS: price change on HRRP penalty

In [None]:
%%R
ols_r <- lm(price_change ~ hrrp_penalty, data=hcris.final)
summary(ols_r)


In [None]:
ols_py = smf.ols("price_change ~ hrrp_penalty", data=hcris_final).fit()
print(ols_py.summary())


## Instrument visualization: baseline Medicare volume vs HRRP penalty

In [None]:
%%R
mcare.hrrp.plot <- ggplot(hcris.final, aes(x = avg_mcare, y = hrrp_penalty)) +
  geom_point(alpha = 0.6) +
  labs(
    x = "Medicare Discharges (before 2012) in 100s",
    y = "HRRP Penalty (2012) in thousands $",
    title = "HRRP Penalties vs Pre-Period Medicare Volume"
  ) +
  theme_bw()

mcare.hrrp.plot


In [None]:
plt.figure()
plt.scatter(hcris_final["avg_mcare"], hcris_final["hrrp_penalty"], alpha=0.6)
plt.xlabel("Medicare Discharges (before 2012) in 100s")
plt.ylabel("HRRP Penalty (2012) in thousands $")
plt.title("HRRP Penalties vs Pre-Period Medicare Volume")
plt.show()


## First stage and reduced form

In [None]:
%%R
first.stage <- feols(hrrp_penalty ~ avg_mcare, data=hcris.final)
red.form <- feols(price_change ~ avg_mcare, data=hcris.final)

modelsummary(
  list("First Stage"=first.stage, "Reduced Form"=red.form),
  keep=c("avg_mcare"),
  coef_map=c("avg_mcare"="Pre-HRRP Medicare Discharges"),
  gof_map=c("nobs", "r.squared")
)


In [None]:
first_stage_py = smf.ols("hrrp_penalty ~ avg_mcare", data=hcris_final).fit()
red_form_py   = smf.ols("price_change ~ avg_mcare", data=hcris_final).fit()

import pandas as pd

table = pd.DataFrame({
    "First Stage": [
        first_stage_py.params["avg_mcare"],
        first_stage_py.bse["avg_mcare"],
        int(first_stage_py.nobs),
        first_stage_py.rsquared
    ],
    "Reduced Form": [
        red_form_py.params["avg_mcare"],
        red_form_py.bse["avg_mcare"],
        int(red_form_py.nobs),
        red_form_py.rsquared
    ]
}, index=["Pre-HRRP Medicare Discharges", "Std. Error", "N", "R-squared"])

table


## IV / 2SLS estimates

In [None]:
%%R
ols <- feols(price_change ~ hrrp_penalty, data=hcris.final)
iv  <- feols(price_change ~ 1 | hrrp_penalty ~ avg_mcare, data=hcris.final)

modelsummary(
  list("OLS"=ols, "IV"=iv),
  keep=c("hrrp_penalty", "fit_hrrp_penalty"),
  coef_map=c("hrrp_penalty"="HRRP Penalty ($1000s)", "fit_hrrp_penalty"="HRRP Penalty ($1000s)"),
  gof_map=c("nobs")
)


In [None]:
# OLS (Python)
ols_py2 = smf.ols("price_change ~ hrrp_penalty", data=hcris_final).fit()

# IV/2SLS (Python)
if not HAVE_LINEARMODELS:
    raise ImportError("Install linearmodels to run IV2SLS: pip install linearmodels")

iv_py = IV2SLS.from_formula(
    "price_change ~ 1 + [hrrp_penalty ~ avg_mcare]",
    data=hcris_final
).fit(cov_type="robust")

# compact table
out = pd.DataFrame(
    {
        "OLS": {
            "HRRP Penalty ($1000s)": ols_py2.params["hrrp_penalty"],
            "Std. Error": ols_py2.bse["hrrp_penalty"],
            "N": int(ols_py2.nobs),
        },
        "IV": {
            "HRRP Penalty ($1000s)": iv_py.params["hrrp_penalty"],
            "Std. Error": iv_py.std_errors["hrrp_penalty"],
            "N": int(iv_py.nobs),
        },
    }
)
out
