# HCRIS Data

This notebook mirrors the HCRIS data slide deck and works with HCRIS snippet files to summarize hospital pricing, HRRP penalties, and Medicare discharges.


In [None]:
!pip -q install rpy2

%load_ext rpy2.ipython


## Setup: file paths

In [None]:
import os
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

base_dir = "https://raw.githubusercontent.com/imccart/empirical-methods-content/main/data/output/hcris-snippets"

HCRIS_EMORY_PATH = os.path.join(DATA_DIR, "hcris-emory.csv")
HCRIS_DATA_PATH  = os.path.join(DATA_DIR, "hcris-data.csv")

HCRIS_EMORY_PATH, HCRIS_DATA_PATH


### R: load packages

In [None]:
%%R
if (!require("pacman")) install.packages("pacman")
pacman::p_load(tidyverse, ggplot2, scales, patchwork)


## HCRIS for Emory hospitals

### R: load Emory snippet

In [None]:
%%R
hcris.emory <- read_csv("https://raw.githubusercontent.com/imccart/empirical-methods-content/main/data/output/hcris-snippets/hcris-emory.csv")
glimpse(hcris.emory)


### Python: load Emory snippet

In [None]:
hcris_emory = pd.read_csv(HCRIS_EMORY_PATH)
hcris_emory.head()


## Emory net patient revenue and Medicare discharges (means by year)

### R

In [None]:
%%R
plot.dat <- hcris.emory %>% 
  group_by(year) %>% 
  summarize(net_rev = mean(net_pat_rev, na.rm=TRUE)/1e6, 
            mcare   = mean(mcare_discharges, na.rm=TRUE),
            .groups="drop")

rev.plot <- plot.dat %>%
  ggplot(aes(x=as.factor(year), y=net_rev)) + 
  geom_line(linewidth = 1) +
  labs(x="Year", y="$ in millions", title="Mean Patient Revenue Over Time") +
  theme_bw() + theme(axis.text.x = element_text(angle=90, hjust=1))

mcare.plot <- plot.dat %>%
  ggplot(aes(x=as.factor(year), y=mcare)) + 
  geom_line(linewidth = 1) +
  labs(x="Year", y="Discharges", title="Mean Medicare Discharges Over Time") +
  theme_bw() + theme(axis.text.x = element_text(angle=90, hjust=1))

rev.plot + mcare.plot


### Python

In [None]:
plot_dat = (
    hcris_emory
    .groupby("year", as_index=False)
    .agg(
        net_rev=("net_pat_rev", lambda x: x.mean(skipna=True) / 1_000_000),
        mcare=("mcare_discharges", "mean")
    )
)

# Patient revenue
fig_rev, ax_rev = plt.subplots()
ax_rev.plot(plot_dat["year"].astype(str), plot_dat["net_rev"], linewidth=1)
ax_rev.set_xlabel("Year")
ax_rev.set_ylabel("$ in millions")
ax_rev.set_title("Mean Patient Revenue Over Time")
ax_rev.tick_params(axis="x", rotation=90)
plt.tight_layout()
plt.show()

# Medicare discharges
fig_mc, ax_mc = plt.subplots()
ax_mc.plot(plot_dat["year"].astype(str), plot_dat["mcare"], linewidth=1)
ax_mc.set_xlabel("Year")
ax_mc.set_ylabel("Discharges")
ax_mc.set_title("Mean Medicare Discharges Over Time")
ax_mc.tick_params(axis="x", rotation=90)
plt.tight_layout()
plt.show()


## HCRIS for all hospitals

### R: load and count hospitals per year

In [None]:
%%R
hcris.data <- read_csv("https://raw.githubusercontent.com/imccart/empirical-methods-content/main/data/output/hcris-snippets/hcris-data.csv")

hosp.count.plot <- hcris.data %>% 
  ggplot(aes(x=as.factor(year))) + 
  geom_bar() +
  labs(x="Year", y="Number of Hospitals", title="Number of Hospitals per Year") +
  theme_bw() + theme(axis.text.x = element_text(angle=90, hjust=1))

hosp.count.plot


### Python: load and count hospitals per year

In [None]:
hcris_data = pd.read_csv(HCRIS_DATA_PATH)

counts = hcris_data["year"].value_counts().sort_index()

fig, ax = plt.subplots()
ax.bar(counts.index.astype(str), counts.values)
ax.set_xlabel("Year")
ax.set_ylabel("Number of Hospitals")
ax.set_title("Number of Hospitals per Year")
ax.tick_params(axis="x", rotation=90)
plt.tight_layout()
plt.show()


## Prices in the full HCRIS data

### R

In [None]:
%%R
price.plot <- hcris.data %>% 
  group_by(year) %>% 
  summarize(mean_price=mean(price, na.rm=TRUE), .groups="drop") %>%
  ggplot(aes(x=as.factor(year), y=mean_price)) + 
  geom_line(aes(group=1)) +
  labs(x="Year", y="Average Hospital Price", title="Hospital Prices per Year") +
  scale_y_continuous(labels=scales::comma) +
  theme_bw() + theme(axis.text.x = element_text(angle=90, hjust=1))

price.plot


### Python

In [None]:
from matplotlib.ticker import FuncFormatter

price_year = (
    hcris_data
    .groupby("year", as_index=False)
    .agg(mean_price=("price", "mean"))
)

comma_fmt = FuncFormatter(lambda x, pos: f"{int(x):,}")

fig, ax = plt.subplots()
ax.plot(price_year["year"].astype(str), price_year["mean_price"], linewidth=1)
ax.set_xlabel("Year")
ax.set_ylabel("Average Hospital Price")
ax.set_title("Hospital Prices per Year")
ax.yaxis.set_major_formatter(comma_fmt)
ax.tick_params(axis="x", rotation=90)
plt.tight_layout()
plt.show()


## Removing very high (and very low) prices: winsorize within year

### R

In [None]:
%%R
price.plot2 <- hcris.data %>% 
  filter(price>0) %>%
  group_by(year) %>% 
  mutate(
    p95 = quantile(price, 0.95, na.rm=TRUE),
    p05 = quantile(price, 0.05, na.rm=TRUE),
    price = pmin(pmax(price, p05), p95)
  ) %>% 
  summarize(mean_price=mean(price, na.rm=TRUE), .groups="drop") %>%
  ggplot(aes(x=as.factor(year), y=mean_price)) + 
  geom_line(aes(group=1)) +
  labs(x="Year", y="Average Hospital Price", title="Hospital Prices per Year (winsorized)") +
  scale_y_continuous(labels=scales::comma) +
  theme_bw() + theme(axis.text.x = element_text(angle=90, hjust=1))

price.plot2


### Python

In [None]:
df2 = hcris_data.loc[hcris_data["price"] > 0, ["year", "price"]].copy()

def winsorize_group(g):
    p95 = g["price"].quantile(0.95)
    p05 = g["price"].quantile(0.05)
    g["price"] = g["price"].clip(lower=p05, upper=p95)
    return g

price_year2 = (
    df2.groupby("year", group_keys=False)
       .apply(winsorize_group)
       .groupby("year", as_index=False)
       .agg(mean_price=("price", "mean"))
)

fig, ax = plt.subplots()
ax.plot(price_year2["year"].astype(str), price_year2["mean_price"], linewidth=1)
ax.set_xlabel("Year")
ax.set_ylabel("Average Hospital Price")
ax.set_title("Hospital Prices per Year (winsorized)")
ax.yaxis.set_major_formatter(comma_fmt)
ax.tick_params(axis="x", rotation=90)
plt.tight_layout()
plt.show()


## HRRP penalties: share of hospitals penalized

### R

In [None]:
%%R
share.hrrp <- hcris.data %>% 
  mutate(penalized = if_else(hrrp_payment>0 & !is.na(hrrp_payment), 1, 0)) %>%
  group_by(year) %>% 
  summarize(share_hrrp=mean(penalized, na.rm=TRUE), .groups="drop") %>%
  ggplot(aes(x=as.factor(year), y=share_hrrp)) + 
  geom_line(aes(group=1)) +
  labs(x="Year", y="Share of Hospitals", title="Penalized Hospitals by Year") +
  theme_bw() + theme(axis.text.x = element_text(angle=90, hjust=1))

share.hrrp


### Python

In [None]:
share_year = (
    hcris_data
    .assign(penalized=np.where((hcris_data["hrrp_payment"] > 0) & (~hcris_data["hrrp_payment"].isna()), 1, 0))
    .groupby("year", as_index=False)
    .agg(share_hrrp=("penalized", "mean"))
)

fig, ax = plt.subplots()
ax.plot(share_year["year"].astype(str), share_year["share_hrrp"], linewidth=1)
ax.set_xlabel("Year")
ax.set_ylabel("Share of Hospitals")
ax.set_title("Penalized Hospitals by Year")
ax.tick_params(axis="x", rotation=90)
plt.tight_layout()
plt.show()


## HRRP penalties: amounts and percent of Medicare payments

### R

In [None]:
%%R
hrrp.stats <- hcris.data %>% 
  filter(!is.na(hrrp_payment), hrrp_payment!=0) %>%
  mutate(hrrp_percent = hrrp_payment/tot_mcare_payment) %>%
  group_by(year) %>% 
  summarize(mean=mean(hrrp_payment),
            max=max(hrrp_payment),
            mean_percent=mean(hrrp_percent, na.rm=TRUE),
            max_percent=max(hrrp_percent, na.rm=TRUE),
            .groups="drop")

hrrp.stats


### Python

In [None]:
hrrp_stats = (
    hcris_data
    .loc[(~hcris_data["hrrp_payment"].isna()) & (hcris_data["hrrp_payment"] != 0)]
    .assign(hrrp_percent=lambda d: d["hrrp_payment"] / d["tot_mcare_payment"])
    .groupby("year", as_index=False)
    .agg(
        mean=("hrrp_payment", "mean"),
        max=("hrrp_payment", "max"),
        mean_percent=("hrrp_percent", "mean"),
        max_percent=("hrrp_percent", "max")
    )
)

hrrp_stats.head()


## HRRP penalties vs pre-period Medicare volume (2013 vs 2011)

### R

In [None]:
%%R
mcare.hrrp <- hcris.data %>% 
  filter(year %in% c(2011, 2013)) %>%
  group_by(provider_number) %>%
  summarize(
    mcare_2011    = mcare_discharges[year == 2011][1]/1000,
    hrrp_pay_2013 = hrrp_payment[year == 2013][1]/1000000,
    .groups = "drop"
  ) %>%
  drop_na(mcare_2011, hrrp_pay_2013)

mcare.hrrp.plot <- ggplot(mcare.hrrp, aes(x=mcare_2011, y=hrrp_pay_2013)) +
  geom_point(alpha=0.6) +
  labs(x="Medicare Discharges (2011) in thousands",
       y="HRRP Penalty (2013) in millions $",
       title="HRRP Penalties vs Pre-Period Medicare Volume") +
  theme_bw()

mcare.hrrp.plot


### Python

In [None]:
# Robust pivot-based construction (transparent 2011/2013 merge)
wide = (
    hcris_data
    .loc[hcris_data["year"].isin([2011, 2013]),
         ["provider_number", "year", "mcare_discharges", "hrrp_payment"]]
    .pivot(index="provider_number", columns="year")
)

mcare_hrrp = (
    wide
    .assign(
        mcare_2011=wide[("mcare_discharges", 2011)] / 1_000,
        hrrp_pay_2013=wide[("hrrp_payment", 2013)] / 1_000_000
    )
    .loc[:, ["mcare_2011", "hrrp_pay_2013"]]
    .dropna()
)

fig, ax = plt.subplots()
ax.scatter(mcare_hrrp["mcare_2011"], mcare_hrrp["hrrp_pay_2013"], alpha=0.6)
ax.set_xlabel("Medicare Discharges (2011) in thousands")
ax.set_ylabel("HRRP Penalty (2013) in millions $")
ax.set_title("HRRP Penalties vs Pre-Period Medicare Volume")
plt.tight_layout()
plt.show()
