# Build data

This script builds the necessary data files for homework 1. We work from the raw data and save intermediate data for future analysis.

## Preliminary loading and packages

In [1]:
!pip -q install rpy2
%load_ext rpy2.ipython


[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m24.2[0m[39;49m -> [0m[32;49m25.3[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpip install --upgrade pip[0m


In [2]:
%%R
options(repos = c(CRAN = "https://cloud.r-project.org"))

if (!require("pacman")) install.packages("pacman")
pacman::p_load(tidyverse, ggplot2, dplyr, lubridate, stringr, readxl, data.table, gdata, scales)

Loading required package: pacman


## Read enrollment and contract data

In [4]:
%%R

test.dat <- read_csv("../ma-data/ma/enrollment/Extracted Data/CPSC_Contract_Info_2018_01.csv")
head(test.dat)

Rows: 5235 Columns: 12
── Column specification ────────────────────────────────────────────────────────
Delimiter: ","
chr (12): Contract ID, Plan ID, Organization Type, Plan Type, Offers Part D,...

ℹ Use `spec()` to retrieve the full column specification for this data.
ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
# A tibble: 6 × 12
  `Contract ID` `Plan ID` `Organization Type`        `Plan Type` `Offers Part D`
  <chr>         <chr>     <chr>                      <chr>       <chr>          
1 90091         <NA>      HCPP - 1833 Cost           HCPP - 183… No             
2 E0654         801       Employer/Union Only Direc… Employer/U… Yes            
3 E3014         801       Employer/Union Only Direc… Employer/U… Yes            
4 E4744         801       Employer/Union Only Direc… Employer/U… Yes            
5 H0022         001       Demo                       Medicare-M… Yes            
6 H0028         004       Local CCP                  HMO/HMO

In [3]:
%%R
  read_contract <- function(path) {
    read_csv(
      path,
      skip = 1,
      col_names = c(
        "contractid","planid","org_type","plan_type","partd","snp","eghp",
        "org_name","org_marketing_name","plan_name","parent_org","contract_date"
      ),
      col_types = cols(
        contractid = col_character(),
        planid     = col_double(),
        org_type   = col_character(),
        plan_type  = col_character(),
        partd      = col_character(),
        snp        = col_character(),
        eghp       = col_character(),
        org_name   = col_character(),
        org_marketing_name = col_character(),
        plan_name  = col_character(),
        parent_org = col_character(),
        contract_date = col_character()
      ),
      show_col_types = FALSE,
      progress = FALSE
    )
  }

  read_enroll <- function(path) {
    read_csv(
      path,
      skip = 1,
      col_names = c("contractid","planid","ssa","fips","state","county","enrollment"),
      col_types = cols(
        contractid = col_character(),
        planid     = col_double(),
        ssa        = col_double(),
        fips       = col_double(),
        state      = col_character(),
        county     = col_character(),
        enrollment = col_double()
      ),
      na = "*",
      show_col_types = FALSE,
      progress = FALSE
    )
  }

  # One-month loader --------------------------------------------------------
  load_month <- function(m, y) {
    c_path <- paste0("../ma-data/ma/enrollment/Extracted Data/CPSC_Contract_Info_", y, "_", m, ".csv")
    e_path <- paste0("../ma-data/ma/enrollment/Extracted Data/CPSC_Enrollment_Info_", y, "_", m, ".csv")

    contract.info <- read_contract(c_path) %>%
      distinct(contractid, planid, .keep_all = TRUE)   

    enroll.info <- read_enroll(e_path)

    contract.info %>%
      left_join(enroll.info, by = c("contractid","planid")) %>%
      mutate(month = as.integer(m), year = y)
  }

In [4]:
%%R

  monthlist <- sprintf("%02d", 1:12)
  y <- 2018

  # Read all months, then tidy once ----------------------------------------

  plan.year <- map_dfr(monthlist, ~ load_month(.x, y)) %>%
    arrange(contractid, planid, state, county, month) %>%
    group_by(state, county) %>%
    fill(fips, .direction = "downup") %>%                
    ungroup() %>%
    group_by(contractid, planid) %>%
    fill(plan_type, partd, snp, eghp, plan_name, .direction = "downup") %>%
    ungroup() %>%
    group_by(contractid) %>%
    fill(org_type, org_name, org_marketing_name, parent_org, .direction = "downup") %>%
    ungroup()


  # Collapse to yearly panel ------------------------------------------------
  final.plans <- plan.year %>%
    group_by(contractid, planid, fips, year) %>%
    arrange(month, .by_group = TRUE) %>%
    summarize(
      n_nonmiss        = sum(!is.na(enrollment)),
      avg_enrollment   = ifelse(n_nonmiss > 0, mean(enrollment, na.rm = TRUE), NA_real_),
      sd_enrollment    = ifelse(n_nonmiss > 1, sd(enrollment, na.rm = TRUE), NA_real_),
      min_enrollment   = ifelse(n_nonmiss > 0, min(enrollment, na.rm = TRUE), NA_real_),
      max_enrollment   = ifelse(n_nonmiss > 0, max(enrollment, na.rm = TRUE), NA_real_),
      first_enrollment = ifelse(n_nonmiss > 0, first(na.omit(enrollment)), NA_real_),
      last_enrollment  = ifelse(n_nonmiss > 0,  last(na.omit(enrollment)), NA_real_),
      state            = last(state),
      county           = last(county),
      org_type         = last(org_type),
      plan_type        = last(plan_type),
      partd            = last(partd),
      snp              = last(snp),
      eghp             = last(eghp),
      org_name         = last(org_name),
      org_marketing_name = last(org_marketing_name),
      plan_name        = last(plan_name),
      parent_org       = last(parent_org),
      contract_date    = last(contract_date),
      year             = last(year),
      .groups = "drop"
    )





In [9]:
%%R

write_csv(final.plans,"data/output/plan_data.csv")

## Read service area data

In [5]:
%%R
  read_service_area <- function(path) {
    read_csv(
      path, skip = 1,
      col_names = c(
        "contractid","org_name","org_type","plan_type","partial","eghp",
        "ssa","fips","county","state","notes"
      ),
      col_types = cols(
        contractid = col_character(),
        org_name   = col_character(),
        org_type   = col_character(),
        plan_type  = col_character(),
        partial    = col_logical(),
        eghp       = col_character(),
        ssa        = col_double(),
        fips       = col_double(),
        county     = col_character(),
        state      = col_character(),
        notes      = col_character()
      ),
      na = "*",
      show_col_types = FALSE,
      progress = FALSE
    )
  }

  # One-month loader --------------------------------------------------------
  load_month_sa <- function(m, y) {
    path <- paste0("../ma-data/ma/service-area/Extracted Data/MA_Cnty_SA_",y, "_", m, ".csv")
    
    read_service_area(path) %>%
      mutate(month = as.integer(m), year = y)
  }


In [6]:
%%R

  monthlist <- sprintf("%02d", 1:12)
  y <- 2018

  service.year <- map_dfr(monthlist, ~ load_month_sa(.x, y))

  # Ensure stable order before fills
  service.year <- service.year %>%
    arrange(contractid, fips, state, county, month)

  # Fill missing identifiers/labels
  service.year <- service.year %>%
    group_by(state, county) %>%
    fill(fips, .direction = "downup") %>%
    ungroup() %>%
    group_by(contractid) %>%
    fill(plan_type, partial, eghp, org_type, org_name, .direction = "downup") %>%
    ungroup()

  # Collapse to yearly: one row per contract × county (fips) × year --------
  final.service.area <- service.year %>%
    group_by(contractid, fips, year) %>%
    arrange(month, .by_group = TRUE) %>%
    summarize(
      state     = last(state),
      county    = last(county),
      org_name  = last(org_name),
      org_type  = last(org_type),
      plan_type = last(plan_type),
      partial   = last(partial),
      eghp      = last(eghp),
      ssa       = last(ssa),
      notes     = last(notes),
      .groups = "drop"
    )




In [14]:
%%R

write_csv(final.service.area,"data/output/service_area.csv")