Skip to content
Permalink
Branch: master
Find file Copy path
Find file Copy path
Fetching contributors…
Cannot retrieve contributors at this time
530 lines (431 sloc) 17.7 KB

Scraping roll call data for the New Mexico State Legislature

Code used to scrape/collect/make available roll call data and legislation details for New Mexico's 53rd State Legislature (NMSL53) from PDFs made available online at nmlegis.gov. The code presented here generates the data tables comprising the R package nmlegisdatr. Code will have to be modified some to accommodate the addition of new legislatures over time, especially with respect to hand edits.

library(tabulizer)
library(tidyverse)
library(curl)

1. Get legislation details

All introduced legislation (along with bill sponsors) for each legislative session are made available at nmlegis.gov --- to have a clean starting point, these data have been copy/pasted as CSVs here.

setwd("C:\\Users\\jason\\Google Drive\\GitHub\\packages\\nmlegisdatr\\data-raw\\nmsl_legislation\\nmsl53")

raw_leg <- list.files(path="C:\\Users\\jason\\Google Drive\\GitHub\\packages\\nmlegisdatr\\data-raw\\nmsl_legislation\\nmsl53", 
                      pattern = ".csv")

nml_leg_list <- raw_leg %>%
  lapply(., read.csv, stringsAsFactors = FALSE,na.strings=c("")) %>%
  bind_rows()%>%
  fill(Title, Bill.ID, Actions, Session) %>%
  mutate (Bill_ID = gsub('\\*','',Bill.ID)) %>%
  mutate(Type = gsub(' [0-9].*$','', Bill_ID)) %>%
  mutate(Type =
           case_when(
             Type %in% c('HB','SB') ~ "Bill",
             Type %in% c('HJR','SJR','HCR','SCR','HR','SR') ~ "Resolution",
             Type %in% c('HM','SM', 'HJM','SJM') ~ "Memorial")
         ) %>%

  mutate(bnum = gsub('^[A-Z].* ','',Bill_ID)) %>%
  mutate(bnum = ifelse (Type == "Bill", 
                        str_pad(bnum, 4,pad = "0"), 
                        str_pad(bnum, 3, pad = "0"))) %>%
  mutate(bnum = ifelse(Type == "Resolution", 
                       gsub('^0','',bnum),
                       bnum)) %>%
  mutate(Bill_ID = paste0(gsub(' [0-9].*$','',Bill_ID), bnum),
         Sponsor = gsub('\\s$', '', Sponsor),
         Year = gsub(' .*$','',Session), 
         Session = gsub('^[0-9].* ','',Session),
         Bill_Code = paste0(gsub('[a-z]*$','',Session), gsub('20','',Year),'_',Bill_ID),
         Congress = '53') %>% #Will need to be smarter.
  select (Bill_Code, Congress, Year, Session, Bill_ID, Type, Title:Actions)

2. Scrape bill descriptions

2.1 Unique legislation

legislation <- nml_leg_list %>%
  select (-Sponsor) %>%
  unique() %>%
  select(Bill_Code,Congress, Year:Type,Title, Actions)

2.2 Build urls to legislation descriptions

base <- 'https://www.nmlegis.gov/Sessions/'
bill_desc_urls <- paste0(base,  
               #This needs to be kept/stored somewhere.  
               #Although the addresses are compositional.
               gsub('20','',legislation$Year), 
               '%20', 
               legislation$Session, 
               '/', 
               tolower(legislation$Type), 
               's/', 
               tolower(legislation$Chamber), 
               '/', 
               legislation$Bill_ID, 
               '.pdf')

2.3 Function to scrape/extract bill description from full bill

get_pdf_text <- function (x) {
  tmp <- tempfile()
  curl::curl_download(x, tmp)
  y <- pdftools::pdf_text(tmp)[1:2]
  y <- paste(y[1],y[2], collapse = " ")
  y <- readr::read_lines(y)

  y <- trimws(y)
  start <- grep(" A(N)? |^A(N)? ",y)[1]

  end <- grep("WHEREAS|BE IT",y)
  end <- subset(end, end > start)[1] - 1

  y <- y[start:end] %>%
    gsub ('^[0-9]+( +)?', '', .) #This doesn't seem to always work - ?
  y <- y[grepl("[a-z][a-z]",y)==FALSE]
  y <- subset(y, y!="")
  paste(y, collapse = " ") %>%
    trimws()
}

Apply function to web links

leg_summary <- lapply(bill_desc_urls, get_pdf_text)
#setwd("C:\\Users\\jason\\Google Drive\\GitHub\\packages\\nmlegisdatr")
#saveRDS(leg_summary, 'inter_descs.rds')

2.4 Add bill descriptions & web links to legislation table

nml_legislation <- legislation %>%
  bind_cols(data.table::melt(leg_summary)) %>%
  rename(Bill_Description = value, Bill_Title = Title) %>%
  select(-L1)%>%
  bind_cols(data.frame(Bill_URL = bill_desc_urls))%>% #Add web link.
  mutate_if(is.factor, as.character)

3. Scrape roll calls

3.1 Search parameters

Congress <- rep('53',3)
Year <- c('2017','2017','2018')
Session <- c('Regular', 'Special', 'Regular')

leg_key <- data.frame(Congress, 
                      Year, 
                      Session, 
                      Year_Session=paste0(Year, " ", Session), #Why do we need this?
                      stringsAsFactors = FALSE)

3.2 Get url location of bills that went to vote:

web_votes <- data.frame()

for (i in 1:length(leg_key$Year)) {

  url <- paste0(base, gsub('20','',leg_key$Year[i]),
                '%20',
                leg_key$Session[i],
                '/votes/')
  
  h <- curl::new_handle(dirlistonly=TRUE)
  con <- curl::curl(url, "r", h)
  votes <- scan (con, what = "char", sep = "\n")
  close(con)

  v1 <- strsplit(votes, '<br>')[[2]] %>%
    gsub("(?<=[\\s])\\s*|^\\s+|\\s+$", "", ., perl=TRUE) %>%
    grep('^[0-9]',., value = TRUE) %>%
    grep('votes/H|votes/S', ., value = TRUE) %>%
    gsub('\">.*$|<A HREF=\"|amp;', '', .)

  v2 <- c ('Date time ap code Link', v1)

  v3 <- read.table(text = v2,
                   sep = ' ',
                   header = TRUE,
                   stringsAsFactors=FALSE) %>%
    select(Date, code, Link) %>%
    mutate(file = gsub('^.*/', '', Link),
           file = gsub('%20', '', file),
           Congress = leg_key$Congress[i],
           Session = leg_key$Session[i],
           Year = leg_key$Year[i],
           Link = paste0('https://www.nmlegis.gov', Link))

  web_votes <- rbind(web_votes, v3)
}

3.3 Identify redundant/funky roll calls

Hand checking for funky files (via nmlegis.gov) is really the simplest/most transparent way to address errors on nmlegis.gov side of things. Most issues have to deal with duplicate files with different file names (eg, 'HB405HVOTE.PDF' and 'HB0405HVOTE.PDF').

Fully automating roll call scraping has proven problematic. The table produced below is useful in identifying problematic PDF files (mostly via NA values post-join).

web_votes$Bill_ID <- gsub('([A-z][A-z]+[0-9][0-9]+)([A-Z].*$)','\\1',web_votes$file)
web_votes$chamber <- gsub('([A-z][A-z]+[0-9][0-9]+)([A-Z].*)(\\.PDF)','\\2',toupper(web_votes$file))
web_votes <- web_votes %>% left_join(nml_legislation %>% select(Bill_ID, Session, Year, Bill_Code))

Based on investigating the table above, the following files are identified as redundant/not relevant. (Note: Only roll calls for motions to pass, reconsider, or override a veto are included as legislative activity (or actions) in legislation descriptions/detail; hence, all other types of roll calls (eg, motions to concur) are removed here.)

removes53r17 <- c(
              'HB0001SVOTE2.PDF',
              'HB261HVOTE.PDF',
              'HB405HVOTE.PDF',
              'HJM008VOTE.PDF',
              'HJM018ETALLHVOTE.PDF',
              'HJR001HVOTE.PDF',
              'HJR008HVOTE.PDF',
              'HJR010HVOTE.PDF',
              'HM016VOTE.PDF',
              'HM058HVOTE.PDF', #Link to NMSL52 (2016) file.
              'SB076ASVOTE.PDF',
              'SB0976SVOTE2.PDF',
              'SB1582SVOTE.PDF',
              'SJR001HVOTE.PDF',
              'SJR020HVOTE.PDF')

removes53r18 <- c(
              'HB0019,215,217,266&271SVOTE.PDF',
              'HJR001HVOTE.PDF',
              'HJR009SVOTE.PDF',
              'HJR013SVOTE.PDF',
              'HM0120HVOTE.PDF',
              'SB0040S-SUBVOTE.PDF',
              'SB0040SVOTEsubfailed.PDF',
              'SB0079HTABLE.PDF',
              'SB019SVOTE.PDF',
              'SB029SVOTE.PDF',
              'SB040S-SUBVOTE.PDF',
              'SB040SVOTE(2).PDF',
              'SB040SVOTE.PDF',
              'SB043SVOTE.PDF',
              'SB052SVOTE.PDF',
              'SJM022SVOTE.PDF', #Wrong link.
              'SM008VOTE.PDF',
              'SM012VOTE.PDF',
              'SM048VOTE.PDF',
              'SM058SVOTE(1).PDF',
              'SM067VOTE.PDF',
              'SM074VOTE.PDF',
              'SM094VOTE.PDF',
              'SM103VOTE.PDF',
              'SM119VOTE.PDF',
              'SM121VOTE.PDF',
              'SR001SVOTE.PDF')

3.4 Function to clean roll call links

hand_clean_NMSL53 <- function (x) {

  #Remove any 'motions to concur', per nmlegis.gov interactive search tool.
  x <- x[grepl("CONCUR",toupper(x$file)) == FALSE,]

  #Remove junk/redundant PDFs.
  x <- rbind(subset(x, Year == 2017 & !file %in% removes53r17),
             subset(x, Year == 2018 & !file %in% removes53r18))

  #Sort weird 2018 issue. RE: sm22/sjm22.
  x[x$file == 'SM022SVOTE.PDF' & x$Year == '2018',]$Bill_ID <- 'SJM022'
  x[x$file == 'SM022SVOTE.PDF' & x$Year == '2018',]$Bill_Code <- 'R18_SJM022'
  x[x$file == 'SM022SVOTE(2).PDF' & x$Year == '2018',]$chamber <- 'SVOTE'
  
  #Sort 'Motion to override veto -- Motion to reconsider'
  x <- mutate(x, Motion =  
                case_when(
                  grepl('2', chamber) ~ 'Reconsider',
                  grepl('OVERRIDE', chamber) ~ 'Override',
                  grepl('VOTE$', chamber) ~ 'Passage'))
  
  #Rework chamber ids.
  x <- x %>%
    mutate(Chamber =
             case_when(
               grepl('HVOTE', chamber) ~ "House",
               grepl('SVOTE', chamber) ~ "Senate")
           ) %>%
    select(-chamber, -code, -file)
  
  #Remove dups that slipped through
    distinct(x, Bill_Code, Motion, Chamber,.keep_all = TRUE) #
}

Clean roll call web links:

Clean_Web_Votes <- hand_clean_NMSL53 (web_votes) ##This is now gospel.  

3.5 Function to scrape roll calls from web

get_pdf <- function (x) {
  tmp <- tempfile()
  curl::curl_download(x, tmp)
  tabulizer::extract_tables(tmp, 
                            output = "data.frame", 
                            encoding = 'UTF-8')[[1]]
}

Scrape roll calls:

pdfs <- get_pdf(Clean_Web_Votes$Link)

3.6 Function to reshape scraped roll calls

c5 <- c('Representative', 'Yea', 'Nay', 'Excused', 'Absent')
c6 <- c('Representative', 'Yea', 'Nay', 'Excused', 'Absent', 'Rec')

reshape_votes <- function (x) {
  if (ncol(x) ==12) {
    y <- x[,c(1:3,5,4,6)]
    colnames(y) <- c6
    z <- x[,c(7:9,11,10,12)]
    colnames(z) <- c6
    new <- rbind(y,z)} else {
      y <- x[,1:5]
      colnames(y) <- c5
      z <- x[,6:10]
      colnames(z) <- c5
      new <- rbind(y,z)}

  new %>%
    mutate(Representative = toupper(Representative))%>%
    filter (Representative !='' | !grepl('TOTAL',Representative))%>%
    gather(key=Member_Vote, value = yaynay, -Representative)%>%
    filter(yaynay == 'X') %>%
    dplyr::select(-yaynay)
}

Reshape roll calls:

pdfs_clean <- lapply(pdfs, reshape_votes) %>%
  bind_rows(.id = "Link") %>%
  inner_join(Clean_Web_Votes) %>% #Clean_Web_Votes preserves proper web links.
  dplyr::select(Bill_Code, Motion, Chamber,Representative, Member_Vote) %>%
  arrange(Bill_Code, Motion, Chamber, Representative) 

3.7 Standardize representative names in roll call data

rollcall <- pdfs_clean %>%
  mutate(Representative = gsub(' G\\.', ' GAIL', Representative),
         Representative = gsub(' J\\.', ' JAVIER', Representative),
         Representative = gsub(' R\\.', ' RUDY', Representative),
         Representative = gsub('GARCIA, M.P.', 'GARCÍA, M.P.', Representative))

4. Standardize legislator names across data sources

4.1 Read wiki-based legislator details

#Split to first and last names
nml_leg <- read.csv("C:\\Users\\jason\\Google Drive\\GitHub\\packages\\nmlegisdatr\\data-raw\\nmsl_legislators\\nmsl53_legislators.csv",
                    na.strings=c(""), 
                    stringsAsFactors =FALSE) %>%
  arrange(Chamber, Representative) %>%
  mutate(Rep_Last = gsub('^.* ', '', Representative))%>%
  rowwise()%>%
  mutate(Rep_First = gsub(Rep_Last, '', Representative))

#Hand edits
edits <- data.frame(last = c('Maestas Barnes', 
                             'Roybal Caballero', 
                             'Ortiz y Pino', 
                             'Dodge, Jr.', 
                             'Garcia Richard',
                             'Shendo, Jr.'), 
                    first = c('Sarah', 
                              'Patricia', 
                              'Jerry', 
                              'George', 
                              'Stephanie', 
                              'Benny'), 
                    stringsAsFactors = FALSE)

spots <- unlist(lapply(edits$last, grep, nml_leg$Representative))

nml_leg$Rep_Last[spots] <- edits$last
nml_leg$Rep_First[spots] <- edits$first

4.2 Representative names as presented in roll call data

reps <- rollcall %>%
  group_by (Chamber,Representative) %>%
  summarise(count = n()) %>%
  filter(!grepl('^LT', Representative)) %>%
  mutate(last = gsub(', .*$','',Representative),
         first = gsub('^.*, ','',Representative)) %>%
  arrange(Chamber, last,first)

4.3 Cross wiki representative names to roll call names

nml_legislators <- nml_leg %>%
  arrange(Chamber, Rep_Last, Rep_First) %>%
  bind_cols(reps) %>%
  mutate(Congress = '53') %>% #Will need to address per new Congress
  select(Congress, Chamber, Representative1, District, Party, First_elected,
         Representative, Rep_Last, Rep_First)%>%
  rename(Rep_Full = Representative, Representative = Representative1) %>%
  rbind(c('53','Senate','LT. GOV','','Rep', 'Lt. Governor', '', '', '')) %>% ##Add Lt. Gov -- better way.
  mutate(Rep_First = gsub('\\s$', '', Rep_First))

4.4 Join wiki names and bill sponsors table

nml_sponsors <- nml_leg_list %>% 
  select(Bill_Code,Year, Session, Bill_ID, Sponsor)%>%
  left_join(nml_legislators, by = c('Sponsor'='Rep_Full')) %>%
  select(Bill_Code, Congress, Chamber, Representative) #Perhaps add group number.

4.5 Add party affiliation to roll call data

nml_rollcall <- rollcall %>%
  left_join (nml_legislators %>% select (Chamber,Representative, Party)) %>%
  mutate(Party_Member_Vote = ifelse(Member_Vote %in% c('Absent','Rec','Excused'), 'Not Voting', Member_Vote)) %>%
  mutate(Party_Member_Vote = as.factor(paste0(Party, ': ',Party_Member_Vote))) %>%
  select(-Party)

nml_rollcall$Party_Member_Vote <- 
  factor(
    nml_rollcall$Party_Member_Vote,
    levels(nml_rollcall$Party_Member_Vote)[c(3,6,1,4,2,5)]
    )

5. Build summary tables

5.1 Roll call results table

result <- nml_rollcall %>%
  filter(Member_Vote %in% c('Yea','Nay')) %>%
  group_by(Bill_Code, Motion, Chamber, Member_Vote) %>%
  summarise(count=n()) %>%
  spread(Member_Vote, count)

result_by <- nml_rollcall %>%
  group_by(Bill_Code, Motion, Chamber, Party_Member_Vote) %>%
  summarise(count=n()) %>%
  spread(Party_Member_Vote, count)%>%
  left_join(result)

result_by[is.na(result_by)] <- as.integer(0)


nml_rollcall_results <-  
  result_by %>%
  mutate(Dem_Vote = `Dem: Yea`/ sum(`Dem: Yea`,`Dem: Nay`),
         Rep_Vote = `Rep: Yea`/ sum(`Rep: Yea`,`Rep: Nay`),
         Dem_Vote = case_when(Dem_Vote < .5 ~ "Nay",
                              Dem_Vote > .5 ~ "Yea",
                              Dem_Vote == .5 ~ "Tie"),
         Rep_Vote = case_when(Rep_Vote < .5 ~ "Nay",
                              Rep_Vote > .5 ~ "Yea",
                              Rep_Vote == .5 ~ "Tie"),
         Result = paste0(Yea, "-", Nay)) %>%
  left_join( nml_legislation) %>%  
  select(Bill_Code, Motion, Chamber,Result, Dem_Vote, Rep_Vote, `Dem: Yea`:`Rep: Nay`)%>%
  ungroup() %>%
  arrange(Bill_Code, Motion, Chamber) %>%
  left_join(Clean_Web_Votes %>% select(Bill_Code, Motion, Chamber,Link)) %>%
  rename(Roll_URL = Link)

5.2 Votes cast, Attendance & Party Loyalty.

descs <- nml_rollcall %>%
  filter(!Representative %in% 'LT. GOV') %>%
  select(Bill_Code, Motion, Chamber, Representative, Member_Vote) %>%
  left_join(nml_legislators %>% select(Congress, Chamber, Representative,Party)) %>%
  left_join(nml_rollcall_results %>% select(Bill_Code, Motion, Chamber,Dem_Vote, Rep_Vote))%>%
  mutate(Party_Vote = ifelse(Party == 'Rep',Rep_Vote, Dem_Vote)) %>%
  select(-Rep_Vote, -Dem_Vote)%>%
  filter(Member_Vote %in% c('Yea','Nay'))%>%
  mutate(Votes_Loyal = ifelse(Member_Vote == Party_Vote, 1, 0)) %>%
  group_by(Congress, Chamber, Representative, Party) %>%
  summarize(Votes_Loyal = as.integer(sum(Votes_Loyal)))%>%
  ungroup()


nml_legislator_descs <-
  nml_rollcall %>%
  filter(!Representative %in% 'LT. GOV') %>%
  group_by(Chamber, Representative, Member_Vote) %>%
  summarize(count = n()) %>%
  spread(Member_Vote, count) %>%
  replace_na(list(Rec = 0, Absent =0, Excused=0)) %>%
  left_join(descs) %>%
  mutate(Votes_Cast = sum(Yea, Nay),
         Party_Loyalty = round(Votes_Loyal/Votes_Cast*100, 1),
         Attendance = round(sum(Yea, Nay, Excused, Rec) / sum(Yea, Nay, Excused, Rec, Absent)*100, 1))%>%
  select(Congress, Chamber:Representative, Party,
         Yea, Nay, Excused, Rec, Absent, 
         Votes_Loyal:Attendance)%>%
  ungroup()

6. Output, etc.

Output consists of six tables, which are made available in three different formats:

You can’t perform that action at this time.