tidyverse tools, style guide, countrycode package

* use `tidyverse` tools, esp. `read_csv()`, `write_csv()` * use tidyverse style guide, esp. " ' " --> ' " ' * use `countrycode` package, instead of `country.csv`
hdigital · Jun 15, 2017 · c71d911 · c71d911
1 parent 93ac6a2
commit c71d911
Show file tree

Hide file tree

Showing 14 changed files with 92 additions and 111 deletions.
diff --git a/import/ches/ches-party-info.R b/import/ches/ches-party-info.R
@@ -1,12 +1,11 @@
-library("dplyr")
-library("stringr")
+library(tidyverse)
 
 url <- "http://www.chesdata.eu/1999-2014/1999-2014_CHES_dataset_means.csv"
-file_name <- paste0("source__", str_replace(url, ".+/", ""))
+file_name <- "source__1999-2014_CHES_dataset_means.csv"
 if( ! file.exists(file_name)) {
-  download.file(url, file_name, mode="wb")
+  download.file(url, file_name, mode = "wb")
 }
-trend_raw <- read.csv(file_name, fileEncoding="utf-8", as.is=TRUE)
+trend_raw <- read_csv(file_name)
 
 trend <- trend_raw %>%
   group_by(party_id) %>%
@@ -19,4 +18,4 @@ trend <- trend_raw %>%
   distinct(party_id, .keep_all = TRUE) %>%
   select(country, party_id, party, cmp_id, electionyear, vote, year_first, year_last)
 
-write.csv(trend, "ches-party-info.csv", na="", row.names = FALSE, fileEncoding="utf-8")
+write_csv(trend, "ches-party-info.csv", na = "")
diff --git a/import/ches/ches.R b/import/ches/ches.R
@@ -1,4 +1,4 @@
-library("dplyr")
+library(dplyr)
 
 party <- read.csv("ches-parties.csv", fileEncoding="utf-8", as.is=TRUE)
 country <- read.csv("ches-country.csv", fileEncoding="utf-8", as.is=TRUE)

diff --git a/import/clea/clea-national-vote.R b/import/clea/clea-national-vote.R
@@ -1,26 +1,26 @@
 library(tidyverse)
 library(stringr)
 
-clea_version <- '20170530'
+clea_version <- "20170530"
 max_share <- 2.0
 
-path <- str_interp('source__clea/clea_${clea_version}')
+path <- str_interp("source__clea/clea_${clea_version}")
 
 # Stata exported RDS file to save disk space
-clea_rdata <- str_interp('${path}/clea_${clea_version}.Rds')
+clea_rdata <- str_interp("${path}/clea_${clea_version}.Rds")
 if( ! file.exists(clea_rdata)) {
   library(haven)
-  clea <- haven::read_dta(str_interp('${path}/clea_${clea_version}_stata.zip'))
+  clea <- haven::read_dta(str_interp("${path}/clea_${clea_version}_stata.zip"))
   saveRDS(clea, file = clea_rdata, ascii = TRUE)
 }
 
 # read CLEA data only once
-if( ! exists('clea_raw')) {
-  clea_raw <- readRDS(file=str_interp('${path}/clea_${clea_version}.Rds'))
+if( ! exists("clea_raw")) {
+  clea_raw <- readRDS(file=str_interp("${path}/clea_${clea_version}.Rds"))
 }
 clea <- clea_raw %>%
   filter(pv1 > 0) %>%
-  mutate(ctr_n = recode(ctr_n, UK='United Kingdom', US='United States of America'),
+  mutate(ctr_n = recode(ctr_n, UK="United Kingdom", US="United States of America"),
          pv1 = as.numeric(pv1),
          mn = if_else(ctr == 840, 0, as.numeric(mn)))  # unify US election months
 
@@ -63,16 +63,16 @@ elec_out <- pa_name %>%
   group_by() %>%
   arrange(ctr_n,  yr, mn, -pv1_share)
 
-write.csv(elec_out, 'source__clea/clea_national_vote.csv',
-          na = '', fileEncoding = 'utf-8', row.names = FALSE)
+write.csv(elec_out, "source__clea/clea_national_vote.csv",
+          na = "", fileEncoding = "utf-8", row.names = FALSE)
 
 
 ## Party information for Party Facts data import
 
 # filter none, others, alliances, independents
-# higher threshold because of votes not in 'pv1' parties
+# higher threshold because of votes not in "pv1" parties
 party_out <- party %>%
   mutate(ctr_pty = ctr*1000000 + pty) %>%
   filter(pty > 0, pty < 4000, pv1_share_max >= max_share)
 
-write_csv(party_out, 'clea-national-vote.csv', na = '')
+write_csv(party_out, "clea-national-vote.csv", na = "")
diff --git a/import/clea/clea.R b/import/clea/clea.R
@@ -2,30 +2,30 @@ library(tidyverse)
 library(stringr)
 library(countrycode)
 
-clea_version <- '20170530'
+clea_version <- "20170530"
 
 path <- str_interp("source__clea/clea_${clea_version}/clea_${clea_version}_appendix_II.csv")
 party_raw <- read_csv(path)
 
 # add CLEA data variable names to party information and clean-up data for import
 party <- party_raw
-names(party) <- c('ctr_n', 'pty', 'abbr', 'name', 'name_english', 'information')
+names(party) <- c("ctr_n", "pty", "abbr", "name", "name_english", "information")
 
 # add time and size information and select larger parties
 vote <- read_csv("clea-national-vote.csv")
 party <- party %>%
-  mutate(pty = as.integer(pty)) %>%
+  mutate(pty = as.integer(str_extract_all(pty, "\\d+"))) %>%
   inner_join(vote)
 
 # add Party Facts country codes
 party <- party %>%
-  mutate(country = countrycode(ctr_n, 'country.name', 'iso3c',
-                            custom_match = c(Kosovo='XKX', Zambia='ZMB')))
+  mutate(country = countrycode(ctr_n, "country.name", "iso3c",
+                            custom_match = c(Kosovo="XKX", Zambia="ZMB")))
 if(any(is.na(party$country))) {
   warning("Country name clean-up needed")
 }
 
 # clean-up CLEA data for import
 party[nchar(party$abbr) > 25 & ! is.na(party$abbr), "abbr"] <- NA
 
-write_csv(party, 'clea.csv', na = '')
+write_csv(party, "clea.csv", na = "")
diff --git a/import/ees14/ees14.R b/import/ees14/ees14.R
@@ -1,11 +1,10 @@
-library('dplyr')
+library(tidyverse)
+library(countrycode)
 
-ees <- read.csv('parties-ees-ches-ess.csv', fileEncoding = 'utf-8', as.is=TRUE)
+ees_raw <- read_csv("parties-ees-ches-ess.csv")
 
-country <- read.csv('../country.csv', fileEncoding = 'utf-8', as.is=TRUE)
-country <- country %>% select(iso2, country_iso3 = iso3)
+ees <- ees_raw %>%
+  mutate(country_iso3 = countrycode(country, "iso2c", "iso3c",
+                                    custom_match = c(UK="GBR")))
 
-ees[ees$country == 'UK', 'country'] <- 'GB'
-ees <- ees %>% left_join(country, by = c('country' = 'iso2'))
-
-write.csv(ees, 'ees14.csv', na='', fileEncoding = 'utf-8', row.names = FALSE)
+write_csv(ees, "ees14.csv", na = "")
diff --git a/import/epac/epac.R b/import/epac/epac.R
@@ -1,27 +1,27 @@
-library("tidyverse")
-library("readxl")
-library("countrycode")
+library(tidyverse)
+library(readxl)
+library(countrycode)
 
 epac_raw <- read_excel("epac-parties-2016.xlsx")
 write_csv(epac_raw, "epac-parties-2016.csv", na = "")
 
 # add Party Facts country codes
 epac <- epac_raw %>%
-  mutate(country = countrycode(country_name, 'country.name', 'iso3c',
-                               custom_match = c(Kosovo='XKX')),
+  mutate(country = countrycode(country_name, "country.name", "iso3c",
+                               custom_match = c(Kosovo="XKX")),
          seat = round(seat, 1))
 if(any(is.na(epac$country))) {
   warning("Country name clean-up needed")
 }
 
 epac_2014_raw <- read_csv("import-2014/epac.csv")
 
-epac_2014_add <- epac_2014_raw %>% 
-  rename(country = country_name_short, country_name = country, party_id = id, 
-         party_accr = accronym, party_name_en = party_name_english) %>% 
-  mutate(round = 2011, pec = NA, elecyear = NA) %>% 
+epac_2014_add <- epac_2014_raw %>%
+  rename(country = country_name_short, country_name = country, party_id = id,
+         party_accr = accronym, party_name_en = party_name_english) %>%
+  mutate(round = 2011, pec = NA, elecyear = NA) %>%
   filter( ! party_id %in% epac$party_id)
 
 epac <- epac %>% bind_rows(epac_2014_add) %>% arrange(party_id)
 
-write.csv(epac, "epac.csv", na='', fileEncoding = "utf-8", row.names = FALSE)
+write.csv(epac, "epac.csv", na = "", fileEncoding = "utf-8", row.names = FALSE)
diff --git a/import/huber/huber.R b/import/huber/huber.R
@@ -1,24 +1,11 @@
-library('dplyr')
+library(tidyverse)
+library(countrycode)
 
-# reading huber data and renaming 'id' to 'party_id'
-party_raw <- read.csv('huber_inglehart_1995.csv', as.is=TRUE)
-party <- party_raw %>% rename(party_id=id)
+huber_raw <- read_csv("huber_inglehart_1995.csv")
 
-# reading contry data and convert 'country_name' to upper-case characters
-country_raw <- read.csv('../country.csv', fileEncoding = 'utf-8', as.is=TRUE)
-country <- country_raw %>%
-  rename(country_name_short = name_short) %>%
-  mutate(country = toupper(name))
+huber <- huber_raw %>%
+  rename(huber_id=id) %>%
+  mutate(country_name_short = countrycode(country, "country.name", "iso3c",
+                                          custom_match = c(`NORTHERN IRELAND`="NIR")))
 
-# merging country and huber data to get 'country_name_short'
-party <- party %>% left_join(country %>% select(country_name_short, country), by='country')
-
-# adding missing country abbreviations
-country_update <- list('BRITAIN'='GBR', 'SOUTH KOREA'='KOR', 'USA'='USA')
-for (to_update in names(country_update)) {
-  party[party$country == to_update, 'country_name_short']  <- country_update[[to_update]]
-}
-if(any(is.na(party$country_name_short))) warning("Not all observations have country keys")
-
-# creating the csv file
-write.csv(party, 'huber.csv', na='', fileEncoding='utf-8', row.names = FALSE)
+write_csv(huber, "huber.csv", na = "")
diff --git a/import/janda/janda.R b/import/janda/janda.R
@@ -1,17 +1,17 @@
-library("dplyr")
+library(tidyverse)
 
-janda <- read.csv('janda-parties.csv', fileEncoding='utf-8', as.is=TRUE)
-country <- read.csv('janda-country.csv', fileEncoding='utf-8', as.is=TRUE)
+janda <- read_csv("janda-parties.csv")
+country <- read_csv("janda-country.csv")
 
 # Extract country id from party id
 janda <- janda %>%
   mutate(country_id = substr(janda_id, 1, nchar(janda_id) - 1) %>% as.integer,
-         country_id = ifelse(janda_id >= 10, country_id, 0))  # add US country id '0'
+         country_id = ifelse(janda_id >= 10, country_id, 0))  # add US country id "0"
 
 # Merge parties and country list
-janda <- janda %>% 
+janda <- janda %>%
   left_join(country, by = c("country_id" = "id")) %>%
   select(-country_id, country_short = short) %>%
-  filter(country_short != '')
+  filter(country_short != "")
 
-write.csv(janda, "janda.csv", na='', fileEncoding = "utf-8", row.names = FALSE)
+write_csv(janda, "janda.csv", na = "")
diff --git a/import/marpor/marpor-share.R b/import/marpor/marpor-share.R
@@ -1,10 +1,9 @@
-library(dplyr)
-library(readr)
+library(tidyverse)
 
 marpor <- read_csv("source__MPDataset_MPDS2016b.csv")
 
 pa_share <- marpor %>%
-  select(party, country, countryname, date, pervote) %>% 
+  select(party, country, countryname, date, pervote) %>%
   mutate(year = date %/% 100) %>%
   group_by(party) %>%
   mutate(pervote_max = max(pervote, na.rm = TRUE)) %>%
@@ -14,4 +13,4 @@ pa_share <- marpor %>%
   arrange(party) %>%
   select(party, pervote_max_year = year, pervote_max)
 
-write.csv(pa_share, "marpor-share.csv", na="", row.names = FALSE, fileEncoding = "utf-8")
+write_csv(pa_share, "marpor-share.csv", na = "")
diff --git a/import/marpor/marpor.R b/import/marpor/marpor.R
@@ -1,22 +1,20 @@
-library(dplyr)
-library(readr)
+library(tidyverse)
 library(countrycode)
 
 marpor_raw <- read_csv("marpor-2016.csv")
-marpor <- marpor_raw %>% select(-country)
-
 marpor_share <- read_csv("marpor-share.csv")
-marpor <- marpor %>% left_join(marpor_share)
+
+marpor <- marpor_raw %>% select(-country) %>% left_join(marpor_share)
 
 # add Party Facts country codes
 marpor <- marpor %>%
-  mutate(country = countrycode(countryname, 'country.name', 'iso3c',
-                            custom_match = c(`Northern Ireland`='NIR')))
+  mutate(country = countrycode(countryname, "country.name", "iso3c",
+                            custom_match = c(`Northern Ireland`="NIR")))
 if(any(is.na(marpor$country))) {
   warning("Country name clean-up needed")
 }
 
 # replace party short longer than 25 chars
 marpor[nchar(marpor$abbrev) > 25 & ! is.na(marpor$abbrev), "abbrev"] <- NA
 
-write.csv(marpor, "marpor.csv", na="", row.names = FALSE, fileEncoding="utf-8")
+write_csv(marpor, "marpor.csv", na = "")
diff --git a/import/parlgov/parlgov.R b/import/parlgov/parlgov.R
@@ -1,18 +1,18 @@
-library('tidyverse')
-library('dbplyr')
+library(tidyverse)
+library(dbplyr)
 
-url <- 'http://www.parlgov.org/static/data/parlgov-development.db'
-db_file <- 'source__parlgov.db'
+url <- "http://www.parlgov.org/static/data/parlgov-development.db"
+db_file <- "source__parlgov.db"
 if( ! file.exists(db_file)) {
   download.file(url, db_file, mode = "wb")
 }
 
 con <- DBI::dbConnect(RSQLite::SQLite(), db_file)
 tbl_parlgov <- function(table) tbl(con, table) %>% as_tibble()
 
-party <- tbl_parlgov('view_party')
-elec <- tbl_parlgov('view_election')
-party_raw <- tbl_parlgov('party')
+party <- tbl_parlgov("view_party")
+elec <- tbl_parlgov("view_election")
+party_raw <- tbl_parlgov("party")
 
 # calculate first and last year each party
 elec_year <- elec %>%
@@ -23,7 +23,7 @@ elec_year <- elec %>%
 
 # calculate max vote share each party
 elec_share <- elec %>%
-  filter(election_type == 'parliament') %>%
+  filter(election_type == "parliament") %>%
   group_by(party_id) %>%
   mutate(vote_share_max = max(vote_share, na.rm = TRUE)) %>%
   filter(vote_share == vote_share_max) %>%
@@ -33,15 +33,15 @@ elec_share <- elec %>%
          vote_share_max_year = substr(election_date, 1, 4)) %>%
   select(party_id, vote_share_max_year, vote_share_max)
 
-parlgov_url <- 'http://www.parlgov.org/explore/%s/party/%d/'
+parlgov_url <- "http://www.parlgov.org/explore/%s/party/%d/"
 parlgov <- party %>%
   select(party_id, country_name_short:family_name, -country_name, -party_name_ascii, -family_name) %>%
-  filter(family_name_short != 'none') %>%
+  filter(family_name_short != "none") %>%
   mutate(url = sprintf(parlgov_url, tolower(country_name_short), party_id)) %>%
   left_join(elec_year) %>%
   left_join(elec_share) %>%
-  left_join(party_raw %>% select(id, wikipedia), by = c('party_id' = 'id')) %>%
+  left_join(party_raw %>% select(id, wikipedia), by = c("party_id" = "id")) %>%
   arrange(country_name_short, party_name)
 
 # create import file and remove downloaded source files
-write_csv(parlgov, 'parlgov.csv', na = '')
+write_csv(parlgov, "parlgov.csv", na = "")
diff --git a/import/ray/ray.R b/import/ray/ray.R
@@ -1,17 +1,17 @@
-library("tidyverse")
-library("stringr")
-library("haven")
-library("countrycode")
+library(tidyverse)
+library(stringr)
+library(haven)
+library(countrycode)
 
 # get local copy of data file
 url <- "http://www.lsu.edu/faculty/lray2/data/1996survey/1996survey.sav?export=sav"
 data_file_local <- "source__1996survey.sav"
 if( ! file.exists(data_file_local)) {
-  download.file(url, data_file_local, mode="wb")
+  download.file(url, data_file_local, mode = "wb")
 }
 
 # read, select and clean data
-ray_raw <- read_spss("source__1996survey.sav")
+ray_raw <- haven::read_spss(data_file_local)
 ray <- ray_raw %>%
   select(NATID:CMPCODE) %>%  # select party information
   mutate_at(vars(PARTY, ENAME, NAME), str_trim) %>%  # trim white space

diff --git a/import/readme.md b/import/readme.md
@@ -22,8 +22,8 @@ Party Facts import requires ISO3 country codes. `country.csv` includes the respe
 ```r
 # Example country recoding from Marpor import
 marpor <- marpor %>%
-  mutate(country = countrycode(countryname, 'country.name', 'iso3c',
-                            custom_match = c(`Northern Ireland`='NIR')))
+  mutate(country = countrycode(countryname, "country.name", "iso3c",
+                            custom_match = c(`Northern Ireland`="NIR")))
 if(any(is.na(marpor$country))) {
   warning("Country name clean-up needed")
 }