Skip to content

Commit

Permalink
tidyverse tools, style guide, countrycode package
Browse files Browse the repository at this point in the history
 * use `tidyverse` tools, esp. `read_csv()`, `write_csv()`
 * use tidyverse style guide, esp. " ' " --> ' " '
 * use `countrycode` package, instead of `country.csv`
  • Loading branch information
hdigital committed Jun 15, 2017
1 parent 93ac6a2 commit c71d911
Show file tree
Hide file tree
Showing 14 changed files with 92 additions and 111 deletions.
11 changes: 5 additions & 6 deletions import/ches/ches-party-info.R
Original file line number Diff line number Diff line change
@@ -1,12 +1,11 @@
library("dplyr")
library("stringr")
library(tidyverse)

url <- "http://www.chesdata.eu/1999-2014/1999-2014_CHES_dataset_means.csv"
file_name <- paste0("source__", str_replace(url, ".+/", ""))
file_name <- "source__1999-2014_CHES_dataset_means.csv"
if( ! file.exists(file_name)) {
download.file(url, file_name, mode="wb")
download.file(url, file_name, mode = "wb")
}
trend_raw <- read.csv(file_name, fileEncoding="utf-8", as.is=TRUE)
trend_raw <- read_csv(file_name)

trend <- trend_raw %>%
group_by(party_id) %>%
Expand All @@ -19,4 +18,4 @@ trend <- trend_raw %>%
distinct(party_id, .keep_all = TRUE) %>%
select(country, party_id, party, cmp_id, electionyear, vote, year_first, year_last)

write.csv(trend, "ches-party-info.csv", na="", row.names = FALSE, fileEncoding="utf-8")
write_csv(trend, "ches-party-info.csv", na = "")
2 changes: 1 addition & 1 deletion import/ches/ches.R
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
library("dplyr")
library(dplyr)

party <- read.csv("ches-parties.csv", fileEncoding="utf-8", as.is=TRUE)
country <- read.csv("ches-country.csv", fileEncoding="utf-8", as.is=TRUE)
Expand Down
22 changes: 11 additions & 11 deletions import/clea/clea-national-vote.R
Original file line number Diff line number Diff line change
@@ -1,26 +1,26 @@
library(tidyverse)
library(stringr)

clea_version <- '20170530'
clea_version <- "20170530"
max_share <- 2.0

path <- str_interp('source__clea/clea_${clea_version}')
path <- str_interp("source__clea/clea_${clea_version}")

# Stata exported RDS file to save disk space
clea_rdata <- str_interp('${path}/clea_${clea_version}.Rds')
clea_rdata <- str_interp("${path}/clea_${clea_version}.Rds")
if( ! file.exists(clea_rdata)) {
library(haven)
clea <- haven::read_dta(str_interp('${path}/clea_${clea_version}_stata.zip'))
clea <- haven::read_dta(str_interp("${path}/clea_${clea_version}_stata.zip"))
saveRDS(clea, file = clea_rdata, ascii = TRUE)
}

# read CLEA data only once
if( ! exists('clea_raw')) {
clea_raw <- readRDS(file=str_interp('${path}/clea_${clea_version}.Rds'))
if( ! exists("clea_raw")) {
clea_raw <- readRDS(file=str_interp("${path}/clea_${clea_version}.Rds"))
}
clea <- clea_raw %>%
filter(pv1 > 0) %>%
mutate(ctr_n = recode(ctr_n, UK='United Kingdom', US='United States of America'),
mutate(ctr_n = recode(ctr_n, UK="United Kingdom", US="United States of America"),
pv1 = as.numeric(pv1),
mn = if_else(ctr == 840, 0, as.numeric(mn))) # unify US election months

Expand Down Expand Up @@ -63,16 +63,16 @@ elec_out <- pa_name %>%
group_by() %>%
arrange(ctr_n, yr, mn, -pv1_share)

write.csv(elec_out, 'source__clea/clea_national_vote.csv',
na = '', fileEncoding = 'utf-8', row.names = FALSE)
write.csv(elec_out, "source__clea/clea_national_vote.csv",
na = "", fileEncoding = "utf-8", row.names = FALSE)


## Party information for Party Facts data import

# filter none, others, alliances, independents
# higher threshold because of votes not in 'pv1' parties
# higher threshold because of votes not in "pv1" parties
party_out <- party %>%
mutate(ctr_pty = ctr*1000000 + pty) %>%
filter(pty > 0, pty < 4000, pv1_share_max >= max_share)

write_csv(party_out, 'clea-national-vote.csv', na = '')
write_csv(party_out, "clea-national-vote.csv", na = "")
12 changes: 6 additions & 6 deletions import/clea/clea.R
Original file line number Diff line number Diff line change
Expand Up @@ -2,30 +2,30 @@ library(tidyverse)
library(stringr)
library(countrycode)

clea_version <- '20170530'
clea_version <- "20170530"

path <- str_interp("source__clea/clea_${clea_version}/clea_${clea_version}_appendix_II.csv")
party_raw <- read_csv(path)

# add CLEA data variable names to party information and clean-up data for import
party <- party_raw
names(party) <- c('ctr_n', 'pty', 'abbr', 'name', 'name_english', 'information')
names(party) <- c("ctr_n", "pty", "abbr", "name", "name_english", "information")

# add time and size information and select larger parties
vote <- read_csv("clea-national-vote.csv")
party <- party %>%
mutate(pty = as.integer(pty)) %>%
mutate(pty = as.integer(str_extract_all(pty, "\\d+"))) %>%
inner_join(vote)

# add Party Facts country codes
party <- party %>%
mutate(country = countrycode(ctr_n, 'country.name', 'iso3c',
custom_match = c(Kosovo='XKX', Zambia='ZMB')))
mutate(country = countrycode(ctr_n, "country.name", "iso3c",
custom_match = c(Kosovo="XKX", Zambia="ZMB")))
if(any(is.na(party$country))) {
warning("Country name clean-up needed")
}

# clean-up CLEA data for import
party[nchar(party$abbr) > 25 & ! is.na(party$abbr), "abbr"] <- NA

write_csv(party, 'clea.csv', na = '')
write_csv(party, "clea.csv", na = "")
15 changes: 7 additions & 8 deletions import/ees14/ees14.R
Original file line number Diff line number Diff line change
@@ -1,11 +1,10 @@
library('dplyr')
library(tidyverse)
library(countrycode)

ees <- read.csv('parties-ees-ches-ess.csv', fileEncoding = 'utf-8', as.is=TRUE)
ees_raw <- read_csv("parties-ees-ches-ess.csv")

country <- read.csv('../country.csv', fileEncoding = 'utf-8', as.is=TRUE)
country <- country %>% select(iso2, country_iso3 = iso3)
ees <- ees_raw %>%
mutate(country_iso3 = countrycode(country, "iso2c", "iso3c",
custom_match = c(UK="GBR")))

ees[ees$country == 'UK', 'country'] <- 'GB'
ees <- ees %>% left_join(country, by = c('country' = 'iso2'))

write.csv(ees, 'ees14.csv', na='', fileEncoding = 'utf-8', row.names = FALSE)
write_csv(ees, "ees14.csv", na = "")
20 changes: 10 additions & 10 deletions import/epac/epac.R
Original file line number Diff line number Diff line change
@@ -1,27 +1,27 @@
library("tidyverse")
library("readxl")
library("countrycode")
library(tidyverse)
library(readxl)
library(countrycode)

epac_raw <- read_excel("epac-parties-2016.xlsx")
write_csv(epac_raw, "epac-parties-2016.csv", na = "")

# add Party Facts country codes
epac <- epac_raw %>%
mutate(country = countrycode(country_name, 'country.name', 'iso3c',
custom_match = c(Kosovo='XKX')),
mutate(country = countrycode(country_name, "country.name", "iso3c",
custom_match = c(Kosovo="XKX")),
seat = round(seat, 1))
if(any(is.na(epac$country))) {
warning("Country name clean-up needed")
}

epac_2014_raw <- read_csv("import-2014/epac.csv")

epac_2014_add <- epac_2014_raw %>%
rename(country = country_name_short, country_name = country, party_id = id,
party_accr = accronym, party_name_en = party_name_english) %>%
mutate(round = 2011, pec = NA, elecyear = NA) %>%
epac_2014_add <- epac_2014_raw %>%
rename(country = country_name_short, country_name = country, party_id = id,
party_accr = accronym, party_name_en = party_name_english) %>%
mutate(round = 2011, pec = NA, elecyear = NA) %>%
filter( ! party_id %in% epac$party_id)

epac <- epac %>% bind_rows(epac_2014_add) %>% arrange(party_id)

write.csv(epac, "epac.csv", na='', fileEncoding = "utf-8", row.names = FALSE)
write.csv(epac, "epac.csv", na = "", fileEncoding = "utf-8", row.names = FALSE)
29 changes: 8 additions & 21 deletions import/huber/huber.R
Original file line number Diff line number Diff line change
@@ -1,24 +1,11 @@
library('dplyr')
library(tidyverse)
library(countrycode)

# reading huber data and renaming 'id' to 'party_id'
party_raw <- read.csv('huber_inglehart_1995.csv', as.is=TRUE)
party <- party_raw %>% rename(party_id=id)
huber_raw <- read_csv("huber_inglehart_1995.csv")

# reading contry data and convert 'country_name' to upper-case characters
country_raw <- read.csv('../country.csv', fileEncoding = 'utf-8', as.is=TRUE)
country <- country_raw %>%
rename(country_name_short = name_short) %>%
mutate(country = toupper(name))
huber <- huber_raw %>%
rename(huber_id=id) %>%
mutate(country_name_short = countrycode(country, "country.name", "iso3c",
custom_match = c(`NORTHERN IRELAND`="NIR")))

# merging country and huber data to get 'country_name_short'
party <- party %>% left_join(country %>% select(country_name_short, country), by='country')

# adding missing country abbreviations
country_update <- list('BRITAIN'='GBR', 'SOUTH KOREA'='KOR', 'USA'='USA')
for (to_update in names(country_update)) {
party[party$country == to_update, 'country_name_short'] <- country_update[[to_update]]
}
if(any(is.na(party$country_name_short))) warning("Not all observations have country keys")

# creating the csv file
write.csv(party, 'huber.csv', na='', fileEncoding='utf-8', row.names = FALSE)
write_csv(huber, "huber.csv", na = "")
14 changes: 7 additions & 7 deletions import/janda/janda.R
Original file line number Diff line number Diff line change
@@ -1,17 +1,17 @@
library("dplyr")
library(tidyverse)

janda <- read.csv('janda-parties.csv', fileEncoding='utf-8', as.is=TRUE)
country <- read.csv('janda-country.csv', fileEncoding='utf-8', as.is=TRUE)
janda <- read_csv("janda-parties.csv")
country <- read_csv("janda-country.csv")

# Extract country id from party id
janda <- janda %>%
mutate(country_id = substr(janda_id, 1, nchar(janda_id) - 1) %>% as.integer,
country_id = ifelse(janda_id >= 10, country_id, 0)) # add US country id '0'
country_id = ifelse(janda_id >= 10, country_id, 0)) # add US country id "0"

# Merge parties and country list
janda <- janda %>%
janda <- janda %>%
left_join(country, by = c("country_id" = "id")) %>%
select(-country_id, country_short = short) %>%
filter(country_short != '')
filter(country_short != "")

write.csv(janda, "janda.csv", na='', fileEncoding = "utf-8", row.names = FALSE)
write_csv(janda, "janda.csv", na = "")
7 changes: 3 additions & 4 deletions import/marpor/marpor-share.R
Original file line number Diff line number Diff line change
@@ -1,10 +1,9 @@
library(dplyr)
library(readr)
library(tidyverse)

marpor <- read_csv("source__MPDataset_MPDS2016b.csv")

pa_share <- marpor %>%
select(party, country, countryname, date, pervote) %>%
select(party, country, countryname, date, pervote) %>%
mutate(year = date %/% 100) %>%
group_by(party) %>%
mutate(pervote_max = max(pervote, na.rm = TRUE)) %>%
Expand All @@ -14,4 +13,4 @@ pa_share <- marpor %>%
arrange(party) %>%
select(party, pervote_max_year = year, pervote_max)

write.csv(pa_share, "marpor-share.csv", na="", row.names = FALSE, fileEncoding = "utf-8")
write_csv(pa_share, "marpor-share.csv", na = "")
14 changes: 6 additions & 8 deletions import/marpor/marpor.R
Original file line number Diff line number Diff line change
@@ -1,22 +1,20 @@
library(dplyr)
library(readr)
library(tidyverse)
library(countrycode)

marpor_raw <- read_csv("marpor-2016.csv")
marpor <- marpor_raw %>% select(-country)

marpor_share <- read_csv("marpor-share.csv")
marpor <- marpor %>% left_join(marpor_share)

marpor <- marpor_raw %>% select(-country) %>% left_join(marpor_share)

# add Party Facts country codes
marpor <- marpor %>%
mutate(country = countrycode(countryname, 'country.name', 'iso3c',
custom_match = c(`Northern Ireland`='NIR')))
mutate(country = countrycode(countryname, "country.name", "iso3c",
custom_match = c(`Northern Ireland`="NIR")))
if(any(is.na(marpor$country))) {
warning("Country name clean-up needed")
}

# replace party short longer than 25 chars
marpor[nchar(marpor$abbrev) > 25 & ! is.na(marpor$abbrev), "abbrev"] <- NA

write.csv(marpor, "marpor.csv", na="", row.names = FALSE, fileEncoding="utf-8")
write_csv(marpor, "marpor.csv", na = "")
24 changes: 12 additions & 12 deletions import/parlgov/parlgov.R
Original file line number Diff line number Diff line change
@@ -1,18 +1,18 @@
library('tidyverse')
library('dbplyr')
library(tidyverse)
library(dbplyr)

url <- 'http://www.parlgov.org/static/data/parlgov-development.db'
db_file <- 'source__parlgov.db'
url <- "http://www.parlgov.org/static/data/parlgov-development.db"
db_file <- "source__parlgov.db"
if( ! file.exists(db_file)) {
download.file(url, db_file, mode = "wb")
}

con <- DBI::dbConnect(RSQLite::SQLite(), db_file)
tbl_parlgov <- function(table) tbl(con, table) %>% as_tibble()

party <- tbl_parlgov('view_party')
elec <- tbl_parlgov('view_election')
party_raw <- tbl_parlgov('party')
party <- tbl_parlgov("view_party")
elec <- tbl_parlgov("view_election")
party_raw <- tbl_parlgov("party")

# calculate first and last year each party
elec_year <- elec %>%
Expand All @@ -23,7 +23,7 @@ elec_year <- elec %>%

# calculate max vote share each party
elec_share <- elec %>%
filter(election_type == 'parliament') %>%
filter(election_type == "parliament") %>%
group_by(party_id) %>%
mutate(vote_share_max = max(vote_share, na.rm = TRUE)) %>%
filter(vote_share == vote_share_max) %>%
Expand All @@ -33,15 +33,15 @@ elec_share <- elec %>%
vote_share_max_year = substr(election_date, 1, 4)) %>%
select(party_id, vote_share_max_year, vote_share_max)

parlgov_url <- 'http://www.parlgov.org/explore/%s/party/%d/'
parlgov_url <- "http://www.parlgov.org/explore/%s/party/%d/"
parlgov <- party %>%
select(party_id, country_name_short:family_name, -country_name, -party_name_ascii, -family_name) %>%
filter(family_name_short != 'none') %>%
filter(family_name_short != "none") %>%
mutate(url = sprintf(parlgov_url, tolower(country_name_short), party_id)) %>%
left_join(elec_year) %>%
left_join(elec_share) %>%
left_join(party_raw %>% select(id, wikipedia), by = c('party_id' = 'id')) %>%
left_join(party_raw %>% select(id, wikipedia), by = c("party_id" = "id")) %>%
arrange(country_name_short, party_name)

# create import file and remove downloaded source files
write_csv(parlgov, 'parlgov.csv', na = '')
write_csv(parlgov, "parlgov.csv", na = "")
12 changes: 6 additions & 6 deletions import/ray/ray.R
Original file line number Diff line number Diff line change
@@ -1,17 +1,17 @@
library("tidyverse")
library("stringr")
library("haven")
library("countrycode")
library(tidyverse)
library(stringr)
library(haven)
library(countrycode)

# get local copy of data file
url <- "http://www.lsu.edu/faculty/lray2/data/1996survey/1996survey.sav?export=sav"
data_file_local <- "source__1996survey.sav"
if( ! file.exists(data_file_local)) {
download.file(url, data_file_local, mode="wb")
download.file(url, data_file_local, mode = "wb")
}

# read, select and clean data
ray_raw <- read_spss("source__1996survey.sav")
ray_raw <- haven::read_spss(data_file_local)
ray <- ray_raw %>%
select(NATID:CMPCODE) %>% # select party information
mutate_at(vars(PARTY, ENAME, NAME), str_trim) %>% # trim white space
Expand Down
4 changes: 2 additions & 2 deletions import/readme.md
Original file line number Diff line number Diff line change
Expand Up @@ -22,8 +22,8 @@ Party Facts import requires ISO3 country codes. `country.csv` includes the respe
```r
# Example country recoding from Marpor import
marpor <- marpor %>%
mutate(country = countrycode(countryname, 'country.name', 'iso3c',
custom_match = c(`Northern Ireland`='NIR')))
mutate(country = countrycode(countryname, "country.name", "iso3c",
custom_match = c(`Northern Ireland`="NIR")))
if(any(is.na(marpor$country))) {
warning("Country name clean-up needed")
}
Expand Down
Loading

0 comments on commit c71d911

Please sign in to comment.