From e4a487fec584dc00d77cf6084da18c72a1613417 Mon Sep 17 00:00:00 2001 From: Julia Silge Date: Sun, 1 Apr 2018 21:43:01 -0600 Subject: [PATCH] Code style from styler --- NEWS.md | 2 +- R/bind_tf_idf.R | 5 +- R/corpus_tidiers.R | 12 +- R/dictionary_tidiers.R | 3 +- R/globals.R | 10 +- R/lda_tidiers.R | 6 +- R/sparse_casters.R | 32 +-- R/stm_tidiers.R | 19 +- R/stop_words.R | 6 +- R/unnest_tokens.R | 108 +++++++---- data-raw/setup_data.R | 30 +-- tests/testthat/test-corpus-tidiers.R | 7 +- tests/testthat/test-dictionary-tidiers.R | 12 +- tests/testthat/test-lda-tidiers.R | 1 - tests/testthat/test-sentiments.R | 28 ++- tests/testthat/test-sparse-casters.R | 12 +- tests/testthat/test-sparse-tidiers.R | 6 +- tests/testthat/test-stm-tidiers.R | 9 +- tests/testthat/test-stop-words.R | 2 - tests/testthat/test-tf-idf.R | 38 ++-- tests/testthat/test-unnest-tokens.R | 237 +++++++++++++++-------- 21 files changed, 363 insertions(+), 222 deletions(-) diff --git a/NEWS.md b/NEWS.md index 56d83050..8d70665a 100644 --- a/NEWS.md +++ b/NEWS.md @@ -2,7 +2,7 @@ * Updates to documentation (#109) thanks to Emil Hvitfeldt. * Add new tokenizers for tweets, Penn Treebank to `unnest_tokens()`. -* Better error message (#111). +* Better error message (#111) and code styling. # tidytext 0.1.8 diff --git a/R/bind_tf_idf.R b/R/bind_tf_idf.R index 3c99f4d0..8ab17e73 100644 --- a/R/bind_tf_idf.R +++ b/R/bind_tf_idf.R @@ -37,7 +37,7 @@ #' bind_tf_idf(word, book, n) %>% #' arrange(desc(tf_idf)) #' -#'@export +#' @export bind_tf_idf <- function(tbl, term, document, n) { UseMethod("bind_tf_idf") @@ -54,7 +54,6 @@ bind_tf_idf.default <- function(tbl, term, document, n) { #' @export bind_tf_idf.data.frame <- function(tbl, term, document, n) { - term <- quo_name(enquo(term)) document <- quo_name(enquo(document)) n_col <- quo_name(enquo(n)) @@ -85,5 +84,5 @@ bind_tf_idf_.data.frame <- function(tbl, term, document, n) { term <- compat_lazy(term, caller_env()) document <- compat_lazy(document, caller_env()) n <- compat_lazy(n, caller_env()) - bind_tf_idf(tbl, !! term, !! document, !! n) + bind_tf_idf(tbl, !!term, !!document, !!n) } diff --git a/R/corpus_tidiers.R b/R/corpus_tidiers.R index 016765d1..93b69893 100644 --- a/R/corpus_tidiers.R +++ b/R/corpus_tidiers.R @@ -52,7 +52,7 @@ tidy.Corpus <- function(x, collapse = "\n", ...) { # keep as a list column return(m) } - m <- purrr::map_at(m, which(lengths == 0), ~NA) + m <- purrr::map_at(m, which(lengths == 0), ~ NA) ret <- unname(do.call(c, m)) ## tbl_df() doesn't support POSIXlt format @@ -115,9 +115,11 @@ tidy.Corpus <- function(x, collapse = "\n", ...) { #' #' @export tidy.corpus <- function(x, ...) { - dplyr::tbl_df(data.frame(text = quanteda::texts(x), - quanteda::docvars(x), - stringsAsFactors = FALSE)) + dplyr::tbl_df(data.frame( + text = quanteda::texts(x), + quanteda::docvars(x), + stringsAsFactors = FALSE + )) } @@ -127,7 +129,7 @@ glance.corpus <- function(x, ...) { md <- purrr::compact(quanteda::metacorpus(x)) # turn vectors into list columns - md <- purrr::map_if(md, ~length(.) > 1, list) + md <- purrr::map_if(md, ~ length(.) > 1, list) as_data_frame(md) } diff --git a/R/dictionary_tidiers.R b/R/dictionary_tidiers.R index 92b1564c..03380ba2 100644 --- a/R/dictionary_tidiers.R +++ b/R/dictionary_tidiers.R @@ -13,7 +13,8 @@ #' @export tidy.dictionary2 <- function(x, regex = FALSE, ...) { ret <- purrr::map_df(x, function(e) data_frame(word = e), - .id = "category") %>% + .id = "category" + ) %>% mutate(category = as.character(category)) if (regex) { diff --git a/R/globals.R b/R/globals.R index 18f2b008..a49d558d 100644 --- a/R/globals.R +++ b/R/globals.R @@ -1,4 +1,6 @@ -globalVariables(c("X1", "X2", "X3", "word", "code", "category", "texts", - "i", "j", "x", "Var1", "Var2", "value", - ".ndocs", ".nterm", ".document_total", - "tf", "idf", "sentiments", "document", "term")) +globalVariables(c( + "X1", "X2", "X3", "word", "code", "category", "texts", + "i", "j", "x", "Var1", "Var2", "value", + ".ndocs", ".nterm", ".document_total", + "tf", "idf", "sentiments", "document", "term" +)) diff --git a/R/lda_tidiers.R b/R/lda_tidiers.R index 8d67a09b..d3459c21 100644 --- a/R/lda_tidiers.R +++ b/R/lda_tidiers.R @@ -134,8 +134,10 @@ augment.LDA <- function(x, data, ...) { data <- tidy(data) } else if (!inherits(data, "data.frame") && !(all(c("document", "term") %in% colnames(data)))) { - stop("data argument must either be a simple_triplet_matrix (such as ", - "a DocumentTermMatrix) or a table with document and term columns") + stop( + "data argument must either be a simple_triplet_matrix (such as ", + "a DocumentTermMatrix) or a table with document and term columns" + ) } ret <- left_join(data, ret, by = c("document", "term")) } diff --git a/R/sparse_casters.R b/R/sparse_casters.R index 13f4cca1..476dd000 100644 --- a/R/sparse_casters.R +++ b/R/sparse_casters.R @@ -65,8 +65,10 @@ cast_sparse <- function(data, row, column, value, ...) { j <- match(col_names, col_u) } - ret <- Matrix::sparseMatrix(i = i, j = j, x = values, - dimnames = list(row_u, col_u), ...) + ret <- Matrix::sparseMatrix( + i = i, j = j, x = values, + dimnames = list(row_u, col_u), ... + ) ret } @@ -78,10 +80,10 @@ cast_sparse_ <- function(data, row, column, value) { row <- compat_lazy(row, caller_env()) column <- compat_lazy(column, caller_env()) if (missing(value)) { - cast_sparse(data, !! row, !! column) + cast_sparse(data, !!row, !!column) } else { value_col <- compat_lazy(value, caller_env()) - cast_sparse(data, !! row, !! column, !! value_col) + cast_sparse(data, !!row, !!column, !!value_col) } } @@ -114,7 +116,7 @@ cast_tdm <- function(data, term, document, value, term <- quo_name(enquo(term)) document <- quo_name(enquo(document)) value <- quo_name(enquo(value)) - m <- cast_sparse(data, !! term, !! document, !! value, ...) + m <- cast_sparse(data, !!term, !!document, !!value, ...) tm::as.TermDocumentMatrix(m, weighting = weighting) } @@ -127,8 +129,9 @@ cast_tdm_ <- function(data, term, document, value, term <- compat_lazy(term, caller_env()) document <- compat_lazy(document, caller_env()) value <- compat_lazy(value, caller_env()) - cast_tdm(data, !! term, !! document, !! value, - weighting = weighting, ...) + cast_tdm(data, !!term, !!document, !!value, + weighting = weighting, ... + ) } @@ -136,11 +139,11 @@ cast_tdm_ <- function(data, term, document, value, #' @rdname document_term_casters #' @export cast_dtm <- function(data, document, term, value, - weighting = tm::weightTf, ...) { + weighting = tm::weightTf, ...) { document <- quo_name(enquo(document)) term <- quo_name(enquo(term)) value <- quo_name(enquo(value)) - m <- cast_sparse(data, !! document, !! term, !! value, ...) + m <- cast_sparse(data, !!document, !!term, !!value, ...) tm::as.DocumentTermMatrix(m, weighting = weighting) } @@ -149,12 +152,13 @@ cast_dtm <- function(data, document, term, value, #' @inheritParams cast_dtm #' @export cast_dtm_ <- function(data, document, term, value, - weighting = tm::weightTf, ...) { + weighting = tm::weightTf, ...) { document <- compat_lazy(document, caller_env()) term <- compat_lazy(term, caller_env()) value <- compat_lazy(value, caller_env()) - cast_dtm(data, !! document, !! term, !! value, - weighting = weighting, ...) + cast_dtm(data, !!document, !!term, !!value, + weighting = weighting, ... + ) } @@ -164,7 +168,7 @@ cast_dfm <- function(data, document, term, value, ...) { document <- quo_name(enquo(document)) term <- quo_name(enquo(term)) value <- quo_name(enquo(value)) - m <- cast_sparse(data, !! document, !! term, !! value, ...) + m <- cast_sparse(data, !!document, !!term, !!value, ...) quanteda::as.dfm(m) } @@ -176,5 +180,5 @@ cast_dfm_ <- function(data, document, term, value, ...) { document <- compat_lazy(document, caller_env()) term <- compat_lazy(term, caller_env()) value <- compat_lazy(value, caller_env()) - cast_dfm(data, !! document, !! term, !! value) + cast_dfm(data, !!document, !!term, !!value) } diff --git a/R/stm_tidiers.R b/R/stm_tidiers.R index 7339836a..602ea7cb 100644 --- a/R/stm_tidiers.R +++ b/R/stm_tidiers.R @@ -120,7 +120,6 @@ tidy.STM <- function(x, matrix = c("beta", "gamma", "theta"), log = FALSE, #' #' @export augment.STM <- function(x, data, ...) { - if (missing(data)) { stop("data argument must be provided in order to augment a stm model") } @@ -134,8 +133,10 @@ augment.STM <- function(x, data, ...) { mat <- data data <- tidy(mat) } else { - stop("data argument must either be a dfm ", - "(from quanteda) or a table with document and term columns") + stop( + "data argument must either be a dfm ", + "(from quanteda) or a table with document and term columns" + ) } beta <- t(as.matrix(x$beta$logbeta[[1]])) @@ -167,11 +168,13 @@ augment.STM <- function(x, data, ...) { #' @export glance.STM <- function(x, ...) { - ret <- data_frame(k = as.integer(x$settings$dim$K), - docs = x$settings$dim$N, - terms = x$settings$dim$V, - iter = length(x$convergence$bound), - alpha = x$settings$init$alpha) + ret <- data_frame( + k = as.integer(x$settings$dim$K), + docs = x$settings$dim$N, + terms = x$settings$dim$V, + iter = length(x$convergence$bound), + alpha = x$settings$init$alpha + ) ret } diff --git a/R/stop_words.R b/R/stop_words.R index 90afb7bb..0bc4c584 100644 --- a/R/stop_words.R +++ b/R/stop_words.R @@ -45,6 +45,8 @@ #' @export #' get_stopwords <- function(language = "en", source = "snowball") { - data_frame(word = stopwords::stopwords(language = language, source = source), - lexicon = source) + data_frame( + word = stopwords::stopwords(language = language, source = source), + lexicon = source + ) } diff --git a/R/unnest_tokens.R b/R/unnest_tokens.R index 5e41b166..484b7d2d 100644 --- a/R/unnest_tokens.R +++ b/R/unnest_tokens.R @@ -89,29 +89,36 @@ #' unnest_tokens(word, text, format = "html") #' unnest_tokens <- function(tbl, output, input, token = "words", - format = c("text", "man", "latex", - "html", "xml"), + format = c( + "text", "man", "latex", + "html", "xml" + ), to_lower = TRUE, drop = TRUE, collapse = NULL, ...) { UseMethod("unnest_tokens") } #' @export unnest_tokens.default <- function(tbl, output, input, token = "words", - format = c("text", "man", "latex", - "html", "xml"), + format = c( + "text", "man", "latex", + "html", "xml" + ), to_lower = TRUE, drop = TRUE, collapse = NULL, ...) { - output <- compat_as_lazy(enquo(output)) input <- compat_as_lazy(enquo(input)) - unnest_tokens_(tbl, output, input, - token, format, to_lower, drop, collapse, ...) + unnest_tokens_( + tbl, output, input, + token, format, to_lower, drop, collapse, ... + ) } #' @export unnest_tokens.data.frame <- function(tbl, output, input, token = "words", - format = c("text", "man", "latex", - "html", "xml"), + format = c( + "text", "man", "latex", + "html", "xml" + ), to_lower = TRUE, drop = TRUE, collapse = NULL, ...) { output <- quo_name(enquo(output)) @@ -119,36 +126,49 @@ unnest_tokens.data.frame <- function(tbl, output, input, token = "words", # retain top-level attributes attrs <- attributes(tbl) - custom_attributes <- attrs[setdiff(names(attrs), - c("dim", "dimnames", - "names", "row.names", - ".internal.selfref"))] + custom_attributes <- attrs[setdiff( + names(attrs), + c( + "dim", "dimnames", + "names", "row.names", + ".internal.selfref" + ) + )] format <- match.arg(format) if (is.function(token)) { tokenfunc <- token - } else if (token %in% c("word", "character", - "character_shingle", "ngram", - "skip_ngram", "sentence", "line", - "paragraph", "tweet")) { - stop(paste0("Error: Token must be a supported type, or a function that takes a character vector as input\nDid you mean token = ", - token, "s?")) + } else if (token %in% c( + "word", "character", + "character_shingle", "ngram", + "skip_ngram", "sentence", "line", + "paragraph", "tweet" + )) { + stop(paste0( + "Error: Token must be a supported type, or a function that takes a character vector as input\nDid you mean token = ", + token, "s?" + )) } else if (format != "text") { if (token != "words") { stop("Cannot tokenize by any unit except words when format is not text") } tokenfunc <- function(col, ...) hunspell::hunspell_parse(col, - format = format) + format = format + ) } else { - if (is.null(collapse) && token %in% c("ngrams", "skip_ngrams", "sentences", - "lines", "paragraphs", "regex", - "character_shingles")) { + if (is.null(collapse) && token %in% c( + "ngrams", "skip_ngrams", "sentences", + "lines", "paragraphs", "regex", + "character_shingles" + )) { collapse <- TRUE } tf <- get(paste0("tokenize_", token)) - if (token %in% c("characters", "words", "ngrams", "skip_ngrams", - "tweets", "ptb")) { + if (token %in% c( + "characters", "words", "ngrams", "skip_ngrams", + "tweets", "ptb" + )) { tokenfunc <- function(col, ...) tf(col, lowercase = FALSE, ...) } else { tokenfunc <- tf @@ -157,32 +177,37 @@ unnest_tokens.data.frame <- function(tbl, output, input, token = "words", if (!is.null(collapse) && collapse) { if (any(!purrr::map_lgl(tbl, is.atomic))) { - stop("If collapse = TRUE (such as for unnesting by sentence or paragraph), ", - "unnest_tokens needs all input columns to be atomic vectors (not lists)") + stop( + "If collapse = TRUE (such as for unnesting by sentence or paragraph), ", + "unnest_tokens needs all input columns to be atomic vectors (not lists)" + ) } group_vars <- setdiff(names(tbl), input) - exps <- substitute(stringr::str_c(colname, collapse = "\n"), - list(colname = as.name(input))) + exps <- substitute( + stringr::str_c(colname, collapse = "\n"), + list(colname = as.name(input)) + ) if (is_empty(group_vars)) { - tbl <- summarise(tbl, col = !! exps) + tbl <- summarise(tbl, col = !!exps) } else { - tbl <- group_by(tbl, !!! syms(group_vars)) %>% - summarise(col = !! exps) %>% - ungroup + tbl <- group_by(tbl, !!!syms(group_vars)) %>% + summarise(col = !!exps) %>% + ungroup() } names(tbl)[names(tbl) == "col"] <- input - } col <- tbl[[input]] output_lst <- tokenfunc(col, ...) if (!(is.list(output_lst) && length(output_lst) == nrow(tbl))) { - stop("Expected output of tokenizing function to be a list of length ", - nrow(tbl)) + stop( + "Expected output of tokenizing function to be a list of length ", + nrow(tbl) + ) } ret <- tbl[rep(seq_len(nrow(tbl)), lengths(output_lst)), , drop = FALSE] @@ -220,13 +245,16 @@ unnest_tokens_ <- function(tbl, output, input, token = "words", #' @export unnest_tokens_.data.frame <- function(tbl, output, input, token = "words", - format = c("text", "man", "latex", - "html", "xml"), + format = c( + "text", "man", "latex", + "html", "xml" + ), to_lower = TRUE, drop = TRUE, collapse = NULL, ...) { output <- compat_lazy(output, caller_env()) input <- compat_lazy(input, caller_env()) - unnest_tokens(tbl, !! output, !! input, + unnest_tokens(tbl, !!output, !!input, token = token, format = format, - to_lower = to_lower, drop = drop, collapse = collapse, ...) + to_lower = to_lower, drop = drop, collapse = collapse, ... + ) } diff --git a/data-raw/setup_data.R b/data-raw/setup_data.R index cdeec009..d5ced421 100644 --- a/data-raw/setup_data.R +++ b/data-raw/setup_data.R @@ -6,15 +6,19 @@ library(dplyr) # sentiments dataset ------------------------------------------------------ nrc_lexicon <- readr::read_tsv("data-raw/NRC-emotion-lexicon-wordlevel-alphabetized-v0.92.txt.zip", - col_names = FALSE, skip = 46) -nrc_lexicon <- nrc_lexicon %>% filter(X3 == 1) %>% + col_names = FALSE, skip = 46 +) +nrc_lexicon <- nrc_lexicon %>% + filter(X3 == 1) %>% select(word = X1, sentiment = X2) %>% mutate(lexicon = "nrc") bing_lexicon1 <- readr::read_lines("data-raw/positive-words.txt", - skip = 35) + skip = 35 +) bing_lexicon2 <- readr::read_lines("data-raw/negative-words.txt", - skip = 35) + skip = 35 +) bing_lexicon1 <- data_frame(word = bing_lexicon1) %>% mutate(sentiment = "positive", lexicon = "bing") bing_lexicon2 <- data_frame(word = bing_lexicon2) %>% @@ -22,7 +26,8 @@ bing_lexicon2 <- data_frame(word = bing_lexicon2) %>% bing_lexicon <- bind_rows(bing_lexicon1, bing_lexicon2) %>% arrange(word) AFINN_lexicon <- readr::read_tsv("data-raw/AFINN-111.txt", - col_names = FALSE) + col_names = FALSE +) AFINN_lexicon <- AFINN_lexicon %>% transmute(word = X1, sentiment = NA, score = X2, lexicon = "AFINN") @@ -40,9 +45,11 @@ loughran_lexicon <- mcdonald_raw %>% tidyr::gather(sentiment, value, -word) %>% filter(value > 0) %>% select(-value) %>% - mutate(word = stringr::str_to_lower(word), - sentiment = stringr::str_to_lower(sentiment), - lexicon = "loughran") + mutate( + word = stringr::str_to_lower(word), + sentiment = stringr::str_to_lower(sentiment), + lexicon = "loughran" + ) sentiments <- bind_rows(nrc_lexicon, bing_lexicon, AFINN_lexicon, loughran_lexicon) %>% filter(!stringr::str_detect(word, "[^[:ascii:]]")) @@ -84,8 +91,10 @@ parts_of_speech <- readr::read_csv("Noun,N ", col_names = c("pos", "code")) # parts of speech -parts_of_speech <- readr::read_delim("data-raw/mobyposi.i.zip", delim = "\xd7", - col_names = c("word", "code")) %>% +parts_of_speech <- readr::read_delim("data-raw/mobyposi.i.zip", + delim = "\xd7", + col_names = c("word", "code") +) %>% tidyr::unnest(code = stringr::str_split(code, "")) %>% inner_join(parts_of_speech, by = "code") %>% filter(!stringr::str_detect(word, " ")) %>% @@ -113,4 +122,3 @@ nma_words <- readr::read_lines("data-raw/list-English-negators.txt") %>% readr::write_csv(nma_words, "data-raw/nma_words.csv") devtools::use_data(nma_words, overwrite = TRUE) - diff --git a/tests/testthat/test-corpus-tidiers.R b/tests/testthat/test-corpus-tidiers.R index 4e437414..a8f487c2 100644 --- a/tests/testthat/test-corpus-tidiers.R +++ b/tests/testthat/test-corpus-tidiers.R @@ -5,7 +5,8 @@ test_that("Can tidy corpus from tm package", { #' # tm package examples txt <- system.file("texts", "txt", package = "tm") ovid <- VCorpus(DirSource(txt, encoding = "UTF-8"), - readerControl = list(language = "lat")) + readerControl = list(language = "lat") + ) td <- tidy(ovid, collapse = " ") @@ -31,7 +32,6 @@ test_that("Can tidy corpus from quanteda package", { test_that("Can tidy corpus from quanteda using accessor functions", { if (requireNamespace("quanteda", quietly = TRUE)) { - x <- quanteda::data_corpus_inaugural ## old method @@ -47,14 +47,13 @@ test_that("Can tidy corpus from quanteda using accessor functions", { test_that("Can glance a corpus from quanteda using accessor functions", { if (requireNamespace("quanteda", quietly = TRUE)) { - x <- quanteda::data_corpus_inaugural ## old method glance_old <- function(x, ...) { md <- purrr::compact(x$metadata) # turn vectors into list columns - md <- purrr::map_if(md, ~length(.) > 1, list) + md <- purrr::map_if(md, ~ length(.) > 1, list) as_data_frame(md) } ret_old <- glance_old(x) diff --git a/tests/testthat/test-dictionary-tidiers.R b/tests/testthat/test-dictionary-tidiers.R index eadbcc1c..6a0d10a5 100644 --- a/tests/testthat/test-dictionary-tidiers.R +++ b/tests/testthat/test-dictionary-tidiers.R @@ -2,8 +2,10 @@ context("dictionary tidiers") if (requireNamespace("quanteda", quietly = TRUE)) { test_that("can tidy a quanteda dictionary", { - lst <- list(terror = c("terrorism", "terrorists", "threat"), - economy = c("jobs", "business", "grow", "work")) + lst <- list( + terror = c("terrorism", "terrorists", "threat"), + economy = c("jobs", "business", "grow", "work") + ) d <- quanteda::dictionary(lst) td <- tidy(d) @@ -13,7 +15,9 @@ if (requireNamespace("quanteda", quietly = TRUE)) { expect_equal(nrow(td), 7) expect_equal(sort(unique(td$category)), c("economy", "terror")) - expect_equal(sort(unique(td$word)), - sort(unique(c(lst[[1]], lst[[2]])))) + expect_equal( + sort(unique(td$word)), + sort(unique(c(lst[[1]], lst[[2]]))) + ) }) } diff --git a/tests/testthat/test-lda-tidiers.R b/tests/testthat/test-lda-tidiers.R index bd2299a1..77aa65ad 100644 --- a/tests/testthat/test-lda-tidiers.R +++ b/tests/testthat/test-lda-tidiers.R @@ -3,7 +3,6 @@ context("LDA tidiers") suppressPackageStartupMessages(library(dplyr)) if (require("topicmodels", quietly = TRUE)) { - data(AssociatedPress) ap <- AssociatedPress[1:100, ] lda <- LDA(ap, control = list(alpha = 0.1), k = 4) diff --git a/tests/testthat/test-sentiments.R b/tests/testthat/test-sentiments.R index 8c621ef9..2cbe287c 100644 --- a/tests/testthat/test-sentiments.R +++ b/tests/testthat/test-sentiments.R @@ -2,9 +2,13 @@ context("sentiments") suppressPackageStartupMessages(library(dplyr)) -test_data <- data_frame(line = 1:2, - text = c("I am happy and joyful", - "I am sad and annoyed")) +test_data <- data_frame( + line = 1:2, + text = c( + "I am happy and joyful", + "I am sad and annoyed" + ) +) test_tokens <- unnest_tokens(test_data, word, text) @@ -14,8 +18,10 @@ test_that("get_sentiments works for nrc data", { # only positive included in NRC for some reason expect_equal(unique(nrc_joined$word), c("happy", "joyful")) - expect_equal(sort(unique(nrc_joined$sentiment)), - c("anticipation", "joy", "positive", "trust")) + expect_equal( + sort(unique(nrc_joined$sentiment)), + c("anticipation", "joy", "positive", "trust") + ) }) test_that("get_sentiments works for bing data", { @@ -23,15 +29,19 @@ test_that("get_sentiments works for bing data", { inner_join(get_sentiments("bing"), by = "word") expect_equal(bing_joined$word, c("happy", "joyful", "sad", "annoyed")) - expect_equal(bing_joined$sentiment, - c("positive", "positive", "negative", "negative")) + expect_equal( + bing_joined$sentiment, + c("positive", "positive", "negative", "negative") + ) }) test_that("get_sentiments works for afinn data", { afinn_joined <- test_tokens %>% inner_join(get_sentiments("afinn"), by = "word") - expect_equal(afinn_joined$word, - c("happy", "joyful", "sad", "annoyed")) + expect_equal( + afinn_joined$word, + c("happy", "joyful", "sad", "annoyed") + ) expect_equal(sign(afinn_joined$score), c(1, 1, -1, -1)) }) diff --git a/tests/testthat/test-sparse-casters.R b/tests/testthat/test-sparse-casters.R index a4cf67f4..2808c1f6 100644 --- a/tests/testthat/test-sparse-casters.R +++ b/tests/testthat/test-sparse-casters.R @@ -2,9 +2,11 @@ context("Sparse casters") library(Matrix) -dat <- data_frame(a = c("row1", "row1", "row2", "row2", "row2"), - b = c("col1", "col2", "col1", "col3", "col4"), - val = 1:5) +dat <- data_frame( + a = c("row1", "row1", "row2", "row2", "row2"), + b = c("col1", "col2", "col1", "col3", "col4"), + val = 1:5 +) test_that("Can cast tables into a sparse Matrix", { m <- cast_sparse(dat, a, b) @@ -35,7 +37,7 @@ test_that("cast_sparse ignores groups", { test_that("Can cast_sparse with tidyeval", { m <- cast_sparse(dat, a, b) rowvar <- quo("a") - m2 <- cast_sparse(dat, !! rowvar, b) + m2 <- cast_sparse(dat, !!rowvar, b) expect_identical(m, m2) }) @@ -63,7 +65,6 @@ test_that("Can cast tables into a sparse TermDocumentMatrix", { expect_equal(dim(d), c(4, 2)) expect_equal(sort(tm::Terms(d)), sort(unique(dat$b))) - }) test_that("Can cast tables into a sparse dfm", { @@ -77,5 +78,4 @@ test_that("Can cast tables into a sparse dfm", { expect_equal(dim(d), c(2, 4)) expect_equal(as.numeric(d[1, 1]), 1) expect_equal(as.numeric(d[2, 3]), 4) - }) diff --git a/tests/testthat/test-sparse-tidiers.R b/tests/testthat/test-sparse-tidiers.R index 1fa4f99b..66fc454e 100644 --- a/tests/testthat/test-sparse-tidiers.R +++ b/tests/testthat/test-sparse-tidiers.R @@ -4,7 +4,8 @@ test_that("Can tidy DocumentTermMatrices and TermDocumentMatrices", { if (require("tm", quietly = TRUE)) { txt <- system.file("texts", "txt", package = "tm") ovid <- VCorpus(DirSource(txt, encoding = "UTF-8"), - readerControl = list(language = "lat")) + readerControl = list(language = "lat") + ) ovid_dtm <- DocumentTermMatrix(ovid) ovid_dtm_td <- tidy(ovid_dtm) @@ -27,7 +28,8 @@ test_that("Can tidy dfm from quanteda", { if (requireNamespace("quanteda", quietly = TRUE)) { library(methods) capture.output(dfm_obj <- quanteda::dfm(quanteda::data_corpus_inaugural, - verbose = FALSE)) + verbose = FALSE + )) dfm_obj_td <- tidy(dfm_obj) diff --git a/tests/testthat/test-stm-tidiers.R b/tests/testthat/test-stm-tidiers.R index c3c8c6c1..c4465d5d 100644 --- a/tests/testthat/test-stm-tidiers.R +++ b/tests/testthat/test-stm-tidiers.R @@ -3,10 +3,11 @@ context("stm tidiers") suppressPackageStartupMessages(library(dplyr)) if (require("stm", quietly = TRUE)) { - - dat <- data_frame(document = c("row1", "row1", "row2", "row2", "row2"), - term = c("col1", "col2", "col1", "col3", "col4"), - n = 1:5) + dat <- data_frame( + document = c("row1", "row1", "row2", "row2", "row2"), + term = c("col1", "col2", "col1", "col3", "col4"), + n = 1:5 + ) m <- cast_sparse(dat, document, term) stm_model <- stm(m, seed = 1234, K = 3, verbose = FALSE) diff --git a/tests/testthat/test-stop-words.R b/tests/testthat/test-stop-words.R index f2d6dea4..7d46f58f 100644 --- a/tests/testthat/test-stop-words.R +++ b/tests/testthat/test-stop-words.R @@ -3,7 +3,6 @@ context("Stop words") suppressPackageStartupMessages(library(dplyr)) test_that("get_stopwords works for multiple languages", { - skip_if_not_installed("stopwords") de <- get_stopwords("de") ru <- get_stopwords("ru") @@ -12,5 +11,4 @@ test_that("get_stopwords works for multiple languages", { expect_is(ru, "tbl_df") expect_gt(nrow(de), nrow(ru)) expect_equal(unique(de$lexicon), "snowball") - }) diff --git a/tests/testthat/test-tf-idf.R b/tests/testthat/test-tf-idf.R index 7152b246..22edfe3f 100644 --- a/tests/testthat/test-tf-idf.R +++ b/tests/testthat/test-tf-idf.R @@ -1,10 +1,16 @@ context("tf-idf calculation") -w <- data_frame(document = rep(1:2, each = 5), - word = c("the", "quick", "brown", "fox", "jumped", - "over", "the", "lazy", "brown", "dog"), - frequency = c(1, 1, 1, 1, 2, - 1, 1, 1, 1, 2)) +w <- data_frame( + document = rep(1:2, each = 5), + word = c( + "the", "quick", "brown", "fox", "jumped", + "over", "the", "lazy", "brown", "dog" + ), + frequency = c( + 1, 1, 1, 1, 2, + 1, 1, 1, 1, 2 + ) +) test_that("Can calculate TF-IDF", { result <- w %>% @@ -13,8 +19,10 @@ test_that("Can calculate TF-IDF", { bind_tf_idf_("word", "document", "frequency") expect_equal(result, result2) - expect_equal(select(w, document, word, frequency), - select(result, document, word, frequency)) + expect_equal( + select(w, document, word, frequency), + select(result, document, word, frequency) + ) expect_is(result, "tbl_df") expect_is(result$tf, "numeric") @@ -50,17 +58,21 @@ test_that("TF-IDF works when the document ID is a number", { test_that("tf-idf with tidyeval works", { - d <- data_frame(txt = c("Because I could not stop for Death -", - "He kindly stopped for me -")) + d <- data_frame(txt = c( + "Because I could not stop for Death -", + "He kindly stopped for me -" + )) termvar <- quo("word") documentvar <- quo("document") countvar <- quo("frequency") result <- w %>% - bind_tf_idf(!! termvar, !! documentvar, !! countvar) + bind_tf_idf(!!termvar, !!documentvar, !!countvar) - expect_equal(select(w, document, word, frequency), - select(result, document, word, frequency)) + expect_equal( + select(w, document, word, frequency), + select(result, document, word, frequency) + ) expect_is(result, "tbl_df") expect_is(result$tf, "numeric") @@ -73,7 +85,7 @@ test_that("tf-idf with tidyeval works", { result2 <- w %>% group_by(document) %>% - bind_tf_idf(!! termvar, !! documentvar, !! countvar) + bind_tf_idf(!!termvar, !!documentvar, !!countvar) expect_equal(length(groups(result2)), 1) expect_equal(as.character(groups(result2)[[1]]), "document") diff --git a/tests/testthat/test-unnest-tokens.R b/tests/testthat/test-unnest-tokens.R index 92e793e7..2b196a52 100644 --- a/tests/testthat/test-unnest-tokens.R +++ b/tests/testthat/test-unnest-tokens.R @@ -22,16 +22,20 @@ test_that("tokenizing by character shingles works", { test_that("tokenizing by character shingles can include whitespace/punctuation", { d <- data_frame(txt = "tidytext is the best!") - d <- d %>% unnest_tokens(char_ngram, txt, token = "character_shingles", - strip_non_alphanum = FALSE) + d <- d %>% unnest_tokens(char_ngram, txt, + token = "character_shingles", + strip_non_alphanum = FALSE + ) expect_equal(nrow(d), 19) expect_equal(ncol(d), 1) expect_equal(d$char_ngram[1], "tid") }) test_that("tokenizing by word works", { - d <- data_frame(txt = c("Because I could not stop for Death -", - "He kindly stopped for me -")) + d <- data_frame(txt = c( + "Because I could not stop for Death -", + "He kindly stopped for me -" + )) d <- d %>% unnest_tokens(word, txt) expect_equal(nrow(d), 12) expect_equal(ncol(d), 1) @@ -39,17 +43,23 @@ test_that("tokenizing by word works", { }) test_that("tokenizing errors with appropriate message", { - d <- data_frame(txt = c("Because I could not stop for Death -", - "He kindly stopped for me -")) - expect_error(d %>% unnest_tokens(word, txt, token = "word"), - "Error: Token must be a supported type, or a function that takes a character vector as input\nDid you mean token = words?") + d <- data_frame(txt = c( + "Because I could not stop for Death -", + "He kindly stopped for me -" + )) + expect_error( + d %>% unnest_tokens(word, txt, token = "word"), + "Error: Token must be a supported type, or a function that takes a character vector as input\nDid you mean token = words?" + ) }) test_that("tokenizing by sentence works", { - orig <- data_frame(txt = c("I'm Nobody! Who are you?", - "Are you - Nobody - too?", - "Then there’s a pair of us!", - "Don’t tell! they’d advertise - you know!")) + orig <- data_frame(txt = c( + "I'm Nobody! Who are you?", + "Are you - Nobody - too?", + "Then there’s a pair of us!", + "Don’t tell! they’d advertise - you know!" + )) d <- orig %>% unnest_tokens(sentence, txt, token = "sentences") expect_equal(nrow(d), 6) expect_equal(ncol(d), 1) @@ -66,40 +76,43 @@ test_that("tokenizing by sentence works", { test_that("tokenizing by ngram and skip ngram works", { - d2 <- data_frame(txt = c("Hope is the thing with feathers", - "That perches in the soul", - "And sings the tune without the words", - "And never stops at all ", - "And sweetest in the Gale is heard ", - "And sore must be the storm ", - "That could abash the little Bird", - "That kept so many warm ", - "I’ve heard it in the chillest land ", - "And on the strangest Sea ", - "Yet never in Extremity,", - "It asked a crumb of me.")) + d2 <- data_frame(txt = c( + "Hope is the thing with feathers", + "That perches in the soul", + "And sings the tune without the words", + "And never stops at all ", + "And sweetest in the Gale is heard ", + "And sore must be the storm ", + "That could abash the little Bird", + "That kept so many warm ", + "I’ve heard it in the chillest land ", + "And on the strangest Sea ", + "Yet never in Extremity,", + "It asked a crumb of me." + )) # tokenize by ngram d <- d2 %>% unnest_tokens(ngram, txt, token = "ngrams", n = 2) - #expect_equal(nrow(d), 68) does not pass on appveyor + # expect_equal(nrow(d), 68) does not pass on appveyor expect_equal(ncol(d), 1) expect_equal(d$ngram[1], "hope is") expect_equal(d$ngram[10], "the soul") # tokenize by skip_ngram d <- d2 %>% unnest_tokens(ngram, txt, token = "skip_ngrams", n = 4, k = 2) - #expect_equal(nrow(d), 189) does not pass on appveyor + # expect_equal(nrow(d), 189) does not pass on appveyor expect_equal(ncol(d), 1) expect_equal(d$ngram[40], "hope thing that the") expect_equal(d$ngram[400], "the sings without and") - }) test_that("tokenizing with a custom function works", { - orig <- data_frame(txt = c("I'm Nobody! Who are you?", - "Are you - Nobody - too?", - "Then there’s a pair of us!", - "Don’t tell! they’d advertise - you know!")) + orig <- data_frame(txt = c( + "I'm Nobody! Who are you?", + "Are you - Nobody - too?", + "Then there’s a pair of us!", + "Don’t tell! they’d advertise - you know!" + )) d <- orig %>% unnest_tokens(unit, txt, token = stringr::str_split, pattern = " - ") expect_equal(nrow(d), 7) @@ -107,44 +120,52 @@ test_that("tokenizing with a custom function works", { expect_equal(d$unit[4], "too?") d2 <- orig %>% - unnest_tokens(unit, txt, token = stringr::str_split, - pattern = " - ", collapse = TRUE) + unnest_tokens(unit, txt, + token = stringr::str_split, + pattern = " - ", collapse = TRUE + ) expect_equal(nrow(d2), 4) expect_equal(d2$unit[2], "nobody") expect_equal(d2$unit[4], "you know!") }) test_that("tokenizing with standard evaluation works", { - d <- data_frame(txt = c("Because I could not stop for Death -", - "He kindly stopped for me -")) + d <- data_frame(txt = c( + "Because I could not stop for Death -", + "He kindly stopped for me -" + )) d <- d %>% unnest_tokens_("word", "txt") expect_equal(nrow(d), 12) expect_equal(ncol(d), 1) expect_equal(d$word[1], "because") - }) test_that("tokenizing with tidyeval works", { - d <- data_frame(txt = c("Because I could not stop for Death -", - "He kindly stopped for me -")) + d <- data_frame(txt = c( + "Because I could not stop for Death -", + "He kindly stopped for me -" + )) outputvar <- quo("word") inputvar <- quo("txt") d <- d %>% unnest_tokens(!!outputvar, !!inputvar) expect_equal(nrow(d), 12) expect_equal(ncol(d), 1) expect_equal(d$word[1], "because") - }) test_that("tokenizing with to_lower = FALSE works", { - orig <- data_frame(txt = c("Because I could not stop for Death -", - "He kindly stopped for me -")) + orig <- data_frame(txt = c( + "Because I could not stop for Death -", + "He kindly stopped for me -" + )) d <- orig %>% unnest_tokens(word, txt, to_lower = FALSE) expect_equal(nrow(d), 12) expect_equal(ncol(d), 1) expect_equal(d$word[1], "Because") - d2 <- orig %>% unnest_tokens(ngram, txt, token = "ngrams", - n = 2, to_lower = FALSE) + d2 <- orig %>% unnest_tokens(ngram, txt, + token = "ngrams", + n = 2, to_lower = FALSE + ) expect_equal(nrow(d2), 11) expect_equal(ncol(d2), 1) expect_equal(d2$ngram[1], "Because I") @@ -154,16 +175,22 @@ test_that("tokenizing with to_lower = FALSE works", { test_that("unnest_tokens raises an error if custom tokenizer gives bad output", { d <- data_frame(txt = "Emily Dickinson") - expect_error(unnest_tokens(d, word, txt, token = function(e) c("a", "b")), - "to be a list") - expect_error(unnest_tokens(d, word, txt, token = function(e) list("a", "b")), - "of length") + expect_error( + unnest_tokens(d, word, txt, token = function(e) c("a", "b")), + "to be a list" + ) + expect_error( + unnest_tokens(d, word, txt, token = function(e) list("a", "b")), + "of length" + ) }) test_that("tokenizing HTML works", { - h <- data_frame(row = 1:2, - text = c("

Text is", "here")) + h <- data_frame( + row = 1:2, + text = c("

Text is", "here") + ) res1 <- unnest_tokens(h, word, text) expect_gt(nrow(res1), 3) @@ -177,9 +204,13 @@ test_that("tokenizing HTML works", { test_that("tokenizing LaTeX works", { - h <- data_frame(row = 1:4, - text = c("\\textbf{text} \\emph{is}", "\\begin{itemize}", - "\\item here", "\\end{itemize}")) + h <- data_frame( + row = 1:4, + text = c( + "\\textbf{text} \\emph{is}", "\\begin{itemize}", + "\\item here", "\\end{itemize}" + ) + ) res1 <- unnest_tokens(h, word, text) expect_gt(nrow(res1), 3) @@ -192,9 +223,13 @@ test_that("tokenizing LaTeX works", { }) test_that("Tokenizing a one-column data.frame works", { - text <- data.frame(txt = c("Because I could not stop for Death -", - "He kindly stopped for me -"), - stringsAsFactors = FALSE) + text <- data.frame( + txt = c( + "Because I could not stop for Death -", + "He kindly stopped for me -" + ), + stringsAsFactors = FALSE + ) d <- unnest_tokens(text, word, txt) expect_is(d, "data.frame") @@ -204,10 +239,14 @@ test_that("Tokenizing a one-column data.frame works", { }) test_that("Tokenizing a two-column data.frame with one non-text column works", { - text <- data.frame(line = 1:2, - txt = c("Because I could not stop for Death -", - "He kindly stopped for me -"), - stringsAsFactors = FALSE) + text <- data.frame( + line = 1:2, + txt = c( + "Because I could not stop for Death -", + "He kindly stopped for me -" + ), + stringsAsFactors = FALSE + ) d <- unnest_tokens(text, word, txt) expect_is(d, "data.frame") @@ -219,10 +258,14 @@ test_that("Tokenizing a two-column data.frame with one non-text column works", { test_that("Tokenizing with NA values in columns behaves as expected", { - text <- data_frame(line = c(1:2, NA), - txt = c(NA, - "Because I could not stop for Death -", - "He kindly stopped for me -")) + text <- data_frame( + line = c(1:2, NA), + txt = c( + NA, + "Because I could not stop for Death -", + "He kindly stopped for me -" + ) + ) d <- unnest_tokens(text, word, txt) expect_is(d, "data.frame") @@ -238,16 +281,22 @@ test_that("Tokenizing with NA values in columns behaves as expected", { test_that("Trying to tokenize a non-text format with words raises an error", { d <- data_frame(txt = "Emily Dickinson") - expect_error(unnest_tokens(d, word, txt, token = "sentences", - format = "latex"), - "except words") + expect_error( + unnest_tokens(d, word, txt, + token = "sentences", + format = "latex" + ), + "except words" + ) }) test_that("unnest_tokens keeps top-level attributes", { # first check data.frame - d <- data.frame(row = 1:2, - txt = c("Call me Ishmael.", "OK, I will."), - stringsAsFactors = FALSE) + d <- data.frame( + row = 1:2, + txt = c("Call me Ishmael.", "OK, I will."), + stringsAsFactors = FALSE + ) lst <- list(1, 2, 3, 4) attr(d, "custom") <- lst @@ -264,8 +313,10 @@ test_that("unnest_tokens keeps top-level attributes", { test_that("Trying to tokenize a data.table works", { skip_if_not_installed("data.table") - text <- data.table::data.table(txt = "Write till my fingers look like a bouquet of roses", - author = "Watsky") + text <- data.table::data.table( + txt = "Write till my fingers look like a bouquet of roses", + author = "Watsky" + ) output <- unnest_tokens(text, word, txt) expect_equal(ncol(output), 2) expect_equal(nrow(output), 10) @@ -296,10 +347,14 @@ test_that("custom attributes are preserved for a data.table", { }) test_that("Tokenizing a data frame with list columns works", { - df <- data.frame(txt = c("Because I could not stop for Death -", - "He kindly stopped for me -"), - line = 1L:2L, - stringsAsFactors = FALSE) + df <- data.frame( + txt = c( + "Because I could not stop for Death -", + "He kindly stopped for me -" + ), + line = 1L:2L, + stringsAsFactors = FALSE + ) df$list_col <- list(1L:3L, c("a", "b")) @@ -314,10 +369,14 @@ test_that("Tokenizing a data frame with list columns works", { }) test_that("Tokenizing a tbl_df with list columns works", { - df <- data_frame(txt = c("Because I could not stop for Death -", - "He kindly stopped for me -"), - line = 1L:2L, - list_col = list(1L:3L, c("a", "b"))) + df <- data_frame( + txt = c( + "Because I could not stop for Death -", + "He kindly stopped for me -" + ), + line = 1L:2L, + list_col = list(1L:3L, c("a", "b")) + ) ret <- unnest_tokens(df, word, txt) expect_is(ret, "tbl_df") @@ -330,13 +389,19 @@ test_that("Tokenizing a tbl_df with list columns works", { }) test_that("Can't tokenize with list columns with collapse = TRUE", { - df <- data_frame(txt = c("Because I could not stop for Death -", - "He kindly stopped for me -"), - line = 1L:2L, - list_col = list(1L:3L, c("a", "b"))) - - expect_error(unnest_tokens(df, word, txt, token = "sentences"), - "to be atomic vectors") + df <- data_frame( + txt = c( + "Because I could not stop for Death -", + "He kindly stopped for me -" + ), + line = 1L:2L, + list_col = list(1L:3L, c("a", "b")) + ) + + expect_error( + unnest_tokens(df, word, txt, token = "sentences"), + "to be atomic vectors" + ) # Can tokenize by sentence without collapsing # though it sort of defeats the purpose