Make a better error message when the user misses the name of the toke…

…nizer. Also add info for new tokenizers. Closes #111.
juliasilge · Apr 2, 2018 · b85c20a · b85c20a
1 parent b66c5c6
commit b85c20a
Show file tree

Hide file tree

Showing 5 changed files with 48 additions and 23 deletions.
diff --git a/NEWS.md b/NEWS.md
@@ -1,6 +1,8 @@
 # tidytext 0.1.9 [WIP]
 
 * Updates to documentation (#109) thanks to Emil Hvitfeldt.
+* Add new tokenizers for tweets, Penn Treebank to `unnest_tokens()`.
+* Better error message (#111).
 
 # tidytext 0.1.8
 

diff --git a/R/unnest_tokens.R b/R/unnest_tokens.R
@@ -1,15 +1,17 @@
 #' Split a column into tokens using the tokenizers package
 #'
 #' Split a column into tokens using the tokenizers package, splitting the table
-#' into one-token-per-row. This function supports non-standard evaluation through
-#' the tidyeval framework.
+#' into one-token-per-row. This function supports non-standard evaluation
+#' through the tidyeval framework.
 #'
 #' @param tbl A data frame
 #'
 #' @param token Unit for tokenizing, or a custom tokenizing function. Built-in
-#' options are "words" (default), "characters", "character_shingles", "ngrams", "skip_ngrams",
-#' "sentences", "lines", "paragraphs", and "regex". If a function, should take
-#' a character vector and return a list of character vectors of the same length.
+#' options are "words" (default), "characters", "character_shingles", "ngrams",
+#' "skip_ngrams", "sentences", "lines", "paragraphs", "regex", "tweets"
+#' (tokenization by word that preserves usernames, hashtags, and URLS ), and
+#' "ptb" (Penn Treebank). If a function, should take a character vector and
+#' return a list of character vectors of the same length.
 #'
 #' @param format Either "text", "man", "latex", "html", or "xml". If not text,
 #' this uses the hunspell tokenizer, and can tokenize only by "word"
@@ -31,8 +33,10 @@
 #' when token method is "ngrams", "skip_ngrams", "sentences", "lines",
 #' "paragraphs", or "regex".
 #'
-#' @param ... Extra arguments passed on to the tokenizer, such as \code{n} and
-#' \code{k} for "ngrams" and "skip_ngrams" or \code{pattern} for "regex".
+#' @param ... Extra arguments passed on to \link[tokenizers]{tokenizers}, such
+#' as \code{strip_punct} for "words" and "tweets", \code{n} and \code{k} for
+#' "ngrams" and "skip_ngrams", \code{strip_url} for "tweets", and
+#' \code{pattern} for "regex".
 #'
 #' @details If the unit for tokenizing is ngrams, skip_ngrams, sentences, lines,
 #' paragraphs, or regex, the entire input will be collapsed together before
@@ -124,6 +128,12 @@ unnest_tokens.data.frame <- function(tbl, output, input, token = "words",
 
   if (is.function(token)) {
     tokenfunc <- token
+  } else if (token %in% c("word", "character",
+                          "character_shingle", "ngram",
+                          "skip_ngram", "sentence", "line",
+                          "paragraph", "tweet")) {
+    stop(paste0("Error: Token must be a supported type, or a function that takes a character vector as input\nDid you mean token = ",
+                token, "s?"))
   } else if (format != "text") {
     if (token != "words") {
       stop("Cannot tokenize by any unit except words when format is not text")
@@ -136,7 +146,7 @@ unnest_tokens.data.frame <- function(tbl, output, input, token = "words",
       collapse <- TRUE
     }
     tf <- get(paste0("tokenize_", token))
-    if (token %in% c("characters", "words", "ngrams", "skip_ngrams")) {
+    if (token %in% c("characters", "words", "ngrams", "skip_ngrams", "tweets", "ptb")) {
       tokenfunc <- function(col, ...) tf(col, lowercase = FALSE, ...)
     } else {
       tokenfunc <- tf

diff --git a/man/deprecated-se.Rd b/man/deprecated-se.Rd
diff --git a/man/unnest_tokens.Rd b/man/unnest_tokens.Rd
diff --git a/tests/testthat/test-unnest-tokens.R b/tests/testthat/test-unnest-tokens.R
@@ -38,6 +38,13 @@ test_that("tokenizing by word works", {
   expect_equal(d$word[1], "because")
 })
 
+test_that("tokenizing errors with appropriate message", {
+  d <- data_frame(txt = c("Because I could not stop for Death -",
+                          "He kindly stopped for me -"))
+  expect_error(d %>% unnest_tokens(word, txt, token = "word"),
+               "Error: Token must be a supported type, or a function that takes a character vector as input\nDid you mean token = words?")
+})
+
 test_that("tokenizing by sentence works", {
   orig <- data_frame(txt = c("I'm Nobody! Who are you?",
                              "Are you - Nobody - too?",
@@ -131,13 +138,13 @@ test_that("tokenizing with tidyeval works", {
 
 test_that("tokenizing with to_lower = FALSE works", {
   orig <- data_frame(txt = c("Because I could not stop for Death -",
-                          "He kindly stopped for me -"))
+                             "He kindly stopped for me -"))
   d <- orig %>% unnest_tokens(word, txt, to_lower = FALSE)
   expect_equal(nrow(d), 12)
   expect_equal(ncol(d), 1)
   expect_equal(d$word[1], "Because")
   d2 <- orig %>% unnest_tokens(ngram, txt, token = "ngrams",
-                           n = 2, to_lower = FALSE)
+                               n = 2, to_lower = FALSE)
   expect_equal(nrow(d2), 11)
   expect_equal(ncol(d2), 1)
   expect_equal(d2$ngram[1], "Because I")
@@ -239,8 +246,8 @@ test_that("Trying to tokenize a non-text format with words raises an error", {
 test_that("unnest_tokens keeps top-level attributes", {
   # first check data.frame
   d <- data.frame(row = 1:2,
-                         txt = c("Call me Ishmael.", "OK, I will."),
-                         stringsAsFactors = FALSE)
+                  txt = c("Call me Ishmael.", "OK, I will."),
+                  stringsAsFactors = FALSE)
 
   lst <- list(1, 2, 3, 4)
   attr(d, "custom") <- lst
@@ -258,7 +265,7 @@ test_that("unnest_tokens keeps top-level attributes", {
 test_that("Trying to tokenize a data.table works", {
   skip_if_not_installed("data.table")
   text <- data.table::data.table(txt = "Write till my fingers look like a bouquet of roses",
-                     author = "Watsky")
+                                 author = "Watsky")
   output <- unnest_tokens(text, word, txt)
   expect_equal(ncol(output), 2)
   expect_equal(nrow(output), 10)