Skip to content

Commit

Permalink
Make a better error message when the user misses the name of the toke…
Browse files Browse the repository at this point in the history
…nizer. Also add info for new tokenizers. Closes #111.
  • Loading branch information
juliasilge committed Apr 2, 2018
1 parent b66c5c6 commit b85c20a
Show file tree
Hide file tree
Showing 5 changed files with 48 additions and 23 deletions.
2 changes: 2 additions & 0 deletions NEWS.md
Original file line number Diff line number Diff line change
@@ -1,6 +1,8 @@
# tidytext 0.1.9 [WIP]

* Updates to documentation (#109) thanks to Emil Hvitfeldt.
* Add new tokenizers for tweets, Penn Treebank to `unnest_tokens()`.
* Better error message (#111).

# tidytext 0.1.8

Expand Down
26 changes: 18 additions & 8 deletions R/unnest_tokens.R
Original file line number Diff line number Diff line change
@@ -1,15 +1,17 @@
#' Split a column into tokens using the tokenizers package
#'
#' Split a column into tokens using the tokenizers package, splitting the table
#' into one-token-per-row. This function supports non-standard evaluation through
#' the tidyeval framework.
#' into one-token-per-row. This function supports non-standard evaluation
#' through the tidyeval framework.
#'
#' @param tbl A data frame
#'
#' @param token Unit for tokenizing, or a custom tokenizing function. Built-in
#' options are "words" (default), "characters", "character_shingles", "ngrams", "skip_ngrams",
#' "sentences", "lines", "paragraphs", and "regex". If a function, should take
#' a character vector and return a list of character vectors of the same length.
#' options are "words" (default), "characters", "character_shingles", "ngrams",
#' "skip_ngrams", "sentences", "lines", "paragraphs", "regex", "tweets"
#' (tokenization by word that preserves usernames, hashtags, and URLS ), and
#' "ptb" (Penn Treebank). If a function, should take a character vector and
#' return a list of character vectors of the same length.
#'
#' @param format Either "text", "man", "latex", "html", or "xml". If not text,
#' this uses the hunspell tokenizer, and can tokenize only by "word"
Expand All @@ -31,8 +33,10 @@
#' when token method is "ngrams", "skip_ngrams", "sentences", "lines",
#' "paragraphs", or "regex".
#'
#' @param ... Extra arguments passed on to the tokenizer, such as \code{n} and
#' \code{k} for "ngrams" and "skip_ngrams" or \code{pattern} for "regex".
#' @param ... Extra arguments passed on to \link[tokenizers]{tokenizers}, such
#' as \code{strip_punct} for "words" and "tweets", \code{n} and \code{k} for
#' "ngrams" and "skip_ngrams", \code{strip_url} for "tweets", and
#' \code{pattern} for "regex".
#'
#' @details If the unit for tokenizing is ngrams, skip_ngrams, sentences, lines,
#' paragraphs, or regex, the entire input will be collapsed together before
Expand Down Expand Up @@ -124,6 +128,12 @@ unnest_tokens.data.frame <- function(tbl, output, input, token = "words",

if (is.function(token)) {
tokenfunc <- token
} else if (token %in% c("word", "character",
"character_shingle", "ngram",
"skip_ngram", "sentence", "line",
"paragraph", "tweet")) {
stop(paste0("Error: Token must be a supported type, or a function that takes a character vector as input\nDid you mean token = ",
token, "s?"))
} else if (format != "text") {
if (token != "words") {
stop("Cannot tokenize by any unit except words when format is not text")
Expand All @@ -136,7 +146,7 @@ unnest_tokens.data.frame <- function(tbl, output, input, token = "words",
collapse <- TRUE
}
tf <- get(paste0("tokenize_", token))
if (token %in% c("characters", "words", "ngrams", "skip_ngrams")) {
if (token %in% c("characters", "words", "ngrams", "skip_ngrams", "tweets", "ptb")) {
tokenfunc <- function(col, ...) tf(col, lowercase = FALSE, ...)
} else {
tokenfunc <- tf
Expand Down
8 changes: 5 additions & 3 deletions man/deprecated-se.Rd

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

18 changes: 11 additions & 7 deletions man/unnest_tokens.Rd

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

17 changes: 12 additions & 5 deletions tests/testthat/test-unnest-tokens.R
Original file line number Diff line number Diff line change
Expand Up @@ -38,6 +38,13 @@ test_that("tokenizing by word works", {
expect_equal(d$word[1], "because")
})

test_that("tokenizing errors with appropriate message", {
d <- data_frame(txt = c("Because I could not stop for Death -",
"He kindly stopped for me -"))
expect_error(d %>% unnest_tokens(word, txt, token = "word"),
"Error: Token must be a supported type, or a function that takes a character vector as input\nDid you mean token = words?")
})

test_that("tokenizing by sentence works", {
orig <- data_frame(txt = c("I'm Nobody! Who are you?",
"Are you - Nobody - too?",
Expand Down Expand Up @@ -131,13 +138,13 @@ test_that("tokenizing with tidyeval works", {

test_that("tokenizing with to_lower = FALSE works", {
orig <- data_frame(txt = c("Because I could not stop for Death -",
"He kindly stopped for me -"))
"He kindly stopped for me -"))
d <- orig %>% unnest_tokens(word, txt, to_lower = FALSE)
expect_equal(nrow(d), 12)
expect_equal(ncol(d), 1)
expect_equal(d$word[1], "Because")
d2 <- orig %>% unnest_tokens(ngram, txt, token = "ngrams",
n = 2, to_lower = FALSE)
n = 2, to_lower = FALSE)
expect_equal(nrow(d2), 11)
expect_equal(ncol(d2), 1)
expect_equal(d2$ngram[1], "Because I")
Expand Down Expand Up @@ -239,8 +246,8 @@ test_that("Trying to tokenize a non-text format with words raises an error", {
test_that("unnest_tokens keeps top-level attributes", {
# first check data.frame
d <- data.frame(row = 1:2,
txt = c("Call me Ishmael.", "OK, I will."),
stringsAsFactors = FALSE)
txt = c("Call me Ishmael.", "OK, I will."),
stringsAsFactors = FALSE)

lst <- list(1, 2, 3, 4)
attr(d, "custom") <- lst
Expand All @@ -258,7 +265,7 @@ test_that("unnest_tokens keeps top-level attributes", {
test_that("Trying to tokenize a data.table works", {
skip_if_not_installed("data.table")
text <- data.table::data.table(txt = "Write till my fingers look like a bouquet of roses",
author = "Watsky")
author = "Watsky")
output <- unnest_tokens(text, word, txt)
expect_equal(ncol(output), 2)
expect_equal(nrow(output), 10)
Expand Down

0 comments on commit b85c20a

Please sign in to comment.