Added ability to pass a (custom tokenizing) function to token. Also a…

…dded a collapse argument that makes the choice whether to combine lines before tokenizing explicit. See #10
juliasilge · May 17, 2016 · 338cc6f · 338cc6f
1 parent 4669d0a
commit 338cc6f
Show file tree

Hide file tree

Showing 4 changed files with 65 additions and 20 deletions.
diff --git a/NEWS.md b/NEWS.md
@@ -4,6 +4,7 @@
 * Add codecov
 * Add tidiers for LDA objects from topicmodels
 * Fixed a bug when tidying by line/sentence/paragraph/regex and there are multiple non-text columns
+* Added ability to pass a (custom tokenizing) function to token. Also added a collapse argument that makes the choice whether to combine lines before tokenizing explicit.
 
 # tidytext 0.1.0
 

diff --git a/R/unnest_tokens.R b/R/unnest_tokens.R
@@ -3,14 +3,18 @@
 #' @param tbl Data frame
 #' @param output_col Output column to be created
 #' @param input_col Input column that gets split
-#' @param token Unit for tokenizing. Options are "characters", "words",
-#' "ngrams", "skip_ngrams", "sentences", "lines", "paragraphs", and "regex".
-#' Default is "words".
+#' @param token Unit for tokenizing, or a custom tokenizing function. Built-in
+#' options are "words" (default), "characters", "ngrams", "skip_ngrams",
+#' "sentences", "lines", "paragraphs", and "regex". If a function, should take
+#' a character vector and return a list of character vectors of the same length.
 #' @param to_lower Whether to turn column lowercase
 #' @param drop Whether original input column should get dropped. Ignored
 #' if the original input and new output column have the same name.
 #' @param output Output column to be created as bare name
 #' @param input Input column that gets split as bare name
+#' @param collapse Whether to combine text with newlines first in case tokens
+#' (such as sentences or paragraphs) span multiple lines. If NULL, collapses
+#' when token method is "sentences", "lines", "paragraphs", or "regexes"
 #' @param ... Extra arguments passed on to the tokenizer, such as \code{n} and
 #' \code{k} for "ngrams" and "skip_ngrams"
 #'
@@ -43,10 +47,29 @@
 #' d %>%
 #'   unnest_tokens(ngram, txt, token = "skip_ngrams", n = 4, k = 2)
 #'
+#' # custom function
+#' d %>%
+#'   unnest_tokens(word, txt, token = stringr::str_split, pattern = " ")
+#'
 #' @export
 unnest_tokens_ <- function(tbl, output_col, input_col, token = "words",
-                          to_lower = TRUE, drop = TRUE, ...) {
-  if (token %in% c("sentences", "lines", "paragraphs", "regex")) {
+                          to_lower = TRUE, drop = TRUE, collapse = NULL, ...) {
+  if (is.function(token)) {
+    tokenfunc <- token
+  } else {
+    if (is.null(collapse) && token %in% c("sentences", "lines", "paragraphs", "regex")) {
+      collapse <- TRUE
+    }
+
+    tf <- get(paste0("tokenize_", token))
+    if (token %in% c("characters", "words")) {
+      tokenfunc <- function(col, ...) tf(col, lowercase = FALSE, ...)
+    } else {
+      tokenfunc <- tf
+    }
+  }
+
+  if (!is.null(collapse) && collapse) {
     exps <- list(substitute(stringr::str_c(colname, collapse = "\n"),
                             list(colname = as.name(input_col))))
     names(exps) <- input_col
@@ -56,14 +79,7 @@ unnest_tokens_ <- function(tbl, output_col, input_col, token = "words",
   }
 
   col <- tbl[[input_col]]
-
-  token <- paste0("tokenize_", token)
-  tokenfunc <- get(token)
-  if (token == "tokenize_characters" || token == "tokenize_words") {
-    tbl[[output_col]] <- tokenfunc(col, lowercase = FALSE, ...)
-  } else { # mash the whole character string together here for other tokenizer functions
-    tbl[[output_col]] <- tokenfunc(col, ...)
-  }
+  tbl[[output_col]] <- tokenfunc(col, ...)
 
   if (drop && input_col != output_col) {
     tbl[[input_col]] <- NULL
@@ -84,10 +100,12 @@ unnest_tokens_ <- function(tbl, output_col, input_col, token = "words",
 #' @rdname unnest_tokens
 #' @export
 unnest_tokens <- function(tbl, output, input, token = "words",
-                           to_lower = TRUE, drop = TRUE, ...) {
+                           to_lower = TRUE, drop = TRUE,
+                          collapse = NULL, ...) {
   output_col <- col_name(substitute(output))
   input_col <- col_name(substitute(input))
 
   unnest_tokens_(tbl, output_col, input_col, token = token,
-                 to_lower = to_lower, drop = drop, ...)
+                 to_lower = to_lower, drop = drop,
+                 collapse = collapse, ...)
 }
diff --git a/man/unnest_tokens.Rd b/man/unnest_tokens.Rd
diff --git a/tests/testthat/test-unnest-tokens.R b/tests/testthat/test-unnest-tokens.R
@@ -69,3 +69,20 @@ test_that("tokenizing by ngram and skip ngram works", {
 
 })
 
+test_that("tokenizing with a custom function works", {
+  orig <- data_frame(txt = c("I'm Nobody! Who are you?",
+                             "Are you - Nobody - too?",
+                             "Then there’s a pair of us!",
+                             "Don’t tell! they’d advertise - you know!"))
+  d <- orig %>%
+    unnest_tokens(unit, txt, token = stringr::str_split, pattern = " - ")
+  expect_equal(nrow(d), 7)
+  expect_equal(d$unit[3], "nobody")
+  expect_equal(d$unit[4], "too?")
+
+  d2 <- orig %>%
+    unnest_tokens(unit, txt, token = stringr::str_split, pattern = " - ", collapse = TRUE)
+  expect_equal(nrow(d2), 4)
+  expect_equal(d2$unit[2], "nobody")
+  expect_equal(d2$unit[4], "you know!")
+})