Skip to content

Commit

Permalink
Added ability to pass a (custom tokenizing) function to token. Also a…
Browse files Browse the repository at this point in the history
…dded a collapse argument that makes the choice whether to combine lines before tokenizing explicit.

See #10
  • Loading branch information
Dave Robinson committed May 17, 2016
1 parent 4669d0a commit 338cc6f
Show file tree
Hide file tree
Showing 4 changed files with 65 additions and 20 deletions.
1 change: 1 addition & 0 deletions NEWS.md
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@
* Add codecov
* Add tidiers for LDA objects from topicmodels
* Fixed a bug when tidying by line/sentence/paragraph/regex and there are multiple non-text columns
* Added ability to pass a (custom tokenizing) function to token. Also added a collapse argument that makes the choice whether to combine lines before tokenizing explicit.

# tidytext 0.1.0

Expand Down
48 changes: 33 additions & 15 deletions R/unnest_tokens.R
Original file line number Diff line number Diff line change
Expand Up @@ -3,14 +3,18 @@
#' @param tbl Data frame
#' @param output_col Output column to be created
#' @param input_col Input column that gets split
#' @param token Unit for tokenizing. Options are "characters", "words",
#' "ngrams", "skip_ngrams", "sentences", "lines", "paragraphs", and "regex".
#' Default is "words".
#' @param token Unit for tokenizing, or a custom tokenizing function. Built-in
#' options are "words" (default), "characters", "ngrams", "skip_ngrams",
#' "sentences", "lines", "paragraphs", and "regex". If a function, should take
#' a character vector and return a list of character vectors of the same length.
#' @param to_lower Whether to turn column lowercase
#' @param drop Whether original input column should get dropped. Ignored
#' if the original input and new output column have the same name.
#' @param output Output column to be created as bare name
#' @param input Input column that gets split as bare name
#' @param collapse Whether to combine text with newlines first in case tokens
#' (such as sentences or paragraphs) span multiple lines. If NULL, collapses
#' when token method is "sentences", "lines", "paragraphs", or "regexes"
#' @param ... Extra arguments passed on to the tokenizer, such as \code{n} and
#' \code{k} for "ngrams" and "skip_ngrams"
#'
Expand Down Expand Up @@ -43,10 +47,29 @@
#' d %>%
#' unnest_tokens(ngram, txt, token = "skip_ngrams", n = 4, k = 2)
#'
#' # custom function
#' d %>%
#' unnest_tokens(word, txt, token = stringr::str_split, pattern = " ")
#'
#' @export
unnest_tokens_ <- function(tbl, output_col, input_col, token = "words",
to_lower = TRUE, drop = TRUE, ...) {
if (token %in% c("sentences", "lines", "paragraphs", "regex")) {
to_lower = TRUE, drop = TRUE, collapse = NULL, ...) {
if (is.function(token)) {
tokenfunc <- token
} else {
if (is.null(collapse) && token %in% c("sentences", "lines", "paragraphs", "regex")) {
collapse <- TRUE
}

tf <- get(paste0("tokenize_", token))
if (token %in% c("characters", "words")) {
tokenfunc <- function(col, ...) tf(col, lowercase = FALSE, ...)
} else {
tokenfunc <- tf
}
}

if (!is.null(collapse) && collapse) {
exps <- list(substitute(stringr::str_c(colname, collapse = "\n"),
list(colname = as.name(input_col))))
names(exps) <- input_col
Expand All @@ -56,14 +79,7 @@ unnest_tokens_ <- function(tbl, output_col, input_col, token = "words",
}

col <- tbl[[input_col]]

token <- paste0("tokenize_", token)
tokenfunc <- get(token)
if (token == "tokenize_characters" || token == "tokenize_words") {
tbl[[output_col]] <- tokenfunc(col, lowercase = FALSE, ...)
} else { # mash the whole character string together here for other tokenizer functions
tbl[[output_col]] <- tokenfunc(col, ...)
}
tbl[[output_col]] <- tokenfunc(col, ...)

if (drop && input_col != output_col) {
tbl[[input_col]] <- NULL
Expand All @@ -84,10 +100,12 @@ unnest_tokens_ <- function(tbl, output_col, input_col, token = "words",
#' @rdname unnest_tokens
#' @export
unnest_tokens <- function(tbl, output, input, token = "words",
to_lower = TRUE, drop = TRUE, ...) {
to_lower = TRUE, drop = TRUE,
collapse = NULL, ...) {
output_col <- col_name(substitute(output))
input_col <- col_name(substitute(input))

unnest_tokens_(tbl, output_col, input_col, token = token,
to_lower = to_lower, drop = drop, ...)
to_lower = to_lower, drop = drop,
collapse = collapse, ...)
}
19 changes: 14 additions & 5 deletions man/unnest_tokens.Rd

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

17 changes: 17 additions & 0 deletions tests/testthat/test-unnest-tokens.R
Original file line number Diff line number Diff line change
Expand Up @@ -69,3 +69,20 @@ test_that("tokenizing by ngram and skip ngram works", {

})

test_that("tokenizing with a custom function works", {
orig <- data_frame(txt = c("I'm Nobody! Who are you?",
"Are you - Nobody - too?",
"Then there’s a pair of us!",
"Don’t tell! they’d advertise - you know!"))
d <- orig %>%
unnest_tokens(unit, txt, token = stringr::str_split, pattern = " - ")
expect_equal(nrow(d), 7)
expect_equal(d$unit[3], "nobody")
expect_equal(d$unit[4], "too?")

d2 <- orig %>%
unnest_tokens(unit, txt, token = stringr::str_split, pattern = " - ", collapse = TRUE)
expect_equal(nrow(d2), 4)
expect_equal(d2$unit[2], "nobody")
expect_equal(d2$unit[4], "you know!")
})

0 comments on commit 338cc6f

Please sign in to comment.