diff --git a/.Rbuildignore b/.Rbuildignore index 91114bf..ced8b8a 100644 --- a/.Rbuildignore +++ b/.Rbuildignore @@ -1,2 +1,4 @@ ^.*\.Rproj$ ^\.Rproj\.user$ +^README.Rmd$ + diff --git a/DESCRIPTION b/DESCRIPTION index 90cd704..7ca3531 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -1,16 +1,23 @@ Package: LIWCalike Type: Package Title: Text analysis similar to the Linguistic Inquiry and Word Count (LIWC) -Version: 0.1.0 +Version: 0.1.1 Date: 2016-04-22 Author: Kenneth Benoit Maintainer: Kenneth Benoit -Description: Built on the quanteda package for text analysis, LIWCalikes provides a simple interface to the analysis of text by counting words and other textual features, including the application of a dictionary to produce a tabular report of percentages. This provides similar functionality to the LIWC stand-alone software. The user must a dictionary, which can include one of the custom LIWC dictionaries if these have been purchased from http://liwc.wpengine.com. +Description: Built on the quanteda package for text analysis, LIWCalikes + provides a simple interface to the analysis of text by counting words and other + textual features, including the application of a dictionary to produce a tabular + report of percentages. This provides similar functionality to the LIWC stand- + alone software. The user must a dictionary, which can include one of the custom + LIWC dictionaries if these have been purchased from http://liwc.wpengine.com. License: GPL-3 LazyData: TRUE -Depends: quanteda (>= 0.9.5.20) -Imports: stringi +Depends: + quanteda (>= 0.9.5-20) +Imports: + stringi URL: http://github.com/kbenoit/LIWCalike Encoding: UTF-8 BugReports: https://github.com/kbenoit/LIWCalike/issues -VignetteBuilder: knitr +RoxygenNote: 5.0.1 diff --git a/NAMESPACE b/NAMESPACE index d75f824..88e32cf 100644 --- a/NAMESPACE +++ b/NAMESPACE @@ -1 +1,6 @@ -exportPattern("^[[:alpha:]]+") +# Generated by roxygen2: do not edit by hand + +S3method(liwcalike,character) +S3method(liwcalike,corpus) +export(liwcalike) +import(quanteda) diff --git a/R/data.R b/R/data.R new file mode 100644 index 0000000..e184d34 --- /dev/null +++ b/R/data.R @@ -0,0 +1,13 @@ + +#' @name testphrases +#' @docType data +#' @title sample short documents for testing +#' @description Some sample short documents in plain text format for testing +#' with \code{\link{liwcalike}}. +#' @examples +#' liwcalike(testphrases) +#' +NULL + +# save(testphrases, file = "data/testphrases.RData") +# writeLines(testphrases, "inst/extdata/testphrases.txt") diff --git a/R/liwc.R b/R/liwcalike.R similarity index 67% rename from R/liwc.R rename to R/liwcalike.R index 0bd0094..f3f894d 100644 --- a/R/liwc.R +++ b/R/liwcalike.R @@ -8,6 +8,8 @@ #' vector for analysis #' @param dictionary a \pkg{quanteda} \link[quanteda]{dictionary} object #' supplied for analysis +#' @param toLower convert to common (lower) case before tokenizing +#' @param verbose if \code{TRUE} print status messages during processing #' @param ... options passed to \code{\link[quanteda]{tokenize}} offering #' finer-grained control over how "words" are defined #' @return a data.frame object containing the analytic results, one row per @@ -20,25 +22,41 @@ #' texts into smaller units based on user-supplied tags, sentence, or #' paragraph boundaries. #' @examples +#' liwcalike(testphrases) +#' +#' # examples for comparison +#' txt <- c("The red-shirted lawyer gave her ex-boyfriend $300 out of pity :(.") +#' myDict <- dictionary(list(people = c("lawyer", "boyfriend"), +#' colorFixed = "red", +#' colorGlob = "red*", +#' mwe = "out of")) +#' liwcalike(txt, myDict, what = "word") +#' liwcalike(txt, myDict, what = "fasterword") +#' (toks <- tokenize(txt, what = "fasterword", removeHyphens = TRUE)) +#' length(toks[[1]]) +#' # LIWC says 12 words +#' +#' \dontrun{# works with LIWC 2015 dictionary too #' liwcDict <- dictionary(file = "~/Dropbox/QUANTESS/dictionaries/LIWC/LIWC2015_English_Flat.dic", #' format = "LIWC") -#' inaugLIWCanalysis <- liwc(inaugTexts, liwcDict) -#' +#' inaugLIWCanalysis <- liwcalike(inaugTexts, liwcDict) +#' } #' @export -liwc <- function(x, ...) { - UseMethod("liwc") +#' @import quanteda +liwcalike <- function(x, ...) { + UseMethod("liwcalike") } -#' @rdname liwc +#' @rdname liwcalike #' @export -liwc.corpus <- function(x, ...) { - liwc(texts(x), ...) +liwcalike.corpus <- function(x, ...) { + liwcalike(texts(x), ...) } -#' @rdname liwc +#' @rdname liwcalike #' @export -liwc.character <- function(x, dictionary = NULL, toLower = TRUE, verbose = TRUE, ...) { +liwcalike.character <- function(x, dictionary = NULL, toLower = TRUE, verbose = TRUE, ...) { ## initialize results data.frame ## similar to "Filename" and Segment @@ -48,7 +66,7 @@ liwc.character <- function(x, dictionary = NULL, toLower = TRUE, verbose = TRUE, stringsAsFactors = FALSE) ## get readability before lowercasing - WPS <- readability(x, "meanSentenceLength", ...) + WPS <- readability(x, "meanSentenceLength") #, ...) ## lower case the texts if required if (toLower) x <- toLower(x) @@ -62,7 +80,7 @@ liwc.character <- function(x, dictionary = NULL, toLower = TRUE, verbose = TRUE, } ## tokenize and form the dfm - toks <- tokenize(x, ...) + toks <- tokenize(x, removePunct = TRUE, removeHyphens = TRUE, ...) dfmAll <- dfm(toks, verbose = FALSE) if (!is.null(dictionary)) dfmDict <- dfm(toks, verbose = FALSE, dictionary = dictionary) @@ -86,7 +104,8 @@ liwc.character <- function(x, dictionary = NULL, toLower = TRUE, verbose = TRUE, ## add the dictionary counts, transformed to percentages of total words if (!is.null(dictionary)) result <- cbind(result, - as.data.frame(dfmDict / rep(result[["WC"]], each = nfeature(dfmDict)) * 100)) + quanteda::as.data.frame(dfmDict / rep(result[["WC"]], + each = nfeature(dfmDict))) * 100) ## add punctuation counts # AllPunc @@ -102,9 +121,12 @@ liwc.character <- function(x, dictionary = NULL, toLower = TRUE, verbose = TRUE, # Parenth -- note this is specified as "pairs of parentheses" # OtherP + # format the result + result[, which(names(result)=="Sixltr") : ncol(result)] <- + format(result[, which(names(result)=="Sixltr") : ncol(result)], + digits = 4, trim = TRUE) + result } -# the word counts - diff --git a/README.Rmd b/README.Rmd new file mode 100644 index 0000000..566ff84 --- /dev/null +++ b/README.Rmd @@ -0,0 +1,83 @@ +--- +output: + md_document: + variant: markdown_github +--- + +```{r, echo = FALSE} +knitr::opts_chunk$set( + collapse = TRUE, + comment = "#>", + fig.path = "README-" +) +``` + +**Master branch** [![Build Status](https://travis-ci.org/kbenoit/LIWCalike.svg?branch=master)] +[![codecov.io](https://codecov.io/github/kbenoit/LIWCalike/coverage.svg?branch=master)](https://codecov.io/github/kbenoit/LIWCalike/coverage.svg?branch=master) + + +## LIWCalike: an R implementation of the Linguistic Inquiry and Word Count + +Built on the quanteda package for text analysis, LIWCalikes provides a simple interface to the analysis of text by counting words and other textual features, including the application of a dictionary to produce a tabular report of percentages. This provides similar functionality to the LIWC stand-alone software. The user must a dictionary, which can include one of the custom LIWC dictionaries if these have been purchased from http://liwc.wpengine.com, or any other dictionary supplied by the user. + +### Differences from the LIWC standalone software + +This package is designed for R users and those wishing to build functionality by extending the [**quanteda**](https://github.com/kbenoit/quanteda) package for text analysis. If you prefer to have a complete, stand-alone user interface, then you should purchase and use the [LIWC standalone software](http://liwc.wpengine.com). This has several advantages: + +* LIWC allows direct importing of files, including binary (Word, pdf, etc) formats. To use + **LIWCalike**, you will need to import these into the **quanteda** package first. + **LIWCalike** also works fine with simple character vectors, if you prefer to use + standard R methods to create your input object (e.g. `readLines()`, `read.csv()`, etc.) + +* LIWC provides direct outputs in the form of csv, Excel files, etc. By contrast, **LIWCalike** returns a `data.frame`, which you have to export yourself (e.g. using `write.csv()`.) + +* LIWC provides easy segmentation, through a GUI. By contrast, with **LIWCalike** you will + have to segment the texts yourself. (**quanteda** provides easy ways to do this using + `segment()` and `changeunits()`.) + +* LIWC color codes the dictionary value matches in your texts and displays these in a nice graphical window. + + +## Using dictionaries with LIWCalike + +No dictionaries are supplied with **LIWCalike**, it is up to you to supply these. With the **quanteda** functions for creating or importing dictionaries, however, this is quite easy. + +With the LIWC 2007, external dictionaries were distributed with the software that could be used in the format read by Provalis Research's [*Wordstat*](http://provalisresearch.com/products/content-analysis-software/). Because I purchases a license for this product, I have that file and can use it with **LIWCalike**. + +Using it is quite straightforward: + +```{r} +require(LIWCalike) + +# read in the dictionary +liwc2007dict <- dictionary(file = "~/Dropbox/QUANTESS/dictionaries/LIWC/LIWC2007.cat", + format = "wordstat") +tail(liwc2007dict, 1) + +# our test data +testphrases + +# call LIWCalike +output <- liwcalike(testphrases, liwc2007dict) + +# view some results +output[, c(1:7, ncol(output)-2)] +``` + + +## How to Install + +``` +devtools::install_github("kbenoit/quanteda") +devtools::install_github("kbenoit/LIWCalike") +``` + +You need to have installed the **quanteda** package of at least version 0.9.5-20 for this +to work, since that update implemented multi-word dictionary values. + + +## Comments and feedback + +I welcome your comments and feedback. Please file issues on the issues page, and/or send me comments at kbenoit@lse.ac.uk. + + diff --git a/README.md b/README.md new file mode 100644 index 0000000..0ae9b5a --- /dev/null +++ b/README.md @@ -0,0 +1,104 @@ +**Master branch** \[![Build Status](https://travis-ci.org/kbenoit/LIWCalike.svg?branch=master)\]\[![codecov.io](https://codecov.io/github/kbenoit/LIWCalike/coverage.svg?branch=master)\]() + +LIWCalike: an R implementation of the Linguistic Inquiry and Word Count +----------------------------------------------------------------------- + +Built on the quanteda package for text analysis, LIWCalikes provides a simple interface to the analysis of text by counting words and other textual features, including the application of a dictionary to produce a tabular report of percentages. This provides similar functionality to the LIWC stand-alone software. The user must a dictionary, which can include one of the custom LIWC dictionaries if these have been purchased from , or any other dictionary supplied by the user. + +### Differences from the LIWC standalone software + +This package is designed for R users and those wishing to build functionality by extending the [**quanteda**](https://github.com/kbenoit/quanteda) package for text analysis. If you prefer to have a complete, stand-alone user interface, then you should purchase and use the [LIWC standalone software](http://liwc.wpengine.com). This has several advantages: + +- LIWC allows direct importing of files, including binary (Word, pdf, etc) formats. To use **LIWCalike**, you will need to import these into the **quanteda** package first. + **LIWCalike** also works fine with simple character vectors, if you prefer to use standard R methods to create your input object (e.g. `readLines()`, `read.csv()`, etc.) + +- LIWC provides direct outputs in the form of csv, Excel files, etc. By contrast, **LIWCalike** returns a `data.frame`, which you have to export yourself (e.g. using `write.csv()`.) + +- LIWC provides easy segmentation, through a GUI. By contrast, with **LIWCalike** you will have to segment the texts yourself. (**quanteda** provides easy ways to do this using `segment()` and `changeunits()`.) + +- LIWC color codes the dictionary value matches in your texts and displays these in a nice graphical window. + +Using dictionaries with LIWCalike +--------------------------------- + +No dictionaries are supplied with **LIWCalike**, it is up to you to supply these. With the **quanteda** functions for creating or importing dictionaries, however, this is quite easy. + +With the LIWC 2007, external dictionaries were distributed with the software that could be used in the format read by Provalis Research's [*Wordstat*](http://provalisresearch.com/products/content-analysis-software/). Because I purchases a license for this product, I have that file and can use it with **LIWCalike**. + +Using it is quite straightforward: + +``` r +require(LIWCalike) +#> Loading required package: LIWCalike +#> Loading required package: quanteda +#> quanteda version 0.9.5.20 +#> +#> Attaching package: 'quanteda' +#> The following object is masked from 'package:base': +#> +#> sample + +# read in the dictionary +liwc2007dict <- dictionary(file = "~/Dropbox/QUANTESS/dictionaries/LIWC/LIWC2007.cat", + format = "wordstat") +#> Warning in strsplit(w, "\\("): input string 1 is invalid in this locale +tail(liwc2007dict, 1) +#> $`SPOKEN CATEGORIES.FILLERS` +#> [1] "blah" NA "idontknow" "imean" +#> [5] "ohwell" "oranything*" "orsomething*" "orwhatever*" +#> [9] "rr*" "yakn*" "ykn*" "youknow*" + +# our test data +testphrases +#> [1] "Test sentence for LIWCalike. Second sentence." +#> [2] "Each row is a document." +#> [3] "Comma, period." +#> [4] "The red-shirted lawyer gave her ex-boyfriend $300 out of pity :(." +#> [5] "LOL :-)." +#> [6] "(Parentheses) for $100." +#> [7] "Say \"what\" again!!" +#> [8] "Why are we here?" +#> [9] "Other punctation: §; ±." +#> [10] "Sentence one. Sentence two! :-)" + +# call LIWCalike +output <- liwcalike(testphrases, liwc2007dict) + +# view some results +output[, c(1:7, ncol(output)-2)] +#> docname Segment WC WPS Sixltr Dic +#> text1 text1 1 6 3 50.00 120.00 +#> text2 text2 2 5 5 20.00 50.00 +#> text3 text3 3 2 2 0.00 100.00 +#> text4 text4 4 12 12 16.67 40.00 +#> text5 text5 5 1 1 0.00 33.33 +#> text6 text6 6 3 3 33.33 75.00 +#> text7 text7 7 3 3 0.00 30.00 +#> text8 text8 8 4 4 0.00 26.67 +#> text9 text9 9 2 2 50.00 66.67 +#> text10 text10 10 4 2 50.00 100.00 +#> LINGUISTIC PROCESSES.FUNCTION WORDS SPOKEN CATEGORIES.ASSENT +#> text1 33.33 0 +#> text2 50.00 0 +#> text3 0.00 0 +#> text4 66.67 0 +#> text5 0.00 25 +#> text6 16.67 0 +#> text7 33.33 0 +#> text8 50.00 0 +#> text9 16.67 0 +#> text10 33.33 0 +``` + +How to Install +-------------- + + devtools::install_github("kbenoit/quanteda") + devtools::install_github("kbenoit/LIWCalike") + +You need to have installed the **quanteda** package of at least version 0.9.5-20 for this to work, since that update implemented multi-word dictionary values. + +Comments and feedback +--------------------- + +I welcome your comments and feedback. Please file issues on the issues page, and/or send me comments at . diff --git a/data/testphrases.RData b/data/testphrases.RData new file mode 100644 index 0000000..94bf6de Binary files /dev/null and b/data/testphrases.RData differ diff --git a/inst/extdata/testphrases.txt b/inst/extdata/testphrases.txt new file mode 100644 index 0000000..0b653af --- /dev/null +++ b/inst/extdata/testphrases.txt @@ -0,0 +1,10 @@ +Test sentence for LIWCalike. Second sentence. +Each row is a document. +Comma, period. +The red-shirted lawyer gave her ex-boyfriend $300 out of pity :(. +LOL :-). +(Parentheses) for $100. +Say "what" again!! +Why are we here? +Other punctation: ^; %, &. +Sentence one. Sentence two! :-) diff --git a/man/hello.Rd b/man/hello.Rd deleted file mode 100644 index 0fa7c4b..0000000 --- a/man/hello.Rd +++ /dev/null @@ -1,12 +0,0 @@ -\name{hello} -\alias{hello} -\title{Hello, World!} -\usage{ -hello() -} -\description{ -Prints 'Hello, world!'. -} -\examples{ -hello() -} diff --git a/man/liwcalike.Rd b/man/liwcalike.Rd new file mode 100644 index 0000000..1d34253 --- /dev/null +++ b/man/liwcalike.Rd @@ -0,0 +1,70 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/liwcalike.R +\name{liwcalike} +\alias{liwcalike} +\alias{liwcalike.character} +\alias{liwcalike.corpus} +\title{analyze text in a LIWC-alike fashion} +\usage{ +liwcalike(x, ...) + +\method{liwcalike}{corpus}(x, ...) + +\method{liwcalike}{character}(x, dictionary = NULL, toLower = TRUE, + verbose = TRUE, ...) +} +\arguments{ +\item{x}{input object, a \pkg{quanteda} \link[quanteda]{corpus} or character +vector for analysis} + +\item{...}{options passed to \code{\link[quanteda]{tokenize}} offering +finer-grained control over how "words" are defined} + +\item{dictionary}{a \pkg{quanteda} \link[quanteda]{dictionary} object +supplied for analysis} + +\item{toLower}{convert to common (lower) case before tokenizing} + +\item{verbose}{if \code{TRUE} print status messages during processing} +} +\value{ +a data.frame object containing the analytic results, one row per + document supplied +} +\description{ +Analyze a set of texts to produce a dataset of percentages and other +quantities describing the text, similar to the functionality supplied by the +Linguistic Inquiry and Word Count standalone software distributed at +\url{http://liwc.wpengine.com}. +} +\section{Segmentation}{ + The LIWC standalone software has many options for + segmenting the text. While this function does not supply segmentation + options, you can easily achieve the same effect by converting the input + object into a corpus (if it is not already a corpus) and using + \link[quanteda]{changeunits} or \link[quanteda]{segment} to split the input + texts into smaller units based on user-supplied tags, sentence, or + paragraph boundaries. +} +\examples{ +liwcalike(testphrases) + +# examples for comparison +txt <- c("The red-shirted lawyer gave her ex-boyfriend $300 out of pity :(.") +myDict <- dictionary(list(people = c("lawyer", "boyfriend"), + colorFixed = "red", + colorGlob = "red*", + mwe = "out of")) +liwcalike(txt, myDict, what = "word") +liwcalike(txt, myDict, what = "fasterword") +(toks <- tokenize(txt, what = "fasterword", removeHyphens = TRUE)) +length(toks[[1]]) +# LIWC says 12 words + +\dontrun{# works with LIWC 2015 dictionary too +liwcDict <- dictionary(file = "~/Dropbox/QUANTESS/dictionaries/LIWC/LIWC2015_English_Flat.dic", + format = "LIWC") +inaugLIWCanalysis <- liwcalike(inaugTexts, liwcDict) +} +} + diff --git a/man/testphrases.Rd b/man/testphrases.Rd new file mode 100644 index 0000000..25a4dc6 --- /dev/null +++ b/man/testphrases.Rd @@ -0,0 +1,15 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/data.R +\docType{data} +\name{testphrases} +\alias{testphrases} +\title{sample short documents for testing} +\description{ +Some sample short documents in plain text format for testing + with \code{\link{liwcalike}}. +} +\examples{ +liwcalike(testphrases) + +} +