This repository has been archived by the owner on Apr 24, 2018. It is now read-only.
/
liwcalike.R
132 lines (113 loc) · 4.78 KB
/
liwcalike.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
#' analyze text in a LIWC-alike fashion
#'
#' Analyze a set of texts to produce a dataset of percentages and other
#' quantities describing the text, similar to the functionality supplied by the
#' Linguistic Inquiry and Word Count standalone software distributed at
#' \url{http://liwc.wpengine.com}.
#' @param x input object, a \pkg{quanteda} \link[quanteda]{corpus} or character
#' vector for analysis
#' @param dictionary a \pkg{quanteda} \link[quanteda]{dictionary} object
#' supplied for analysis
#' @param toLower convert to common (lower) case before tokenizing
#' @param verbose if \code{TRUE} print status messages during processing
#' @param ... options passed to \code{\link[quanteda]{tokenize}} offering
#' finer-grained control over how "words" are defined
#' @return a data.frame object containing the analytic results, one row per
#' document supplied
#' @section Segmentation: The LIWC standalone software has many options for
#' segmenting the text. While this function does not supply segmentation
#' options, you can easily achieve the same effect by converting the input
#' object into a corpus (if it is not already a corpus) and using
#' \link[quanteda]{changeunits} or \link[quanteda]{segment} to split the input
#' texts into smaller units based on user-supplied tags, sentence, or
#' paragraph boundaries.
#' @examples
#' liwcalike(testphrases)
#'
#' # examples for comparison
#' txt <- c("The red-shirted lawyer gave her ex-boyfriend $300 out of pity :(.")
#' myDict <- dictionary(list(people = c("lawyer", "boyfriend"),
#' colorFixed = "red",
#' colorGlob = "red*",
#' mwe = "out of"))
#' liwcalike(txt, myDict, what = "word")
#' liwcalike(txt, myDict, what = "fasterword")
#' (toks <- tokenize(txt, what = "fasterword", removeHyphens = TRUE))
#' length(toks[[1]])
#' # LIWC says 12 words
#'
#' \dontrun{# works with LIWC 2015 dictionary too
#' liwcDict <- dictionary(file = "~/Dropbox/QUANTESS/dictionaries/LIWC/LIWC2015_English_Flat.dic",
#' format = "LIWC")
#' inaugLIWCanalysis <- liwcalike(inaugTexts, liwcDict)
#' }
#' @export
#' @import quanteda
liwcalike <- function(x, ...) {
UseMethod("liwcalike")
}
#' @rdname liwcalike
#' @export
liwcalike.corpus <- function(x, ...) {
liwcalike(texts(x), ...)
}
#' @rdname liwcalike
#' @export
liwcalike.character <- function(x, dictionary = NULL, toLower = TRUE, verbose = TRUE, ...) {
## initialize results data.frame
## similar to "Filename" and Segment
result <-
data.frame(docname = if (is.null(names(x))) paste0("text", 1:length(x)) else names(x),
Segment = 1:length(x),
stringsAsFactors = FALSE)
## get readability before lowercasing
WPS <- readability(x, "meanSentenceLength") #, ...)
## lower case the texts if required
if (toLower) x <- toLower(x)
## if a dictionary is supplied, apply it to the dfm
# first pre-process the text for multi-word dictionary values
if (!is.null(dictionary)) {
x <- phrasetotoken(x, dictionary, case_insensitive = toLower)
if (dictionary@concatenator != "_")
dictionary <- lapply(dictionary, stringi::stri_replace_all_fixed, dictionary@concatenator, "_")
}
## tokenize and form the dfm
toks <- tokenize(x, removePunct = TRUE, removeHyphens = TRUE, ...)
dfmAll <- dfm(toks, verbose = FALSE)
if (!is.null(dictionary))
dfmDict <- dfm(toks, verbose = FALSE, dictionary = dictionary)
# or applyDictionary() to dfm
## WC
result[["WC"]] <- ntoken(toks)
# maybe this should be ntoken(dfmAll) - does LIWC count punctuation??
## no implementation for: Analytic Clout Authentic Tone
## WPS (mean words per sentence)
result[["WPS"]] <- WPS
## Sixltr
result[["Sixltr"]] <- sapply(toks, function(y) sum(stringi::stri_length(y) > 6)) / result[["WC"]] * 100
## Dic (percentage of words in the dictionary)
result[["Dic"]] <- if (!is.null(dictionary)) ntoken(dfmAll) / ntoken(dfmDict) * 100 else NA
## add the dictionary counts, transformed to percentages of total words
if (!is.null(dictionary))
result <- cbind(result,
quanteda::as.data.frame(dfmDict / rep(result[["WC"]],
each = nfeature(dfmDict))) * 100)
## add punctuation counts
# AllPunc
# Period
# Comma
# Colon
# SemiC
# QMark
# Exclam
# Dash
# Quote
# Apostro
# Parenth -- note this is specified as "pairs of parentheses"
# OtherP
# format the result
result[, which(names(result)=="Sixltr") : ncol(result)] <-
format(result[, which(names(result)=="Sixltr") : ncol(result)],
digits = 4, trim = TRUE)
result
}