Skip to content

Commit

Permalink
Update MFD to v2.0
Browse files Browse the repository at this point in the history
- Closes #6
  • Loading branch information
kbenoit committed Oct 20, 2018
1 parent da4f4e8 commit a310906
Show file tree
Hide file tree
Showing 9 changed files with 683 additions and 8 deletions.
4 changes: 2 additions & 2 deletions DESCRIPTION
@@ -1,7 +1,7 @@
Package: quanteda.dictionaries
Type: Package
Title: Text Analysis Dictionaries
Version: 0.2
Version: 0.21
Authors@R: c( person("Kenneth", "Benoit", email = "kbenoit@lse.ac.uk", role =
c("aut", "cre", "cph")),
person("Stefan", "Müller", email = "mullers@tcd.ie", role = "aut"))
Expand All @@ -26,4 +26,4 @@ Remotes:
Encoding: UTF-8
LazyData: true
VignetteBuilder: knitr
RoxygenNote: 6.0.1
RoxygenNote: 6.1.0
1 change: 1 addition & 0 deletions NEWS.md
Expand Up @@ -3,6 +3,7 @@
* Added a `NEWS.md` file to track changes to the package.
* Fixed bug #14: word count failing when dictionaries are used.
* Fixed bug #16: `liwcalike(x, ...)` not passing arguments to `tokens()`.
* (v0.21) Updated the Moral Foundations Dictionary to 2.0.



Expand Down
12 changes: 10 additions & 2 deletions R/data.R
Expand Up @@ -144,13 +144,21 @@

#' Moral Foundations Dictionary
#'
#' A \pkg{quanteda} \link[quanteda]{dictionary} object containing
#' @description A \pkg{quanteda} \link[quanteda]{dictionary} object containing
#' the Moral Foundations Dictionary, a publicly available dictionaries with
#' information on the proportions of virtue and vice words for each foundation.
#' The categories are harm (vice/virtue), fairness (vice/virtue), ingroup (vice/virtue),
#' authority (vice/virtue), purity (vice/virtue) and morality (general).
#' @source \url{http://moralfoundations.org/othermaterials}
#'
#' @description This version is 2.0 version of the dictionary,
#' \href{http://www.jeremyfrimer.com/uploads/2/1/2/7/21278832/summary.pdf}{recommended}
#' over the first version of the MDF by its authors.
#' @source http://www.jeremyfrimer.com/research-downloads.html; a previous
#' version is available at \url{http://moralfoundations.org/othermaterials}
#' @references
#' Frimer, Jeffrey et. al. (2017). "Moral Foundations Dictionaries for
#' Linguistic Analyses, 2.0." University of Winnipeg manuscript.
#' \url{http://www.jeremyfrimer.com/uploads/2/1/2/7/21278832/summary.pdf}.
#'
#' Haidt, J., Graham, J., and Nosek, B.A. (2009). "Liberals and Conservatives
#' Rely on Different Sets of Moral Foundations. \emph{Journal of Personality and Social
Expand Down
Binary file modified data/data_dictionary_MFD.rda
Binary file not shown.
15 changes: 12 additions & 3 deletions man/data_dictionary_MFD.Rd

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

3 changes: 2 additions & 1 deletion sources/MFD/create-data_dictionary_MFD.R
Expand Up @@ -2,6 +2,7 @@

library("quanteda")

data_dictionary_MFD <- dictionary(file = "sources/MFD/moral_foundations_dictionary.dic")
#data_dictionary_MFD <- dictionary(file = "sources/MFD/moral_foundations_dictionary.dic")
data_dictionary_MFD <- dictionary(file = "sources/MFD/mfd2.0.dic")

devtools::use_data(data_dictionary_MFD, overwrite = TRUE)
1 change: 1 addition & 0 deletions sources/MFD/mfd2.0.dic

Large diffs are not rendered by default.

113 changes: 113 additions & 0 deletions vignettes/quanteda.dictionaries_vignette.R
@@ -0,0 +1,113 @@
## ----echo = FALSE--------------------------------------------------------
knitr::opts_chunk$set(collapse = TRUE,
comment = "##")

## ----eval=TRUE, warning=FALSE, message=FALSE-----------------------------
library(quanteda)
library(quanteda.dictionaries)

## ------------------------------------------------------------------------
data(data_corpus_movies, package = "quanteda.corpora")

## ---- eval=FALSE---------------------------------------------------------
# liwc2007dict <- dictionary(file = "LIWC2007.cat", format = "wordstat")
# tail(liwc2007dict, 1)
# # Dictionary object with 1 primary key entry and 2 nested levels.
# # - [SPOKEN CATEGORIES]:
# # - [ASSENT]:
# # - absolutely, agree, ah, alright*, aok, aw, awesome, cool, duh, ha, hah, haha*, heh*, hm*, huh, lol, mm*, oh, ok, okay, okey*, rofl, uhhu*, uhuh, yah, yay, yea, yeah, yep*, yes, yup
# # - [NON-FLUENCIES]:
# # - er, hm*, sigh, uh, um, umm*, well, zz*
# # - [FILLERS]:
# # - blah, idon'tknow, idontknow, imean, ohwell, oranything*, orsomething*, orwhatever*, rr*, yakn*, ykn*, youknow*

## ------------------------------------------------------------------------
output_nrc <- liwcalike(data_corpus_movies, data_dictionary_NRC)
head(output_nrc)

## ----fig.width=7, fig.height=6-------------------------------------------
output_nrc$net_positive <- output_nrc$positive - output_nrc$negative
output_nrc$sentiment <- docvars(data_corpus_movies, "Sentiment")

library(ggplot2)
# set ggplot2 theme
theme_set(theme_bw())
ggplot(output_nrc, aes(x = sentiment, y = net_positive)) +
geom_boxplot() +
labs(x = "Classified sentiment",
y = "Net positive sentiment",
main = "NRC Sentiment Dictionary")

## ----fig.width=7, fig.height=6-------------------------------------------
output_geninq <- liwcalike(data_corpus_movies, data_dictionary_geninqposneg)
names(output_geninq)

output_geninq$net_positive <- output_geninq$positive - output_geninq$negative
output_geninq$sentiment <- docvars(data_corpus_movies, "Sentiment")

ggplot(output_geninq, aes(x = sentiment, y = net_positive)) +
geom_boxplot() +
labs(x = "Classified sentiment",
y = "Net positive sentiment",
main = "General Inquirer Sentiment Association")

## ----fig.width=7, fig.height=6-------------------------------------------
cor.test(output_nrc$net_positive, output_geninq$net_positive)

cor_dictionaries <- data.frame(
nrc = output_nrc$net_positive,
geninq = output_geninq$net_positive
)

ggplot(data = cor_dictionaries, aes(x = nrc, y = geninq)) +
geom_point(alpha = 0.2) +
labs(x = "NRC Word-Emotion Association Lexicon",
y = "General Inquirer Net Positive Sentiment",
main = "Correlation for Net Positive Sentiment in Movie Reviews")

## ------------------------------------------------------------------------
mydict <- dictionary(list(positive = c("great", "phantastic", "wonderful"),
negative = c("bad", "horrible", "terrible")))

output_custom_dict <- liwcalike(data_corpus_movies, mydict)

head(output_custom_dict)

## ------------------------------------------------------------------------
ndoc(data_corpus_inaugural)

## ------------------------------------------------------------------------
inaug_corpus_paragraphs <- corpus_reshape(data_corpus_inaugural, to = "paragraphs")
ndoc(inaug_corpus_paragraphs)

## ------------------------------------------------------------------------
output_paragraphs <- liwcalike(inaug_corpus_paragraphs, data_dictionary_NRC)
head(output_custom_dict)

## ---- eval=FALSE---------------------------------------------------------
# # save as csv file
# write.csv(output_custom_dict, file = "output_dictionary.csv",
# fileEncoding = "utf-8")
#
# # save as Excel file (xlsx)
# library(rio)
# rio::export(output_custom_dict, file = "output_dictionary.xlsx")

## ------------------------------------------------------------------------
txt <- c(uk = "endeavour to prioritise honour over esthetics",
us = "endeavor to prioritize honor over aesthetics")
toks <- quanteda::tokens(txt)

## ------------------------------------------------------------------------
quanteda::tokens_replace(toks, data_dictionary_uk2us)

## ------------------------------------------------------------------------
quanteda::tokens_replace(toks, data_dictionary_us2uk)

## ------------------------------------------------------------------------
# original dfm
quanteda::dfm(toks)

# homogeni[zs]ed dfm
quanteda::dfm(quanteda::tokens_replace(toks, data_dictionary_uk2us))

0 comments on commit a310906

Please sign in to comment.