Update MFD to v2.0

- Closes #6
kbenoit · Oct 20, 2018 · a310906 · a310906
1 parent da4f4e8
commit a310906
Show file tree

Hide file tree

Showing 9 changed files with 683 additions and 8 deletions.
diff --git a/DESCRIPTION b/DESCRIPTION
@@ -1,7 +1,7 @@
 Package: quanteda.dictionaries
 Type: Package
 Title: Text Analysis Dictionaries
-Version: 0.2
+Version: 0.21
 Authors@R: c( person("Kenneth", "Benoit", email = "kbenoit@lse.ac.uk", role =
     c("aut", "cre", "cph")),
     person("Stefan", "Müller", email = "mullers@tcd.ie", role = "aut"))
@@ -26,4 +26,4 @@ Remotes:
 Encoding: UTF-8
 LazyData: true
 VignetteBuilder: knitr
-RoxygenNote: 6.0.1
+RoxygenNote: 6.1.0
diff --git a/NEWS.md b/NEWS.md
@@ -3,6 +3,7 @@
 * Added a `NEWS.md` file to track changes to the package.  
 * Fixed bug #14: word count failing when dictionaries are used.  
 * Fixed bug #16: `liwcalike(x, ...)` not passing arguments to `tokens()`.  
+* (v0.21) Updated the Moral Foundations Dictionary to 2.0.
 
 
 

diff --git a/R/data.R b/R/data.R
@@ -144,13 +144,21 @@
 
 #' Moral Foundations Dictionary
 #'
-#' A \pkg{quanteda} \link[quanteda]{dictionary} object containing
+#' @description A \pkg{quanteda} \link[quanteda]{dictionary} object containing
 #' the Moral Foundations Dictionary, a publicly available dictionaries with
 #' information on the proportions of virtue and vice words for each foundation.
 #' The categories are harm (vice/virtue), fairness (vice/virtue), ingroup (vice/virtue),
 #' authority (vice/virtue), purity (vice/virtue) and morality (general).
-#' @source \url{http://moralfoundations.org/othermaterials}
+#'
+#' @description This version is 2.0 version of the dictionary,
+#'   \href{http://www.jeremyfrimer.com/uploads/2/1/2/7/21278832/summary.pdf}{recommended}
+#'   over the first version of the MDF by its authors.
+#' @source http://www.jeremyfrimer.com/research-downloads.html; a previous
+#'   version is available at \url{http://moralfoundations.org/othermaterials}
 #' @references
+#'   Frimer, Jeffrey et. al. (2017).  "Moral Foundations Dictionaries for
+#'   Linguistic Analyses, 2.0."  University of Winnipeg manuscript.
+#'   \url{http://www.jeremyfrimer.com/uploads/2/1/2/7/21278832/summary.pdf}.
 #'
 #'   Haidt, J., Graham, J., and Nosek, B.A. (2009). "Liberals and Conservatives
 #'   Rely on Different Sets of Moral Foundations. \emph{Journal of Personality and Social

diff --git a/data/data_dictionary_MFD.rda b/data/data_dictionary_MFD.rda
diff --git a/man/data_dictionary_MFD.Rd b/man/data_dictionary_MFD.Rd
diff --git a/sources/MFD/create-data_dictionary_MFD.R b/sources/MFD/create-data_dictionary_MFD.R
@@ -2,6 +2,7 @@
 
 library("quanteda")
 
-data_dictionary_MFD <- dictionary(file = "sources/MFD/moral_foundations_dictionary.dic")
+#data_dictionary_MFD <- dictionary(file = "sources/MFD/moral_foundations_dictionary.dic")
+data_dictionary_MFD <- dictionary(file = "sources/MFD/mfd2.0.dic")
 
 devtools::use_data(data_dictionary_MFD, overwrite = TRUE)
diff --git a/sources/MFD/mfd2.0.dic b/sources/MFD/mfd2.0.dic
diff --git a/vignettes/quanteda.dictionaries_vignette.R b/vignettes/quanteda.dictionaries_vignette.R
@@ -0,0 +1,113 @@
+## ----echo = FALSE--------------------------------------------------------
+knitr::opts_chunk$set(collapse = TRUE, 
+                      comment = "##")
+
+## ----eval=TRUE, warning=FALSE, message=FALSE-----------------------------
+library(quanteda)
+library(quanteda.dictionaries)
+
+## ------------------------------------------------------------------------
+data(data_corpus_movies, package = "quanteda.corpora")
+
+## ---- eval=FALSE---------------------------------------------------------
+#  liwc2007dict <- dictionary(file = "LIWC2007.cat", format = "wordstat")
+#  tail(liwc2007dict, 1)
+#  # Dictionary object with 1 primary key entry and 2 nested levels.
+#  # - [SPOKEN CATEGORIES]:
+#  #   - [ASSENT]:
+#  #     - absolutely, agree, ah, alright*, aok, aw, awesome, cool, duh, ha, hah, haha*, heh*, hm*, huh, lol, mm*, oh, ok, okay, okey*, rofl, uhhu*, uhuh, yah, yay, yea, yeah, yep*, yes, yup
+#  #   - [NON-FLUENCIES]:
+#  #     - er, hm*, sigh, uh, um, umm*, well, zz*
+#  #   - [FILLERS]:
+#  #     - blah, idon'tknow, idontknow, imean, ohwell, oranything*, orsomething*, orwhatever*, rr*, yakn*, ykn*, youknow*
+
+## ------------------------------------------------------------------------
+output_nrc <- liwcalike(data_corpus_movies, data_dictionary_NRC)
+head(output_nrc)
+
+## ----fig.width=7, fig.height=6-------------------------------------------
+output_nrc$net_positive <- output_nrc$positive - output_nrc$negative
+output_nrc$sentiment <- docvars(data_corpus_movies, "Sentiment")
+
+library(ggplot2)
+# set ggplot2 theme
+theme_set(theme_bw())
+ggplot(output_nrc, aes(x = sentiment, y = net_positive)) +
+    geom_boxplot() +
+    labs(x = "Classified sentiment", 
+         y = "Net positive sentiment",
+         main = "NRC Sentiment Dictionary")
+
+## ----fig.width=7, fig.height=6-------------------------------------------
+output_geninq <- liwcalike(data_corpus_movies, data_dictionary_geninqposneg)
+names(output_geninq)
+
+output_geninq$net_positive <- output_geninq$positive - output_geninq$negative
+output_geninq$sentiment <- docvars(data_corpus_movies, "Sentiment")
+
+ggplot(output_geninq, aes(x = sentiment, y = net_positive)) +
+    geom_boxplot() +
+    labs(x = "Classified sentiment", 
+         y = "Net positive sentiment", 
+         main = "General Inquirer Sentiment Association")
+
+## ----fig.width=7, fig.height=6-------------------------------------------
+cor.test(output_nrc$net_positive, output_geninq$net_positive)
+
+cor_dictionaries <- data.frame(
+    nrc = output_nrc$net_positive,
+    geninq = output_geninq$net_positive
+)
+
+ggplot(data = cor_dictionaries, aes(x = nrc, y = geninq)) + 
+    geom_point(alpha = 0.2) +
+    labs(x = "NRC Word-Emotion Association Lexicon",
+         y = "General Inquirer Net Positive Sentiment",
+         main = "Correlation for Net Positive Sentiment in Movie Reviews")
+
+## ------------------------------------------------------------------------
+mydict <- dictionary(list(positive = c("great", "phantastic", "wonderful"),
+                          negative = c("bad", "horrible", "terrible")))
+
+output_custom_dict <- liwcalike(data_corpus_movies, mydict)
+
+head(output_custom_dict)
+
+## ------------------------------------------------------------------------
+ndoc(data_corpus_inaugural)
+
+## ------------------------------------------------------------------------
+inaug_corpus_paragraphs <- corpus_reshape(data_corpus_inaugural, to = "paragraphs")
+ndoc(inaug_corpus_paragraphs)
+
+## ------------------------------------------------------------------------
+output_paragraphs <- liwcalike(inaug_corpus_paragraphs, data_dictionary_NRC)
+head(output_custom_dict)
+
+## ---- eval=FALSE---------------------------------------------------------
+#  # save as csv file
+#  write.csv(output_custom_dict, file = "output_dictionary.csv",
+#           fileEncoding = "utf-8")
+#  
+#  # save as Excel file (xlsx)
+#  library(rio)
+#  rio::export(output_custom_dict, file = "output_dictionary.xlsx")
+
+## ------------------------------------------------------------------------
+txt <- c(uk = "endeavour to prioritise honour over esthetics",
+         us = "endeavor to prioritize honor over aesthetics")
+toks <- quanteda::tokens(txt)
+
+## ------------------------------------------------------------------------
+quanteda::tokens_replace(toks, data_dictionary_uk2us)
+
+## ------------------------------------------------------------------------
+quanteda::tokens_replace(toks, data_dictionary_us2uk)
+
+## ------------------------------------------------------------------------
+# original dfm
+quanteda::dfm(toks)
+
+# homogeni[zs]ed dfm
+quanteda::dfm(quanteda::tokens_replace(toks, data_dictionary_uk2us))
+