various updates

kasperwelbers · May 20, 2014 · c5a6abc · c5a6abc
1 parent 6cc4e7d
commit c5a6abc
Show file tree

Hide file tree

Showing 31 changed files with 991 additions and 0 deletions.
diff --git a/.Rbuildignore b/.Rbuildignore
@@ -0,0 +1,2 @@
+^.*\.Rproj$
+^\.Rproj\.user$
diff --git a/.gitignore b/.gitignore
@@ -0,0 +1,3 @@
+.Rproj.user
+.Rhistory
+.RData
diff --git a/DESCRIPTION b/DESCRIPTION
@@ -0,0 +1,20 @@
+Package: corpustools
+Version: 0.100
+Date: 2014-05-20
+Title: tools for corpus analysis
+Author: Kasper Welbers & Wouter van Atteveldt
+Maintainer: Kasper Welbers <kasperwelbers@gmail.com>
+Depends:
+    slam,
+    Matrix,
+    lda,
+    tm,
+    topicmodels,
+    RColorBrewer,
+    wordcloud
+Description: Several convenient functions for bag-of-word type text analysis
+    methods
+License: MIT
+URL: http://github.com/kasperwelbers/corpus-tools
+Roxygen: list(wrap = FALSE)
+
diff --git a/NAMESPACE b/NAMESPACE
@@ -0,0 +1,17 @@
+# Generated by roxygen2 (4.0.0): do not edit by hand
+
+export(cast.sparse.matrix)
+export(corpora.compare)
+export(dtm.create)
+export(fill.time.gaps)
+export(lda.fit)
+export(lda.match.meta)
+export(lda.plot.alltopics)
+export(lda.plot.category)
+export(lda.plot.time)
+export(lda.plot.topic)
+export(lda.plot.wordcloud)
+export(lda.topics.per.document)
+export(prepare.plot.values)
+export(prepare.time.var)
+export(term.statistics)
diff --git a/R/corpus.r b/R/corpus.r
@@ -0,0 +1,102 @@
+
+#' Cast data.frame to sparse matrix
+#' 
+#' Create a sparse matrix from matching vectors of row indices, column indices and values
+#' 
+#' @param rows a vector of row indices: [i,]
+#' @param columns a vector of column indices: [,j]
+#' @param values a vector of the values for each (non-zero) cell: [i,j] = value
+#' @return a sparse matrix of the dgTMatrix class (\code{\link{Matrix}} package) 
+#' @export
+cast.sparse.matrix <- function(rows, columns, values=NULL) {
+  if(is.null(values)) values = rep(1, length(rows))
+  d = data.frame(rows=rows, columns=columns, values=values)
+  if(nrow(d) > nrow(unique(d[,c('rows','columns')]))){
+    message('(Duplicate row-column matches occured. Values of duplicates are added up)')
+    d = aggregate(values ~ rows + columns, d, FUN='sum')
+  }
+  unit_index = unique(d$rows)
+  char_index = unique(d$columns)
+  sm = spMatrix(nrow=length(unit_index), ncol=length(char_index),
+                match(d$rows, unit_index), match(d$columns, char_index), d$values)
+  rownames(sm) = unit_index
+  colnames(sm) = char_index
+  sm
+}
+
+#' Create a document term matrix from a list of tokens
+#' 
+#' Create a \code{\link{DocumentTermMatrix}} from a list of ids, terms, and frequencies. 
+#' 
+#' @param ids a vector of document ids
+#' @param terms a vector of words of the same length as ids
+#' @param freqs a vector of the frequency a a term in a document
+#' @return a document-term matrix  \code{\link{DocumentTermMatrix}}
+#' @export
+dtm.create <- function(ids, terms, freqs) {
+  # remove NA terms
+  d = data.frame(ids=ids, terms=terms, freqs=freqs)
+  if (sum(is.na(d$terms)) > 0) {
+    warning("Removing ", sum(is.na(d$terms)), "rows with missing term names")
+    d = d[!is.na(d$terms), ]
+  }
+  sparsemat = cast.sparse.matrix(rows=d$ids, columns=d$terms, values=d$freqs)
+  as.DocumentTermMatrix(sparsemat, weighting=weightTf)
+}
+
+#' Compute some useful corpus statistics for a dtm
+#' 
+#' Compute a number of useful statistics for filtering words: term frequency, idf, etc.
+#' 
+#' @param dtm a document term matrix (e.g. the output of \code{\link{dtm.create}})
+#' @return A data frame with rows corresponding to the terms in dtm and the statistics in the columns
+#' @export
+term.statistics <- function(dtm) {
+  dtm = dtm[row_sums(dtm) > 0,col_sums(dtm) > 0]    # get rid of empty rows/columns
+  vocabulary = colnames(dtm)
+  data.frame(term = vocabulary,
+             characters = nchar(vocabulary),
+             number = grepl("[0-9]", vocabulary),
+             nonalpha = grepl("\\W", vocabulary),
+             termfreq = col_sums(dtm),
+             docfreq = col_sums(dtm > 0),
+             reldocfreq = col_sums(dtm > 0) / nDocs(dtm),
+             tfidf = tapply(dtm$v/row_sums(dtm)[dtm$i], dtm$j, mean) * log2(nDocs(dtm)/col_sums(dtm > 0)))
+}
+
+#' Compute the chi^2 statistic for a 2x2 crosstab containing the values
+#' [[a, b], [c, d]]
+chi2 <- function(a,b,c,d) {
+  ooe <- function(o, e) {(o-e)*(o-e) / e}
+  tot = 0.0 + a+b+c+d
+  a = as.numeric(a)
+  b = as.numeric(b)
+  c = as.numeric(c)
+  d = as.numeric(d)
+  (ooe(a, (a+c)*(a+b)/tot)
+   +  ooe(b, (b+d)*(a+b)/tot)
+   +  ooe(c, (a+c)*(c+d)/tot)
+   +  ooe(d, (d+b)*(c+d)/tot))
+}
+
+#' Compare two corpora
+#' 
+#' Compare the term use in corpus dtm with a refernece corpus dtm.ref, returning relative frequencies
+#' and overrepresentation using various measures
+#' 
+#' @param dtm.x the main document-term matrix
+#' @param dtm.y the 'reference' document-term matrix
+#' @param smooth the smoothing parameter for computing overrepresentation
+#' @return A data frame with rows corresponding to the terms in dtm and the statistics in the columns
+#' @export
+corpora.compare <- function(dtm.x, dtm.y, smooth=.001) {
+  freqs = term.statistics(dtm.x)[, c("term", "termfreq")]
+  freqs.rel = term.statistics(dtm.y)[, c("term", "termfreq")]
+  f = merge(freqs, freqs.rel, all=T, by="term")    
+  f[is.na(f)] = 0
+  f$relfreq.x = f$termfreq.x / sum(freqs$termfreq)
+  f$relfreq.y = f$termfreq.y / sum(freqs.rel$termfreq)
+  f$over = (f$relfreq.x + smooth) / (f$relfreq.y + smooth)
+  f$chi = chi2(f$termfreq.x, f$termfreq.y, sum(f$termfreq.x) - f$termfreq.x, sum(f$termfreq.y) - f$termfreq.y)
+  f
+}
diff --git a/R/lda.r b/R/lda.r
@@ -0,0 +1,228 @@
+### PREPARE DATA
+
+#' Estimate a topic model using the lda package
+#' 
+#' Estimate an LDA topic model using the \code{\link{lda.collapsed.gibbs.sampler}} function
+#' The parameters other than dtm are simply passed to the sampler but provide a workable default.
+#' See the description of that function for more information
+#' 
+#' @param dtm a document term matrix (e.g. the output of \code{\link{amcat.dtm.create}})
+#' @param K the number of clusters
+#' @param num.iterations the number of iterations
+#' @param alpha the alpha parameter
+#' @param eta the eta parameter
+#' @return A fitted LDA model (see \code{\link{lda.collapsed.gibbs.sampler}})
+#' @export
+lda.fit <- function(dtm, K=50, num.iterations=100, alpha=50/K, eta=.01, burnin=100, compute.log.likelihood=F) {
+  dtm = dtm[row_sums(dtm) > 0,col_sums(dtm) > 0]
+  x = dtm2ldaformat(dtm)
+  m = lda.collapsed.gibbs.sampler(x$documents, vocab=x$vocab, K=K, num.iterations=num.iterations, 
+                                  alpha=alpha, eta=eta, burnin=burnin, compute.log.likelihood=compute.log.likelihood)
+  m$dtm = dtm
+  m
+}
+
+#' Add document meta to LDA output
+#' 
+#' Add a dataframe containing document meta to the output (a list) of \code{\link{lda.collapsed.gibbs.sampler}}. 
+#' 
+#' @param m The output of \code{\link{lda.collapsed.gibbs.sampler}}   
+#' @param lda.document.ids A vector with document ids of the same length and order as the LDA output (matching the columns of m$document_sums)
+#' @param meta A data.frame with document meta. Has to contain a vector to match the lda.document.ids
+#' @param match.by The name of the vector in meta that matches the lda.document.ids 
+#' @return The LDA output appended with document meta
+#' @export
+lda.match.meta <- function(m, document.ids, meta, match.by = 'id'){
+  if('meta' %in% names(m)) {m$meta = meta[match(document.ids, meta[,match.by]),]
+  } else m = c(m, list(meta=meta[match(document.ids, meta[,match.by]),]))
+  m
+}
+
+
+#' Get the topics per document, optionally merged with 
+#' 
+#' Return a data frame containing article metadata and topic occurence per document
+#' 
+#' @param dtm a document term matrix (e.g. the output of \code{\link{dtm.create}})
+#' @return A data frame with rows corresponding to the terms in dtm and the statistics in the columns
+#' @export
+lda.topics.per.document <- function(topics) {
+  ids = as.numeric(rownames(topics$dtm))
+  cbind(id=ids, data.frame(t(topics$document_sums)))
+}
+
+### PLOT LDA TOPICS
+
+#' Plot all topics
+#' 
+#' Write plots for all topics with \code{\link{lda.plot.topic}} in designated folder
+#' 
+#' @param m The output of \code{\link{lda.collapsed.gibbs.sampler}}   
+#' @param time_var A vector with time stamps (either numeric or Date class) of the same length and order of the documents (rows) in m$document_sums
+#' @param category_var A vector with id values of the same length and order of the documents (rows) in m$document_sums
+#' @param path The path for a folder where output will be saved
+#' @param date_interval The interval for plotting the values over time. Can be: 'day', 'week', 'month' or 'year'
+#' @return Nothing
+#' @export
+lda.plot.alltopics <- function(m, time_var, category_var, path, date_interval='day'){
+  for(topic_nr in 1:nrow(m$document_sums)){
+    print(paste('Plotting:',topic_nr))
+    fn = paste(path, topic_nr, ".png", sep="")
+    if (!is.null(fn)) png(fn, width=1280,height=800)
+    lda.plot.topic(m, topic_nr, time_var, category_var, date_interval)
+    if (!is.null(fn)) dev.off()
+  }
+  par(mfrow=c(1,1))
+}
+
+#' Plots topic wordcloud, and attention over time and per category
+#' 
+#' Plots \code{\link{lda.plot.wordcloud}}, \code{\link{lda.plot.time}} and \code{\link{lda.plot.category}}
+#' 
+#' @param m The output of \code{\link{lda.collapsed.gibbs.sampler}}
+#' @param The index of the topic (1 to K)
+#' @param time_var A vector with time stamps (either numeric or Date class) of the same length and order of the documents (rows) in m$document_sums
+#' @param category_var A vector with id values of the same length and order of the documents (rows) in m$document_sums
+#' @param date_interval The interval for plotting the values over time. Can be: 'day', 'week', 'month' or 'year'
+#' @param pct Show topic values as percentages
+#' @param value Show topic values as 'total', or as 'relative' to the attention for other topics
+#' @return Nothing, just plots
+#' @export
+lda.plot.topic <- function(m, topic_nr, time_var, category_var, date_interval='day', pct=F, value='total'){
+  par(mar=c(4.5,3,2,1), cex.axis=1.7)
+  layout(matrix(c(1,1,2,3), 2, 2, byrow = TRUE), widths=c(2.5,1.5), heights=c(1,2))
+  lda.plot.time(m, topic_nr, time_var, date_interval, pct=pct, value=value)
+  lda.plot.wordcloud(m, topic_nr)
+  lda.plot.category(m, topic_nr, category_var, pct=pct, value=value)
+  par(mfrow=c(1,1))
+}
+
+#' Change date object to date_interval
+#' 
+#' Change date object to date_interval
+#' 
+#' @param time_var A vector of Date values
+#' @param date_interval The desired date_interval ('day','week','month', or 'year')
+#' @return A vector of Date values
+#' @export
+prepare.time.var <- function(time_var, date_interval){
+  if(class(time_var) == 'Date'){
+    if(date_interval == 'day') time_var = as.Date(format(time_var, '%Y-%m-%d'))
+    if(date_interval == 'month') time_var = as.Date(paste(format(time_var, '%Y-%m'),'-01',sep=''))
+    if(date_interval == 'week') time_var = as.Date(paste(format(time_var, '%Y-%W'),1), '%Y-%W %u')
+    if(date_interval == 'year') time_var = as.Date(paste(format(time_var, '%Y'),'-01-01',sep=''))
+  } 
+  time_var
+}
+
+#' Add empty values for pretty plotting
+#' 
+#' When plotting a timeline, gaps in date_intervals are ignored. For the attention for topics gaps should be considered as having value 0.   
+#' 
+#' @param d A data.frame with the columns 'time' (Date) and 'value' (numeric)  
+#' @param date_interval The date_interval is required to know what the gaps are
+#' @return A data.frame with the columns 'time' (Date) and 'value' (numeric)  
+#' @export
+fill.time.gaps <- function(d, date_interval){
+  if(class(d$time) == 'numeric'){
+    for(t in min(d$time):max(d$time)) 
+      if(!t %in% d$time) d = rbind(d, data.frame(time=t, value=0))
+  }
+  if(class(d$time) == 'Date'){
+    date_sequence = seq.Date(from=min(d$time), to=max(d$time), by=date_interval)
+    for(i in 1:length(date_sequence)){
+      t = date_sequence[i]
+      if(!t %in% d$time) d = rbind(d, data.frame(time=t, value=0))
+    }
+  }
+  d[order(d$time),]
+}
+
+#' Prepares the topic values per document for plotting 
+#' 
+#' Prepares the topic values per document for plotting
+#' 
+#' @param m The output of \code{\link{lda.collapsed.gibbs.sampler}}
+#' @param break_var A break vector to aggregate topic values per document
+#' @param The index of the topic (1 to K)
+#' @param pct Show topic values as percentages
+#' @param value Show topic values as 'total', or as 'relative' to the attention for other topics
+#' @return The aggregated/transformed topic values
+#' @export
+prepare.plot.values <- function(m, break_var, topic_nr, pct=F, value='total', filter=NULL){
+  hits = m$document_sums[topic_nr,]
+  d = aggregate(hits, by=list(break_var=break_var), FUN='sum') 
+  if(value == 'relative'){
+    total_hits = colSums(m$document_sums)  
+    totals = aggregate(total_hits, by=list(break_var=break_var), FUN='sum')
+    d$x = d$x / totals$x
+  }
+  if(pct == T) d$x = d$x / sum(d$x)
+  d
+}
+
+#' Plots topic values over time
+#' 
+#' Plots the attention for a topic over time
+#' 
+#' @param m The output of \code{\link{lda.collapsed.gibbs.sampler}}
+#' @param The index of the topic (1 to K)
+#' @param time_var A vector with time stamps (either numeric or Date class) of the same length and order of the documents (rows) in m$document_sums
+#' @param date_interval The interval for plotting the values over time. Can be: 'day', 'week', 'month' or 'year'
+#' @param pct Show topic values as percentages
+#' @param value Show topic values as 'total', or as 'relative' to the attention for other topics
+#' @param return.values Logical. If true, data that is plotted is returned as a data.frame
+#' @return data.frame for plotted values
+#' @export
+lda.plot.time <- function(m, topic_nr, time_var, date_interval='day', pct=F, value='total', return.values=F){
+  par(mar=c(3,3,3,1))
+  time_var = prepare.time.var(time_var, date_interval)  
+  d = prepare.plot.values(m, break_var=time_var, topic_nr=topic_nr, pct=pct, value=value)
+  colnames(d) = c('time','value')
+  d = fill.time.gaps(d, date_interval)
+  plot(d$time, d$value, type='l', xlab='', main='', ylab='', xlim=c(min(d$time), max(d$time)), ylim=c(0, max(d$value)), bty='L', lwd=5, col='darkgrey')
+  if(return.values==T) d
+}
+
+#' Plots topic values per category
+#' 
+#' Plots the attention for a topic per category
+#' 
+#' @param m The output of \code{\link{lda.collapsed.gibbs.sampler}}
+#' @param The index of the topic (1 to K)
+#' @param category_var A vector with id values of the same length and order of the documents (rows) in m$document_sums
+#' @param pct Show topic values as percentages
+#' @param value Show topic values as 'total', or as 'relative' to the attention for other topics
+#' @param return.values Logical. If true, data that is plotted is returned as a data.frame
+#' @return data.frame for plotted values
+#' @export
+lda.plot.category <- function(m, topic_nr, category_var, pct=F, value='total', return.values=F){
+  par(mar=c(15,0,1,2))
+  d = prepare.plot.values(m, break_var=category_var, topic_nr=topic_nr, pct=pct, value=value)
+  colnames(d) = c('category','value')
+  barplot(as.matrix(t(d[,c('value')])), main='', beside=TRUE,horiz=FALSE,
+          density=NA,
+          col='darkgrey',
+          xlab='',
+          ylab="",
+          axes=T, names.arg=d$category, cex.names=0.8, cex.axis=0.8, adj=1, las=2)
+  if(return.values==T) d
+}
+
+#' Plot wordcloud for LDA topic
+#' 
+#' Plots a wordcloud of the top words per topic
+#' 
+#' @param m The output of \code{\link{lda.collapsed.gibbs.sampler}}
+#' @param topic_nr The index of the topic (1 to K)
+#' @return Nothing, just plots
+#' @export
+lda.plot.wordcloud <- function(m, topic_nr){
+  x = m$topics[topic_nr,]
+  x = sort(x[x>5], decreasing=T)[1:100]
+  x = x[!is.na(x)]
+  names = sub("/.*", "", names(x))
+  freqs = x**.5
+  pal <- brewer.pal(6,"YlGnBu")
+  wordcloud(names, freqs, scale=c(6,.5), min.freq=1, max.words=Inf, random.order=FALSE, rot.per=.15, colors=pal)
+}
diff --git a/corpus-tools.Rproj b/corpus-tools.Rproj
@@ -0,0 +1,17 @@
+Version: 1.0
+
+RestoreWorkspace: Default
+SaveWorkspace: Default
+AlwaysSaveHistory: Default
+
+EnableCodeIndexing: Yes
+UseSpacesForTab: Yes
+NumSpacesForTab: 2
+Encoding: UTF-8
+
+RnwWeave: knitr
+LaTeX: pdfLaTeX
+
+BuildType: Package
+PackageInstallArgs: --no-multiarch --with-keep.source
+PackageRoxygenize: rd,namespace