Skip to content

Commit

Permalink
various updates
Browse files Browse the repository at this point in the history
  • Loading branch information
kasperwelbers committed May 20, 2014
1 parent 6cc4e7d commit c5a6abc
Show file tree
Hide file tree
Showing 31 changed files with 991 additions and 0 deletions.
2 changes: 2 additions & 0 deletions .Rbuildignore
@@ -0,0 +1,2 @@
^.*\.Rproj$
^\.Rproj\.user$
3 changes: 3 additions & 0 deletions .gitignore
@@ -0,0 +1,3 @@
.Rproj.user
.Rhistory
.RData
20 changes: 20 additions & 0 deletions DESCRIPTION
@@ -0,0 +1,20 @@
Package: corpustools
Version: 0.100
Date: 2014-05-20
Title: tools for corpus analysis
Author: Kasper Welbers & Wouter van Atteveldt
Maintainer: Kasper Welbers <kasperwelbers@gmail.com>
Depends:
slam,
Matrix,
lda,
tm,
topicmodels,
RColorBrewer,
wordcloud
Description: Several convenient functions for bag-of-word type text analysis
methods
License: MIT
URL: http://github.com/kasperwelbers/corpus-tools
Roxygen: list(wrap = FALSE)

17 changes: 17 additions & 0 deletions NAMESPACE
@@ -0,0 +1,17 @@
# Generated by roxygen2 (4.0.0): do not edit by hand

export(cast.sparse.matrix)
export(corpora.compare)
export(dtm.create)
export(fill.time.gaps)
export(lda.fit)
export(lda.match.meta)
export(lda.plot.alltopics)
export(lda.plot.category)
export(lda.plot.time)
export(lda.plot.topic)
export(lda.plot.wordcloud)
export(lda.topics.per.document)
export(prepare.plot.values)
export(prepare.time.var)
export(term.statistics)
102 changes: 102 additions & 0 deletions R/corpus.r
@@ -0,0 +1,102 @@

#' Cast data.frame to sparse matrix
#'
#' Create a sparse matrix from matching vectors of row indices, column indices and values
#'
#' @param rows a vector of row indices: [i,]
#' @param columns a vector of column indices: [,j]
#' @param values a vector of the values for each (non-zero) cell: [i,j] = value
#' @return a sparse matrix of the dgTMatrix class (\code{\link{Matrix}} package)
#' @export
cast.sparse.matrix <- function(rows, columns, values=NULL) {
if(is.null(values)) values = rep(1, length(rows))
d = data.frame(rows=rows, columns=columns, values=values)
if(nrow(d) > nrow(unique(d[,c('rows','columns')]))){
message('(Duplicate row-column matches occured. Values of duplicates are added up)')
d = aggregate(values ~ rows + columns, d, FUN='sum')
}
unit_index = unique(d$rows)
char_index = unique(d$columns)
sm = spMatrix(nrow=length(unit_index), ncol=length(char_index),
match(d$rows, unit_index), match(d$columns, char_index), d$values)
rownames(sm) = unit_index
colnames(sm) = char_index
sm
}

#' Create a document term matrix from a list of tokens
#'
#' Create a \code{\link{DocumentTermMatrix}} from a list of ids, terms, and frequencies.
#'
#' @param ids a vector of document ids
#' @param terms a vector of words of the same length as ids
#' @param freqs a vector of the frequency a a term in a document
#' @return a document-term matrix \code{\link{DocumentTermMatrix}}
#' @export
dtm.create <- function(ids, terms, freqs) {
# remove NA terms
d = data.frame(ids=ids, terms=terms, freqs=freqs)
if (sum(is.na(d$terms)) > 0) {
warning("Removing ", sum(is.na(d$terms)), "rows with missing term names")
d = d[!is.na(d$terms), ]
}
sparsemat = cast.sparse.matrix(rows=d$ids, columns=d$terms, values=d$freqs)
as.DocumentTermMatrix(sparsemat, weighting=weightTf)
}

#' Compute some useful corpus statistics for a dtm
#'
#' Compute a number of useful statistics for filtering words: term frequency, idf, etc.
#'
#' @param dtm a document term matrix (e.g. the output of \code{\link{dtm.create}})
#' @return A data frame with rows corresponding to the terms in dtm and the statistics in the columns
#' @export
term.statistics <- function(dtm) {
dtm = dtm[row_sums(dtm) > 0,col_sums(dtm) > 0] # get rid of empty rows/columns
vocabulary = colnames(dtm)
data.frame(term = vocabulary,
characters = nchar(vocabulary),
number = grepl("[0-9]", vocabulary),
nonalpha = grepl("\\W", vocabulary),
termfreq = col_sums(dtm),
docfreq = col_sums(dtm > 0),
reldocfreq = col_sums(dtm > 0) / nDocs(dtm),
tfidf = tapply(dtm$v/row_sums(dtm)[dtm$i], dtm$j, mean) * log2(nDocs(dtm)/col_sums(dtm > 0)))
}

#' Compute the chi^2 statistic for a 2x2 crosstab containing the values
#' [[a, b], [c, d]]
chi2 <- function(a,b,c,d) {
ooe <- function(o, e) {(o-e)*(o-e) / e}
tot = 0.0 + a+b+c+d
a = as.numeric(a)
b = as.numeric(b)
c = as.numeric(c)
d = as.numeric(d)
(ooe(a, (a+c)*(a+b)/tot)
+ ooe(b, (b+d)*(a+b)/tot)
+ ooe(c, (a+c)*(c+d)/tot)
+ ooe(d, (d+b)*(c+d)/tot))
}

#' Compare two corpora
#'
#' Compare the term use in corpus dtm with a refernece corpus dtm.ref, returning relative frequencies
#' and overrepresentation using various measures
#'
#' @param dtm.x the main document-term matrix
#' @param dtm.y the 'reference' document-term matrix
#' @param smooth the smoothing parameter for computing overrepresentation
#' @return A data frame with rows corresponding to the terms in dtm and the statistics in the columns
#' @export
corpora.compare <- function(dtm.x, dtm.y, smooth=.001) {
freqs = term.statistics(dtm.x)[, c("term", "termfreq")]
freqs.rel = term.statistics(dtm.y)[, c("term", "termfreq")]
f = merge(freqs, freqs.rel, all=T, by="term")
f[is.na(f)] = 0
f$relfreq.x = f$termfreq.x / sum(freqs$termfreq)
f$relfreq.y = f$termfreq.y / sum(freqs.rel$termfreq)
f$over = (f$relfreq.x + smooth) / (f$relfreq.y + smooth)
f$chi = chi2(f$termfreq.x, f$termfreq.y, sum(f$termfreq.x) - f$termfreq.x, sum(f$termfreq.y) - f$termfreq.y)
f
}
228 changes: 228 additions & 0 deletions R/lda.r
@@ -0,0 +1,228 @@
### PREPARE DATA

#' Estimate a topic model using the lda package
#'
#' Estimate an LDA topic model using the \code{\link{lda.collapsed.gibbs.sampler}} function
#' The parameters other than dtm are simply passed to the sampler but provide a workable default.
#' See the description of that function for more information
#'
#' @param dtm a document term matrix (e.g. the output of \code{\link{amcat.dtm.create}})
#' @param K the number of clusters
#' @param num.iterations the number of iterations
#' @param alpha the alpha parameter
#' @param eta the eta parameter
#' @return A fitted LDA model (see \code{\link{lda.collapsed.gibbs.sampler}})
#' @export
lda.fit <- function(dtm, K=50, num.iterations=100, alpha=50/K, eta=.01, burnin=100, compute.log.likelihood=F) {
dtm = dtm[row_sums(dtm) > 0,col_sums(dtm) > 0]
x = dtm2ldaformat(dtm)
m = lda.collapsed.gibbs.sampler(x$documents, vocab=x$vocab, K=K, num.iterations=num.iterations,
alpha=alpha, eta=eta, burnin=burnin, compute.log.likelihood=compute.log.likelihood)
m$dtm = dtm
m
}

#' Add document meta to LDA output
#'
#' Add a dataframe containing document meta to the output (a list) of \code{\link{lda.collapsed.gibbs.sampler}}.
#'
#' @param m The output of \code{\link{lda.collapsed.gibbs.sampler}}
#' @param lda.document.ids A vector with document ids of the same length and order as the LDA output (matching the columns of m$document_sums)
#' @param meta A data.frame with document meta. Has to contain a vector to match the lda.document.ids
#' @param match.by The name of the vector in meta that matches the lda.document.ids
#' @return The LDA output appended with document meta
#' @export
lda.match.meta <- function(m, document.ids, meta, match.by = 'id'){
if('meta' %in% names(m)) {m$meta = meta[match(document.ids, meta[,match.by]),]
} else m = c(m, list(meta=meta[match(document.ids, meta[,match.by]),]))
m
}


#' Get the topics per document, optionally merged with
#'
#' Return a data frame containing article metadata and topic occurence per document
#'
#' @param dtm a document term matrix (e.g. the output of \code{\link{dtm.create}})
#' @return A data frame with rows corresponding to the terms in dtm and the statistics in the columns
#' @export
lda.topics.per.document <- function(topics) {
ids = as.numeric(rownames(topics$dtm))
cbind(id=ids, data.frame(t(topics$document_sums)))
}

### PLOT LDA TOPICS

#' Plot all topics
#'
#' Write plots for all topics with \code{\link{lda.plot.topic}} in designated folder
#'
#' @param m The output of \code{\link{lda.collapsed.gibbs.sampler}}
#' @param time_var A vector with time stamps (either numeric or Date class) of the same length and order of the documents (rows) in m$document_sums
#' @param category_var A vector with id values of the same length and order of the documents (rows) in m$document_sums
#' @param path The path for a folder where output will be saved
#' @param date_interval The interval for plotting the values over time. Can be: 'day', 'week', 'month' or 'year'
#' @return Nothing
#' @export
lda.plot.alltopics <- function(m, time_var, category_var, path, date_interval='day'){
for(topic_nr in 1:nrow(m$document_sums)){
print(paste('Plotting:',topic_nr))
fn = paste(path, topic_nr, ".png", sep="")
if (!is.null(fn)) png(fn, width=1280,height=800)
lda.plot.topic(m, topic_nr, time_var, category_var, date_interval)
if (!is.null(fn)) dev.off()
}
par(mfrow=c(1,1))
}

#' Plots topic wordcloud, and attention over time and per category
#'
#' Plots \code{\link{lda.plot.wordcloud}}, \code{\link{lda.plot.time}} and \code{\link{lda.plot.category}}
#'
#' @param m The output of \code{\link{lda.collapsed.gibbs.sampler}}
#' @param The index of the topic (1 to K)
#' @param time_var A vector with time stamps (either numeric or Date class) of the same length and order of the documents (rows) in m$document_sums
#' @param category_var A vector with id values of the same length and order of the documents (rows) in m$document_sums
#' @param date_interval The interval for plotting the values over time. Can be: 'day', 'week', 'month' or 'year'
#' @param pct Show topic values as percentages
#' @param value Show topic values as 'total', or as 'relative' to the attention for other topics
#' @return Nothing, just plots
#' @export
lda.plot.topic <- function(m, topic_nr, time_var, category_var, date_interval='day', pct=F, value='total'){
par(mar=c(4.5,3,2,1), cex.axis=1.7)
layout(matrix(c(1,1,2,3), 2, 2, byrow = TRUE), widths=c(2.5,1.5), heights=c(1,2))
lda.plot.time(m, topic_nr, time_var, date_interval, pct=pct, value=value)
lda.plot.wordcloud(m, topic_nr)
lda.plot.category(m, topic_nr, category_var, pct=pct, value=value)
par(mfrow=c(1,1))
}

#' Change date object to date_interval
#'
#' Change date object to date_interval
#'
#' @param time_var A vector of Date values
#' @param date_interval The desired date_interval ('day','week','month', or 'year')
#' @return A vector of Date values
#' @export
prepare.time.var <- function(time_var, date_interval){
if(class(time_var) == 'Date'){
if(date_interval == 'day') time_var = as.Date(format(time_var, '%Y-%m-%d'))
if(date_interval == 'month') time_var = as.Date(paste(format(time_var, '%Y-%m'),'-01',sep=''))
if(date_interval == 'week') time_var = as.Date(paste(format(time_var, '%Y-%W'),1), '%Y-%W %u')
if(date_interval == 'year') time_var = as.Date(paste(format(time_var, '%Y'),'-01-01',sep=''))
}
time_var
}

#' Add empty values for pretty plotting
#'
#' When plotting a timeline, gaps in date_intervals are ignored. For the attention for topics gaps should be considered as having value 0.
#'
#' @param d A data.frame with the columns 'time' (Date) and 'value' (numeric)
#' @param date_interval The date_interval is required to know what the gaps are
#' @return A data.frame with the columns 'time' (Date) and 'value' (numeric)
#' @export
fill.time.gaps <- function(d, date_interval){
if(class(d$time) == 'numeric'){
for(t in min(d$time):max(d$time))
if(!t %in% d$time) d = rbind(d, data.frame(time=t, value=0))
}
if(class(d$time) == 'Date'){
date_sequence = seq.Date(from=min(d$time), to=max(d$time), by=date_interval)
for(i in 1:length(date_sequence)){
t = date_sequence[i]
if(!t %in% d$time) d = rbind(d, data.frame(time=t, value=0))
}
}
d[order(d$time),]
}

#' Prepares the topic values per document for plotting
#'
#' Prepares the topic values per document for plotting
#'
#' @param m The output of \code{\link{lda.collapsed.gibbs.sampler}}
#' @param break_var A break vector to aggregate topic values per document
#' @param The index of the topic (1 to K)
#' @param pct Show topic values as percentages
#' @param value Show topic values as 'total', or as 'relative' to the attention for other topics
#' @return The aggregated/transformed topic values
#' @export
prepare.plot.values <- function(m, break_var, topic_nr, pct=F, value='total', filter=NULL){
hits = m$document_sums[topic_nr,]
d = aggregate(hits, by=list(break_var=break_var), FUN='sum')
if(value == 'relative'){
total_hits = colSums(m$document_sums)
totals = aggregate(total_hits, by=list(break_var=break_var), FUN='sum')
d$x = d$x / totals$x
}
if(pct == T) d$x = d$x / sum(d$x)
d
}

#' Plots topic values over time
#'
#' Plots the attention for a topic over time
#'
#' @param m The output of \code{\link{lda.collapsed.gibbs.sampler}}
#' @param The index of the topic (1 to K)
#' @param time_var A vector with time stamps (either numeric or Date class) of the same length and order of the documents (rows) in m$document_sums
#' @param date_interval The interval for plotting the values over time. Can be: 'day', 'week', 'month' or 'year'
#' @param pct Show topic values as percentages
#' @param value Show topic values as 'total', or as 'relative' to the attention for other topics
#' @param return.values Logical. If true, data that is plotted is returned as a data.frame
#' @return data.frame for plotted values
#' @export
lda.plot.time <- function(m, topic_nr, time_var, date_interval='day', pct=F, value='total', return.values=F){
par(mar=c(3,3,3,1))
time_var = prepare.time.var(time_var, date_interval)
d = prepare.plot.values(m, break_var=time_var, topic_nr=topic_nr, pct=pct, value=value)
colnames(d) = c('time','value')
d = fill.time.gaps(d, date_interval)
plot(d$time, d$value, type='l', xlab='', main='', ylab='', xlim=c(min(d$time), max(d$time)), ylim=c(0, max(d$value)), bty='L', lwd=5, col='darkgrey')
if(return.values==T) d
}

#' Plots topic values per category
#'
#' Plots the attention for a topic per category
#'
#' @param m The output of \code{\link{lda.collapsed.gibbs.sampler}}
#' @param The index of the topic (1 to K)
#' @param category_var A vector with id values of the same length and order of the documents (rows) in m$document_sums
#' @param pct Show topic values as percentages
#' @param value Show topic values as 'total', or as 'relative' to the attention for other topics
#' @param return.values Logical. If true, data that is plotted is returned as a data.frame
#' @return data.frame for plotted values
#' @export
lda.plot.category <- function(m, topic_nr, category_var, pct=F, value='total', return.values=F){
par(mar=c(15,0,1,2))
d = prepare.plot.values(m, break_var=category_var, topic_nr=topic_nr, pct=pct, value=value)
colnames(d) = c('category','value')
barplot(as.matrix(t(d[,c('value')])), main='', beside=TRUE,horiz=FALSE,
density=NA,
col='darkgrey',
xlab='',
ylab="",
axes=T, names.arg=d$category, cex.names=0.8, cex.axis=0.8, adj=1, las=2)
if(return.values==T) d
}

#' Plot wordcloud for LDA topic
#'
#' Plots a wordcloud of the top words per topic
#'
#' @param m The output of \code{\link{lda.collapsed.gibbs.sampler}}
#' @param topic_nr The index of the topic (1 to K)
#' @return Nothing, just plots
#' @export
lda.plot.wordcloud <- function(m, topic_nr){
x = m$topics[topic_nr,]
x = sort(x[x>5], decreasing=T)[1:100]
x = x[!is.na(x)]
names = sub("/.*", "", names(x))
freqs = x**.5
pal <- brewer.pal(6,"YlGnBu")
wordcloud(names, freqs, scale=c(6,.5), min.freq=1, max.words=Inf, random.order=FALSE, rot.per=.15, colors=pal)
}
17 changes: 17 additions & 0 deletions corpus-tools.Rproj
@@ -0,0 +1,17 @@
Version: 1.0

RestoreWorkspace: Default
SaveWorkspace: Default
AlwaysSaveHistory: Default

EnableCodeIndexing: Yes
UseSpacesForTab: Yes
NumSpacesForTab: 2
Encoding: UTF-8

RnwWeave: knitr
LaTeX: pdfLaTeX

BuildType: Package
PackageInstallArgs: --no-multiarch --with-keep.source
PackageRoxygenize: rd,namespace

0 comments on commit c5a6abc

Please sign in to comment.