diff --git a/.Rbuildignore b/.Rbuildignore new file mode 100644 index 0000000..91114bf --- /dev/null +++ b/.Rbuildignore @@ -0,0 +1,2 @@ +^.*\.Rproj$ +^\.Rproj\.user$ diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..807ea25 --- /dev/null +++ b/.gitignore @@ -0,0 +1,3 @@ +.Rproj.user +.Rhistory +.RData diff --git a/DESCRIPTION b/DESCRIPTION new file mode 100644 index 0000000..818e33a --- /dev/null +++ b/DESCRIPTION @@ -0,0 +1,20 @@ +Package: corpustools +Version: 0.100 +Date: 2014-05-20 +Title: tools for corpus analysis +Author: Kasper Welbers & Wouter van Atteveldt +Maintainer: Kasper Welbers +Depends: + slam, + Matrix, + lda, + tm, + topicmodels, + RColorBrewer, + wordcloud +Description: Several convenient functions for bag-of-word type text analysis + methods +License: MIT +URL: http://github.com/kasperwelbers/corpus-tools +Roxygen: list(wrap = FALSE) + diff --git a/NAMESPACE b/NAMESPACE new file mode 100644 index 0000000..6eb6ca8 --- /dev/null +++ b/NAMESPACE @@ -0,0 +1,17 @@ +# Generated by roxygen2 (4.0.0): do not edit by hand + +export(cast.sparse.matrix) +export(corpora.compare) +export(dtm.create) +export(fill.time.gaps) +export(lda.fit) +export(lda.match.meta) +export(lda.plot.alltopics) +export(lda.plot.category) +export(lda.plot.time) +export(lda.plot.topic) +export(lda.plot.wordcloud) +export(lda.topics.per.document) +export(prepare.plot.values) +export(prepare.time.var) +export(term.statistics) diff --git a/R/corpus.r b/R/corpus.r new file mode 100644 index 0000000..165ad47 --- /dev/null +++ b/R/corpus.r @@ -0,0 +1,102 @@ + +#' Cast data.frame to sparse matrix +#' +#' Create a sparse matrix from matching vectors of row indices, column indices and values +#' +#' @param rows a vector of row indices: [i,] +#' @param columns a vector of column indices: [,j] +#' @param values a vector of the values for each (non-zero) cell: [i,j] = value +#' @return a sparse matrix of the dgTMatrix class (\code{\link{Matrix}} package) +#' @export +cast.sparse.matrix <- function(rows, columns, values=NULL) { + if(is.null(values)) values = rep(1, length(rows)) + d = data.frame(rows=rows, columns=columns, values=values) + if(nrow(d) > nrow(unique(d[,c('rows','columns')]))){ + message('(Duplicate row-column matches occured. Values of duplicates are added up)') + d = aggregate(values ~ rows + columns, d, FUN='sum') + } + unit_index = unique(d$rows) + char_index = unique(d$columns) + sm = spMatrix(nrow=length(unit_index), ncol=length(char_index), + match(d$rows, unit_index), match(d$columns, char_index), d$values) + rownames(sm) = unit_index + colnames(sm) = char_index + sm +} + +#' Create a document term matrix from a list of tokens +#' +#' Create a \code{\link{DocumentTermMatrix}} from a list of ids, terms, and frequencies. +#' +#' @param ids a vector of document ids +#' @param terms a vector of words of the same length as ids +#' @param freqs a vector of the frequency a a term in a document +#' @return a document-term matrix \code{\link{DocumentTermMatrix}} +#' @export +dtm.create <- function(ids, terms, freqs) { + # remove NA terms + d = data.frame(ids=ids, terms=terms, freqs=freqs) + if (sum(is.na(d$terms)) > 0) { + warning("Removing ", sum(is.na(d$terms)), "rows with missing term names") + d = d[!is.na(d$terms), ] + } + sparsemat = cast.sparse.matrix(rows=d$ids, columns=d$terms, values=d$freqs) + as.DocumentTermMatrix(sparsemat, weighting=weightTf) +} + +#' Compute some useful corpus statistics for a dtm +#' +#' Compute a number of useful statistics for filtering words: term frequency, idf, etc. +#' +#' @param dtm a document term matrix (e.g. the output of \code{\link{dtm.create}}) +#' @return A data frame with rows corresponding to the terms in dtm and the statistics in the columns +#' @export +term.statistics <- function(dtm) { + dtm = dtm[row_sums(dtm) > 0,col_sums(dtm) > 0] # get rid of empty rows/columns + vocabulary = colnames(dtm) + data.frame(term = vocabulary, + characters = nchar(vocabulary), + number = grepl("[0-9]", vocabulary), + nonalpha = grepl("\\W", vocabulary), + termfreq = col_sums(dtm), + docfreq = col_sums(dtm > 0), + reldocfreq = col_sums(dtm > 0) / nDocs(dtm), + tfidf = tapply(dtm$v/row_sums(dtm)[dtm$i], dtm$j, mean) * log2(nDocs(dtm)/col_sums(dtm > 0))) +} + +#' Compute the chi^2 statistic for a 2x2 crosstab containing the values +#' [[a, b], [c, d]] +chi2 <- function(a,b,c,d) { + ooe <- function(o, e) {(o-e)*(o-e) / e} + tot = 0.0 + a+b+c+d + a = as.numeric(a) + b = as.numeric(b) + c = as.numeric(c) + d = as.numeric(d) + (ooe(a, (a+c)*(a+b)/tot) + + ooe(b, (b+d)*(a+b)/tot) + + ooe(c, (a+c)*(c+d)/tot) + + ooe(d, (d+b)*(c+d)/tot)) +} + +#' Compare two corpora +#' +#' Compare the term use in corpus dtm with a refernece corpus dtm.ref, returning relative frequencies +#' and overrepresentation using various measures +#' +#' @param dtm.x the main document-term matrix +#' @param dtm.y the 'reference' document-term matrix +#' @param smooth the smoothing parameter for computing overrepresentation +#' @return A data frame with rows corresponding to the terms in dtm and the statistics in the columns +#' @export +corpora.compare <- function(dtm.x, dtm.y, smooth=.001) { + freqs = term.statistics(dtm.x)[, c("term", "termfreq")] + freqs.rel = term.statistics(dtm.y)[, c("term", "termfreq")] + f = merge(freqs, freqs.rel, all=T, by="term") + f[is.na(f)] = 0 + f$relfreq.x = f$termfreq.x / sum(freqs$termfreq) + f$relfreq.y = f$termfreq.y / sum(freqs.rel$termfreq) + f$over = (f$relfreq.x + smooth) / (f$relfreq.y + smooth) + f$chi = chi2(f$termfreq.x, f$termfreq.y, sum(f$termfreq.x) - f$termfreq.x, sum(f$termfreq.y) - f$termfreq.y) + f +} diff --git a/R/lda.r b/R/lda.r new file mode 100644 index 0000000..a629e5c --- /dev/null +++ b/R/lda.r @@ -0,0 +1,228 @@ +### PREPARE DATA + +#' Estimate a topic model using the lda package +#' +#' Estimate an LDA topic model using the \code{\link{lda.collapsed.gibbs.sampler}} function +#' The parameters other than dtm are simply passed to the sampler but provide a workable default. +#' See the description of that function for more information +#' +#' @param dtm a document term matrix (e.g. the output of \code{\link{amcat.dtm.create}}) +#' @param K the number of clusters +#' @param num.iterations the number of iterations +#' @param alpha the alpha parameter +#' @param eta the eta parameter +#' @return A fitted LDA model (see \code{\link{lda.collapsed.gibbs.sampler}}) +#' @export +lda.fit <- function(dtm, K=50, num.iterations=100, alpha=50/K, eta=.01, burnin=100, compute.log.likelihood=F) { + dtm = dtm[row_sums(dtm) > 0,col_sums(dtm) > 0] + x = dtm2ldaformat(dtm) + m = lda.collapsed.gibbs.sampler(x$documents, vocab=x$vocab, K=K, num.iterations=num.iterations, + alpha=alpha, eta=eta, burnin=burnin, compute.log.likelihood=compute.log.likelihood) + m$dtm = dtm + m +} + +#' Add document meta to LDA output +#' +#' Add a dataframe containing document meta to the output (a list) of \code{\link{lda.collapsed.gibbs.sampler}}. +#' +#' @param m The output of \code{\link{lda.collapsed.gibbs.sampler}} +#' @param lda.document.ids A vector with document ids of the same length and order as the LDA output (matching the columns of m$document_sums) +#' @param meta A data.frame with document meta. Has to contain a vector to match the lda.document.ids +#' @param match.by The name of the vector in meta that matches the lda.document.ids +#' @return The LDA output appended with document meta +#' @export +lda.match.meta <- function(m, document.ids, meta, match.by = 'id'){ + if('meta' %in% names(m)) {m$meta = meta[match(document.ids, meta[,match.by]),] + } else m = c(m, list(meta=meta[match(document.ids, meta[,match.by]),])) + m +} + + +#' Get the topics per document, optionally merged with +#' +#' Return a data frame containing article metadata and topic occurence per document +#' +#' @param dtm a document term matrix (e.g. the output of \code{\link{dtm.create}}) +#' @return A data frame with rows corresponding to the terms in dtm and the statistics in the columns +#' @export +lda.topics.per.document <- function(topics) { + ids = as.numeric(rownames(topics$dtm)) + cbind(id=ids, data.frame(t(topics$document_sums))) +} + +### PLOT LDA TOPICS + +#' Plot all topics +#' +#' Write plots for all topics with \code{\link{lda.plot.topic}} in designated folder +#' +#' @param m The output of \code{\link{lda.collapsed.gibbs.sampler}} +#' @param time_var A vector with time stamps (either numeric or Date class) of the same length and order of the documents (rows) in m$document_sums +#' @param category_var A vector with id values of the same length and order of the documents (rows) in m$document_sums +#' @param path The path for a folder where output will be saved +#' @param date_interval The interval for plotting the values over time. Can be: 'day', 'week', 'month' or 'year' +#' @return Nothing +#' @export +lda.plot.alltopics <- function(m, time_var, category_var, path, date_interval='day'){ + for(topic_nr in 1:nrow(m$document_sums)){ + print(paste('Plotting:',topic_nr)) + fn = paste(path, topic_nr, ".png", sep="") + if (!is.null(fn)) png(fn, width=1280,height=800) + lda.plot.topic(m, topic_nr, time_var, category_var, date_interval) + if (!is.null(fn)) dev.off() + } + par(mfrow=c(1,1)) +} + +#' Plots topic wordcloud, and attention over time and per category +#' +#' Plots \code{\link{lda.plot.wordcloud}}, \code{\link{lda.plot.time}} and \code{\link{lda.plot.category}} +#' +#' @param m The output of \code{\link{lda.collapsed.gibbs.sampler}} +#' @param The index of the topic (1 to K) +#' @param time_var A vector with time stamps (either numeric or Date class) of the same length and order of the documents (rows) in m$document_sums +#' @param category_var A vector with id values of the same length and order of the documents (rows) in m$document_sums +#' @param date_interval The interval for plotting the values over time. Can be: 'day', 'week', 'month' or 'year' +#' @param pct Show topic values as percentages +#' @param value Show topic values as 'total', or as 'relative' to the attention for other topics +#' @return Nothing, just plots +#' @export +lda.plot.topic <- function(m, topic_nr, time_var, category_var, date_interval='day', pct=F, value='total'){ + par(mar=c(4.5,3,2,1), cex.axis=1.7) + layout(matrix(c(1,1,2,3), 2, 2, byrow = TRUE), widths=c(2.5,1.5), heights=c(1,2)) + lda.plot.time(m, topic_nr, time_var, date_interval, pct=pct, value=value) + lda.plot.wordcloud(m, topic_nr) + lda.plot.category(m, topic_nr, category_var, pct=pct, value=value) + par(mfrow=c(1,1)) +} + +#' Change date object to date_interval +#' +#' Change date object to date_interval +#' +#' @param time_var A vector of Date values +#' @param date_interval The desired date_interval ('day','week','month', or 'year') +#' @return A vector of Date values +#' @export +prepare.time.var <- function(time_var, date_interval){ + if(class(time_var) == 'Date'){ + if(date_interval == 'day') time_var = as.Date(format(time_var, '%Y-%m-%d')) + if(date_interval == 'month') time_var = as.Date(paste(format(time_var, '%Y-%m'),'-01',sep='')) + if(date_interval == 'week') time_var = as.Date(paste(format(time_var, '%Y-%W'),1), '%Y-%W %u') + if(date_interval == 'year') time_var = as.Date(paste(format(time_var, '%Y'),'-01-01',sep='')) + } + time_var +} + +#' Add empty values for pretty plotting +#' +#' When plotting a timeline, gaps in date_intervals are ignored. For the attention for topics gaps should be considered as having value 0. +#' +#' @param d A data.frame with the columns 'time' (Date) and 'value' (numeric) +#' @param date_interval The date_interval is required to know what the gaps are +#' @return A data.frame with the columns 'time' (Date) and 'value' (numeric) +#' @export +fill.time.gaps <- function(d, date_interval){ + if(class(d$time) == 'numeric'){ + for(t in min(d$time):max(d$time)) + if(!t %in% d$time) d = rbind(d, data.frame(time=t, value=0)) + } + if(class(d$time) == 'Date'){ + date_sequence = seq.Date(from=min(d$time), to=max(d$time), by=date_interval) + for(i in 1:length(date_sequence)){ + t = date_sequence[i] + if(!t %in% d$time) d = rbind(d, data.frame(time=t, value=0)) + } + } + d[order(d$time),] +} + +#' Prepares the topic values per document for plotting +#' +#' Prepares the topic values per document for plotting +#' +#' @param m The output of \code{\link{lda.collapsed.gibbs.sampler}} +#' @param break_var A break vector to aggregate topic values per document +#' @param The index of the topic (1 to K) +#' @param pct Show topic values as percentages +#' @param value Show topic values as 'total', or as 'relative' to the attention for other topics +#' @return The aggregated/transformed topic values +#' @export +prepare.plot.values <- function(m, break_var, topic_nr, pct=F, value='total', filter=NULL){ + hits = m$document_sums[topic_nr,] + d = aggregate(hits, by=list(break_var=break_var), FUN='sum') + if(value == 'relative'){ + total_hits = colSums(m$document_sums) + totals = aggregate(total_hits, by=list(break_var=break_var), FUN='sum') + d$x = d$x / totals$x + } + if(pct == T) d$x = d$x / sum(d$x) + d +} + +#' Plots topic values over time +#' +#' Plots the attention for a topic over time +#' +#' @param m The output of \code{\link{lda.collapsed.gibbs.sampler}} +#' @param The index of the topic (1 to K) +#' @param time_var A vector with time stamps (either numeric or Date class) of the same length and order of the documents (rows) in m$document_sums +#' @param date_interval The interval for plotting the values over time. Can be: 'day', 'week', 'month' or 'year' +#' @param pct Show topic values as percentages +#' @param value Show topic values as 'total', or as 'relative' to the attention for other topics +#' @param return.values Logical. If true, data that is plotted is returned as a data.frame +#' @return data.frame for plotted values +#' @export +lda.plot.time <- function(m, topic_nr, time_var, date_interval='day', pct=F, value='total', return.values=F){ + par(mar=c(3,3,3,1)) + time_var = prepare.time.var(time_var, date_interval) + d = prepare.plot.values(m, break_var=time_var, topic_nr=topic_nr, pct=pct, value=value) + colnames(d) = c('time','value') + d = fill.time.gaps(d, date_interval) + plot(d$time, d$value, type='l', xlab='', main='', ylab='', xlim=c(min(d$time), max(d$time)), ylim=c(0, max(d$value)), bty='L', lwd=5, col='darkgrey') + if(return.values==T) d +} + +#' Plots topic values per category +#' +#' Plots the attention for a topic per category +#' +#' @param m The output of \code{\link{lda.collapsed.gibbs.sampler}} +#' @param The index of the topic (1 to K) +#' @param category_var A vector with id values of the same length and order of the documents (rows) in m$document_sums +#' @param pct Show topic values as percentages +#' @param value Show topic values as 'total', or as 'relative' to the attention for other topics +#' @param return.values Logical. If true, data that is plotted is returned as a data.frame +#' @return data.frame for plotted values +#' @export +lda.plot.category <- function(m, topic_nr, category_var, pct=F, value='total', return.values=F){ + par(mar=c(15,0,1,2)) + d = prepare.plot.values(m, break_var=category_var, topic_nr=topic_nr, pct=pct, value=value) + colnames(d) = c('category','value') + barplot(as.matrix(t(d[,c('value')])), main='', beside=TRUE,horiz=FALSE, + density=NA, + col='darkgrey', + xlab='', + ylab="", + axes=T, names.arg=d$category, cex.names=0.8, cex.axis=0.8, adj=1, las=2) + if(return.values==T) d +} + +#' Plot wordcloud for LDA topic +#' +#' Plots a wordcloud of the top words per topic +#' +#' @param m The output of \code{\link{lda.collapsed.gibbs.sampler}} +#' @param topic_nr The index of the topic (1 to K) +#' @return Nothing, just plots +#' @export +lda.plot.wordcloud <- function(m, topic_nr){ + x = m$topics[topic_nr,] + x = sort(x[x>5], decreasing=T)[1:100] + x = x[!is.na(x)] + names = sub("/.*", "", names(x)) + freqs = x**.5 + pal <- brewer.pal(6,"YlGnBu") + wordcloud(names, freqs, scale=c(6,.5), min.freq=1, max.words=Inf, random.order=FALSE, rot.per=.15, colors=pal) +} diff --git a/corpus-tools.Rproj b/corpus-tools.Rproj new file mode 100644 index 0000000..dfe7b37 --- /dev/null +++ b/corpus-tools.Rproj @@ -0,0 +1,17 @@ +Version: 1.0 + +RestoreWorkspace: Default +SaveWorkspace: Default +AlwaysSaveHistory: Default + +EnableCodeIndexing: Yes +UseSpacesForTab: Yes +NumSpacesForTab: 2 +Encoding: UTF-8 + +RnwWeave: knitr +LaTeX: pdfLaTeX + +BuildType: Package +PackageInstallArgs: --no-multiarch --with-keep.source +PackageRoxygenize: rd,namespace diff --git a/demo/demo_content_similarity_network.r b/demo/demo_content_similarity_network.r new file mode 100644 index 0000000..0bdcaf9 --- /dev/null +++ b/demo/demo_content_similarity_network.r @@ -0,0 +1,169 @@ +library(corpustools) +library(networktools) + +load('demo/wos_comsci_lda.rdata') + +names(m) +document.topic.matrix = t(m$document_sums) +document.topic.matrix = Matrix(document.topic.matrix, sparse=T) + +dim(document.topic.matrix) +head(m$meta) + +g = content.similarity.graph(document.topic.matrix, + vertex.grouping.vars=list(journal=m$meta$journal, + journal.top10=m$meta$journal.top10), + similarity.measure='correlation', + min.similarity=0) + +g = graph.color.vertices(g, V(g)$journal.top10) +V(g)$color[V(g)$journal.top10 == 'Other'] = 'white' +V(g)$size = sqrt(V(g)$n)*2 +V(g)$label = '' +E(g)$width = E(g)$width / 5 + +graph.plot(g, min.edge=0.5) + +##### over time + +g = content.similarity.graph(document.topic.matrix, + vertex.grouping.vars=list(journal=m$meta$journal, + journal.top10=m$meta$journal.top10, + year=format(m$meta$date,'%Y')), + similarity.measure='correlation', + min.similarity=0) + +g = graph.color.vertices(g, V(g)$journal.top10) +V(g)$color[V(g)$journal.top10 == 'Other'] = 'white' +V(g)$size = sqrt(V(g)$n)*6 +V(g)$label = '' +E(g)$width = E(g)$width / 2 + +graph.plot(g, min.edge=0.5, select.vertices=V(g)$year == 2000) +graph.plot(g, min.edge=0.5, select.vertices=V(g)$year == 2001) +graph.plot(g, min.edge=0.5, select.vertices=V(g)$year == 2002) +graph.plot(g, min.edge=0.5, select.vertices=V(g)$year == 2003) +graph.plot(g, min.edge=0.5, select.vertices=V(g)$year == 2004) +graph.plot(g, min.edge=0.5, select.vertices=V(g)$year == 2005) +graph.plot(g, min.edge=0.5, select.vertices=V(g)$year == 2006) +graph.plot(g, min.edge=0.5, select.vertices=V(g)$year == 2007) +graph.plot(g, min.edge=0.5, select.vertices=V(g)$year == 2008) +graph.plot(g, min.edge=0.5, select.vertices=V(g)$year == 2009) +graph.plot(g, min.edge=0.5, select.vertices=V(g)$year == 2010) + + +##### with topics coloured + +lda.plot.topic(m, 1, m$meta$date, m$meta$journal.top10, date_interval='year', value='relative') + +g = content.similarity.graph(document.topic.matrix, + vertex.grouping.vars=list(journal=m$meta$journal, + journal.top10=m$meta$journal.top10, + year=format(m$meta$date,'%Y')), + similarity.measure='correlation', + min.similarity=0, + content.totals.as.vertexmeta=1) + +g = graph.color.vertices(g, V(g)$C1) +V(g)$size = sqrt(V(g)$n)*6 +V(g)$label = '' +E(g)$width = E(g)$width / 2 + +graph.plot(g, min.edge=0.5, select.vertices=V(g)$year == 2000) +graph.plot(g, min.edge=0.5, select.vertices=V(g)$year == 2001) +graph.plot(g, min.edge=0.5, select.vertices=V(g)$year == 2002) +graph.plot(g, min.edge=0.5, select.vertices=V(g)$year == 2003) +graph.plot(g, min.edge=0.5, select.vertices=V(g)$year == 2004) +graph.plot(g, min.edge=0.5, select.vertices=V(g)$year == 2005) +graph.plot(g, min.edge=0.5, select.vertices=V(g)$year == 2006) +graph.plot(g, min.edge=0.5, select.vertices=V(g)$year == 2007) +graph.plot(g, min.edge=0.5, select.vertices=V(g)$year == 2008) +graph.plot(g, min.edge=0.5, select.vertices=V(g)$year == 2009) +graph.plot(g, min.edge=0.5, select.vertices=V(g)$year == 2010) + + + +## +## + + + + + + + + + + + + +load('demo/parlproceedings_lda.rdata') + +document.topic.matrix = t(m$document_sums) +document.topic.matrix = Matrix(document.topic.matrix, sparse=T) + +dim(document.topic.matrix) +head(m$meta) + +g = content.similarity.graph(document.topic.matrix, + vertex.grouping.vars=list(party=m$meta$party, + year=format(m$meta$date, '%Y')), + similarity.measure='correlation') + +g = graph.color.vertices(g, V(g)$party) # color vertices by party +V(g)$label = as.character(V(g)$year) # use year as vertex label + +graph.plot(g, min.edge=0.1) + +#### + +dtm = weightTfIdf(dtm) + +g = content.similarity.graph(m$dtm, + vertex.grouping.vars=list(party=m$meta$party, + year=format(m$meta$date, '%Y')), + similarity.measure='correlation', + min.similarity=0.5) + +min(E(g)$weight) + + +g = graph.color.vertices(g, V(g)$party) # color vertices by party +V(g)$label = as.character(V(g)$year) # use year as vertex label + +graph.plot(g, min.edge=0.6) + + + + +g = content.similarity.graph(m$dtm, + vertex.grouping.vars=list(id=m$meta$id), + similarity.measure='correlation') + + + + + + + +##### + +lda.plot.topic(m, 1, m$meta$date, m$meta$party, date_interval='year') + + +g = content.similarity.graph(document.topic.matrix, + vertex.grouping.vars=list(party=m$meta$party, + year=format(m$meta$date, '%Y')), + similarity.measure='correlation', + content.totals.as.vertexmeta=1:10) + +data.frame(vertex.attributes(g)) +g = graph.color.vertices(g, V(g)$party) # color vertices by party +V(g)$label = as.character(V(g)$year) # use year as vertex label + +graph.plot(g, min.edge=0.1) + + + +##### + diff --git a/demo/demo_dtm.r b/demo/demo_dtm.r new file mode 100644 index 0000000..ec4af0e --- /dev/null +++ b/demo/demo_dtm.r @@ -0,0 +1,9 @@ +library(corpustools) + +tokens = load('demo/rwanda_tokens.rdata') +head(td) +head(tokens) + +dtm = dtm.create(tokens$aid, tokens$lemma, tokens$freq) +dtm + diff --git a/demo/demo_lda.r b/demo/demo_lda.r new file mode 100644 index 0000000..c6584c7 --- /dev/null +++ b/demo/demo_lda.r @@ -0,0 +1,33 @@ +library(corpustools) + +load('demo/wos_comsci_dtm.rdata') +dtm + +termstats = term.statistics(dtm) +head(termstats) +head(termstats[order(termstats$tfidf, decreasing=T),]) + +termstats = termstats[termstats$docfreq > 1 & termstats$nonalpha==F & termstats$number==F,] +voca = as.character(termstats[order(termstats$tfidf, decreasing=T),][1:3000,'term']) +filtered_dtm = dtm[,voca] + +m = lda.fit(filtered_dtm, K=25, num.iterations=1000) +top.topic.words(m$topics) +names(m) + +## add meta +load('demo/wos_comsci_meta.rdata') +head(meta) + +m = lda.match.meta(m, rownames(m$dtm), meta, match.by='id') +names(m) + +load('demo/wos_comsci_lda.rdata') + +## visualize topics +lda.plot.wordcloud(m, 1) +lda.plot.topic(m, 1, m$meta$date, m$meta$journal.top10, 'year') + +lda.plot.topic(m, 1, m$meta$date, m$meta$journal.top10, date_interval='year', value='relative') +lda.plot.topic(m, 2, m$meta$date, m$meta$journal.top10, date_interval='year', value='relative') +lda.plot.topic(m, 17, m$meta$date, m$meta$journal.top10, date_interval='year', value='relative') diff --git a/demo/demo_social_network.r b/demo/demo_social_network.r new file mode 100644 index 0000000..a1a1293 --- /dev/null +++ b/demo/demo_social_network.r @@ -0,0 +1,33 @@ +d = data.frame(conversation=c(1,1,1,1,1,1,2,2,2,2), + author=c('Alice','Bob','Alice','Charlie','Bob','Bob','Alice','Bob','Alice','Bob'), + order.nr=c(1,2,3,4,5,6,1,2,3,4)) + +d +g = author.coincidence.graph(d$conversation, d$author) # In how many conversations did author.X and author.Y communicate? +plot(g, edge.label=E(g)$weight, vertex.size=50, edge.label.cex=3, edge.width=E(g)$weight*5) +g = author.coincidence.graph(d$conversation, d$author, 'overlap_jacard') # Similar to default (coincidence_count) but with direction (by dividing coincidence by number of conversations author participated in) +plot(g, edge.label=E(g)$weight, vertex.size=50, edge.label.cex=3, edge.width=E(g)$weight*5) +g = author.coincidence.graph(d$conversation, d$author, 'cosine') # Cosine can be used to also take into account how many times each author participated within conversations +plot(g, edge.label=round(E(g)$weight,2), vertex.size=50, edge.label.cex=3, edge.width=E(g)$weight*5) + +d +g = previous.authors.graph(d$conversation, d$author, d$order.nr, lookback=1) # how many times did author.X communicate directly after author.Y? +plot(g, edge.label=E(g)$weight, vertex.size=50, edge.label.cex=3, edge.width=E(g)$weight*5) +g = previous.authors.graph(d$conversation, d$author, d$order.nr, lookback=2) # how many times did author.X communicate within two messages after author.Y? +plot(g, edge.label=E(g)$weight, vertex.size=50, edge.label.cex=3, edge.width=E(g)$weight*5) + +### + +load('demo/parlproceedings_lda.rdata') + +head(m$meta) +m$meta$author = paste(m$meta$name, m$meta$party) + +g = previous.authors.graph(m$meta$meeting, m$meta$author, m$meta$order.nr, lookback=5) # how many times did author.X communicate directly after author.Y? + +E(g)$width = log(E(g)$weight) +E(g)$arrow.size = E(g)$width / 50 +V(g)$label = '' +V(g)$size = log(V(g)$n.messages)/2 + +graph.plot(g, min.edge=1) diff --git a/demo/parlproceedings_lda.rdata b/demo/parlproceedings_lda.rdata new file mode 100644 index 0000000..e81bf0d Binary files /dev/null and b/demo/parlproceedings_lda.rdata differ diff --git a/demo/wos_comsci_dtm.rdata b/demo/wos_comsci_dtm.rdata new file mode 100644 index 0000000..bad2419 Binary files /dev/null and b/demo/wos_comsci_dtm.rdata differ diff --git a/demo/wos_comsci_lda.rdata b/demo/wos_comsci_lda.rdata new file mode 100644 index 0000000..df8153b Binary files /dev/null and b/demo/wos_comsci_lda.rdata differ diff --git a/demo/wos_comsci_meta.rdata b/demo/wos_comsci_meta.rdata new file mode 100644 index 0000000..4e51a6a Binary files /dev/null and b/demo/wos_comsci_meta.rdata differ diff --git a/man/cast.sparse.matrix.Rd b/man/cast.sparse.matrix.Rd new file mode 100644 index 0000000..763e39a --- /dev/null +++ b/man/cast.sparse.matrix.Rd @@ -0,0 +1,21 @@ +% Generated by roxygen2 (4.0.0): do not edit by hand +\name{cast.sparse.matrix} +\alias{cast.sparse.matrix} +\title{Cast data.frame to sparse matrix} +\usage{ +cast.sparse.matrix(rows, columns, values = NULL) +} +\arguments{ +\item{rows}{a vector of row indices: [i,]} + +\item{columns}{a vector of column indices: [,j]} + +\item{values}{a vector of the values for each (non-zero) cell: [i,j] = value} +} +\value{ +a sparse matrix of the dgTMatrix class (\code{\link{Matrix}} package) +} +\description{ +Create a sparse matrix from matching vectors of row indices, column indices and values +} + diff --git a/man/chi2.Rd b/man/chi2.Rd new file mode 100644 index 0000000..ecf8c7a --- /dev/null +++ b/man/chi2.Rd @@ -0,0 +1,13 @@ +% Generated by roxygen2 (4.0.0): do not edit by hand +\name{chi2} +\alias{chi2} +\title{Compute the chi^2 statistic for a 2x2 crosstab containing the values +[[a, b], [c, d]]} +\usage{ +chi2(a, b, c, d) +} +\description{ +Compute the chi^2 statistic for a 2x2 crosstab containing the values +[[a, b], [c, d]] +} + diff --git a/man/corpora.compare.Rd b/man/corpora.compare.Rd new file mode 100644 index 0000000..f9a2581 --- /dev/null +++ b/man/corpora.compare.Rd @@ -0,0 +1,22 @@ +% Generated by roxygen2 (4.0.0): do not edit by hand +\name{corpora.compare} +\alias{corpora.compare} +\title{Compare two corpora} +\usage{ +corpora.compare(dtm.x, dtm.y, smooth = 0.001) +} +\arguments{ +\item{dtm.x}{the main document-term matrix} + +\item{dtm.y}{the 'reference' document-term matrix} + +\item{smooth}{the smoothing parameter for computing overrepresentation} +} +\value{ +A data frame with rows corresponding to the terms in dtm and the statistics in the columns +} +\description{ +Compare the term use in corpus dtm with a refernece corpus dtm.ref, returning relative frequencies +and overrepresentation using various measures +} + diff --git a/man/dtm.create.Rd b/man/dtm.create.Rd new file mode 100644 index 0000000..1b22aa7 --- /dev/null +++ b/man/dtm.create.Rd @@ -0,0 +1,21 @@ +% Generated by roxygen2 (4.0.0): do not edit by hand +\name{dtm.create} +\alias{dtm.create} +\title{Create a document term matrix from a list of tokens} +\usage{ +dtm.create(ids, terms, freqs) +} +\arguments{ +\item{ids}{a vector of document ids} + +\item{terms}{a vector of words of the same length as ids} + +\item{freqs}{a vector of the frequency a a term in a document} +} +\value{ +a document-term matrix \code{\link{DocumentTermMatrix}} +} +\description{ +Create a \code{\link{DocumentTermMatrix}} from a list of ids, terms, and frequencies. +} + diff --git a/man/fill.time.gaps.Rd b/man/fill.time.gaps.Rd new file mode 100644 index 0000000..355df3d --- /dev/null +++ b/man/fill.time.gaps.Rd @@ -0,0 +1,19 @@ +% Generated by roxygen2 (4.0.0): do not edit by hand +\name{fill.time.gaps} +\alias{fill.time.gaps} +\title{Add empty values for pretty plotting} +\usage{ +fill.time.gaps(d, date_interval) +} +\arguments{ +\item{d}{A data.frame with the columns 'time' (Date) and 'value' (numeric)} + +\item{date_interval}{The date_interval is required to know what the gaps are} +} +\value{ +A data.frame with the columns 'time' (Date) and 'value' (numeric) +} +\description{ +When plotting a timeline, gaps in date_intervals are ignored. For the attention for topics gaps should be considered as having value 0. +} + diff --git a/man/lda.fit.Rd b/man/lda.fit.Rd new file mode 100644 index 0000000..1bb3047 --- /dev/null +++ b/man/lda.fit.Rd @@ -0,0 +1,28 @@ +% Generated by roxygen2 (4.0.0): do not edit by hand +\name{lda.fit} +\alias{lda.fit} +\title{Estimate a topic model using the lda package} +\usage{ +lda.fit(dtm, K = 50, num.iterations = 100, alpha = 50/K, eta = 0.01, + burnin = 100, compute.log.likelihood = F) +} +\arguments{ +\item{dtm}{a document term matrix (e.g. the output of \code{\link{amcat.dtm.create}})} + +\item{K}{the number of clusters} + +\item{num.iterations}{the number of iterations} + +\item{alpha}{the alpha parameter} + +\item{eta}{the eta parameter} +} +\value{ +A fitted LDA model (see \code{\link{lda.collapsed.gibbs.sampler}}) +} +\description{ +Estimate an LDA topic model using the \code{\link{lda.collapsed.gibbs.sampler}} function +The parameters other than dtm are simply passed to the sampler but provide a workable default. +See the description of that function for more information +} + diff --git a/man/lda.match.meta.Rd b/man/lda.match.meta.Rd new file mode 100644 index 0000000..ee7ad99 --- /dev/null +++ b/man/lda.match.meta.Rd @@ -0,0 +1,23 @@ +% Generated by roxygen2 (4.0.0): do not edit by hand +\name{lda.match.meta} +\alias{lda.match.meta} +\title{Add document meta to LDA output} +\usage{ +lda.match.meta(m, document.ids, meta, match.by = "id") +} +\arguments{ +\item{m}{The output of \code{\link{lda.collapsed.gibbs.sampler}}} + +\item{lda.document.ids}{A vector with document ids of the same length and order as the LDA output (matching the columns of m$document_sums)} + +\item{meta}{A data.frame with document meta. Has to contain a vector to match the lda.document.ids} + +\item{match.by}{The name of the vector in meta that matches the lda.document.ids} +} +\value{ +The LDA output appended with document meta +} +\description{ +Add a dataframe containing document meta to the output (a list) of \code{\link{lda.collapsed.gibbs.sampler}}. +} + diff --git a/man/lda.plot.alltopics.Rd b/man/lda.plot.alltopics.Rd new file mode 100644 index 0000000..ec00ac8 --- /dev/null +++ b/man/lda.plot.alltopics.Rd @@ -0,0 +1,25 @@ +% Generated by roxygen2 (4.0.0): do not edit by hand +\name{lda.plot.alltopics} +\alias{lda.plot.alltopics} +\title{Plot all topics} +\usage{ +lda.plot.alltopics(m, time_var, category_var, path, date_interval = "day") +} +\arguments{ +\item{m}{The output of \code{\link{lda.collapsed.gibbs.sampler}}} + +\item{time_var}{A vector with time stamps (either numeric or Date class) of the same length and order of the documents (rows) in m$document_sums} + +\item{category_var}{A vector with id values of the same length and order of the documents (rows) in m$document_sums} + +\item{path}{The path for a folder where output will be saved} + +\item{date_interval}{The interval for plotting the values over time. Can be: 'day', 'week', 'month' or 'year'} +} +\value{ +Nothing +} +\description{ +Write plots for all topics with \code{\link{lda.plot.topic}} in designated folder +} + diff --git a/man/lda.plot.category.Rd b/man/lda.plot.category.Rd new file mode 100644 index 0000000..c9a0cf9 --- /dev/null +++ b/man/lda.plot.category.Rd @@ -0,0 +1,28 @@ +% Generated by roxygen2 (4.0.0): do not edit by hand +\name{lda.plot.category} +\alias{lda.plot.category} +\title{Plots topic values per category} +\usage{ +lda.plot.category(m, topic_nr, category_var, pct = F, value = "total", + return.values = F) +} +\arguments{ +\item{m}{The output of \code{\link{lda.collapsed.gibbs.sampler}}} + +\item{The}{index of the topic (1 to K)} + +\item{category_var}{A vector with id values of the same length and order of the documents (rows) in m$document_sums} + +\item{pct}{Show topic values as percentages} + +\item{value}{Show topic values as 'total', or as 'relative' to the attention for other topics} + +\item{return.values}{Logical. If true, data that is plotted is returned as a data.frame} +} +\value{ +data.frame for plotted values +} +\description{ +Plots the attention for a topic per category +} + diff --git a/man/lda.plot.time.Rd b/man/lda.plot.time.Rd new file mode 100644 index 0000000..f8931d9 --- /dev/null +++ b/man/lda.plot.time.Rd @@ -0,0 +1,30 @@ +% Generated by roxygen2 (4.0.0): do not edit by hand +\name{lda.plot.time} +\alias{lda.plot.time} +\title{Plots topic values over time} +\usage{ +lda.plot.time(m, topic_nr, time_var, date_interval = "day", pct = F, + value = "total", return.values = F) +} +\arguments{ +\item{m}{The output of \code{\link{lda.collapsed.gibbs.sampler}}} + +\item{The}{index of the topic (1 to K)} + +\item{time_var}{A vector with time stamps (either numeric or Date class) of the same length and order of the documents (rows) in m$document_sums} + +\item{date_interval}{The interval for plotting the values over time. Can be: 'day', 'week', 'month' or 'year'} + +\item{pct}{Show topic values as percentages} + +\item{value}{Show topic values as 'total', or as 'relative' to the attention for other topics} + +\item{return.values}{Logical. If true, data that is plotted is returned as a data.frame} +} +\value{ +data.frame for plotted values +} +\description{ +Plots the attention for a topic over time +} + diff --git a/man/lda.plot.topic.Rd b/man/lda.plot.topic.Rd new file mode 100644 index 0000000..8c0d49f --- /dev/null +++ b/man/lda.plot.topic.Rd @@ -0,0 +1,30 @@ +% Generated by roxygen2 (4.0.0): do not edit by hand +\name{lda.plot.topic} +\alias{lda.plot.topic} +\title{Plots topic wordcloud, and attention over time and per category} +\usage{ +lda.plot.topic(m, topic_nr, time_var, category_var, date_interval = "day", + pct = F, value = "total") +} +\arguments{ +\item{m}{The output of \code{\link{lda.collapsed.gibbs.sampler}}} + +\item{The}{index of the topic (1 to K)} + +\item{time_var}{A vector with time stamps (either numeric or Date class) of the same length and order of the documents (rows) in m$document_sums} + +\item{category_var}{A vector with id values of the same length and order of the documents (rows) in m$document_sums} + +\item{date_interval}{The interval for plotting the values over time. Can be: 'day', 'week', 'month' or 'year'} + +\item{pct}{Show topic values as percentages} + +\item{value}{Show topic values as 'total', or as 'relative' to the attention for other topics} +} +\value{ +Nothing, just plots +} +\description{ +Plots \code{\link{lda.plot.wordcloud}}, \code{\link{lda.plot.time}} and \code{\link{lda.plot.category}} +} + diff --git a/man/lda.plot.wordcloud.Rd b/man/lda.plot.wordcloud.Rd new file mode 100644 index 0000000..185917d --- /dev/null +++ b/man/lda.plot.wordcloud.Rd @@ -0,0 +1,19 @@ +% Generated by roxygen2 (4.0.0): do not edit by hand +\name{lda.plot.wordcloud} +\alias{lda.plot.wordcloud} +\title{Plot wordcloud for LDA topic} +\usage{ +lda.plot.wordcloud(m, topic_nr) +} +\arguments{ +\item{m}{The output of \code{\link{lda.collapsed.gibbs.sampler}}} + +\item{topic_nr}{The index of the topic (1 to K)} +} +\value{ +Nothing, just plots +} +\description{ +Plots a wordcloud of the top words per topic +} + diff --git a/man/lda.topics.per.document.Rd b/man/lda.topics.per.document.Rd new file mode 100644 index 0000000..2687cf9 --- /dev/null +++ b/man/lda.topics.per.document.Rd @@ -0,0 +1,17 @@ +% Generated by roxygen2 (4.0.0): do not edit by hand +\name{lda.topics.per.document} +\alias{lda.topics.per.document} +\title{Get the topics per document, optionally merged with} +\usage{ +lda.topics.per.document(topics) +} +\arguments{ +\item{dtm}{a document term matrix (e.g. the output of \code{\link{dtm.create}})} +} +\value{ +A data frame with rows corresponding to the terms in dtm and the statistics in the columns +} +\description{ +Return a data frame containing article metadata and topic occurence per document +} + diff --git a/man/prepare.plot.values.Rd b/man/prepare.plot.values.Rd new file mode 100644 index 0000000..18faabd --- /dev/null +++ b/man/prepare.plot.values.Rd @@ -0,0 +1,26 @@ +% Generated by roxygen2 (4.0.0): do not edit by hand +\name{prepare.plot.values} +\alias{prepare.plot.values} +\title{Prepares the topic values per document for plotting} +\usage{ +prepare.plot.values(m, break_var, topic_nr, pct = F, value = "total", + filter = NULL) +} +\arguments{ +\item{m}{The output of \code{\link{lda.collapsed.gibbs.sampler}}} + +\item{break_var}{A break vector to aggregate topic values per document} + +\item{The}{index of the topic (1 to K)} + +\item{pct}{Show topic values as percentages} + +\item{value}{Show topic values as 'total', or as 'relative' to the attention for other topics} +} +\value{ +The aggregated/transformed topic values +} +\description{ +Prepares the topic values per document for plotting +} + diff --git a/man/prepare.time.var.Rd b/man/prepare.time.var.Rd new file mode 100644 index 0000000..164633c --- /dev/null +++ b/man/prepare.time.var.Rd @@ -0,0 +1,19 @@ +% Generated by roxygen2 (4.0.0): do not edit by hand +\name{prepare.time.var} +\alias{prepare.time.var} +\title{Change date object to date_interval} +\usage{ +prepare.time.var(time_var, date_interval) +} +\arguments{ +\item{time_var}{A vector of Date values} + +\item{date_interval}{The desired date_interval ('day','week','month', or 'year')} +} +\value{ +A vector of Date values +} +\description{ +Change date object to date_interval +} + diff --git a/man/term.statistics.Rd b/man/term.statistics.Rd new file mode 100644 index 0000000..0065da4 --- /dev/null +++ b/man/term.statistics.Rd @@ -0,0 +1,17 @@ +% Generated by roxygen2 (4.0.0): do not edit by hand +\name{term.statistics} +\alias{term.statistics} +\title{Compute some useful corpus statistics for a dtm} +\usage{ +term.statistics(dtm) +} +\arguments{ +\item{dtm}{a document term matrix (e.g. the output of \code{\link{dtm.create}})} +} +\value{ +A data frame with rows corresponding to the terms in dtm and the statistics in the columns +} +\description{ +Compute a number of useful statistics for filtering words: term frequency, idf, etc. +} +