diff --git a/.Rbuildignore b/.Rbuildignore
new file mode 100644
index 0000000..91114bf
--- /dev/null
+++ b/.Rbuildignore
@@ -0,0 +1,2 @@
+^.*\.Rproj$
+^\.Rproj\.user$
diff --git a/.gitignore b/.gitignore
new file mode 100644
index 0000000..807ea25
--- /dev/null
+++ b/.gitignore
@@ -0,0 +1,3 @@
+.Rproj.user
+.Rhistory
+.RData
diff --git a/DESCRIPTION b/DESCRIPTION
new file mode 100644
index 0000000..818e33a
--- /dev/null
+++ b/DESCRIPTION
@@ -0,0 +1,20 @@
+Package: corpustools
+Version: 0.100
+Date: 2014-05-20
+Title: tools for corpus analysis
+Author: Kasper Welbers & Wouter van Atteveldt
+Maintainer: Kasper Welbers <kasperwelbers@gmail.com>
+Depends:
+    slam,
+    Matrix,
+    lda,
+    tm,
+    topicmodels,
+    RColorBrewer,
+    wordcloud
+Description: Several convenient functions for bag-of-word type text analysis
+    methods
+License: MIT
+URL: http://github.com/kasperwelbers/corpus-tools
+Roxygen: list(wrap = FALSE)
+
diff --git a/NAMESPACE b/NAMESPACE
new file mode 100644
index 0000000..6eb6ca8
--- /dev/null
+++ b/NAMESPACE
@@ -0,0 +1,17 @@
+# Generated by roxygen2 (4.0.0): do not edit by hand
+
+export(cast.sparse.matrix)
+export(corpora.compare)
+export(dtm.create)
+export(fill.time.gaps)
+export(lda.fit)
+export(lda.match.meta)
+export(lda.plot.alltopics)
+export(lda.plot.category)
+export(lda.plot.time)
+export(lda.plot.topic)
+export(lda.plot.wordcloud)
+export(lda.topics.per.document)
+export(prepare.plot.values)
+export(prepare.time.var)
+export(term.statistics)
diff --git a/R/corpus.r b/R/corpus.r
new file mode 100644
index 0000000..165ad47
--- /dev/null
+++ b/R/corpus.r
@@ -0,0 +1,102 @@
+
+#' Cast data.frame to sparse matrix
+#' 
+#' Create a sparse matrix from matching vectors of row indices, column indices and values
+#' 
+#' @param rows a vector of row indices: [i,]
+#' @param columns a vector of column indices: [,j]
+#' @param values a vector of the values for each (non-zero) cell: [i,j] = value
+#' @return a sparse matrix of the dgTMatrix class (\code{\link{Matrix}} package) 
+#' @export
+cast.sparse.matrix <- function(rows, columns, values=NULL) {
+  if(is.null(values)) values = rep(1, length(rows))
+  d = data.frame(rows=rows, columns=columns, values=values)
+  if(nrow(d) > nrow(unique(d[,c('rows','columns')]))){
+    message('(Duplicate row-column matches occured. Values of duplicates are added up)')
+    d = aggregate(values ~ rows + columns, d, FUN='sum')
+  }
+  unit_index = unique(d$rows)
+  char_index = unique(d$columns)
+  sm = spMatrix(nrow=length(unit_index), ncol=length(char_index),
+                match(d$rows, unit_index), match(d$columns, char_index), d$values)
+  rownames(sm) = unit_index
+  colnames(sm) = char_index
+  sm
+}
+
+#' Create a document term matrix from a list of tokens
+#' 
+#' Create a \code{\link{DocumentTermMatrix}} from a list of ids, terms, and frequencies. 
+#' 
+#' @param ids a vector of document ids
+#' @param terms a vector of words of the same length as ids
+#' @param freqs a vector of the frequency a a term in a document
+#' @return a document-term matrix  \code{\link{DocumentTermMatrix}}
+#' @export
+dtm.create <- function(ids, terms, freqs) {
+  # remove NA terms
+  d = data.frame(ids=ids, terms=terms, freqs=freqs)
+  if (sum(is.na(d$terms)) > 0) {
+    warning("Removing ", sum(is.na(d$terms)), "rows with missing term names")
+    d = d[!is.na(d$terms), ]
+  }
+  sparsemat = cast.sparse.matrix(rows=d$ids, columns=d$terms, values=d$freqs)
+  as.DocumentTermMatrix(sparsemat, weighting=weightTf)
+}
+
+#' Compute some useful corpus statistics for a dtm
+#' 
+#' Compute a number of useful statistics for filtering words: term frequency, idf, etc.
+#' 
+#' @param dtm a document term matrix (e.g. the output of \code{\link{dtm.create}})
+#' @return A data frame with rows corresponding to the terms in dtm and the statistics in the columns
+#' @export
+term.statistics <- function(dtm) {
+  dtm = dtm[row_sums(dtm) > 0,col_sums(dtm) > 0]    # get rid of empty rows/columns
+  vocabulary = colnames(dtm)
+  data.frame(term = vocabulary,
+             characters = nchar(vocabulary),
+             number = grepl("[0-9]", vocabulary),
+             nonalpha = grepl("\\W", vocabulary),
+             termfreq = col_sums(dtm),
+             docfreq = col_sums(dtm > 0),
+             reldocfreq = col_sums(dtm > 0) / nDocs(dtm),
+             tfidf = tapply(dtm$v/row_sums(dtm)[dtm$i], dtm$j, mean) * log2(nDocs(dtm)/col_sums(dtm > 0)))
+}
+
+#' Compute the chi^2 statistic for a 2x2 crosstab containing the values
+#' [[a, b], [c, d]]
+chi2 <- function(a,b,c,d) {
+  ooe <- function(o, e) {(o-e)*(o-e) / e}
+  tot = 0.0 + a+b+c+d
+  a = as.numeric(a)
+  b = as.numeric(b)
+  c = as.numeric(c)
+  d = as.numeric(d)
+  (ooe(a, (a+c)*(a+b)/tot)
+   +  ooe(b, (b+d)*(a+b)/tot)
+   +  ooe(c, (a+c)*(c+d)/tot)
+   +  ooe(d, (d+b)*(c+d)/tot))
+}
+
+#' Compare two corpora
+#' 
+#' Compare the term use in corpus dtm with a refernece corpus dtm.ref, returning relative frequencies
+#' and overrepresentation using various measures
+#' 
+#' @param dtm.x the main document-term matrix
+#' @param dtm.y the 'reference' document-term matrix
+#' @param smooth the smoothing parameter for computing overrepresentation
+#' @return A data frame with rows corresponding to the terms in dtm and the statistics in the columns
+#' @export
+corpora.compare <- function(dtm.x, dtm.y, smooth=.001) {
+  freqs = term.statistics(dtm.x)[, c("term", "termfreq")]
+  freqs.rel = term.statistics(dtm.y)[, c("term", "termfreq")]
+  f = merge(freqs, freqs.rel, all=T, by="term")    
+  f[is.na(f)] = 0
+  f$relfreq.x = f$termfreq.x / sum(freqs$termfreq)
+  f$relfreq.y = f$termfreq.y / sum(freqs.rel$termfreq)
+  f$over = (f$relfreq.x + smooth) / (f$relfreq.y + smooth)
+  f$chi = chi2(f$termfreq.x, f$termfreq.y, sum(f$termfreq.x) - f$termfreq.x, sum(f$termfreq.y) - f$termfreq.y)
+  f
+}
diff --git a/R/lda.r b/R/lda.r
new file mode 100644
index 0000000..a629e5c
--- /dev/null
+++ b/R/lda.r
@@ -0,0 +1,228 @@
+### PREPARE DATA
+
+#' Estimate a topic model using the lda package
+#' 
+#' Estimate an LDA topic model using the \code{\link{lda.collapsed.gibbs.sampler}} function
+#' The parameters other than dtm are simply passed to the sampler but provide a workable default.
+#' See the description of that function for more information
+#' 
+#' @param dtm a document term matrix (e.g. the output of \code{\link{amcat.dtm.create}})
+#' @param K the number of clusters
+#' @param num.iterations the number of iterations
+#' @param alpha the alpha parameter
+#' @param eta the eta parameter
+#' @return A fitted LDA model (see \code{\link{lda.collapsed.gibbs.sampler}})
+#' @export
+lda.fit <- function(dtm, K=50, num.iterations=100, alpha=50/K, eta=.01, burnin=100, compute.log.likelihood=F) {
+  dtm = dtm[row_sums(dtm) > 0,col_sums(dtm) > 0]
+  x = dtm2ldaformat(dtm)
+  m = lda.collapsed.gibbs.sampler(x$documents, vocab=x$vocab, K=K, num.iterations=num.iterations, 
+                                  alpha=alpha, eta=eta, burnin=burnin, compute.log.likelihood=compute.log.likelihood)
+  m$dtm = dtm
+  m
+}
+
+#' Add document meta to LDA output
+#' 
+#' Add a dataframe containing document meta to the output (a list) of \code{\link{lda.collapsed.gibbs.sampler}}. 
+#' 
+#' @param m The output of \code{\link{lda.collapsed.gibbs.sampler}}   
+#' @param lda.document.ids A vector with document ids of the same length and order as the LDA output (matching the columns of m$document_sums)
+#' @param meta A data.frame with document meta. Has to contain a vector to match the lda.document.ids
+#' @param match.by The name of the vector in meta that matches the lda.document.ids 
+#' @return The LDA output appended with document meta
+#' @export
+lda.match.meta <- function(m, document.ids, meta, match.by = 'id'){
+  if('meta' %in% names(m)) {m$meta = meta[match(document.ids, meta[,match.by]),]
+  } else m = c(m, list(meta=meta[match(document.ids, meta[,match.by]),]))
+  m
+}
+
+
+#' Get the topics per document, optionally merged with 
+#' 
+#' Return a data frame containing article metadata and topic occurence per document
+#' 
+#' @param dtm a document term matrix (e.g. the output of \code{\link{dtm.create}})
+#' @return A data frame with rows corresponding to the terms in dtm and the statistics in the columns
+#' @export
+lda.topics.per.document <- function(topics) {
+  ids = as.numeric(rownames(topics$dtm))
+  cbind(id=ids, data.frame(t(topics$document_sums)))
+}
+
+### PLOT LDA TOPICS
+
+#' Plot all topics
+#' 
+#' Write plots for all topics with \code{\link{lda.plot.topic}} in designated folder
+#' 
+#' @param m The output of \code{\link{lda.collapsed.gibbs.sampler}}   
+#' @param time_var A vector with time stamps (either numeric or Date class) of the same length and order of the documents (rows) in m$document_sums
+#' @param category_var A vector with id values of the same length and order of the documents (rows) in m$document_sums
+#' @param path The path for a folder where output will be saved
+#' @param date_interval The interval for plotting the values over time. Can be: 'day', 'week', 'month' or 'year'
+#' @return Nothing
+#' @export
+lda.plot.alltopics <- function(m, time_var, category_var, path, date_interval='day'){
+  for(topic_nr in 1:nrow(m$document_sums)){
+    print(paste('Plotting:',topic_nr))
+    fn = paste(path, topic_nr, ".png", sep="")
+    if (!is.null(fn)) png(fn, width=1280,height=800)
+    lda.plot.topic(m, topic_nr, time_var, category_var, date_interval)
+    if (!is.null(fn)) dev.off()
+  }
+  par(mfrow=c(1,1))
+}
+
+#' Plots topic wordcloud, and attention over time and per category
+#' 
+#' Plots \code{\link{lda.plot.wordcloud}}, \code{\link{lda.plot.time}} and \code{\link{lda.plot.category}}
+#' 
+#' @param m The output of \code{\link{lda.collapsed.gibbs.sampler}}
+#' @param The index of the topic (1 to K)
+#' @param time_var A vector with time stamps (either numeric or Date class) of the same length and order of the documents (rows) in m$document_sums
+#' @param category_var A vector with id values of the same length and order of the documents (rows) in m$document_sums
+#' @param date_interval The interval for plotting the values over time. Can be: 'day', 'week', 'month' or 'year'
+#' @param pct Show topic values as percentages
+#' @param value Show topic values as 'total', or as 'relative' to the attention for other topics
+#' @return Nothing, just plots
+#' @export
+lda.plot.topic <- function(m, topic_nr, time_var, category_var, date_interval='day', pct=F, value='total'){
+  par(mar=c(4.5,3,2,1), cex.axis=1.7)
+  layout(matrix(c(1,1,2,3), 2, 2, byrow = TRUE), widths=c(2.5,1.5), heights=c(1,2))
+  lda.plot.time(m, topic_nr, time_var, date_interval, pct=pct, value=value)
+  lda.plot.wordcloud(m, topic_nr)
+  lda.plot.category(m, topic_nr, category_var, pct=pct, value=value)
+  par(mfrow=c(1,1))
+}
+
+#' Change date object to date_interval
+#' 
+#' Change date object to date_interval
+#' 
+#' @param time_var A vector of Date values
+#' @param date_interval The desired date_interval ('day','week','month', or 'year')
+#' @return A vector of Date values
+#' @export
+prepare.time.var <- function(time_var, date_interval){
+  if(class(time_var) == 'Date'){
+    if(date_interval == 'day') time_var = as.Date(format(time_var, '%Y-%m-%d'))
+    if(date_interval == 'month') time_var = as.Date(paste(format(time_var, '%Y-%m'),'-01',sep=''))
+    if(date_interval == 'week') time_var = as.Date(paste(format(time_var, '%Y-%W'),1), '%Y-%W %u')
+    if(date_interval == 'year') time_var = as.Date(paste(format(time_var, '%Y'),'-01-01',sep=''))
+  } 
+  time_var
+}
+
+#' Add empty values for pretty plotting
+#' 
+#' When plotting a timeline, gaps in date_intervals are ignored. For the attention for topics gaps should be considered as having value 0.   
+#' 
+#' @param d A data.frame with the columns 'time' (Date) and 'value' (numeric)  
+#' @param date_interval The date_interval is required to know what the gaps are
+#' @return A data.frame with the columns 'time' (Date) and 'value' (numeric)  
+#' @export
+fill.time.gaps <- function(d, date_interval){
+  if(class(d$time) == 'numeric'){
+    for(t in min(d$time):max(d$time)) 
+      if(!t %in% d$time) d = rbind(d, data.frame(time=t, value=0))
+  }
+  if(class(d$time) == 'Date'){
+    date_sequence = seq.Date(from=min(d$time), to=max(d$time), by=date_interval)
+    for(i in 1:length(date_sequence)){
+      t = date_sequence[i]
+      if(!t %in% d$time) d = rbind(d, data.frame(time=t, value=0))
+    }
+  }
+  d[order(d$time),]
+}
+
+#' Prepares the topic values per document for plotting 
+#' 
+#' Prepares the topic values per document for plotting
+#' 
+#' @param m The output of \code{\link{lda.collapsed.gibbs.sampler}}
+#' @param break_var A break vector to aggregate topic values per document
+#' @param The index of the topic (1 to K)
+#' @param pct Show topic values as percentages
+#' @param value Show topic values as 'total', or as 'relative' to the attention for other topics
+#' @return The aggregated/transformed topic values
+#' @export
+prepare.plot.values <- function(m, break_var, topic_nr, pct=F, value='total', filter=NULL){
+  hits = m$document_sums[topic_nr,]
+  d = aggregate(hits, by=list(break_var=break_var), FUN='sum') 
+  if(value == 'relative'){
+    total_hits = colSums(m$document_sums)  
+    totals = aggregate(total_hits, by=list(break_var=break_var), FUN='sum')
+    d$x = d$x / totals$x
+  }
+  if(pct == T) d$x = d$x / sum(d$x)
+  d
+}
+
+#' Plots topic values over time
+#' 
+#' Plots the attention for a topic over time
+#' 
+#' @param m The output of \code{\link{lda.collapsed.gibbs.sampler}}
+#' @param The index of the topic (1 to K)
+#' @param time_var A vector with time stamps (either numeric or Date class) of the same length and order of the documents (rows) in m$document_sums
+#' @param date_interval The interval for plotting the values over time. Can be: 'day', 'week', 'month' or 'year'
+#' @param pct Show topic values as percentages
+#' @param value Show topic values as 'total', or as 'relative' to the attention for other topics
+#' @param return.values Logical. If true, data that is plotted is returned as a data.frame
+#' @return data.frame for plotted values
+#' @export
+lda.plot.time <- function(m, topic_nr, time_var, date_interval='day', pct=F, value='total', return.values=F){
+  par(mar=c(3,3,3,1))
+  time_var = prepare.time.var(time_var, date_interval)  
+  d = prepare.plot.values(m, break_var=time_var, topic_nr=topic_nr, pct=pct, value=value)
+  colnames(d) = c('time','value')
+  d = fill.time.gaps(d, date_interval)
+  plot(d$time, d$value, type='l', xlab='', main='', ylab='', xlim=c(min(d$time), max(d$time)), ylim=c(0, max(d$value)), bty='L', lwd=5, col='darkgrey')
+  if(return.values==T) d
+}
+
+#' Plots topic values per category
+#' 
+#' Plots the attention for a topic per category
+#' 
+#' @param m The output of \code{\link{lda.collapsed.gibbs.sampler}}
+#' @param The index of the topic (1 to K)
+#' @param category_var A vector with id values of the same length and order of the documents (rows) in m$document_sums
+#' @param pct Show topic values as percentages
+#' @param value Show topic values as 'total', or as 'relative' to the attention for other topics
+#' @param return.values Logical. If true, data that is plotted is returned as a data.frame
+#' @return data.frame for plotted values
+#' @export
+lda.plot.category <- function(m, topic_nr, category_var, pct=F, value='total', return.values=F){
+  par(mar=c(15,0,1,2))
+  d = prepare.plot.values(m, break_var=category_var, topic_nr=topic_nr, pct=pct, value=value)
+  colnames(d) = c('category','value')
+  barplot(as.matrix(t(d[,c('value')])), main='', beside=TRUE,horiz=FALSE,
+          density=NA,
+          col='darkgrey',
+          xlab='',
+          ylab="",
+          axes=T, names.arg=d$category, cex.names=0.8, cex.axis=0.8, adj=1, las=2)
+  if(return.values==T) d
+}
+
+#' Plot wordcloud for LDA topic
+#' 
+#' Plots a wordcloud of the top words per topic
+#' 
+#' @param m The output of \code{\link{lda.collapsed.gibbs.sampler}}
+#' @param topic_nr The index of the topic (1 to K)
+#' @return Nothing, just plots
+#' @export
+lda.plot.wordcloud <- function(m, topic_nr){
+  x = m$topics[topic_nr,]
+  x = sort(x[x>5], decreasing=T)[1:100]
+  x = x[!is.na(x)]
+  names = sub("/.*", "", names(x))
+  freqs = x**.5
+  pal <- brewer.pal(6,"YlGnBu")
+  wordcloud(names, freqs, scale=c(6,.5), min.freq=1, max.words=Inf, random.order=FALSE, rot.per=.15, colors=pal)
+}
diff --git a/corpus-tools.Rproj b/corpus-tools.Rproj
new file mode 100644
index 0000000..dfe7b37
--- /dev/null
+++ b/corpus-tools.Rproj
@@ -0,0 +1,17 @@
+Version: 1.0
+
+RestoreWorkspace: Default
+SaveWorkspace: Default
+AlwaysSaveHistory: Default
+
+EnableCodeIndexing: Yes
+UseSpacesForTab: Yes
+NumSpacesForTab: 2
+Encoding: UTF-8
+
+RnwWeave: knitr
+LaTeX: pdfLaTeX
+
+BuildType: Package
+PackageInstallArgs: --no-multiarch --with-keep.source
+PackageRoxygenize: rd,namespace
diff --git a/demo/demo_content_similarity_network.r b/demo/demo_content_similarity_network.r
new file mode 100644
index 0000000..0bdcaf9
--- /dev/null
+++ b/demo/demo_content_similarity_network.r
@@ -0,0 +1,169 @@
+library(corpustools)
+library(networktools)
+
+load('demo/wos_comsci_lda.rdata')
+
+names(m)
+document.topic.matrix = t(m$document_sums)
+document.topic.matrix = Matrix(document.topic.matrix, sparse=T)
+
+dim(document.topic.matrix)
+head(m$meta)
+
+g = content.similarity.graph(document.topic.matrix, 
+                             vertex.grouping.vars=list(journal=m$meta$journal,
+                                                       journal.top10=m$meta$journal.top10),
+                             similarity.measure='correlation',
+                             min.similarity=0)
+
+g = graph.color.vertices(g, V(g)$journal.top10) 
+V(g)$color[V(g)$journal.top10 == 'Other'] = 'white'
+V(g)$size = sqrt(V(g)$n)*2
+V(g)$label = ''
+E(g)$width = E(g)$width / 5
+
+graph.plot(g, min.edge=0.5)
+
+##### over time
+
+g = content.similarity.graph(document.topic.matrix, 
+                             vertex.grouping.vars=list(journal=m$meta$journal,
+                                                       journal.top10=m$meta$journal.top10,
+                                                       year=format(m$meta$date,'%Y')), 
+                             similarity.measure='correlation',
+                             min.similarity=0)
+
+g = graph.color.vertices(g, V(g)$journal.top10) 
+V(g)$color[V(g)$journal.top10 == 'Other'] = 'white'
+V(g)$size = sqrt(V(g)$n)*6
+V(g)$label = ''
+E(g)$width = E(g)$width / 2
+
+graph.plot(g, min.edge=0.5, select.vertices=V(g)$year == 2000)
+graph.plot(g, min.edge=0.5, select.vertices=V(g)$year == 2001)
+graph.plot(g, min.edge=0.5, select.vertices=V(g)$year == 2002)
+graph.plot(g, min.edge=0.5, select.vertices=V(g)$year == 2003)
+graph.plot(g, min.edge=0.5, select.vertices=V(g)$year == 2004)
+graph.plot(g, min.edge=0.5, select.vertices=V(g)$year == 2005)
+graph.plot(g, min.edge=0.5, select.vertices=V(g)$year == 2006)
+graph.plot(g, min.edge=0.5, select.vertices=V(g)$year == 2007)
+graph.plot(g, min.edge=0.5, select.vertices=V(g)$year == 2008)
+graph.plot(g, min.edge=0.5, select.vertices=V(g)$year == 2009)
+graph.plot(g, min.edge=0.5, select.vertices=V(g)$year == 2010)
+
+
+##### with topics coloured
+
+lda.plot.topic(m, 1, m$meta$date, m$meta$journal.top10, date_interval='year', value='relative')
+
+g = content.similarity.graph(document.topic.matrix, 
+                             vertex.grouping.vars=list(journal=m$meta$journal,
+                                                       journal.top10=m$meta$journal.top10,
+                                                       year=format(m$meta$date,'%Y')), 
+                             similarity.measure='correlation',
+                             min.similarity=0,
+                             content.totals.as.vertexmeta=1)
+
+g = graph.color.vertices(g, V(g)$C1)
+V(g)$size = sqrt(V(g)$n)*6
+V(g)$label = ''
+E(g)$width = E(g)$width / 2
+
+graph.plot(g, min.edge=0.5, select.vertices=V(g)$year == 2000)
+graph.plot(g, min.edge=0.5, select.vertices=V(g)$year == 2001)
+graph.plot(g, min.edge=0.5, select.vertices=V(g)$year == 2002)
+graph.plot(g, min.edge=0.5, select.vertices=V(g)$year == 2003)
+graph.plot(g, min.edge=0.5, select.vertices=V(g)$year == 2004)
+graph.plot(g, min.edge=0.5, select.vertices=V(g)$year == 2005)
+graph.plot(g, min.edge=0.5, select.vertices=V(g)$year == 2006)
+graph.plot(g, min.edge=0.5, select.vertices=V(g)$year == 2007)
+graph.plot(g, min.edge=0.5, select.vertices=V(g)$year == 2008)
+graph.plot(g, min.edge=0.5, select.vertices=V(g)$year == 2009)
+graph.plot(g, min.edge=0.5, select.vertices=V(g)$year == 2010)
+
+
+
+##
+##
+
+
+
+
+
+
+
+
+
+
+
+
+load('demo/parlproceedings_lda.rdata')
+
+document.topic.matrix = t(m$document_sums)
+document.topic.matrix = Matrix(document.topic.matrix, sparse=T)
+
+dim(document.topic.matrix)
+head(m$meta)
+
+g = content.similarity.graph(document.topic.matrix, 
+                             vertex.grouping.vars=list(party=m$meta$party, 
+                                                       year=format(m$meta$date, '%Y')), 
+                             similarity.measure='correlation')
+
+g = graph.color.vertices(g, V(g)$party) # color vertices by party
+V(g)$label = as.character(V(g)$year) # use year as vertex label
+
+graph.plot(g, min.edge=0.1)
+
+####
+
+dtm = weightTfIdf(dtm)
+
+g = content.similarity.graph(m$dtm, 
+                             vertex.grouping.vars=list(party=m$meta$party, 
+                                                       year=format(m$meta$date, '%Y')), 
+                             similarity.measure='correlation',
+                             min.similarity=0.5)
+
+min(E(g)$weight)
+
+
+g = graph.color.vertices(g, V(g)$party) # color vertices by party
+V(g)$label = as.character(V(g)$year) # use year as vertex label
+
+graph.plot(g, min.edge=0.6)
+
+
+
+
+g = content.similarity.graph(m$dtm, 
+                             vertex.grouping.vars=list(id=m$meta$id), 
+                             similarity.measure='correlation')
+
+
+
+
+
+
+
+#####
+
+lda.plot.topic(m, 1, m$meta$date, m$meta$party, date_interval='year')
+
+
+g = content.similarity.graph(document.topic.matrix, 
+                             vertex.grouping.vars=list(party=m$meta$party, 
+                                                       year=format(m$meta$date, '%Y')), 
+                             similarity.measure='correlation',
+                             content.totals.as.vertexmeta=1:10)
+
+data.frame(vertex.attributes(g))
+g = graph.color.vertices(g, V(g)$party) # color vertices by party
+V(g)$label = as.character(V(g)$year) # use year as vertex label
+
+graph.plot(g, min.edge=0.1)
+
+
+
+##### 
+
diff --git a/demo/demo_dtm.r b/demo/demo_dtm.r
new file mode 100644
index 0000000..ec4af0e
--- /dev/null
+++ b/demo/demo_dtm.r
@@ -0,0 +1,9 @@
+library(corpustools)
+
+tokens = load('demo/rwanda_tokens.rdata')
+head(td)
+head(tokens)
+
+dtm = dtm.create(tokens$aid, tokens$lemma, tokens$freq)
+dtm
+
diff --git a/demo/demo_lda.r b/demo/demo_lda.r
new file mode 100644
index 0000000..c6584c7
--- /dev/null
+++ b/demo/demo_lda.r
@@ -0,0 +1,33 @@
+library(corpustools)
+
+load('demo/wos_comsci_dtm.rdata')
+dtm
+
+termstats = term.statistics(dtm)
+head(termstats)
+head(termstats[order(termstats$tfidf, decreasing=T),])
+
+termstats = termstats[termstats$docfreq > 1 & termstats$nonalpha==F & termstats$number==F,]
+voca = as.character(termstats[order(termstats$tfidf, decreasing=T),][1:3000,'term'])
+filtered_dtm = dtm[,voca]
+
+m = lda.fit(filtered_dtm, K=25, num.iterations=1000)
+top.topic.words(m$topics)
+names(m)
+
+## add meta
+load('demo/wos_comsci_meta.rdata')
+head(meta)
+
+m = lda.match.meta(m, rownames(m$dtm), meta, match.by='id')
+names(m)
+
+load('demo/wos_comsci_lda.rdata')
+
+## visualize topics
+lda.plot.wordcloud(m, 1)
+lda.plot.topic(m, 1, m$meta$date, m$meta$journal.top10, 'year')
+
+lda.plot.topic(m, 1, m$meta$date, m$meta$journal.top10, date_interval='year', value='relative')
+lda.plot.topic(m, 2, m$meta$date, m$meta$journal.top10, date_interval='year', value='relative')
+lda.plot.topic(m, 17, m$meta$date, m$meta$journal.top10, date_interval='year', value='relative')
diff --git a/demo/demo_social_network.r b/demo/demo_social_network.r
new file mode 100644
index 0000000..a1a1293
--- /dev/null
+++ b/demo/demo_social_network.r
@@ -0,0 +1,33 @@
+d = data.frame(conversation=c(1,1,1,1,1,1,2,2,2,2),
+               author=c('Alice','Bob','Alice','Charlie','Bob','Bob','Alice','Bob','Alice','Bob'),
+               order.nr=c(1,2,3,4,5,6,1,2,3,4))
+
+d
+g = author.coincidence.graph(d$conversation, d$author) # In how many conversations did author.X and author.Y communicate?
+plot(g, edge.label=E(g)$weight, vertex.size=50, edge.label.cex=3, edge.width=E(g)$weight*5) 
+g = author.coincidence.graph(d$conversation, d$author, 'overlap_jacard') # Similar to default (coincidence_count) but with direction (by dividing coincidence by number of conversations author participated in)
+plot(g, edge.label=E(g)$weight, vertex.size=50, edge.label.cex=3, edge.width=E(g)$weight*5)
+g = author.coincidence.graph(d$conversation, d$author, 'cosine') # Cosine can be used to also take into account how many times each author participated within conversations
+plot(g, edge.label=round(E(g)$weight,2), vertex.size=50, edge.label.cex=3, edge.width=E(g)$weight*5)
+
+d
+g = previous.authors.graph(d$conversation, d$author, d$order.nr, lookback=1) # how many times did author.X communicate directly after author.Y? 
+plot(g, edge.label=E(g)$weight, vertex.size=50, edge.label.cex=3, edge.width=E(g)$weight*5)
+g = previous.authors.graph(d$conversation, d$author, d$order.nr, lookback=2) # how many times did author.X communicate within two messages after author.Y? 
+plot(g, edge.label=E(g)$weight, vertex.size=50, edge.label.cex=3, edge.width=E(g)$weight*5)
+
+###
+
+load('demo/parlproceedings_lda.rdata')
+
+head(m$meta)
+m$meta$author = paste(m$meta$name, m$meta$party)
+
+g = previous.authors.graph(m$meta$meeting, m$meta$author, m$meta$order.nr, lookback=5) # how many times did author.X communicate directly after author.Y? 
+
+E(g)$width = log(E(g)$weight)
+E(g)$arrow.size = E(g)$width / 50
+V(g)$label = ''
+V(g)$size = log(V(g)$n.messages)/2
+
+graph.plot(g, min.edge=1)
diff --git a/demo/parlproceedings_lda.rdata b/demo/parlproceedings_lda.rdata
new file mode 100644
index 0000000..e81bf0d
Binary files /dev/null and b/demo/parlproceedings_lda.rdata differ
diff --git a/demo/wos_comsci_dtm.rdata b/demo/wos_comsci_dtm.rdata
new file mode 100644
index 0000000..bad2419
Binary files /dev/null and b/demo/wos_comsci_dtm.rdata differ
diff --git a/demo/wos_comsci_lda.rdata b/demo/wos_comsci_lda.rdata
new file mode 100644
index 0000000..df8153b
Binary files /dev/null and b/demo/wos_comsci_lda.rdata differ
diff --git a/demo/wos_comsci_meta.rdata b/demo/wos_comsci_meta.rdata
new file mode 100644
index 0000000..4e51a6a
Binary files /dev/null and b/demo/wos_comsci_meta.rdata differ
diff --git a/man/cast.sparse.matrix.Rd b/man/cast.sparse.matrix.Rd
new file mode 100644
index 0000000..763e39a
--- /dev/null
+++ b/man/cast.sparse.matrix.Rd
@@ -0,0 +1,21 @@
+% Generated by roxygen2 (4.0.0): do not edit by hand
+\name{cast.sparse.matrix}
+\alias{cast.sparse.matrix}
+\title{Cast data.frame to sparse matrix}
+\usage{
+cast.sparse.matrix(rows, columns, values = NULL)
+}
+\arguments{
+\item{rows}{a vector of row indices: [i,]}
+
+\item{columns}{a vector of column indices: [,j]}
+
+\item{values}{a vector of the values for each (non-zero) cell: [i,j] = value}
+}
+\value{
+a sparse matrix of the dgTMatrix class (\code{\link{Matrix}} package)
+}
+\description{
+Create a sparse matrix from matching vectors of row indices, column indices and values
+}
+
diff --git a/man/chi2.Rd b/man/chi2.Rd
new file mode 100644
index 0000000..ecf8c7a
--- /dev/null
+++ b/man/chi2.Rd
@@ -0,0 +1,13 @@
+% Generated by roxygen2 (4.0.0): do not edit by hand
+\name{chi2}
+\alias{chi2}
+\title{Compute the chi^2 statistic for a 2x2 crosstab containing the values
+[[a, b], [c, d]]}
+\usage{
+chi2(a, b, c, d)
+}
+\description{
+Compute the chi^2 statistic for a 2x2 crosstab containing the values
+[[a, b], [c, d]]
+}
+
diff --git a/man/corpora.compare.Rd b/man/corpora.compare.Rd
new file mode 100644
index 0000000..f9a2581
--- /dev/null
+++ b/man/corpora.compare.Rd
@@ -0,0 +1,22 @@
+% Generated by roxygen2 (4.0.0): do not edit by hand
+\name{corpora.compare}
+\alias{corpora.compare}
+\title{Compare two corpora}
+\usage{
+corpora.compare(dtm.x, dtm.y, smooth = 0.001)
+}
+\arguments{
+\item{dtm.x}{the main document-term matrix}
+
+\item{dtm.y}{the 'reference' document-term matrix}
+
+\item{smooth}{the smoothing parameter for computing overrepresentation}
+}
+\value{
+A data frame with rows corresponding to the terms in dtm and the statistics in the columns
+}
+\description{
+Compare the term use in corpus dtm with a refernece corpus dtm.ref, returning relative frequencies
+and overrepresentation using various measures
+}
+
diff --git a/man/dtm.create.Rd b/man/dtm.create.Rd
new file mode 100644
index 0000000..1b22aa7
--- /dev/null
+++ b/man/dtm.create.Rd
@@ -0,0 +1,21 @@
+% Generated by roxygen2 (4.0.0): do not edit by hand
+\name{dtm.create}
+\alias{dtm.create}
+\title{Create a document term matrix from a list of tokens}
+\usage{
+dtm.create(ids, terms, freqs)
+}
+\arguments{
+\item{ids}{a vector of document ids}
+
+\item{terms}{a vector of words of the same length as ids}
+
+\item{freqs}{a vector of the frequency a a term in a document}
+}
+\value{
+a document-term matrix  \code{\link{DocumentTermMatrix}}
+}
+\description{
+Create a \code{\link{DocumentTermMatrix}} from a list of ids, terms, and frequencies.
+}
+
diff --git a/man/fill.time.gaps.Rd b/man/fill.time.gaps.Rd
new file mode 100644
index 0000000..355df3d
--- /dev/null
+++ b/man/fill.time.gaps.Rd
@@ -0,0 +1,19 @@
+% Generated by roxygen2 (4.0.0): do not edit by hand
+\name{fill.time.gaps}
+\alias{fill.time.gaps}
+\title{Add empty values for pretty plotting}
+\usage{
+fill.time.gaps(d, date_interval)
+}
+\arguments{
+\item{d}{A data.frame with the columns 'time' (Date) and 'value' (numeric)}
+
+\item{date_interval}{The date_interval is required to know what the gaps are}
+}
+\value{
+A data.frame with the columns 'time' (Date) and 'value' (numeric)
+}
+\description{
+When plotting a timeline, gaps in date_intervals are ignored. For the attention for topics gaps should be considered as having value 0.
+}
+
diff --git a/man/lda.fit.Rd b/man/lda.fit.Rd
new file mode 100644
index 0000000..1bb3047
--- /dev/null
+++ b/man/lda.fit.Rd
@@ -0,0 +1,28 @@
+% Generated by roxygen2 (4.0.0): do not edit by hand
+\name{lda.fit}
+\alias{lda.fit}
+\title{Estimate a topic model using the lda package}
+\usage{
+lda.fit(dtm, K = 50, num.iterations = 100, alpha = 50/K, eta = 0.01,
+  burnin = 100, compute.log.likelihood = F)
+}
+\arguments{
+\item{dtm}{a document term matrix (e.g. the output of \code{\link{amcat.dtm.create}})}
+
+\item{K}{the number of clusters}
+
+\item{num.iterations}{the number of iterations}
+
+\item{alpha}{the alpha parameter}
+
+\item{eta}{the eta parameter}
+}
+\value{
+A fitted LDA model (see \code{\link{lda.collapsed.gibbs.sampler}})
+}
+\description{
+Estimate an LDA topic model using the \code{\link{lda.collapsed.gibbs.sampler}} function
+The parameters other than dtm are simply passed to the sampler but provide a workable default.
+See the description of that function for more information
+}
+
diff --git a/man/lda.match.meta.Rd b/man/lda.match.meta.Rd
new file mode 100644
index 0000000..ee7ad99
--- /dev/null
+++ b/man/lda.match.meta.Rd
@@ -0,0 +1,23 @@
+% Generated by roxygen2 (4.0.0): do not edit by hand
+\name{lda.match.meta}
+\alias{lda.match.meta}
+\title{Add document meta to LDA output}
+\usage{
+lda.match.meta(m, document.ids, meta, match.by = "id")
+}
+\arguments{
+\item{m}{The output of \code{\link{lda.collapsed.gibbs.sampler}}}
+
+\item{lda.document.ids}{A vector with document ids of the same length and order as the LDA output (matching the columns of m$document_sums)}
+
+\item{meta}{A data.frame with document meta. Has to contain a vector to match the lda.document.ids}
+
+\item{match.by}{The name of the vector in meta that matches the lda.document.ids}
+}
+\value{
+The LDA output appended with document meta
+}
+\description{
+Add a dataframe containing document meta to the output (a list) of \code{\link{lda.collapsed.gibbs.sampler}}.
+}
+
diff --git a/man/lda.plot.alltopics.Rd b/man/lda.plot.alltopics.Rd
new file mode 100644
index 0000000..ec00ac8
--- /dev/null
+++ b/man/lda.plot.alltopics.Rd
@@ -0,0 +1,25 @@
+% Generated by roxygen2 (4.0.0): do not edit by hand
+\name{lda.plot.alltopics}
+\alias{lda.plot.alltopics}
+\title{Plot all topics}
+\usage{
+lda.plot.alltopics(m, time_var, category_var, path, date_interval = "day")
+}
+\arguments{
+\item{m}{The output of \code{\link{lda.collapsed.gibbs.sampler}}}
+
+\item{time_var}{A vector with time stamps (either numeric or Date class) of the same length and order of the documents (rows) in m$document_sums}
+
+\item{category_var}{A vector with id values of the same length and order of the documents (rows) in m$document_sums}
+
+\item{path}{The path for a folder where output will be saved}
+
+\item{date_interval}{The interval for plotting the values over time. Can be: 'day', 'week', 'month' or 'year'}
+}
+\value{
+Nothing
+}
+\description{
+Write plots for all topics with \code{\link{lda.plot.topic}} in designated folder
+}
+
diff --git a/man/lda.plot.category.Rd b/man/lda.plot.category.Rd
new file mode 100644
index 0000000..c9a0cf9
--- /dev/null
+++ b/man/lda.plot.category.Rd
@@ -0,0 +1,28 @@
+% Generated by roxygen2 (4.0.0): do not edit by hand
+\name{lda.plot.category}
+\alias{lda.plot.category}
+\title{Plots topic values per category}
+\usage{
+lda.plot.category(m, topic_nr, category_var, pct = F, value = "total",
+  return.values = F)
+}
+\arguments{
+\item{m}{The output of \code{\link{lda.collapsed.gibbs.sampler}}}
+
+\item{The}{index of the topic (1 to K)}
+
+\item{category_var}{A vector with id values of the same length and order of the documents (rows) in m$document_sums}
+
+\item{pct}{Show topic values as percentages}
+
+\item{value}{Show topic values as 'total', or as 'relative' to the attention for other topics}
+
+\item{return.values}{Logical. If true, data that is plotted is returned as a data.frame}
+}
+\value{
+data.frame for plotted values
+}
+\description{
+Plots the attention for a topic per category
+}
+
diff --git a/man/lda.plot.time.Rd b/man/lda.plot.time.Rd
new file mode 100644
index 0000000..f8931d9
--- /dev/null
+++ b/man/lda.plot.time.Rd
@@ -0,0 +1,30 @@
+% Generated by roxygen2 (4.0.0): do not edit by hand
+\name{lda.plot.time}
+\alias{lda.plot.time}
+\title{Plots topic values over time}
+\usage{
+lda.plot.time(m, topic_nr, time_var, date_interval = "day", pct = F,
+  value = "total", return.values = F)
+}
+\arguments{
+\item{m}{The output of \code{\link{lda.collapsed.gibbs.sampler}}}
+
+\item{The}{index of the topic (1 to K)}
+
+\item{time_var}{A vector with time stamps (either numeric or Date class) of the same length and order of the documents (rows) in m$document_sums}
+
+\item{date_interval}{The interval for plotting the values over time. Can be: 'day', 'week', 'month' or 'year'}
+
+\item{pct}{Show topic values as percentages}
+
+\item{value}{Show topic values as 'total', or as 'relative' to the attention for other topics}
+
+\item{return.values}{Logical. If true, data that is plotted is returned as a data.frame}
+}
+\value{
+data.frame for plotted values
+}
+\description{
+Plots the attention for a topic over time
+}
+
diff --git a/man/lda.plot.topic.Rd b/man/lda.plot.topic.Rd
new file mode 100644
index 0000000..8c0d49f
--- /dev/null
+++ b/man/lda.plot.topic.Rd
@@ -0,0 +1,30 @@
+% Generated by roxygen2 (4.0.0): do not edit by hand
+\name{lda.plot.topic}
+\alias{lda.plot.topic}
+\title{Plots topic wordcloud, and attention over time and per category}
+\usage{
+lda.plot.topic(m, topic_nr, time_var, category_var, date_interval = "day",
+  pct = F, value = "total")
+}
+\arguments{
+\item{m}{The output of \code{\link{lda.collapsed.gibbs.sampler}}}
+
+\item{The}{index of the topic (1 to K)}
+
+\item{time_var}{A vector with time stamps (either numeric or Date class) of the same length and order of the documents (rows) in m$document_sums}
+
+\item{category_var}{A vector with id values of the same length and order of the documents (rows) in m$document_sums}
+
+\item{date_interval}{The interval for plotting the values over time. Can be: 'day', 'week', 'month' or 'year'}
+
+\item{pct}{Show topic values as percentages}
+
+\item{value}{Show topic values as 'total', or as 'relative' to the attention for other topics}
+}
+\value{
+Nothing, just plots
+}
+\description{
+Plots \code{\link{lda.plot.wordcloud}}, \code{\link{lda.plot.time}} and \code{\link{lda.plot.category}}
+}
+
diff --git a/man/lda.plot.wordcloud.Rd b/man/lda.plot.wordcloud.Rd
new file mode 100644
index 0000000..185917d
--- /dev/null
+++ b/man/lda.plot.wordcloud.Rd
@@ -0,0 +1,19 @@
+% Generated by roxygen2 (4.0.0): do not edit by hand
+\name{lda.plot.wordcloud}
+\alias{lda.plot.wordcloud}
+\title{Plot wordcloud for LDA topic}
+\usage{
+lda.plot.wordcloud(m, topic_nr)
+}
+\arguments{
+\item{m}{The output of \code{\link{lda.collapsed.gibbs.sampler}}}
+
+\item{topic_nr}{The index of the topic (1 to K)}
+}
+\value{
+Nothing, just plots
+}
+\description{
+Plots a wordcloud of the top words per topic
+}
+
diff --git a/man/lda.topics.per.document.Rd b/man/lda.topics.per.document.Rd
new file mode 100644
index 0000000..2687cf9
--- /dev/null
+++ b/man/lda.topics.per.document.Rd
@@ -0,0 +1,17 @@
+% Generated by roxygen2 (4.0.0): do not edit by hand
+\name{lda.topics.per.document}
+\alias{lda.topics.per.document}
+\title{Get the topics per document, optionally merged with}
+\usage{
+lda.topics.per.document(topics)
+}
+\arguments{
+\item{dtm}{a document term matrix (e.g. the output of \code{\link{dtm.create}})}
+}
+\value{
+A data frame with rows corresponding to the terms in dtm and the statistics in the columns
+}
+\description{
+Return a data frame containing article metadata and topic occurence per document
+}
+
diff --git a/man/prepare.plot.values.Rd b/man/prepare.plot.values.Rd
new file mode 100644
index 0000000..18faabd
--- /dev/null
+++ b/man/prepare.plot.values.Rd
@@ -0,0 +1,26 @@
+% Generated by roxygen2 (4.0.0): do not edit by hand
+\name{prepare.plot.values}
+\alias{prepare.plot.values}
+\title{Prepares the topic values per document for plotting}
+\usage{
+prepare.plot.values(m, break_var, topic_nr, pct = F, value = "total",
+  filter = NULL)
+}
+\arguments{
+\item{m}{The output of \code{\link{lda.collapsed.gibbs.sampler}}}
+
+\item{break_var}{A break vector to aggregate topic values per document}
+
+\item{The}{index of the topic (1 to K)}
+
+\item{pct}{Show topic values as percentages}
+
+\item{value}{Show topic values as 'total', or as 'relative' to the attention for other topics}
+}
+\value{
+The aggregated/transformed topic values
+}
+\description{
+Prepares the topic values per document for plotting
+}
+
diff --git a/man/prepare.time.var.Rd b/man/prepare.time.var.Rd
new file mode 100644
index 0000000..164633c
--- /dev/null
+++ b/man/prepare.time.var.Rd
@@ -0,0 +1,19 @@
+% Generated by roxygen2 (4.0.0): do not edit by hand
+\name{prepare.time.var}
+\alias{prepare.time.var}
+\title{Change date object to date_interval}
+\usage{
+prepare.time.var(time_var, date_interval)
+}
+\arguments{
+\item{time_var}{A vector of Date values}
+
+\item{date_interval}{The desired date_interval ('day','week','month', or 'year')}
+}
+\value{
+A vector of Date values
+}
+\description{
+Change date object to date_interval
+}
+
diff --git a/man/term.statistics.Rd b/man/term.statistics.Rd
new file mode 100644
index 0000000..0065da4
--- /dev/null
+++ b/man/term.statistics.Rd
@@ -0,0 +1,17 @@
+% Generated by roxygen2 (4.0.0): do not edit by hand
+\name{term.statistics}
+\alias{term.statistics}
+\title{Compute some useful corpus statistics for a dtm}
+\usage{
+term.statistics(dtm)
+}
+\arguments{
+\item{dtm}{a document term matrix (e.g. the output of \code{\link{dtm.create}})}
+}
+\value{
+A data frame with rows corresponding to the terms in dtm and the statistics in the columns
+}
+\description{
+Compute a number of useful statistics for filtering words: term frequency, idf, etc.
+}
+