Skip to content

Commit

Permalink
Merge branch 'cut-by-chisquare'
Browse files Browse the repository at this point in the history
Implements Rolecek et al., JVS 20, 596-602 (2009) modified TWINSPAN
where divisions are picked by heterogeneity (but honouring hierarchy).
Basically there are two functions achieve this: cuth() that returns
a classification vector for quadrats or species for most heterogeneous
classes, and modification of as.hclust that now can display the
TWINSPAN tree using heterogeneity as height. The heterogeneity is
extracted from twinspan object, and it is the total chi-square (or
inertia or sum of all eigenvalues) of the matrix internally used by
twinspan in that division. Wish of github issue #2.
  • Loading branch information
jarioksa committed Dec 21, 2021
2 parents 18fa394 + ab84d9c commit 2f027b4
Show file tree
Hide file tree
Showing 12 changed files with 496 additions and 83 deletions.
1 change: 1 addition & 0 deletions NAMESPACE
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,7 @@ S3method(predict,twinspan)
S3method(print,misclassified)
S3method(print,twinspan)
S3method(summary,twinspan)
export(cuth)
export(eigenvals)
export(misclassified)
export(totalchi)
Expand Down
160 changes: 138 additions & 22 deletions R/as.hclust.twinspan.R
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,6 @@
### tree. Hclust trees can not have polytomies, and quadrats must be
### omitted. There already is as.dendrogram, but try to see if this
### can be developed into a decision tree.

#' Extract twinspan Grouping as Hierarchical Cluster Tree
#'
#' Function extracts classification as an \code{\link{hclust}}
Expand All @@ -13,31 +12,74 @@
#' number and number of items in the terminal group are used as group
#' names and are displayed in plots.
#'
#' @seealso \code{\link{as.dendrogram.twinspan}},
#' \code{\link{hclust}}, \code{\link{plot.twinspan}},
#' \code{\link{image.twinspan}}.
#'
#' @return an \code{\link{hclust}} object.
#' Function can return either a tree showing the \code{twinspan}
#' hierarchy or showing the heterogeneity of each group or
#' division. In the first case, all divisions and groups at a certain
#' level of hierarchy are at the same height, but in the latter the
#' divisions are at the height defined by their heterogeneity. The
#' criterion of heterogeneity is the total chi-square (also known as
#' inertia) of the matrix that \code{\link{twinspan}} internally uses
#' in that division (see \code{\link{twintotalchi}}). This tree gives
#' the visual presentation of the modified method of Roleček et
#' al. (2009).
#'
#' When tree heights are based on heterogeneity, subgroups can be more
#' heterogeneous than their parent group. These appear as reversed
#' branches in the tree. A warning is issued for each such case.
#'
#' @encoding UTF-8
#'
#' @seealso \code{\link{as.dendrogram.twinspan}} provides an
#' alternative which also shows the sampling units (quadrats or
#' species). The result is based \code{\link{hclust}} and can be
#' handled with its support
#' functions. \code{\link{plot.twinspan}},
#' \code{\link{image.twinspan}} display the tree. Function
#' \code{\link{cut.twinspan}} cuts the tree by a level of
#' hierarchy, and \code{\link{cuth}} by heterogeneity for original
#' sampling units (quadrats, species).
#'
#' @return an \code{\link{hclust}} object amended with labels for
#' internal nodes (\code{nodelabels}).
#'
#' @references
#' Roleček, J, Tichý, L., Zelený, D. & Chytrý, M. (2009). Modified
#' TWINSPAN classification in which the hierarchy respects cluster
#' heterogeneity. \emph{J Veg Sci} 20: 596--602.
#'
#' @examples
#'
#' data(ahti)
#' tw <- twinspan(ahti)
#' plot(as.hclust(tw))
#' plot(as.hclust(tw, "species"))
#' cl <- as.hclust(tw)
#' ## plot and 8 groups by hierarchy level
#' plot(cl)
#' rect.hclust(cl, 8)
#' ## plot and 8 groups by heterogeneity
#' cl <- as.hclust(tw, height="chi")
#' plot(cl)
#' rect.hclust(cl, 8)
#'
#'
#'
#' @param x \code{\link{twinspan}} result object.
#' @param what Extract \code{"quadrat"} or \code{"species"}
#' classification tree.
#' @param height Use either division levels (\code{"level"}) or total
#' Chi-squares of division (\code{"chi"}) as heights of internal
#' nodes in the tree.
#' @param \dots Other parameters to the function (ignored).
#'
#' @importFrom stats as.hclust
#'
#' @export
`as.hclust.twinspan` <-
function(x, what = c("quadrat","species"), ...)
function(x, what = c("quadrat","species"), height = c("level", "chi"),
...)
{
what = match.arg(what)
what <- match.arg(what)
height <- match.arg(height)
class <- cut(x, what=what)
n <- max(class)
state <- character(n)
Expand All @@ -46,7 +88,7 @@
nclus <- sum(state == "clust")
line <- numeric(n)
merge <- matrix(0, nclus-1, 2)
height <- numeric(nclus-1)
treeheight <- numeric(nclus-1)
pow2 <- 2^(seq_len(x$levelmax+1))
maxh <- sum(max(class) >= pow2) + 1
nro <- nclus
Expand All @@ -55,7 +97,7 @@
if (state[i] == "")
next
now <- now + 1
height[now] <- maxh - sum(i >= pow2)
treeheight[now] <- maxh - sum(i >= pow2)
for(j in 2:1) {
line[i+j-1] <- now
if(state[i+j-1] == "clust") {
Expand All @@ -69,14 +111,78 @@
labels <- table(class)
labels <- paste0(names(labels), " (N=", labels, ")")
nodelabels <- rev(which(state=="div"))
if (height == "chi") {
## replace levels with Chi-squares of internal nodes
treeheight <- twintotalchi(x, what)[nodelabels]
## height should be ordered in hclust tree so that hclust
## methods know how to handle tree
if (is.unsorted(treeheight)) {
o <- order(treeheight)
o <- o[rev(fixTreeReversal(rev(nodelabels[o]), index = TRUE))]
oo <- order(o)
for (i in 1:nrow(merge))
for (j in 1:2)
if (merge[i,j] > 0) # a (reordered) division
merge[i,j] <- oo[merge[i,j]]
merge <- merge[o,]
treeheight <- treeheight[o]
nodelabels <- nodelabels[o]
## tree reversals should be handled, but check anyway
if (any(merge > row(merge))) # mother before her daughters
stop("some split groups are more heterogeneous than their mother")
}
}
ind <- x[[what]]$index
order <- order(tapply(order(ind), class, min))
out <- list(merge = merge, labels = labels, height = height, order = order,
nodelabels = nodelabels, method = "twinspan")
out <- list(merge = merge, labels = labels, height = treeheight,
order = order, nodelabels = nodelabels, method = "twinspan")
class(out) <- "hclust"
out
}

## If within-group heterogeneity (such as Chi-square) is used as
## height in a twinspan tree, there may be reversed branches or the
## subgroup is more heterogeneous than her mother group. This function
## goes through order of groups and if it finds a subgroup was used
## before the parent group, it will move the kid behind her
## parent. Non-exported support function tuned to work only with our
## as.hclust.twinspan and cuth functions.

fixTreeReversal <-
function(order, index = FALSE)
{
n <- length(order)
## parent of group i is i %/% 2
if(index)
idx <- rev(seq_along(order)) # reverse index
repeat{
## may need several passages
SWAPPED <- FALSE
for (i in 2:n) {
a <- order[1:(i-1)] %/% 2 == order[i]
if (any(a)) {
SWAPPED <- TRUE
k <- min(which(a)) # there should be no two kids, but check
## move kid immediately after her parent
if (index) {
if (i-k == 1)
idx[c(i,k)] <- idx[c(k,i)]
else
idx[k:i] <- idx[c((k+1):(i-1), i, k)]
}
if (i-k == 1)
order[c(i,k)] <- order[c(k,i)]
else
order[k:i] <- order[c((k+1):(i-1), i, k)]
warning(
gettextf("tree reversal: group %d more heterogeneous than parent %d",
order[i], order[i-1]))
}
}
if (!SWAPPED) break
}
if(index) idx else order
}
### plot.twinspan as plot of hclust tree

#' Plot Classification Tree
Expand All @@ -87,12 +193,15 @@
#' are the same numbers as used in \code{\link{summary.twinspan}} and
#' returned by \code{\link{cut.twinspan}} or
#' \code{\link{predict.twinspan}} for the same classification
#' level. For terminal groups the plot shows the number of the group
#' and the number of items (quadrats or species) in the group. For
#' division number \eqn{k}, its daughter divisions or groups are
#' \eqn{2k}{2*k} and \eqn{2k+1}{2*k+1}. The tree is similar as a plot
#' of \code{\link{as.hclust.twinspan}}, but adds numbers of internal
#' nodes.
#' level. For terminal groups the plot shows the numeric code of the
#' group and the number of items (quadrats or species) in the
#' group. For division number \eqn{k}, its daughter divisions or
#' groups are coded \eqn{2k}{2*k} and \eqn{2k+1}{2*k+1}. The tree is
#' similar as a plot of \code{\link{as.hclust.twinspan}}, but adds
#' numbers of internal nodes. The tree can be based either on the
#' levels of hierarchy or on the heterogeneity of division as assessed
#' by chi-square (or inertia) of the division (see
#' \code{\link{as.hclust.twinspan}}).
#'
#' @seealso \code{\link{summary.twinspan}} for similar textual
#' presentation also showing the items (quadrats, species) in
Expand All @@ -107,24 +216,31 @@
#' data(ahti)
#' tw <- twinspan(ahti)
#' plot(tw, "species")
#' ## default plot for quadrats
#' plot(tw)
#' ## plot by the heterogeneity of divisions
#' plot(tw, height = "chi")
#'
#' @param x \code{\link{twinspan}} result object.
#' @param what Plot \code{"quadrat"} or \code{"species"}
#' classification tree.
#' @param main Main title of the plot.
#' @param \dots Other parameters passed to \code{\link{plot}} and
#' \code{\link[vegan]{ordilabel}}.
#'
#' @param height Use either division levels (\code{"level"}) or total
#' Chi-squares of division (\code{"chi"}) as heights of internal
#' nodes in the tree.
#' @importFrom vegan ordilabel
#' @importFrom graphics plot
#'
#' @export
`plot.twinspan` <-
function(x, what = c("quadrat", "species"), main = "Twinspan Dendrogram",
function(x, what = c("quadrat", "species"), height = c("level", "chi"),
main = "Twinspan Dendrogram",
...)
{
what <- match.arg(what)
x <- as.hclust(x, what = what)
x <- as.hclust(x, what = what, height = height)
plot(x, main = main, ...)
ordilabel(x, "internal", labels = x$nodelabels, ...)
}
75 changes: 72 additions & 3 deletions R/cut.twinspan.R
Original file line number Diff line number Diff line change
@@ -1,7 +1,12 @@
#' Return twinspan Classification at Given Level
#'
#' Function returns a vector of \code{twinspan} classes at given level
#' of hierarcy for quadrats or species.
#' Returns a vector of \code{twinspan} classes at a given level of
#' hierarchy or classes respecting group heterogeneity for quadrats or
#' species.
#'
#' @encoding UTF-8
#'
#' @details
#'
#' \code{\link{twinspan}} returns only the the classification at the
#' final level, but any upper level classes can be found by integer
Expand All @@ -15,17 +20,41 @@
#' \code{\link{misclassified}} analyses the differences of these
#' classifications.
#'
#' Function \code{cuth} cuts the classification by class heterogeneity
#' instead of level, and can be used to implement the modified method
#' of Roleček et al. (2009). The groups are formed with decreasing
#' heterogeneity but respecting the hierarchy. Total chi-square (also
#' known as inertia) is used as the criterion of heterogeneity. The
#' criterion is calculated with \code{\link{twintotalchi}} and the
#' criterion is based on the same data matrix as internally used in
#' \code{\link{twinspan}}. The function can also be used for species
#' classification, also with the internally used modified species
#' matrix.
#'
#' @references
#'
#' Roleček, J, Tichý, L., Zelený, D. & Chytrý, M. (2009). Modified
#' TWINSPAN classification in which the hierarchy respects cluster
#' heterogeneity. \emph{J Veg Sci} 20: 596--602.
#'
#' @seealso \code{\link{predict.twinspan}} gives similar classes, but
#' based on indicator pseudospecies. \code{\link{cutree}} provides
#' a similar functionality for \code{\link{hclust}} trees.
#' a similar functionality for \code{\link{hclust}}
#' trees. Function \code{\link{as.hclust.twinspan}} generates
#' corresponding tree presentation, and
#' \code{\link{plot.twinspan}} will print that tree labelling
#' internal nodes (divisions).
#'
#' @examples
#'
#' data(ahti)
#' tw <- twinspan(ahti)
#' cut(tw)
#' ## traditional twinspan classification by level of hierarchy
#' cut(tw, level=3)
#' cut(tw, what = "species")
#' ## number of groups as with level=3, but by group heterogeneity
#' cuth(tw, ngroups = 8)
#'
#' @param x \code{twinspan} result.
#' @param level Level of hierarchy for classification. If missing, the
Expand Down Expand Up @@ -54,3 +83,43 @@
}
cl
}

## cut by group homogeneity as defined by within-group
## Chi-square. This could give similar clustering as Rolecek's
## modified twinspan which only splits the most heterogeneous group at
## each step instead of dichotomizing. There is no guarantee that the
## tree will be ordered, but the code should handle this.

#' @rdname cut.twinspan
#'
#' @param ngroups Number of groups.
#'
#' @export
`cuth` <-
function(x, what = c("quadrat", "species"), ngroups)
{
what <- match.arg(what)
if (missing(ngroups))
ngroups <- 1 # return one group if nothing is asked
chi <- twintotalchi(x, what = what)
## latter half of chi have items that cannot be split
k <- length(chi) %/% 2L + 1L
chi[k:length(chi)] <- 0
## terminal nodes (leaves) cannot be split
chi[x[[what]]$eig <= 0] <- 0
ix <- order(chi, decreasing = TRUE) # order by heterogeneity
ix <- ix[seq_len(sum(chi > 0))]
ix <- fixTreeReversal(ix)
clim <- 2^(0:x$levelmax) - 1L
nobs <- switch(what,
"quadrat" = x$nquadrat,
"species" = x$nspecies)
class <- rep(1, nobs)
for(i in seq_len(ngroups - 1)) {
lev <- max(which(ix[i] > clim))
id <- 2L * ix[i]
class[cut(x, what = what, level=lev) == id] <- id
class[cut(x, what = what, level=lev) == id+1L] <- id + 1L
}
class
}
Loading

0 comments on commit 2f027b4

Please sign in to comment.