#81: retain row names from input data

jlmelville · Aug 22, 2021 · 0ad9eda · 0ad9eda
1 parent fd9bae0
commit 0ad9eda
Show file tree

Hide file tree

Showing 13 changed files with 210 additions and 66 deletions.
diff --git a/NEWS.md b/NEWS.md
@@ -1,3 +1,26 @@
+# uwot 0.1.11
+
+## Bug fixes and minor improvements
+
+* If row names are provided in the input data (or nearest neighbor data, or
+initialization data if it's a matrix), this will be used to name the rows of the
+output embedding (<https://github.com/jlmelville/uwot/issues/81>), and also the
+nearest neighbor data if you set `ret_nn = TRUE`. If the names exist in more
+than one of the input data parameters listed above, but are inconsistent, no
+guarantees are made about which names will be used. Thank you 
+[jwijffels](https://github.com/jwijffels) for reporting this.
+* Setting `nn_method = "annoy"` and `verbose = TRUE` would lead to an error with 
+datasets with fewer than 50 items in them.
+* Using multiple pre-computed nearest neighbors blocks is now supported with
+`umap_transform` (this was incorrectly documented to work).
+* Documentation around pre-calculated nearest neighbor data for `umap_transform`
+was wrong in other ways: it has now been corrected to indicate that there should
+be neighbor data for each item in the test data, but the neighbors and distances
+should refer to items in training data (i.e. the data used to build the model).
+* `n_neighbors` parameter is now correctly ignored in model generation if
+pre-calculated nearest neighbor data is provided.
+* Documentation incorrectly said `grain_size` didn't do anything.
+
 # uwot 0.1.10
 
 This release is mainly to allow for some internal changes to keep compatibility

diff --git a/R/neighbors.R b/R/neighbors.R
@@ -128,8 +128,8 @@ annoy_build <- function(X, metric = "euclidean", n_trees = 50,
     ", n_trees = ", n_trees
   )
   ann <- annoy$ann
-  if (verbose) {
-    nstars <- 50
+  nstars <- 50
+  if (verbose && nr > nstars) {
     progress_for(
       nr, nstars,
       function(chunk_start, chunk_end) {
@@ -238,8 +238,8 @@ annoy_search_serial <- function(X, k, ann,
   nr <- nrow(X)
   idx <- matrix(nrow = nr, ncol = k)
   dist <- matrix(nrow = nr, ncol = k)
-  if (verbose) {
-    nstars <- 50
+  nstars <- 50
+  if (verbose && nr > nstars) {
     progress_for(
       nr, nstars,
       function(chunk_start, chunk_end) {

diff --git a/R/transform.R b/R/transform.R
@@ -15,13 +15,14 @@
 #'   have the same columns in the same order as the input data used to generate
 #'   the \code{model}.
 #' @param model Data associated with an existing embedding.
-#' @param nn_method Optional pre-calculated nearest neighbor data. It must be a 
-#' list consisting of two elements:
+#' @param nn_method Optional pre-calculated nearest neighbor data. The format is 
+#'   a list consisting of two elements:
 #'   \itemize{
-#'     \item \code{"idx"}. A \code{n_vertices x n_neighbors} matrix
-#'     containing the integer indexes of the nearest neighbors in \code{X}. Each
-#'     vertex is considered to be its own nearest neighbor, i.e.
-#'     \code{idx[, 1] == 1:n_vertices}.
+#'     \item \code{"idx"}. A \code{n_vertices x n_neighbors} matrix where 
+#'     \code{n_vertices} is the number of items to be transformed. The contents 
+#'     of the matrix should be the integer indexes of the data used to generate
+#'     the \code{model}, which are the \code{n_neighbors}-nearest neighbors of
+#'     the data to be transformed.
 #'     \item \code{"dist"}. A \code{n_vertices x n_neighbors} matrix
 #'     containing the distances of the nearest neighbors.
 #'   }
@@ -143,6 +144,7 @@ umap_transform <- function(X = NULL, model = NULL,
   method <- model$method
   scale_info <- model$scale_info
   metric <- model$metric
+  nblocks <- length(metric)
   pca_models <- model$pca_models
 
   a <- model$a
@@ -158,7 +160,9 @@ umap_transform <- function(X = NULL, model = NULL,
     pcg_rand <- TRUE
   }
 
-  if(!is.null(X)){
+  n_vertices <- NULL
+  Xnames <- NULL
+  if (!is.null(X)){
     if (ncol(X) != norig_col) {
       stop("Incorrect dimensions: X must have ", norig_col, " columns")
     }
@@ -170,8 +174,36 @@ umap_transform <- function(X = NULL, model = NULL,
       X <- as.matrix(X[, indexes])
     }
     n_vertices <- nrow(X)
-  } else if (nn_is_precomputed(nn_method)){
-    n_vertices <- nrow(nn_method$idx)
+    if (!is.null(row.names(X))) {
+      Xnames <- row.names(X)
+    }
+  } else if (nn_is_precomputed(nn_method)) {
+    if (nblocks == 1) {
+      if (length(nn_method) == 1) {
+        graph <- nn_method[[1]]
+      }
+      else {
+        graph <- nn_method
+      }
+      n_vertices <- nrow(graph$idx)
+      check_graph(graph, n_vertices, n_neighbors)
+      if (is.null(Xnames)) {
+        Xnames <- nn_graph_row_names(graph)
+      }
+    }
+    else {
+      stopifnot(length(nn_method) == nblocks)
+      for (i in 1:nblocks) {
+        graph <- nn_method[[i]]
+        if (is.null(n_vertices)) {
+          n_vertices <- nrow(graph$idx)
+        }
+        check_graph(graph, n_vertices, n_neighbors)
+        if (is.null(Xnames)) {
+          Xnames <- nn_graph_row_names(graph)
+        }
+      }
+    }
   }
 
   if (!is.null(init)) {
@@ -198,6 +230,9 @@ umap_transform <- function(X = NULL, model = NULL,
              xdim[1], ", ", xdim[2], "), but was (",
              indim[1], ", ", indim[2], ")")
       }
+      if (is.null(Xnames) && !is.null(row.names(init))) {
+        Xnames <- row.names(init)
+      }
       init_weighted <- NULL
     }
     else {
@@ -215,7 +250,6 @@ umap_transform <- function(X = NULL, model = NULL,
 
   adjusted_local_connectivity <- max(0, local_connectivity - 1.0)
 
-  nblocks <- length(metric)
   graph <- NULL
   embedding <- NULL
   for (i in 1:nblocks) {
@@ -239,18 +273,24 @@ umap_transform <- function(X = NULL, model = NULL,
         verbose = verbose
       )
     }
-    if(!is.null(X)){
+    if (!is.null(X)) {
       nn <- annoy_search(Xsub,
                          k = n_neighbors, ann = ann, search_k = search_k,
                          prep_data = TRUE,
                          tmpdir = tmpdir,
                          n_threads = n_threads, grain_size = grain_size,
                          verbose = verbose
       )
-    } else if (nn_is_precomputed(nn_method)){
-      nn <- nn_method
+    } else if (nn_is_precomputed(nn_method)) {
+      if (nblocks == 1 && !is.null(nn_method$idx)) {
+        # When there's only one block, the NN graph can be passed directly
+        nn <- nn_method
+      }
+      else {
+        # otherwise we expect a list of NN graphs
+        nn <- nn_method[[i]]
+      }
     }
-
     graph_block <- smooth_knn(nn,
       local_connectivity = adjusted_local_connectivity,
       n_threads = n_threads,
@@ -309,6 +349,7 @@ umap_transform <- function(X = NULL, model = NULL,
     )
 
     embedding <- t(embedding)
+    row.names(train_embedding) <- NULL
     train_embedding <- t(train_embedding)
     if (tolower(method) == "umap") {
       embedding <- optimize_layout_umap(
@@ -349,6 +390,9 @@ umap_transform <- function(X = NULL, model = NULL,
     embedding <- t(embedding)
   }
   tsmessage("Finished")
+  if (!is.null(Xnames)) {
+    row.names(embedding) <- Xnames
+  }
   embedding
 }
 

diff --git a/R/util.R b/R/util.R
@@ -115,7 +115,6 @@ progress_for <- function(n, nchunks, fun) {
     chunk_start <- chunk_end + 1
     chunk_end <- chunk_start + round(remaining / (nchunks - i + 1)) - 1
     remaining <- remaining - (chunk_end - chunk_start + 1)
-
     fun(chunk_start, chunk_end)
 
     message("*", appendLF = FALSE)
@@ -129,3 +128,46 @@ checkna <- function(X) {
     stop("Missing values found in 'X'")
   }
 }
+
+check_graph <- function(graph, expected_rows, expected_cols) {
+  idx <- graph$idx
+  dist <- graph$dist
+  stopifnot(methods::is(idx, "matrix"))
+  stopifnot(methods::is(dist, "matrix"))
+  stopifnot(dim(idx) == dim(dist))
+  stopifnot(nrow(idx) == expected_rows)
+  stopifnot(ncol(idx) == expected_cols)
+}
+
+check_graph_list <- function(graph, expected_rows, expected_cols) {
+  if (!is.null(graph$idx)) {
+    return(check_graph(graph, expected_rows, expected_cols))
+  }
+  ngraphs <- length(graph)
+  for (i in 1:ngraphs) {
+    check_graph(graph[[i]], expected_rows, expected_cols)
+  }
+}
+
+# from a nn graph (or list) get the first non-NULL row names
+nn_graph_row_names <- function(graph) {
+  if (is.null(graph$idx)) {
+    graph <- graph[[1]]
+  }
+  xnames <- NULL
+  if (!is.null(row.names(graph$idx))) {
+    xnames <- row.names(graph$idx)
+  }
+  if (is.null(xnames) && !is.null(row.names(graph$dist))) {
+    xnames <- row.names(graph$dist)
+  }
+  xnames
+}
+
+# from a nn graph (or list) get the number of neighbors
+nn_graph_nbrs <- function(graph) {
+  if (is.null(graph$idx)) {
+    graph <- graph[[1]]
+  }
+  ncol(graph$idx)
+}
diff --git a/R/uwot.R b/R/uwot.R
@@ -5,9 +5,6 @@
 #' the following help text is lifted verbatim from the Python reference
 #' implementation at \url{https://github.com/lmcinnes/umap}.
 #'
-#' Note that the \code{grain_size} parameter no longer does anything and is
-#' present to avoid break backwards compatibility only.
-#'
 #' @param X Input data. Can be a \code{\link{data.frame}}, \code{\link{matrix}},
 #'   \code{\link[stats]{dist}} object or \code{\link[Matrix]{sparseMatrix}}.
 #'   Matrix and data frames should contain one observation per row. Data frames
@@ -179,7 +176,7 @@
 #'    }
 #'   By default, if \code{X} has less than 4,096 vertices, the exact nearest
 #'   neighbors are found. Otherwise, approximate nearest neighbors are used.
-#'   You may also pass precalculated nearest neighbor data to this argument. It
+#'   You may also pass pre-calculated nearest neighbor data to this argument. It
 #'   must be a list consisting of two elements:
 #'   \itemize{
 #'     \item \code{"idx"}. A \code{n_vertices x n_neighbors} matrix
@@ -465,9 +462,6 @@ umap <- function(X, n_neighbors = 15, n_components = 2, metric = "euclidean",
 #' get back the Cauchy distribution as used in t-SNE and LargeVis. It also
 #' results in a substantially simplified gradient expression. This can give
 #' a speed improvement of around 50\%.
-#' 
-#' Note that the \code{grain_size} parameter no longer does anything and is
-#' present to avoid break backwards compatibility only.
 #'
 #' @param X Input data. Can be a \code{\link{data.frame}}, \code{\link{matrix}},
 #'   \code{\link[stats]{dist}} object or \code{\link[Matrix]{sparseMatrix}}.
@@ -870,9 +864,6 @@ tumap <- function(X, n_neighbors = 15, n_components = 2, metric = "euclidean",
 #'   dataset. Using \code{init = "spectral"} can help.
 #' }
 #'
-#' Note that the \code{grain_size} parameter no longer does anything and is
-#' present to avoid break backwards compatibility only.
-#'
 #' @param X Input data. Can be a \code{\link{data.frame}}, \code{\link{matrix}},
 #'   \code{\link[stats]{dist}} object or \code{\link[Matrix]{sparseMatrix}}.
 #'   Matrix and data frames should contain one observation per row. Data frames
@@ -1287,6 +1278,9 @@ uwot <- function(X, n_neighbors = 15, n_components = 2, metric = "euclidean",
   # number of original columns in data frame (or matrix)
   # will be used only if using df or matrix and ret_model = TRUE
   norig_col <- NULL
+  # row names for the input data, which we will apply to the embedding if
+  # needed
+  Xnames <- NULL
   if (is.null(X)) {
     if (!is.list(nn_method)) {
       stop("If X is NULL, must provide NN data in nn_method")
@@ -1295,6 +1289,11 @@ uwot <- function(X, n_neighbors = 15, n_components = 2, metric = "euclidean",
       stop("init = 'pca' and 'spca' can't be used with X = NULL")
     }
     n_vertices <- x2nv(nn_method)
+    stopifnot(n_vertices > 0)
+    n_neighbors <- nn_graph_nbrs(nn_method)
+    stopifnot(n_neighbors > 1 && n_neighbors <= n_vertices)
+    check_graph_list(nn_method, n_vertices, n_neighbors)
+    Xnames <- nn_graph_row_names(nn_method)
   }
   else if (methods::is(X, "dist")) {
     if (ret_model) {
@@ -1303,6 +1302,7 @@ uwot <- function(X, n_neighbors = 15, n_components = 2, metric = "euclidean",
     checkna(X)
     n_vertices <- attr(X, "Size")
     tsmessage("Read ", n_vertices, " rows")
+    Xnames <- labels(X)
   }
   else if (methods::is(X, "sparseMatrix")) {
     if (ret_model) {
@@ -1314,6 +1314,7 @@ uwot <- function(X, n_neighbors = 15, n_components = 2, metric = "euclidean",
       stop("Sparse matrices are only supported as distance matrices")
     }
     tsmessage("Read ", n_vertices, " rows of sparse distance matrix")
+    Xnames <- row.names(X)
   }
   else {
     cat_ids <- NULL
@@ -1349,6 +1350,7 @@ uwot <- function(X, n_neighbors = 15, n_components = 2, metric = "euclidean",
         time_stamp = FALSE
       )
     }
+    Xnames <- row.names(X)
     X <- scale_input(X,
       scale_type = scale, ret_model = ret_model,
       verbose = verbose
@@ -1360,11 +1362,12 @@ uwot <- function(X, n_neighbors = 15, n_components = 2, metric = "euclidean",
   }
 
   if (n_neighbors > n_vertices) {
-    # If nn_method is a list, we will determine n_neighbors later
+    # pre-calculated nearest neighbors ignores the user-supplied n_neighbors
+    # which is handled later
     if (!is.list(nn_method)) {
-      # Otherwise,for LargeVis, n_neighbors normally determined from perplexity
-      # not an error to be too large
       if (method == "largevis") {
+        # for LargeVis, n_neighbors normally determined from perplexity not an 
+        # error to be too large
         tsmessage("Setting n_neighbors to ", n_vertices)
         n_neighbors <- n_vertices
       }
@@ -1675,6 +1678,11 @@ uwot <- function(X, n_neighbors = 15, n_components = 2, metric = "euclidean",
     gc()
     # Center the points before returning
     embedding <- scale(embedding, center = TRUE, scale = FALSE)
+
+    if (is.null(row.names(embedding)) &&
+        !is.null(Xnames) && length(Xnames) == nrow(embedding)) {
+      row.names(embedding) <- Xnames
+    }
     tsmessage("Optimization finished")
   }
 
@@ -1730,6 +1738,10 @@ uwot <- function(X, n_neighbors = 15, n_components = 2, metric = "euclidean",
       res$nn <- list()
       for (i in 1:nblocks) {
         res$nn[[i]] <- list(idx = nns[[i]]$idx, dist = nns[[i]]$dist)
+        if (!is.null(Xnames) && nrow(res$nn[[i]]$idx) == length(Xnames)) {
+          row.names(res$nn[[i]]$idx) <- Xnames
+          row.names(res$nn[[i]]$dist) <- Xnames
+        }
       }
       names(res$nn) <- names(nns)
     }

diff --git a/man/lvish.Rd b/man/lvish.Rd
diff --git a/man/tumap.Rd b/man/tumap.Rd