Skip to content

Commit

Permalink
#81: retain row names from input data
Browse files Browse the repository at this point in the history
  • Loading branch information
jlmelville committed Aug 22, 2021
1 parent fd9bae0 commit 0ad9eda
Show file tree
Hide file tree
Showing 13 changed files with 210 additions and 66 deletions.
23 changes: 23 additions & 0 deletions NEWS.md
Original file line number Diff line number Diff line change
@@ -1,3 +1,26 @@
# uwot 0.1.11

## Bug fixes and minor improvements

* If row names are provided in the input data (or nearest neighbor data, or
initialization data if it's a matrix), this will be used to name the rows of the
output embedding (<https://github.com/jlmelville/uwot/issues/81>), and also the
nearest neighbor data if you set `ret_nn = TRUE`. If the names exist in more
than one of the input data parameters listed above, but are inconsistent, no
guarantees are made about which names will be used. Thank you
[jwijffels](https://github.com/jwijffels) for reporting this.
* Setting `nn_method = "annoy"` and `verbose = TRUE` would lead to an error with
datasets with fewer than 50 items in them.
* Using multiple pre-computed nearest neighbors blocks is now supported with
`umap_transform` (this was incorrectly documented to work).
* Documentation around pre-calculated nearest neighbor data for `umap_transform`
was wrong in other ways: it has now been corrected to indicate that there should
be neighbor data for each item in the test data, but the neighbors and distances
should refer to items in training data (i.e. the data used to build the model).
* `n_neighbors` parameter is now correctly ignored in model generation if
pre-calculated nearest neighbor data is provided.
* Documentation incorrectly said `grain_size` didn't do anything.

# uwot 0.1.10

This release is mainly to allow for some internal changes to keep compatibility
Expand Down
8 changes: 4 additions & 4 deletions R/neighbors.R
Original file line number Diff line number Diff line change
Expand Up @@ -128,8 +128,8 @@ annoy_build <- function(X, metric = "euclidean", n_trees = 50,
", n_trees = ", n_trees
)
ann <- annoy$ann
if (verbose) {
nstars <- 50
nstars <- 50
if (verbose && nr > nstars) {
progress_for(
nr, nstars,
function(chunk_start, chunk_end) {
Expand Down Expand Up @@ -238,8 +238,8 @@ annoy_search_serial <- function(X, k, ann,
nr <- nrow(X)
idx <- matrix(nrow = nr, ncol = k)
dist <- matrix(nrow = nr, ncol = k)
if (verbose) {
nstars <- 50
nstars <- 50
if (verbose && nr > nstars) {
progress_for(
nr, nstars,
function(chunk_start, chunk_end) {
Expand Down
72 changes: 58 additions & 14 deletions R/transform.R
Original file line number Diff line number Diff line change
Expand Up @@ -15,13 +15,14 @@
#' have the same columns in the same order as the input data used to generate
#' the \code{model}.
#' @param model Data associated with an existing embedding.
#' @param nn_method Optional pre-calculated nearest neighbor data. It must be a
#' list consisting of two elements:
#' @param nn_method Optional pre-calculated nearest neighbor data. The format is
#' a list consisting of two elements:
#' \itemize{
#' \item \code{"idx"}. A \code{n_vertices x n_neighbors} matrix
#' containing the integer indexes of the nearest neighbors in \code{X}. Each
#' vertex is considered to be its own nearest neighbor, i.e.
#' \code{idx[, 1] == 1:n_vertices}.
#' \item \code{"idx"}. A \code{n_vertices x n_neighbors} matrix where
#' \code{n_vertices} is the number of items to be transformed. The contents
#' of the matrix should be the integer indexes of the data used to generate
#' the \code{model}, which are the \code{n_neighbors}-nearest neighbors of
#' the data to be transformed.
#' \item \code{"dist"}. A \code{n_vertices x n_neighbors} matrix
#' containing the distances of the nearest neighbors.
#' }
Expand Down Expand Up @@ -143,6 +144,7 @@ umap_transform <- function(X = NULL, model = NULL,
method <- model$method
scale_info <- model$scale_info
metric <- model$metric
nblocks <- length(metric)
pca_models <- model$pca_models

a <- model$a
Expand All @@ -158,7 +160,9 @@ umap_transform <- function(X = NULL, model = NULL,
pcg_rand <- TRUE
}

if(!is.null(X)){
n_vertices <- NULL
Xnames <- NULL
if (!is.null(X)){
if (ncol(X) != norig_col) {
stop("Incorrect dimensions: X must have ", norig_col, " columns")
}
Expand All @@ -170,8 +174,36 @@ umap_transform <- function(X = NULL, model = NULL,
X <- as.matrix(X[, indexes])
}
n_vertices <- nrow(X)
} else if (nn_is_precomputed(nn_method)){
n_vertices <- nrow(nn_method$idx)
if (!is.null(row.names(X))) {
Xnames <- row.names(X)
}
} else if (nn_is_precomputed(nn_method)) {
if (nblocks == 1) {
if (length(nn_method) == 1) {
graph <- nn_method[[1]]
}
else {
graph <- nn_method
}
n_vertices <- nrow(graph$idx)
check_graph(graph, n_vertices, n_neighbors)
if (is.null(Xnames)) {
Xnames <- nn_graph_row_names(graph)
}
}
else {
stopifnot(length(nn_method) == nblocks)
for (i in 1:nblocks) {
graph <- nn_method[[i]]
if (is.null(n_vertices)) {
n_vertices <- nrow(graph$idx)
}
check_graph(graph, n_vertices, n_neighbors)
if (is.null(Xnames)) {
Xnames <- nn_graph_row_names(graph)
}
}
}
}

if (!is.null(init)) {
Expand All @@ -198,6 +230,9 @@ umap_transform <- function(X = NULL, model = NULL,
xdim[1], ", ", xdim[2], "), but was (",
indim[1], ", ", indim[2], ")")
}
if (is.null(Xnames) && !is.null(row.names(init))) {
Xnames <- row.names(init)
}
init_weighted <- NULL
}
else {
Expand All @@ -215,7 +250,6 @@ umap_transform <- function(X = NULL, model = NULL,

adjusted_local_connectivity <- max(0, local_connectivity - 1.0)

nblocks <- length(metric)
graph <- NULL
embedding <- NULL
for (i in 1:nblocks) {
Expand All @@ -239,18 +273,24 @@ umap_transform <- function(X = NULL, model = NULL,
verbose = verbose
)
}
if(!is.null(X)){
if (!is.null(X)) {
nn <- annoy_search(Xsub,
k = n_neighbors, ann = ann, search_k = search_k,
prep_data = TRUE,
tmpdir = tmpdir,
n_threads = n_threads, grain_size = grain_size,
verbose = verbose
)
} else if (nn_is_precomputed(nn_method)){
nn <- nn_method
} else if (nn_is_precomputed(nn_method)) {
if (nblocks == 1 && !is.null(nn_method$idx)) {
# When there's only one block, the NN graph can be passed directly
nn <- nn_method
}
else {
# otherwise we expect a list of NN graphs
nn <- nn_method[[i]]
}
}

graph_block <- smooth_knn(nn,
local_connectivity = adjusted_local_connectivity,
n_threads = n_threads,
Expand Down Expand Up @@ -309,6 +349,7 @@ umap_transform <- function(X = NULL, model = NULL,
)

embedding <- t(embedding)
row.names(train_embedding) <- NULL
train_embedding <- t(train_embedding)
if (tolower(method) == "umap") {
embedding <- optimize_layout_umap(
Expand Down Expand Up @@ -349,6 +390,9 @@ umap_transform <- function(X = NULL, model = NULL,
embedding <- t(embedding)
}
tsmessage("Finished")
if (!is.null(Xnames)) {
row.names(embedding) <- Xnames
}
embedding
}

Expand Down
44 changes: 43 additions & 1 deletion R/util.R
Original file line number Diff line number Diff line change
Expand Up @@ -115,7 +115,6 @@ progress_for <- function(n, nchunks, fun) {
chunk_start <- chunk_end + 1
chunk_end <- chunk_start + round(remaining / (nchunks - i + 1)) - 1
remaining <- remaining - (chunk_end - chunk_start + 1)

fun(chunk_start, chunk_end)

message("*", appendLF = FALSE)
Expand All @@ -129,3 +128,46 @@ checkna <- function(X) {
stop("Missing values found in 'X'")
}
}

check_graph <- function(graph, expected_rows, expected_cols) {
idx <- graph$idx
dist <- graph$dist
stopifnot(methods::is(idx, "matrix"))
stopifnot(methods::is(dist, "matrix"))
stopifnot(dim(idx) == dim(dist))
stopifnot(nrow(idx) == expected_rows)
stopifnot(ncol(idx) == expected_cols)
}

check_graph_list <- function(graph, expected_rows, expected_cols) {
if (!is.null(graph$idx)) {
return(check_graph(graph, expected_rows, expected_cols))
}
ngraphs <- length(graph)
for (i in 1:ngraphs) {
check_graph(graph[[i]], expected_rows, expected_cols)
}
}

# from a nn graph (or list) get the first non-NULL row names
nn_graph_row_names <- function(graph) {
if (is.null(graph$idx)) {
graph <- graph[[1]]
}
xnames <- NULL
if (!is.null(row.names(graph$idx))) {
xnames <- row.names(graph$idx)
}
if (is.null(xnames) && !is.null(row.names(graph$dist))) {
xnames <- row.names(graph$dist)
}
xnames
}

# from a nn graph (or list) get the number of neighbors
nn_graph_nbrs <- function(graph) {
if (is.null(graph$idx)) {
graph <- graph[[1]]
}
ncol(graph$idx)
}
38 changes: 25 additions & 13 deletions R/uwot.R
Original file line number Diff line number Diff line change
Expand Up @@ -5,9 +5,6 @@
#' the following help text is lifted verbatim from the Python reference
#' implementation at \url{https://github.com/lmcinnes/umap}.
#'
#' Note that the \code{grain_size} parameter no longer does anything and is
#' present to avoid break backwards compatibility only.
#'
#' @param X Input data. Can be a \code{\link{data.frame}}, \code{\link{matrix}},
#' \code{\link[stats]{dist}} object or \code{\link[Matrix]{sparseMatrix}}.
#' Matrix and data frames should contain one observation per row. Data frames
Expand Down Expand Up @@ -179,7 +176,7 @@
#' }
#' By default, if \code{X} has less than 4,096 vertices, the exact nearest
#' neighbors are found. Otherwise, approximate nearest neighbors are used.
#' You may also pass precalculated nearest neighbor data to this argument. It
#' You may also pass pre-calculated nearest neighbor data to this argument. It
#' must be a list consisting of two elements:
#' \itemize{
#' \item \code{"idx"}. A \code{n_vertices x n_neighbors} matrix
Expand Down Expand Up @@ -465,9 +462,6 @@ umap <- function(X, n_neighbors = 15, n_components = 2, metric = "euclidean",
#' get back the Cauchy distribution as used in t-SNE and LargeVis. It also
#' results in a substantially simplified gradient expression. This can give
#' a speed improvement of around 50\%.
#'
#' Note that the \code{grain_size} parameter no longer does anything and is
#' present to avoid break backwards compatibility only.
#'
#' @param X Input data. Can be a \code{\link{data.frame}}, \code{\link{matrix}},
#' \code{\link[stats]{dist}} object or \code{\link[Matrix]{sparseMatrix}}.
Expand Down Expand Up @@ -870,9 +864,6 @@ tumap <- function(X, n_neighbors = 15, n_components = 2, metric = "euclidean",
#' dataset. Using \code{init = "spectral"} can help.
#' }
#'
#' Note that the \code{grain_size} parameter no longer does anything and is
#' present to avoid break backwards compatibility only.
#'
#' @param X Input data. Can be a \code{\link{data.frame}}, \code{\link{matrix}},
#' \code{\link[stats]{dist}} object or \code{\link[Matrix]{sparseMatrix}}.
#' Matrix and data frames should contain one observation per row. Data frames
Expand Down Expand Up @@ -1287,6 +1278,9 @@ uwot <- function(X, n_neighbors = 15, n_components = 2, metric = "euclidean",
# number of original columns in data frame (or matrix)
# will be used only if using df or matrix and ret_model = TRUE
norig_col <- NULL
# row names for the input data, which we will apply to the embedding if
# needed
Xnames <- NULL
if (is.null(X)) {
if (!is.list(nn_method)) {
stop("If X is NULL, must provide NN data in nn_method")
Expand All @@ -1295,6 +1289,11 @@ uwot <- function(X, n_neighbors = 15, n_components = 2, metric = "euclidean",
stop("init = 'pca' and 'spca' can't be used with X = NULL")
}
n_vertices <- x2nv(nn_method)
stopifnot(n_vertices > 0)
n_neighbors <- nn_graph_nbrs(nn_method)
stopifnot(n_neighbors > 1 && n_neighbors <= n_vertices)
check_graph_list(nn_method, n_vertices, n_neighbors)
Xnames <- nn_graph_row_names(nn_method)
}
else if (methods::is(X, "dist")) {
if (ret_model) {
Expand All @@ -1303,6 +1302,7 @@ uwot <- function(X, n_neighbors = 15, n_components = 2, metric = "euclidean",
checkna(X)
n_vertices <- attr(X, "Size")
tsmessage("Read ", n_vertices, " rows")
Xnames <- labels(X)
}
else if (methods::is(X, "sparseMatrix")) {
if (ret_model) {
Expand All @@ -1314,6 +1314,7 @@ uwot <- function(X, n_neighbors = 15, n_components = 2, metric = "euclidean",
stop("Sparse matrices are only supported as distance matrices")
}
tsmessage("Read ", n_vertices, " rows of sparse distance matrix")
Xnames <- row.names(X)
}
else {
cat_ids <- NULL
Expand Down Expand Up @@ -1349,6 +1350,7 @@ uwot <- function(X, n_neighbors = 15, n_components = 2, metric = "euclidean",
time_stamp = FALSE
)
}
Xnames <- row.names(X)
X <- scale_input(X,
scale_type = scale, ret_model = ret_model,
verbose = verbose
Expand All @@ -1360,11 +1362,12 @@ uwot <- function(X, n_neighbors = 15, n_components = 2, metric = "euclidean",
}

if (n_neighbors > n_vertices) {
# If nn_method is a list, we will determine n_neighbors later
# pre-calculated nearest neighbors ignores the user-supplied n_neighbors
# which is handled later
if (!is.list(nn_method)) {
# Otherwise,for LargeVis, n_neighbors normally determined from perplexity
# not an error to be too large
if (method == "largevis") {
# for LargeVis, n_neighbors normally determined from perplexity not an
# error to be too large
tsmessage("Setting n_neighbors to ", n_vertices)
n_neighbors <- n_vertices
}
Expand Down Expand Up @@ -1675,6 +1678,11 @@ uwot <- function(X, n_neighbors = 15, n_components = 2, metric = "euclidean",
gc()
# Center the points before returning
embedding <- scale(embedding, center = TRUE, scale = FALSE)

if (is.null(row.names(embedding)) &&
!is.null(Xnames) && length(Xnames) == nrow(embedding)) {
row.names(embedding) <- Xnames
}
tsmessage("Optimization finished")
}

Expand Down Expand Up @@ -1730,6 +1738,10 @@ uwot <- function(X, n_neighbors = 15, n_components = 2, metric = "euclidean",
res$nn <- list()
for (i in 1:nblocks) {
res$nn[[i]] <- list(idx = nns[[i]]$idx, dist = nns[[i]]$dist)
if (!is.null(Xnames) && nrow(res$nn[[i]]$idx) == length(Xnames)) {
row.names(res$nn[[i]]$idx) <- Xnames
row.names(res$nn[[i]]$dist) <- Xnames
}
}
names(res$nn) <- names(nns)
}
Expand Down
3 changes: 0 additions & 3 deletions man/lvish.Rd

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

3 changes: 0 additions & 3 deletions man/tumap.Rd

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

Loading

0 comments on commit 0ad9eda

Please sign in to comment.