-
Notifications
You must be signed in to change notification settings - Fork 43
/
slingshot.R
524 lines (508 loc) · 26.6 KB
/
slingshot.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
#' @rdname slingshot
#'
#' @description Perform trajectory inference by (1) identifying lineage
#' structure with a cluster-based minimum spanning tree, and (2) constructing
#' smooth representations of each lineage using simultaneous principal curves.
#' This function wraps the \code{\link{getLineages}} and
#' \code{\link{getCurves}} functions and is the primary function of the
#' \code{slingshot} package.
#'
#' @param data a data object containing the matrix of coordinates to be used for
#' lineage inference. Supported types include \code{matrix},
#' \code{\link{SingleCellExperiment}}, \code{\link{SlingshotDataSet}}, and
#' \code{\link[TrajectoryUtils]{PseudotimeOrdering}}.
#' @param clusterLabels each cell's cluster assignment. This can be a single
#' vector of labels, or a \code{#cells} by \code{#clusters} matrix
#' representing weighted cluster assignment. Either representation may
#' optionally include a \code{"-1"} group meaning "unclustered."
#' @param reducedDim (optional) the dimensionality reduction to be used. Can be
#' a matrix or a character identifying which element of
#' \code{reducedDim(data)} is to be used. If multiple dimensionality
#' reductions are present and this argument is not provided, the first element
#' will be used by default.
#' @param start.clus (optional) character, indicates the starting cluster(s)
#' from which lineages will be drawn.
#' @param end.clus (optional) character, indicates which cluster(s) will be
#' forced to be leaf nodes in the graph.
#' @param dist.method (optional) character, specifies the method for calculating
#' distances between clusters. Default is \code{"slingshot"}, see
#' \code{\link[TrajectoryUtils]{createClusterMST}} for details.
#' @param use.median logical, whether to use the median (instead of mean) when
#' calculating cluster centroid coordinates.
#' @param omega (optional) numeric, this granularity parameter determines the
#' distance between every real cluster and the artificial cluster,
#' \code{.OMEGA}. In practice, this makes \code{omega} the maximum allowable
#' distance between two connected clusters. By default, \code{omega = Inf}. If
#' \code{omega = TRUE}, the maximum edge length will be set to the median edge
#' length of the unsupervised MST times a scaling factor (\code{omega_scale},
#' default \code{= 3}). This value is provided as a potentially useful rule of
#' thumb for datasets with outlying clusters or multiple, distinct
#' trajectories. See \code{outgroup} in
#' \code{\link[TrajectoryUtils]{createClusterMST}}.
#' @param omega_scale (optional) numeric, scaling factor to use when \code{omega
#' = TRUE}. The maximum edge length will be set to the median edge length of
#' the unsupervised MST times \code{omega_scale} (default \code{= 1.5}). See
#' \code{outscale} in \code{\link[TrajectoryUtils]{createClusterMST}}.
#' @param times numeric, vector of external times associated with either
#' clusters or cells. See \code{\link[TrajectoryUtils]{defineMSTPaths}} for
#' details.
#' @param shrink logical or numeric between 0 and 1, determines whether and how
#' much to shrink branching lineages toward their average prior to the split
#' (default \code{= TRUE}).
#' @param extend character, how to handle root and leaf clusters of lineages
#' when constructing the initial, piece-wise linear curve. Accepted values are
#' \code{'y'} (default), \code{'n'}, and \code{'pc1'}. See 'Details' for more.
#' @param reweight logical, whether to allow cells shared between lineages to be
#' reweighted during curve fitting. If \code{TRUE} (default), cells shared
#' between lineages will be iteratively reweighted based on the quantiles of
#' their projection distances to each curve. See 'Details' for more.
#' @param reassign logical, whether to reassign cells to lineages at each
#' iteration. If \code{TRUE} (default), cells will be added to a lineage when
#' their projection distance to the curve is less than the median distance for
#' all cells currently assigned to the lineage. Additionally, shared cells
#' will be removed from a lineage if their projection distance to the curve is
#' above the 90th percentile and their weight along the curve is less than
#' \code{0.1}.
#' @param thresh numeric, determines the convergence criterion. Percent change
#' in the total distance from cells to their projections along curves must be
#' less than \code{thresh}. Default is \code{0.001}, similar to
#' \code{\link[princurve]{principal_curve}}.
#' @param maxit numeric, maximum number of iterations (default \code{= 15}), see
#' \code{\link[princurve]{principal_curve}}.
#' @param stretch numeric factor by which curves can be extrapolated beyond
#' endpoints. Default is \code{2}, see
#' \code{\link[princurve]{principal_curve}}.
#' @param approx_points numeric, whether curves should be approximated by a
#' fixed number of points. If \code{FALSE} (or 0), no approximation will be
#' performed and curves will contain as many points as the input data. If
#' numeric, curves will be approximated by this number of points (default
#' \code{= 150} or \code{#cells}, whichever is smaller). See 'Details' and
#' \code{\link[princurve]{principal_curve}} for more.
#' @param smoother choice of scatter plot smoother. Same as
#' \code{\link[princurve]{principal_curve}}, but \code{"lowess"} option is
#' replaced with \code{"loess"} for additional flexibility.
#' @param shrink.method character denoting how to determine the appropriate
#' amount of shrinkage for a branching lineage. Accepted values are the same
#' as for \code{kernel} in \code{\link{density}} (default is \code{"cosine"}),
#' as well as \code{"tricube"} and \code{"density"}. See 'Details' for more.
#' @param allow.breaks logical, determines whether curves that branch very close
#' to the origin should be allowed to have different starting points.
#' @param ... Additional parameters to pass to scatter plot smoothing function,
#' \code{smoother}.
#'
#'
#' @details Given a reduced-dimensional data matrix \code{n} by \code{p} and a
#' vector of cluster labels (or matrix of soft cluster assignments,
#' potentially including a \code{-1} label for "unclustered"), this function
#' performs trajectory inference using a cluster-based minimum spanning tree
#' on the clusters and simultaneous principal curves for smooth, branching
#' paths.
#'
#' @details The graph of this structure is learned by fitting a (possibly
#' constrained) minimum-spanning tree on the clusters, plus the artificial
#' cluster, \code{.OMEGA}, which is a fixed distance away from every real
#' cluster. This effectively limits the maximum branch length in the MST to
#' the chosen distance, meaning that the output may contain multiple trees.
#'
#' @details Once the graph is known, lineages are identified in
#' any tree with at least two clusters. For a given tree, if there is an
#' annotated starting cluster, every possible path out of a starting cluster
#' and ending in a leaf that isn't another starting cluster will be returned.
#' If no starting cluster is annotated, one will be chosen by a heuristic
#' method, but this is not recommended.
#'
#' @details When there is only a single lineage, the curve-fitting algorithm is
#' nearly identical to that of \code{\link[princurve]{principal_curve}}. When
#' there are multiple lineages and \code{shrink > 0}, an additional step
#' is added to the iterative procedure, forcing curves to be similar in the
#' neighborhood of shared points (ie., before they branch).
#'
#' @details The \code{approx_points} argument, which sets the number of points
#' to be used for each curve, can have a large effect on computation time. Due
#' to this consideration, we set the default value to \code{150} whenever the
#' input dataset contains more than that many cells. This setting should help
#' with exploratory analysis while having little to no impact on the final
#' curves. To disable this behavior and construct curves with the maximum
#' number of points, set \code{approx_points = FALSE}.
#'
#' @details The \code{extend} argument determines how to construct the
#' piece-wise linear curve used to initiate the recursive algorithm. The
#' initial curve is always based on the lines between cluster centers and if
#' \code{extend = 'n'}, this curve will terminate at the center of the
#' endpoint clusters. Setting \code{extend = 'y'} will allow the first and
#' last segments to extend beyond the cluster center to the orthogonal
#' projection of the furthest point. Setting \code{extend = 'pc1'} is similar
#' to \code{'y'}, but uses the first principal component of the cluster to
#' determine the direction of the curve beyond the cluster center. These
#' options typically have limited impact on the final curve, but can
#' occasionally help with stability issues.
#'
#' @details When \code{shink = TRUE}, we compute a percent shrinkage curve,
#' \eqn{w_l(t)}, for each lineage, a non-increasing function of pseudotime
#' that determines how much that lineage should be shrunk toward a shared
#' average curve. We set \eqn{w_l(0) = 1} (complete shrinkage), so that the
#' curves will always perfectly overlap the average curve at pseudotime
#' \code{0}. The weighting curve decreases from \code{1} to \code{0} over the
#' non-outlying pseudotime values of shared cells (where outliers are defined
#' by the \code{1.5*IQR} rule). The exact shape of the curve in this region is
#' controlled by \code{shrink.method}, and can follow the shape of any
#' standard kernel function's cumulative density curve (or more precisely,
#' survival curve, since we require a decreasing function). Different choices
#' of \code{shrink.method} to have no discernable impact on the final curves,
#' in most cases.
#'
#' @details When \code{reweight = TRUE}, weights for shared cells are based on
#' the quantiles of their projection distances onto each curve. The
#' distances are ranked and converted into quantiles between \code{0} and
#' \code{1}, which are then transformed by \code{1 - q^2}. Each cell's weight
#' along a given lineage is the ratio of this value to the maximum value for
#' this cell across all lineages.
#'
#' @references Hastie, T., and Stuetzle, W. (1989). "Principal Curves."
#' \emph{Journal of the American Statistical Association}, 84:502-516.
#'
#' @references Street, K., et al. (2018). "Slingshot: cell lineage and
#' pseudotime inference for single-cell transcriptomics." \emph{BMC Genomics},
#' 19:477.
#'
#' @return An object of class \code{\link{PseudotimeOrdering}} containing the
#' pseudotime estimates and lineage assignment weights in the \code{assays}.
#' The \code{reducedDim} and \code{clusterLabels} matrices will be stored in
#' the \code{\link[TrajectoryUtils]{cellData}}. Additionally, the
#' \code{metadata} slot will contain an \code{\link[igraph]{igraph}} object
#' named \code{mst}, a list of parameter values named \code{slingParams}, a
#' list of lineages (ordered sets of clusters) named \code{lineages}, and a
#' list of \code{\link[princurve]{principal_curve}} objects named
#' \code{curves}.
#'
#' @examples
#' data("slingshotExample")
#' rd <- slingshotExample$rd
#' cl <- slingshotExample$cl
#' pto <- slingshot(rd, cl, start.clus = '1')
#'
#' # plotting
#' sds <- as.SlingshotDataSet(pto)
#' plot(rd, col = cl, asp = 1)
#' lines(sds, type = 'c', lwd = 3)
#'
#' @export
#'
setMethod(f = "slingshot",
signature = signature(data = "matrix",
clusterLabels = "character"),
definition = function(data, clusterLabels,
reducedDim = NULL,
start.clus = NULL, end.clus = NULL,
dist.method = "slingshot",
use.median = FALSE,
omega = FALSE, omega_scale = 1.5,
times = NULL,
shrink = TRUE,
extend = 'y',
reweight = TRUE,
reassign = TRUE,
thresh = 0.001, maxit = 15, stretch = 2,
approx_points = NULL,
smoother = 'smooth.spline',
shrink.method = 'cosine',
allow.breaks = TRUE, ...){
pto <- getLineages(data, clusterLabels, reducedDim = reducedDim,
start.clus = start.clus, end.clus = end.clus,
dist.method = dist.method,
use.median = use.median, omega = omega,
omega_scale = omega_scale, times = times)
pto <- getCurves(pto,
shrink = shrink, extend = extend,
reweight = reweight, reassign = reassign,
thresh = thresh, maxit = maxit,
approx_points = approx_points,
stretch = stretch, smoother = smoother,
shrink.method = shrink.method,
allow.breaks = allow.breaks, ...)
return(pto)
}
)
#' @rdname slingshot
#' @export
setMethod(f = "slingshot",
signature = signature(data = "matrix",
clusterLabels = "matrix"),
definition = function(data, clusterLabels,
reducedDim = NULL,
start.clus = NULL, end.clus = NULL,
dist.method = "slingshot",
use.median = FALSE,
omega = FALSE, omega_scale = 1.5,
times = NULL,
shrink = TRUE,
extend = 'y',
reweight = TRUE,
reassign = TRUE,
thresh = 0.001, maxit = 15, stretch = 2,
approx_points = NULL,
smoother = 'smooth.spline',
shrink.method = 'cosine',
allow.breaks = TRUE, ...){
pto <- getLineages(data, clusterLabels, reducedDim = reducedDim,
start.clus = start.clus, end.clus = end.clus,
dist.method = dist.method,
use.median = use.median, omega = omega,
omega_scale = omega_scale, times = times)
pto <- getCurves(pto,
shrink = shrink, extend = extend,
reweight = reweight, reassign = reassign,
thresh = thresh, maxit = maxit,
approx_points = approx_points,
stretch = stretch, smoother = smoother,
shrink.method = shrink.method,
allow.breaks = allow.breaks, ...)
return(pto)
})
#' @rdname slingshot
#' @export
setMethod(f = "slingshot",
signature = signature(data = "SlingshotDataSet",
clusterLabels = "ANY"),
definition = function(data, clusterLabels, ...){
return(slingshot(data = reducedDim(data),
clusterLabels = slingClusterLabels(data), ...))
})
#' @rdname slingshot
#' @export
setMethod(f = "slingshot",
signature = signature(data = "data.frame",
clusterLabels = "ANY"),
definition = function(data, clusterLabels, ...){
RD <- as.matrix(data)
rownames(RD) <- rownames(data)
return(slingshot(data = RD,
clusterLabels = clusterLabels, ...))
})
#' @rdname slingshot
#' @export
setMethod(f = "slingshot",
signature = signature(data = "matrix",
clusterLabels = "numeric"),
definition = function(data, clusterLabels, ...){
return(slingshot(data = data,
clusterLabels = as.character(clusterLabels), ...))
})
#' @rdname slingshot
#' @export
setMethod(f = "slingshot",
signature = signature(data = "matrix",
clusterLabels = "factor"),
definition = function(data, clusterLabels, ...){
return(slingshot(data = data,
clusterLabels = as.character(clusterLabels), ...))
})
#' @rdname slingshot
#' @export
setMethod(f = "slingshot",
signature = signature(data = "matrix",
clusterLabels = "ANY"),
definition = function(data, clusterLabels, ...){
if(missing(clusterLabels)){
message('No cluster labels provided.',
' Continuing with one cluster.')
clusterLabels <- rep('1', nrow(data))
}
if(! any(c(length(clusterLabels), nrow(clusterLabels)) ==
nrow(data))){
stop("clusterLabels must have length or number of rows equal',
'to nrow(data).")
}
return(slingshot(data = data, clusterLabels = clusterLabels, ...))
})
#' @rdname slingshot
#' @importFrom SingleCellExperiment colData
#' @importFrom SummarizedExperiment colData<-
#' @importFrom SingleCellExperiment reducedDims
#' @importFrom SingleCellExperiment reducedDims<-
#' @export
setMethod(f = "slingshot",
signature = signature(data = "ClusterExperiment"),
definition = function(data, clusterLabels,
reducedDim = NULL,
start.clus = NULL, end.clus = NULL,
dist.method = "slingshot",
use.median = FALSE,
omega = FALSE, omega_scale = 1.5,
times = NULL,
shrink = TRUE,
extend = 'y',
reweight = TRUE,
reassign = TRUE,
thresh = 0.001, maxit = 15, stretch = 2,
approx_points = NULL,
smoother = 'smooth.spline',
shrink.method = 'cosine',
allow.breaks = TRUE, ...){
# SETUP
# determine the cluster labels and reducedDim matrix
if(is.null(reducedDim)){
if(length(reducedDims(data))==0){
stop('No dimensionality reduction found.')
}else{
message('Dimensionality reduction not explicitly ',
'chosen. Continuing with ',
names(reducedDims(data))[1])
rd <- reducedDims(data)[[1]]
}
}
if(length(reducedDim)==1){
if(reducedDim %in% names(reducedDims(data))){
rd <- reducedDims(data)[[as.character(reducedDim)]]
}else{
stop(reducedDim,' not found in reducedDims(data).')
}
}else{
if(!is.null(dim(reducedDim))){
rd <- reducedDim
reducedDims(data)$slingReducedDim <- reducedDim
}
}
if(missing(clusterLabels)){
cl <- clusterExperiment::primaryClusterNamed(data)
}else{
if(length(clusterLabels)==1){
if(clusterLabels %in% colnames(colData(data))){
cl <- colData(data)[[as.character(clusterLabels)]]
}else{
if(clusterLabels %in% colnames(
clusterExperiment::clusterMatrix(data))){
cl <- clusterExperiment::clusterMatrixNamed(
data)[,as.character(clusterLabels)]
}else{
stop(clusterLabels,' not found in colData(data)',
' or clusterMatrix(data).')
}
}
}
if(length(clusterLabels)>1){
if(!is.null(dim(clusterLabels)) &&
length(dim(clusterLabels)) > 1 &&
all(dim(clusterLabels) > 1)){
cl <- as.matrix(clusterLabels)
colnames(cl) <- paste0('sling_c',seq_len(ncol(cl)))
}else{
cl <- as.character(clusterLabels)
}
}
}
# run slingshot
pto <- slingshot(data = rd, clusterLabels = cl,
reducedDim = NULL,
start.clus = start.clus, end.clus = end.clus,
dist.method = dist.method,
use.median = use.median, omega = omega,
omega_scale = omega_scale, times = times,
shrink = shrink, extend = extend,
reweight = reweight, reassign = reassign,
thresh = thresh, maxit = maxit,
stretch = stretch, smoother = smoother,
approx_points = approx_points,
shrink.method = shrink.method,
allow.breaks = allow.breaks, ...)
# combine slingshot output with SCE
sce <- data
colData(sce)$slingshot <- pto
pst <- slingPseudotime(pto)
colnames(pst) <- paste0('slingPseudotime_',seq_len(ncol(pst)))
colData(sce) <- cbind(colData(sce), pst)
return(sce)
})
#' @rdname slingshot
#' @importFrom SingleCellExperiment colData
#' @importFrom SummarizedExperiment colData<-
#' @importFrom SingleCellExperiment reducedDims
#' @importFrom SingleCellExperiment reducedDims<-
#' @export
setMethod(f = "slingshot",
signature = signature(data = "SingleCellExperiment"),
definition = function(data, clusterLabels,
reducedDim = NULL,
start.clus = NULL, end.clus = NULL,
dist.method = "slingshot",
use.median = FALSE,
omega = FALSE, omega_scale = 1.5,
times = NULL,
shrink = TRUE,
extend = 'y',
reweight = TRUE,
reassign = TRUE,
thresh = 0.001, maxit = 15, stretch = 2,
approx_points = NULL,
smoother = 'smooth.spline',
shrink.method = 'cosine',
allow.breaks = TRUE, ...){
# SETUP
# determine the cluster labels and reducedDim matrix
if(is.null(reducedDim)){
if(length(reducedDims(data))==0){
stop('No dimensionality reduction found.')
}else{
message('Dimensionality reduction not explicitly ',
'chosen. Continuing with ',
names(reducedDims(data))[1])
rd <- reducedDims(data)[[1]]
}
}
if(length(reducedDim)==1){
if(reducedDim %in% names(reducedDims(data))){
rd <- reducedDims(data)[[as.character(reducedDim)]]
}else{
stop(reducedDim,' not found in reducedDims(data).')
}
}else{
if(!is.null(dim(reducedDim))){
rd <- reducedDim
reducedDims(data)$slingReducedDim <- reducedDim
}
}
if(missing(clusterLabels)){
message('No cluster labels provided. Continuing with ',
'one cluster.')
cl <- rep('1', nrow(rd))
}else{
if(length(clusterLabels)==1){
if(clusterLabels %in% colnames(colData(data))){
cl <- colData(data)[[as.character(clusterLabels)]]
}else{
stop(clusterLabels,' not found in colData(data).')
}
}
if(length(clusterLabels)>1){
if(!is.null(dim(clusterLabels)) &&
length(dim(clusterLabels)) > 1 &&
all(dim(clusterLabels) > 1)){
cl <- as.matrix(clusterLabels)
colnames(cl) <- paste0('sling_c',seq_len(ncol(cl)))
}else{
cl <- as.character(clusterLabels)
}
}
}
# run slingshot
pto <- slingshot(data = rd, clusterLabels = cl,
reducedDim = NULL,
start.clus = start.clus, end.clus = end.clus,
dist.method = dist.method,
use.median = use.median, omega = omega,
omega_scale = omega_scale, times = times,
shrink = shrink, extend = extend,
reweight = reweight, reassign = reassign,
thresh = thresh, maxit = maxit,
stretch = stretch, smoother = smoother,
approx_points = approx_points,
shrink.method = shrink.method,
allow.breaks = allow.breaks, ...)
# combine slingshot output with SCE
sce <- data
colData(sce)$slingshot <- pto
pst <- slingPseudotime(pto)
colnames(pst) <- paste0('slingPseudotime_',seq_len(ncol(pst)))
colData(sce) <- cbind(colData(sce), pst)
return(sce)
})