In [5]:
# Author: Fabio Zanini
# Date: 2020-05-27
# Description: learn how to use scmap
# Ah, namespace pollution starts immediately
library(SingleCellExperiment)
library(scmap)

# NOTE: sce has cells as columns
fn_atlas <- '../data/for_scmap/TBS_kidney_atlas_subsample_20_counts.tsv'
cols <- c('character', rep('double', 19860))
counts_atlas <- t(read.table(file = fn_atlas, sep = '\t', header = TRUE,  colClasses = cols,  row.names = "index"))

fn_meta_atlas = '../data/for_scmap/TBS_kidney_atlas_subsample_20_metadata.tsv'
meta_atlas <- read.table(file = fn_meta_atlas, sep = '\t', header = TRUE, row.names='index')

atlas <- SingleCellExperiment(assays = list(normcounts = as.matrix(counts_atlas)), colData = meta_atlas)

start_time <- Sys.time()

# this is needed to calculate dropout rate for feature selection
# important: normcounts have the same zeros as raw counts (fpkm)
counts(atlas) <- normcounts(atlas)
logcounts(atlas) <- log2(normcounts(atlas) + 1)

# use gene names as feature symbols
rowData(atlas)$feature_symbol <- rownames(atlas)

# remove features with duplicated names
atlas <- atlas[!duplicated(rownames(atlas)), ]
atlas <- selectFeatures(atlas)

atlas <- indexCell(atlas)

end_time <- Sys.time()
print('Runtime atlas:')
print(end_time - start_time)

repns <- c('1', '2', '3', '4', '5')

for (repn in repns) {

# NOTE: sce has cells as columns
fn_newdata <- paste('../data/for_scmap/TBS_kidney_newdata_subsample_100_counts_rep_', repn, '.tsv', sep="")
cols <- c('character', rep('double', 19860))
counts_newdata <- t(read.table(file = fn_newdata, sep = '\t', header = TRUE, colClasses = cols,  row.names = "index"))


fn_meta_newdata = paste('../data/for_scmap/TBS_kidney_newdata_subsample_100_metadata_rep_', repn, '.tsv', sep="")
meta_newdata <- read.table(file = fn_meta_newdata, sep = '\t', header = TRUE, row.names = 'index')

newdata <- SingleCellExperiment(assays = list(normcounts = as.matrix(counts_newdata)), colData = meta_newdata)

start_time <- Sys.time()
    
# this is needed to calculate dropout rate for feature selection
# important: normcounts have the same zeros as raw counts (fpkm)
counts(newdata) <- normcounts(newdata)
logcounts(newdata) <- log2(normcounts(newdata) + 1)

# use gene names as feature symbols
rowData(newdata)$feature_symbol <- rownames(newdata)

# remove features with duplicated names
newdata <- newdata[!duplicated(rownames(newdata)), ]
newdata <- selectFeatures(newdata)

newdata <- indexCell(newdata)

##############################################
# run scmapCell to map the cells back to atlas
##############################################
res <- scmapCell(
  projection = newdata,
  index_list = list(
    atlas = metadata(atlas)$scmap_cell_index
  ),
  w = 9
)
##############################################
    
end_time <- Sys.time()
    
print('Runtime for newdata:')
print(repn)
print(end_time - start_time)

neighbors <- res[[1]]$cells

# Exctract TSV and do the rest in Python...
fn_res <- paste('../data/for_scmap/TBS_kidney_newdata_subsample_100_metadata_rep_', repn, '_output.tsv', sep="")
write.table(neighbors, file = fn_res, quote=FALSE, sep='\t')
    
}

Parameter M was not provided, will use M = n_features / 10 (if n_features <= 1000), where n_features is the number of selected features, and M = 100 otherwise.

Parameter k was not provided, will use k = sqrt(number_of_cells)



[1] "Runtime atlas:"
Time difference of 0.3800428 secs


In [16]:
# Load more and more incomplete atlases
nat <- c('17', '16', '15', '14')
repns <- c('1', '2', '3', '4', '5')

for (na in nat) {
print(na)
    
# NOTE: sce has cells as columns
fn_atlas <- paste('../data/for_scmap/TBS_kidney_atlas_subsample_20_counts_na_', na, '.tsv', sep = "")
cols <- c('character', rep('double', 19860))
counts_atlas <- t(read.table(file = fn_atlas, sep = '\t', header = TRUE,  colClasses = cols,  row.names = "X"))

fn_meta_atlas = paste('../data/for_scmap/TBS_kidney_atlas_subsample_20_metadata_na_', na, '.tsv', sep = "")
meta_atlas <- read.table(file = fn_meta_atlas, sep = '\t', header = TRUE, row.names='X')

atlas <- SingleCellExperiment(assays = list(normcounts = as.matrix(counts_atlas)), colData = meta_atlas)

start_time <- Sys.time()

# this is needed to calculate dropout rate for feature selection
# important: normcounts have the same zeros as raw counts (fpkm)
counts(atlas) <- normcounts(atlas)
logcounts(atlas) <- log2(normcounts(atlas) + 1)

# use gene names as feature symbols
rowData(atlas)$feature_symbol <- rownames(atlas)

# remove features with duplicated names
atlas <- atlas[!duplicated(rownames(atlas)), ]
atlas <- selectFeatures(atlas)

atlas <- indexCell(atlas)

end_time <- Sys.time()
print('Runtime atlas:')
print(end_time - start_time)

for (repn in repns) {

# NOTE: sce has cells as columns
fn_newdata <- paste('../data/for_scmap/TBS_kidney_newdata_subsample_100_counts_rep_', repn, '.tsv', sep = "")
cols <- c('character', rep('double', 19860))
counts_newdata <- t(read.table(file = fn_newdata, sep = '\t', header = TRUE, colClasses = cols,  row.names = "index"))


fn_meta_newdata = paste('../data/for_scmap/TBS_kidney_newdata_subsample_100_metadata_rep_', repn, '.tsv', sep = "")
meta_newdata <- read.table(file = fn_meta_newdata, sep = '\t', header = TRUE, row.names = 'index')

newdata <- SingleCellExperiment(assays = list(normcounts = as.matrix(counts_newdata)), colData = meta_newdata)

start_time <- Sys.time()
    
# this is needed to calculate dropout rate for feature selection
# important: normcounts have the same zeros as raw counts (fpkm)
counts(newdata) <- normcounts(newdata)
logcounts(newdata) <- log2(normcounts(newdata) + 1)

# use gene names as feature symbols
rowData(newdata)$feature_symbol <- rownames(newdata)

# remove features with duplicated names
newdata <- newdata[!duplicated(rownames(newdata)), ]
newdata <- selectFeatures(newdata)

newdata <- indexCell(newdata)

##############################################
# run scmapCell to map the cells back to atlas
##############################################
res <- scmapCell(
  projection = newdata,
  index_list = list(
    atlas = metadata(atlas)$scmap_cell_index
  ),
  w = 9
)
##############################################
    
end_time <- Sys.time()
    
print('Runtime for newdata:')
print(repn)
print(end_time - start_time)

neighbors <- res[[1]]$cells

# Exctract TSV and do the rest in Python...
fn_res <- paste('../data/for_scmap/TBS_kidney_newdata_subsample_100_metadata_rep_', repn, '_na_', na, '_output.tsv', sep="")
write.table(neighbors, file = fn_res, quote=FALSE, sep='\t')
    
}

}

[1] "17"


Parameter M was not provided, will use M = n_features / 10 (if n_features <= 1000), where n_features is the number of selected features, and M = 100 otherwise.

Parameter k was not provided, will use k = sqrt(number_of_cells)



[1] "Runtime atlas:"
Time difference of 0.4147904 secs


Parameter M was not provided, will use M = n_features / 10 (if n_features <= 1000), where n_features is the number of selected features, and M = 100 otherwise.

Parameter k was not provided, will use k = sqrt(number_of_cells)



[1] "Runtime for newdata:"
[1] "1"
Time difference of 3.331619 secs


Parameter M was not provided, will use M = n_features / 10 (if n_features <= 1000), where n_features is the number of selected features, and M = 100 otherwise.

Parameter k was not provided, will use k = sqrt(number_of_cells)



[1] "Runtime for newdata:"
[1] "2"
Time difference of 3.574516 secs


Parameter M was not provided, will use M = n_features / 10 (if n_features <= 1000), where n_features is the number of selected features, and M = 100 otherwise.

Parameter k was not provided, will use k = sqrt(number_of_cells)



[1] "Runtime for newdata:"
[1] "3"
Time difference of 3.318047 secs


Parameter M was not provided, will use M = n_features / 10 (if n_features <= 1000), where n_features is the number of selected features, and M = 100 otherwise.

Parameter k was not provided, will use k = sqrt(number_of_cells)



[1] "Runtime for newdata:"
[1] "4"
Time difference of 2.877976 secs


Parameter M was not provided, will use M = n_features / 10 (if n_features <= 1000), where n_features is the number of selected features, and M = 100 otherwise.

Parameter k was not provided, will use k = sqrt(number_of_cells)



[1] "Runtime for newdata:"
[1] "5"
Time difference of 3.241976 secs
[1] "16"


Parameter M was not provided, will use M = n_features / 10 (if n_features <= 1000), where n_features is the number of selected features, and M = 100 otherwise.

Parameter k was not provided, will use k = sqrt(number_of_cells)



[1] "Runtime atlas:"
Time difference of 0.3446839 secs


Parameter M was not provided, will use M = n_features / 10 (if n_features <= 1000), where n_features is the number of selected features, and M = 100 otherwise.

Parameter k was not provided, will use k = sqrt(number_of_cells)



[1] "Runtime for newdata:"
[1] "1"
Time difference of 3.14454 secs


Parameter M was not provided, will use M = n_features / 10 (if n_features <= 1000), where n_features is the number of selected features, and M = 100 otherwise.

Parameter k was not provided, will use k = sqrt(number_of_cells)



[1] "Runtime for newdata:"
[1] "2"
Time difference of 2.76651 secs


Parameter M was not provided, will use M = n_features / 10 (if n_features <= 1000), where n_features is the number of selected features, and M = 100 otherwise.

Parameter k was not provided, will use k = sqrt(number_of_cells)



[1] "Runtime for newdata:"
[1] "3"
Time difference of 3.233341 secs


Parameter M was not provided, will use M = n_features / 10 (if n_features <= 1000), where n_features is the number of selected features, and M = 100 otherwise.

Parameter k was not provided, will use k = sqrt(number_of_cells)



[1] "Runtime for newdata:"
[1] "4"
Time difference of 3.0351 secs


Parameter M was not provided, will use M = n_features / 10 (if n_features <= 1000), where n_features is the number of selected features, and M = 100 otherwise.

Parameter k was not provided, will use k = sqrt(number_of_cells)



[1] "Runtime for newdata:"
[1] "5"
Time difference of 3.155047 secs
[1] "15"


Parameter M was not provided, will use M = n_features / 10 (if n_features <= 1000), where n_features is the number of selected features, and M = 100 otherwise.

Parameter k was not provided, will use k = sqrt(number_of_cells)



[1] "Runtime atlas:"
Time difference of 0.3229709 secs


Parameter M was not provided, will use M = n_features / 10 (if n_features <= 1000), where n_features is the number of selected features, and M = 100 otherwise.

Parameter k was not provided, will use k = sqrt(number_of_cells)



[1] "Runtime for newdata:"
[1] "1"
Time difference of 3.38759 secs


Parameter M was not provided, will use M = n_features / 10 (if n_features <= 1000), where n_features is the number of selected features, and M = 100 otherwise.

Parameter k was not provided, will use k = sqrt(number_of_cells)



[1] "Runtime for newdata:"
[1] "2"
Time difference of 3.302768 secs


Parameter M was not provided, will use M = n_features / 10 (if n_features <= 1000), where n_features is the number of selected features, and M = 100 otherwise.

Parameter k was not provided, will use k = sqrt(number_of_cells)



[1] "Runtime for newdata:"
[1] "3"
Time difference of 2.818339 secs


Parameter M was not provided, will use M = n_features / 10 (if n_features <= 1000), where n_features is the number of selected features, and M = 100 otherwise.

Parameter k was not provided, will use k = sqrt(number_of_cells)



[1] "Runtime for newdata:"
[1] "4"
Time difference of 3.295036 secs


Parameter M was not provided, will use M = n_features / 10 (if n_features <= 1000), where n_features is the number of selected features, and M = 100 otherwise.

Parameter k was not provided, will use k = sqrt(number_of_cells)



[1] "Runtime for newdata:"
[1] "5"
Time difference of 3.26331 secs
[1] "14"


Parameter M was not provided, will use M = n_features / 10 (if n_features <= 1000), where n_features is the number of selected features, and M = 100 otherwise.

Parameter k was not provided, will use k = sqrt(number_of_cells)



[1] "Runtime atlas:"
Time difference of 0.3133569 secs


Parameter M was not provided, will use M = n_features / 10 (if n_features <= 1000), where n_features is the number of selected features, and M = 100 otherwise.

Parameter k was not provided, will use k = sqrt(number_of_cells)



[1] "Runtime for newdata:"
[1] "1"
Time difference of 3.123514 secs


Parameter M was not provided, will use M = n_features / 10 (if n_features <= 1000), where n_features is the number of selected features, and M = 100 otherwise.

Parameter k was not provided, will use k = sqrt(number_of_cells)



[1] "Runtime for newdata:"
[1] "2"
Time difference of 2.901536 secs


Parameter M was not provided, will use M = n_features / 10 (if n_features <= 1000), where n_features is the number of selected features, and M = 100 otherwise.

Parameter k was not provided, will use k = sqrt(number_of_cells)



[1] "Runtime for newdata:"
[1] "3"
Time difference of 3.275896 secs


Parameter M was not provided, will use M = n_features / 10 (if n_features <= 1000), where n_features is the number of selected features, and M = 100 otherwise.

Parameter k was not provided, will use k = sqrt(number_of_cells)



[1] "Runtime for newdata:"
[1] "4"
Time difference of 3.392008 secs


Parameter M was not provided, will use M = n_features / 10 (if n_features <= 1000), where n_features is the number of selected features, and M = 100 otherwise.

Parameter k was not provided, will use k = sqrt(number_of_cells)



[1] "Runtime for newdata:"
[1] "5"
Time difference of 3.354692 secs
