In [91]:
# Author: Fabio Zanini
# Date: 2020-05-27
# Description: learn how to use scmap
# Ah, namespace pollution starts immediately
library(SingleCellExperiment)
library(scmap)

## Load atlas

In [92]:
# NOTE: sce has cells as columns
fn_atlas <- '../data/for_scmap/TBS_kidney_atlas_subsample_20_counts.tsv'
cols <- c('character', rep('double', 19860))
counts_atlas <- t(read.table(file = fn_atlas, sep = '\t', header = TRUE,  colClasses = cols,  row.names = "index"))

fn_meta_atlas = '../data/for_scmap/TBS_kidney_atlas_subsample_20_metadata.tsv'
meta_atlas <- read.table(file = fn_meta_atlas, sep = '\t', header = TRUE, row.names='index')

atlas <- SingleCellExperiment(assays = list(normcounts = as.matrix(counts_atlas)), colData = meta_atlas)

In [93]:
atlas

class: SingleCellExperiment 
dim: 19860 360 
metadata(0):
assays(1): normcounts
rownames(19860): Xkr4 Rp1 ... Sly Erdr1
rowData names(0):
colnames(360): AAGGTTCAGCGTGAAC-1-78-0-0 ACGCAGCTCAGTTGAC-1-78-0-0 ...
  10X_P4_6_GTTACAGTCCCTCTTT-1 10X_P7_5_GGACAAGAGTGGAGAA-1
colData names(1): CellType
reducedDimNames(0):
altExpNames(0):

## Load new dataset

In [149]:
repn <- '5'

# NOTE: sce has cells as columns
fn_newdata <- paste('../data/for_scmap/TBS_kidney_newdata_subsample_100_counts_rep_', repn, '.tsv', sep="")
cols <- c('character', rep('double', 19860))
counts_newdata <- t(read.table(file = fn_newdata, sep = '\t', header = TRUE, colClasses = cols,  row.names = "index"))


fn_meta_newdata = paste('../data/for_scmap/TBS_kidney_newdata_subsample_100_metadata_rep_', repn, '.tsv', sep="")
meta_newdata <- read.table(file = fn_meta_newdata, sep = '\t', header = TRUE, row.names = 'index')

newdata <- SingleCellExperiment(assays = list(normcounts = as.matrix(counts_newdata)), colData = meta_newdata)

## Prepare the atlas with scmap

In [150]:
# this is needed to calculate dropout rate for feature selection
# important: normcounts have the same zeros as raw counts (fpkm)
counts(atlas) <- normcounts(atlas)
logcounts(atlas) <- log2(normcounts(atlas) + 1)
counts(newdata) <- normcounts(newdata)
logcounts(newdata) <- log2(normcounts(newdata) + 1)

In [151]:
# use gene names as feature symbols
rowData(atlas)$feature_symbol <- rownames(atlas)
rowData(newdata)$feature_symbol <- rownames(newdata)
# Skip this, the function is undefined and 10X has no ERCC anyways
#isSpike(sce, 'ERCC') <- grepl('^ERCC-', rownames(sce))

In [152]:
# remove features with duplicated names
atlas <- atlas[!duplicated(rownames(atlas)), ]
atlas <- selectFeatures(atlas)
newdata <- newdata[!duplicated(rownames(newdata)), ]
newdata <- selectFeatures(newdata)

In [153]:
atlas <- indexCell(atlas)
newdata <- indexCell(newdata)

Parameter M was not provided, will use M = n_features / 10 (if n_features <= 1000), where n_features is the number of selected features, and M = 100 otherwise.

Parameter k was not provided, will use k = sqrt(number_of_cells)

Parameter M was not provided, will use M = n_features / 10 (if n_features <= 1000), where n_features is the number of selected features, and M = 100 otherwise.

Parameter k was not provided, will use k = sqrt(number_of_cells)



In [154]:
summary(factor(colData(sce)$CellType))

In [155]:
metadata(atlas)$scmap_cell_index$subclusters[1:5,1:5]

AAGGTTCAGCGTGAAC-1-78-0-0,ACGCAGCTCAGTTGAC-1-78-0-0,AGAGCTTCACAAGACG-1-78-0-0,CAAGATCCAACAACCT-1-78-0-0,CGCGTTTCAGCGTCCA-1-78-0-0
13,13,11,11,11
14,4,1,14,14
18,12,8,2,18
2,7,11,2,2
5,16,11,9,6


In [156]:
##############################################
# run scmapCell to map the cells back to atlas
##############################################
res <- scmapCell(
  projection = newdata,
  index_list = list(
    atlas = metadata(atlas)$scmap_cell_index
  ),
  w = 9
)

#scmapCell_results <- scmapCell(sce, list(metadata(sce)$scmap_cell_index))
##############################################

In [157]:
neighbors <- res[[1]]$cells

In [158]:
# Exctract TSV and do the rest in Python...
fn_res <- paste('../data/for_scmap/TBS_kidney_newdata_subsample_100_metadata_rep_', repn, '_output.tsv', sep="")
write.table(neighbors, file = fn_res, quote=FALSE, sep='\t')