-
Notifications
You must be signed in to change notification settings - Fork 68
/
detect-genes.R
executable file
·143 lines (130 loc) · 5.35 KB
/
detect-genes.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
#!/usr/bin/env Rscript
"Detect the number of expressed genes and the number of counts.
Usage:
detect-genes.R [options] [--wells=<w>...] <num_cells> <seed> <exp>
Options:
-h --help Show this screen.
--individual=<ind> Only use data from ind, e.g. 19098
--min_count=<x> The minimum count required for detection [default: 1]
--min_cells=<x> The minimum number of cells required for detection [default: 1]
--good_cells=<file> A 1-column file with the names of good quality cells to maintain
-w --wells=<w> Only use data from the specified well(s), e.g. A01
--gene=<pattern> Only use genes whose name contains 'pattern'
Arguments:
num_cells number of single cells to subsample
seed seed for random number generator
exp sample-by-gene expression matrix" -> doc
suppressMessages(library("docopt"))
library("testit")
library("data.table")
main <- function(num_cells, seed, exp_fname, individual = NULL, min_count = 1,
min_cells = 1, good_cells = NULL, wells = NULL, gene = NULL) {
# Load expression data
exp_dat <- fread(exp_fname, data.table = FALSE)
# Filter by individual
if (!is.null(individual)) {
exp_dat <- exp_dat[exp_dat$individual == individual, ]
}
# Filter by wells
if (!is.null(wells)) {
exp_dat <- exp_dat[exp_dat$well %in% wells, ]
}
# Remove bulk samples
exp_dat <- exp_dat[exp_dat$well != "bulk", ]
# Add rownames
if ("replicate" %in% colnames(exp_dat)) {
rownames(exp_dat) <- paste(exp_dat$individual, exp_dat$replicate,
exp_dat$well, sep = ".")
} else {
# allow legacy versions of data which uses batch instead of replicate
rownames(exp_dat) <- paste(exp_dat$individual, exp_dat$batch,
exp_dat$well, sep = ".")
}
# Remove meta-info cols
exp_dat <- exp_dat[, grepl("ENSG", colnames(exp_dat)) |
grepl("ERCC", colnames(exp_dat)), drop = FALSE]
# Transpose
exp_dat <- t(exp_dat)
# Filter genes based on input pattern, e.g. "ENSG" or "ERCC"
# browser()
if (!is.null(gene)) {
# For some reason, the "drop = TRUE" trick to maintain a 1 column matrix
# does not work if it modifying itself.
tmp <- exp_dat[grep(gene, rownames(exp_dat)), , drop = FALSE]
exp_dat <- tmp
rm(tmp)
}
# Keep only good quality cells
if (!is.null(good_cells)) {
assert("File with list of good quality cells exists.",
file.exists(good_cells))
good_cells_list <- scan(good_cells, what = "character", quiet = TRUE)
good_cells_list <- substr(good_cells_list, start = 3, stop = 13)
exp_dat <- exp_dat[, colnames(exp_dat) %in% good_cells_list, drop = FALSE]
assert("There are quality cells to perform the analysis.",
ncol(exp_dat) > 0)
}
# Subsample number of single cells
if (ncol(exp_dat) < num_cells) {
cat(sprintf("%d\t%d\tNA\tNA\n", num_cells, seed))
return(invisible())
}
set.seed(seed)
exp_dat <- exp_dat[, sample(1:ncol(exp_dat), size = num_cells), drop = FALSE]
# exp_dat[1:10, 1:10]
# dim(exp_dat)
# Detect number of expressed genes
detected <- apply(exp_dat, 1, detect_expression, min_count = min_count, min_cells = min_cells)
num_detected <- sum(detected)
# Caculate mean number of total counts, using only genes which meet the
# criteria for detection.
exp_dat_detected <- exp_dat[detected, , drop = FALSE]
mean_counts <- mean(colSums(exp_dat_detected))
# Output
cat(sprintf("%d\t%d\t%d\t%.2f\n", num_cells, seed, num_detected, mean_counts))
}
detect_expression <- function(x, min_count, min_cells) {
# x - vector of counts
# min_count - minumum required observations per cell
# min_cells - minimum required number of cells
expressed <- sum(x >= min_count)
return(expressed >= min_cells)
}
library("testit")
assert("Detect expression function works properly.",
detect_expression(c(0, 0, 1, 1), 1, 2) == TRUE,
detect_expression(c(0, 0, 0, 1), 1, 2) == FALSE,
detect_expression(c(0, 0, 1, 1), 1, 3) == FALSE,
detect_expression(c(0, 1, 1, 1), 1, 3) == TRUE,
detect_expression(c(0, 2, 3, 1), 2, 2) == TRUE,
detect_expression(c(0, 1, 3, 1), 2, 2) == FALSE)
if (!interactive() & getOption('run.main', default = TRUE)) {
opts <- docopt(doc)
# str(opts)
main(num_cells = as.numeric(opts$num_cells),
seed = as.numeric(opts$seed),
exp_fname = opts$exp,
min_count = as.numeric(opts$min_count),
min_cells = as.numeric(opts$min_cells),
individual = opts$individual,
good_cells = opts$good_cells,
wells = opts$wells,
gene = opts$gene)
} else if (interactive() & getOption('run.main', default = TRUE)) {
# what to do if interactively testing
# main(num_cells = 20,
# seed = 1,
# exp_fname = "/mnt/gluster/data/internal_supp/singleCellSeq/subsampled/molecule-counts-200000.txt",
# individual = "19098",
# min_count = 10,
# min_cells = 5,
# good_cells = "../data/quality-single-cells.txt")
main(num_cells = 1,
seed = 1,
# exp_fname = "/mnt/gluster/data/internal_supp/singleCellSeq/lcl/full-lane/molecule-counts-200000.txt",
exp_fname = "/mnt/gluster/home/jdblischak/ssd/lcl/subsampled/counts-matrix/80000000-molecules-raw-single-per-sample.txt",
min_count = 1,
min_cells = 1,
wells = "A9E1",
gene = "ENSG")
}