-
Notifications
You must be signed in to change notification settings - Fork 5
/
discrete_algorithm.R
417 lines (395 loc) · 19.4 KB
/
discrete_algorithm.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
#' Calculate P-value
#'
#' @description Given all of the empirical statistics derived from permutations,
#' count how many of the empirical/permuted test statistics are greater than
#' or equal to the observed/real test statistic.Adding one in the numerator
#' and denominator accounts for the observed value.
#'
#' @param empirical_statistic Numeric vector. Length = num_perm. KS test
#' statistics for each permutation.
#' @param observed_statistic Number. Length = 1. KS test statistic of real data.
#' @param num_perm Number. Number of permutations.
#'
#' @return pval. Number. Length = 1. Value between 0 and 1.
#'
#' @noRd
calculate_permutation_based_p_value <- function(empirical_statistic,
observed_statistic,
num_perm){
# Check input ----------------------------------------------------------------
check_equal(length(empirical_statistic), num_perm)
check_is_number(observed_statistic)
check_is_number(num_perm)
check_if_permutation_num_valid(num_perm)
check_is_number(empirical_statistic[1])
# Function -------------------------------------------------------------------
pval <-
(sum(1 + sum(empirical_statistic >= observed_statistic)) / (num_perm + 1))
# Check and return output ----------------------------------------------------
check_num_between_0_and_1(pval)
return(pval)
}
#' Count the phenotype and genotype edge overlaps
#'
#' @description For each genotype, return a count of the number of edges for
#' which the genotype is a transition and the phenotype is present (when using
#' reconstruction)/phenotype is a transition (when using transition). Also,
#' return a count of the number of edges for which the just the genotype is a
#' transition (phenotype is either a 0 reconstruction or not a transition).
#'
#' @param genotype_transition_edges List of numeric vectors. Each vector
#' corresponds with one genotype from the geno_mat. The numbers in each vector
#' correspond to an edge on the tree.
#' @param phenotype_reconstruction Numeric vector. Each entry corresponds to an
#' edge.
#' @param high_confidence_edges List of vectors. Each vector corresponds to the
#' confidence of 1 genotype from the geno_mat. Each entry in the vector
#' corresponds to an edge on the tree. 1 means high confidence in the node, 0
#' means low confidence.
#' @param tr Phylo.
#'
#' @return List of two lists:
#' \describe{
#' \item{both_present}{Length = number of genotypes.}
#' \item{only_geno_present}{Length = number of genotypes.}
#' }
#'
#' @noRd
count_hits_on_edges <- function(genotype_transition_edges,
phenotype_reconstruction,
high_confidence_edges,
tr){
# Check input ----------------------------------------------------------------
check_for_root_and_bootstrap(tr)
check_is_number(genotype_transition_edges[[1]][1])
check_is_number(high_confidence_edges[[1]][1])
check_is_number(phenotype_reconstruction[1])
check_equal(length(genotype_transition_edges[[1]]), ape::Nedge(tr))
check_equal(length(phenotype_reconstruction), ape::Nedge(tr))
check_equal(length(high_confidence_edges[[1]]), ape::Nedge(tr))
# Function -------------------------------------------------------------------
both_present <- sapply(1:length(high_confidence_edges), function(x) {
sum(phenotype_reconstruction[as.logical(high_confidence_edges[[x]])] +
genotype_transition_edges[[x]][as.logical(high_confidence_edges[[x]])]
== 2)
})
only_geno_present <- sapply(1:length(high_confidence_edges), function(x) {
sum(
genotype_transition_edges[[x]][as.logical(high_confidence_edges[[x]])]) -
both_present[x]
})
# Check and return output ----------------------------------------------------
hit_counts <- list("both_present" = both_present,
"only_geno_present" = only_geno_present)
return(hit_counts)
}
#' Run GWAS and calculate P-value for a discrete trait
#'
#' @description discrete_calculate_pvals is the "meat" of the discrete
#' algorithm. It returns the empirical p-value for each observed genotype.
#'
#' @details Algorithm overview:
#' * Subset tree edges to those with high confidence (as determined by
#' phenotype & genotype reconstructions as well as tree bootstrap values).
#' * For each genotype from the genotype_matrix:
#' * Exclude any edges on which the genotype is not present
#' * Create a matrix where each row is a random sampling of the high
#' confidence edges of the tree where probability of choosing edges is
#' proportional to length of edge. Number of edges selected for the
#' permuted data set is the number of times the empirical genotype
#' appears.
#' * Calculate when the randomly permuted genotype overlap with the high
#' confidence phenotype (these values create the null distribution for the
#' permuted genotypes)
#' * Calculate empirical p-values.
#' * Genotypes for which no convergence is possible (<2 transition edges or
#' overlap between genotype and phenotype <2) are given a p-value of 1 and
#' the permutation test is skipped.
#'
#' @param genotype_transition_edges List of vectors.
#' Length(genotype_transition_list) == number of genotypes. Length(each
#' vector) == Nedge(tr). Each of these are either 0, 1.
#' @param phenotype_reconstruction Vector. Length() == Nedge(tr). Phenotype
#' reconstruction could instead by phenotype transition vector.
#' @param tr Phylo. Rooted phylogenetic tree.
#' @param mat Numeric matrix. The genotype matrix. Nrow(mat) == number of
#' isolates == Ntip(tr). Each column is a variant. Matrix is binary (0 or 1).
#' @param permutations Integer. The number of times to run the permutation test.
#' @param fdr Number. Significance threshold. Between 0 and 1.
#' @param high_confidence_edges List. Length(list) = ncol(mat) == number of
#' genotypes. Each entry is a vector with length == Nedge(tr). All entries are
#' either 0 (low confidence) or 1 (high confidence).
#' @param convergence List of five objects:
#' \describe{
#' \item{intersection}{Numeric vector. Intersection of the geno_beta and
#' pheno_beta for each genotype. Length == number of genotypes.}
#' \item{num_hi_conf_edges}{Numeric vector. Number of high confidence
#' edges per genotype. Length == number of genotypes.}
#' \item{pheno_beta}{Number. Count of how many tree edges are phenotype
#' transitions and the phenotype ancestral reconstruction and tree edge are
#' high confidence. Length == 1.}
#' \item{geno_beta}{Numeric vector. count of how many tree edges are
#' gentoype transitions and the genotype ancestral reconstruction and tree
#' edge are high confidence. Length == number of genotypes.}
#' \item{epsilon}{Numeric vector. 2 x (edges with both high confidence
#' genotype AND phenotype transition) / sum(edges with high confidence
#' gentoype and/or phenotype transitions). Length == number of genotypes.}
#' }
#'
#' @return List of three objects:
#' \describe{
#' \item{hit_pvals.}{Vector. Length is the number of genotypes. Unadjusted
#' p-value for each genotype.}
#' \item{permuted_count.}{List. Each entry corresponds to one genotype.
#' Records the number of times there is an overlap between the phenotype
#' (either transition or reconstruction) and the genotype transition for the
#' permuted data.}
#' \item{observed_overlap.}{List. Each entry corresponds to one genotype.
#' Records the number of times there is an overlap between the phenotype
#' (either transition or reconstruction) and the genotype transition for the
#' real/observed data.}
#' }
#'
#' @noRd
discrete_calculate_pvals <- function(genotype_transition_edges,
phenotype_reconstruction,
tr,
mat,
permutations,
fdr,
high_confidence_edges,
convergence){
# Check input ----------------------------------------------------------------
check_equal(ncol(mat), length(genotype_transition_edges))
check_equal(length(genotype_transition_edges[[1]]), ape::Nedge(tr))
check_equal(length(phenotype_reconstruction), ape::Nedge(tr))
check_if_permutation_num_valid(permutations)
check_for_root_and_bootstrap(tr)
check_num_between_0_and_1(fdr)
check_equal(ncol(mat), length(high_confidence_edges))
check_equal(length(high_confidence_edges[[1]]), ape::Nedge(tr))
check_equal(length(convergence$intersection), ncol(mat))
check_is_number(convergence$intersection[1])
# Function -------------------------------------------------------------------
# Calculate observed values
# (Convergence of 0 -> 1 on phenotype present edges (variable: both present)
# versus phenotype absent edges (only_geno_present)).
# Phenotype present means 1 in reconstruction test or phenotype is a
# transition (1) in the overlap/transition test. Equivalent to Farhat et al.'s
# "Resistant" branches. Phenotype absent (only_geno_present) is equivalent to
# Farhat et al.'s "Sensitive" branches.
observed_result <- count_hits_on_edges(genotype_transition_edges,
phenotype_reconstruction,
high_confidence_edges,
tr)
both_present <- observed_result$both_present
only_geno_present <- observed_result$only_geno_present
# Initialize some values
num_genotypes <- ncol(mat)
num_edges_with_geno_trans <- both_present + only_geno_present
hit_pvals <- rep(NA, num_genotypes)
num_hi_conf_edges <- sapply(high_confidence_edges, function(x) sum(x))
list_of_all_edges <- c(1:ape::Nedge(tr))
hi_conf_edges <- list(rep(0, num_genotypes))
# Subset tr edges to those with high confidence (as determined by phenotype &
# genotype reconstructions as well as tr bootstrap values).
for (i in 1:num_genotypes) {
hi_conf_edges[[i]] <-
list_of_all_edges[as.logical(high_confidence_edges[[i]])]
}
# For each genotype from the genotype_matrix:
record_redistrib_both_present <- rep(list(rep(NA, permutations)),
num_genotypes)
# looping over each genotype in the genotype_mat
for (i in 1:num_genotypes) {
if (num_edges_with_geno_trans[i] > num_hi_conf_edges[i]) {
stop("Too many hits on the branches")
}
if ( (both_present[[i]] + only_geno_present[[i]]) < 2) {
# If there are 1 or 0 high confidence edges with the genotype present then
# the p-value should be reported as 1.0; both_present and
# only_geno_present are made up of only high confidence branches as
# defined in count_hits_on_edges which isn't quite true but will indicate
# that we cannot calculate a p-value because we cannot detect any
# convergence with one or fewer affected branches. And that we should skip
# to the next genotype to run the permutation test.
hit_pvals[i] <- 1.0
# This should never get triggered because we should be filtering the
# genotype before this step, but it's being kept in as a fail safe.
} else if (convergence$intersection[i] < 2) {
# If there is no ovlerap in BOTH the phenotype (presence/transition)
# and genotype transition we CANNOT detect convergence. In these cases
# we should skip to the next genotype to run the permutation test.
hit_pvals[i] <- 1.0
} else {
permuted_geno_trans_edges <-
discrete_permutation(tr,
permutations,
num_edges_with_geno_trans,
num_hi_conf_edges,
ape::Nedge(tr),
hi_conf_edges,
i)
# Now calculate both_present and only_geno_present with the permuted data
# in the same fashion as with the observed data
empirical_both_present <-
count_empirical_both_present(permuted_geno_trans_edges,
phenotype_reconstruction,
high_confidence_edges,
i)
# Note on nomeclature from the phyC paper supplement page 7:
# X -> G on R is the same as sum(empirical_both_present >= both_present[i])
# X -> G on S is the same
# as sum(empirical_only_geno_present <= only_geno_present[i])
# Note: sum(empirical_both_present >= both_present[i]) always
# equals sum(empirical_only_geno_present <= only_geno_present[i])
# so we only need to include one in the p-value calculation.
pval <- calculate_permutation_based_p_value(empirical_both_present,
both_present[i],
permutations)
hit_pvals[i] <- format(round(pval, 20), nsmall = 20)
record_redistrib_both_present[[i]] <- empirical_both_present
}
}
names(hit_pvals) <- colnames(mat)
storage.mode(hit_pvals) <- "character"
# Return output --------------------------------------------------------------
results <- list("hit_pvals" = hit_pvals,
"permuted_count" = record_redistrib_both_present,
"observed_overlap" = both_present)
return(results)
}
#' Perform permutation for discrete data
#'
#' @description Perform a permutation of the edges selected to be genotype
#' transition edges. permuted_geno_trans_mat is a matrix where each row
#' corresponds to a permuted set of edges and each column corresponds to one
#' high confidence edge. To get the edges selected in a permutation, take that
#' row from permuted_geno_trans_mat. We could use this format only for later
#' analyses, but a more convenient format is to convert to redistributed_hits.
#' Redistributed_hits is a matrix with nrow = number of perms (same as
#' permuted_geno_trans_mat, but each row now corresponds to a genotype)
#'
#' @param tr Phylo.
#' @param num_perm Integer. Number of permutations.
#' @param number_edges_with_geno_trans Numeric vector. Maximum value should be
#' the number of edges in the tree.
#' @param number_hi_conf_edges Numeric vector. Maximum value should be the
#' number of edges in the tree.
#' @param number_edges Nedge(tr).
#' @param high_conf_edges Numeric vector of edges that high confidence. Maximum
#' value should be the number of edges in the tree.
#' @param index Integer. "i" from loop. Max value is number of genotypes. Min
#' value == 1.
#'
#' @return redistributed_hits. Matrix. Nrow == num_perm. Ncol == Nedge(tr).
#' @noRd
discrete_permutation <- function(tr,
num_perm,
number_edges_with_geno_trans,
number_hi_conf_edges,
number_edges,
high_conf_edges,
index){
# Check input ----------------------------------------------------------------
check_if_permutation_num_valid(num_perm)
check_for_root_and_bootstrap(tr)
check_is_number(number_edges_with_geno_trans[index])
check_is_number(number_hi_conf_edges[index])
check_is_number(number_edges)
check_is_number(index)
check_is_number(high_conf_edges[[index]][1])
check_equal(number_edges, ape::Nedge(tr))
if (index < 1) {
stop("loop index must be positive")
}
if (max(high_conf_edges[[index]]) > ape::Nedge(tr) |
number_hi_conf_edges[index] > ape::Nedge(tr) |
number_edges_with_geno_trans[index] > ape::Nedge(tr)) {
stop("max value should be Nedge(tr")
}
# Function -------------------------------------------------------------------
permuted_geno_trans_mat <-
matrix(nrow = num_perm, ncol = number_edges_with_geno_trans[index])
redistributed_hits <- matrix(0, nrow = num_perm, ncol = number_edges)
set.seed(1)
for (j in 1:num_perm) {
permuted_geno_trans_mat[j, ] <-
sample(1:number_hi_conf_edges[index],
size = number_edges_with_geno_trans[index],
replace = FALSE,
prob = tr$edge.length[high_conf_edges[[index]]] /
sum(tr$edge.length[high_conf_edges[[index]]]))
}
if (nrow(permuted_geno_trans_mat) != num_perm) {
# This if statement deals with situation when number_edges_with_geno_trans
# = 1; which should never happen now that we prefilter genotypes.
permuted_geno_trans_mat <- t(permuted_geno_trans_mat)
}
for (m in 1:nrow(permuted_geno_trans_mat)) {
redistributed_hits[m, ][permuted_geno_trans_mat[m, ]] <- 1
}
# Check and return output ----------------------------------------------------
check_dimensions(redistributed_hits,
exact_rows = num_perm,
min_rows = num_perm,
exact_cols = ape::Nedge(tr),
min_cols = ape::Nedge(tr))
return(redistributed_hits)
}
#' Count the number of edges where phenotype and genotype overlap
#'
#' @description Find number of edges on the tree where the permuted genotype is
#' a transition AND the phenotype is also a transition (or a phenotype is
#' present in the reconstruction; it depends on the type of test being run).
#'
#' @param permuted_mat Matrix. Nrow = number of permutations. Ncol = number of
#' tr edges. Either 0 or 1.
#' @param pheno_vec Numeric vector.
#' @param hi_conf_edge List.
#' @param index Number. The "i" of the loop this function is run within.
#'
#' @return Numeric Vector. Length = num perm.
#' @noRd
count_empirical_both_present <- function(permuted_mat,
pheno_vec,
hi_conf_edge,
index){
# Check input ----------------------------------------------------------------
check_is_number(index)
check_if_binary_matrix(permuted_mat)
check_equal(length(pheno_vec), ncol(permuted_mat))
check_equal(length(pheno_vec), length(as.logical(hi_conf_edge[[index]])))
# Function -------------------------------------------------------------------
result <- sapply(1:nrow(permuted_mat), function(x) {
sum( (pheno_vec + permuted_mat[x, ]) == 2 & hi_conf_edge[[index]] == 1)
})
# Check and return output ----------------------------------------------------
check_equal(nrow(permuted_mat), length(result))
return(result)
}
#' Count edges with genotype but not phenotype
#'
#' @description Find number of edges on the tree where the permuted genotype is
#' a transition but the phenotype is not (either a transition phenotype edge
#' or phenotype reconstruction present; it depends on the type of test being
#' run).
#'
#' @param permuted_mat Matrix. Nrow = number of permutations. Ncol = number of
#' edges. Either 0 or 1.
#' @param emp_both_present Numeric vector.
#'
#' @return Numeric vector. Length = num perm.
#' @noRd
count_empirical_only_geno_present <- function(permuted_mat, emp_both_present){
# Check input ----------------------------------------------------------------
check_if_binary_matrix(permuted_mat)
check_equal(length(emp_both_present), nrow(permuted_mat))
# Function -------------------------------------------------------------------
result <- sapply(1:nrow(permuted_mat), function(x) {
sum(permuted_mat[x, ]) - emp_both_present[x]
})
# Check & return output ------------------------------------------------------
check_equal(nrow(permuted_mat), length(result))
return(result)
}