/
PgR6MS.R
258 lines (221 loc) · 13.8 KB
/
PgR6MS.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
#' @name PgR6MS
#' @title PgR6 class with Methods and Sequences.
#' @description PgR6 with Methods and Sequences. Final users should use \code{\link{pagoo}}
#' instead of this, since is more easy to understand.
#' Inherits: \code{\link[pagoo]{PgR6M}}
#' @importFrom GenomicRanges mcols mcols<-
#' @importFrom Biostrings BStringSet DNAStringSet
#' @importFrom S4Vectors split
#' @export
PgR6MS <- R6Class('PgR6MS',
inherit = PgR6M,
private = list(
.sequences = NULL
),
public = list(
#' @description
#' Create a \code{PgR6MS} object.
#' @param data A \code{data.frame} or \code{\link[S4Vectors:DataFrame-class]{DataFrame}} containing at least the
#' following columns: \code{gene} (gene name), \code{org} (organism name to which the gene belongs to),
#' and \code{cluster} (group of orthologous to which the gene belongs to). More columns can be added as metadata
#' for each gene.
#' @param org_meta (optional) A \code{data.frame} or \code{\link[S4Vectors:DataFrame-class]{DataFrame}}
#' containing additional metadata for organisms. This \code{data.frame} must have a column named "org" with
#' valid organisms names (that is, they should match with those provided in \code{data}, column \code{org}), and
#' additional columns will be used as metadata. Each row should correspond to each organism.
#' @param cluster_meta (optional) A \code{data.frame} or \code{\link[S4Vectors:DataFrame-class]{DataFrame}}
#' containing additional metadata for clusters. This \code{data.frame} must have a column named "cluster" with
#' valid organisms names (that is, they should match with those provided in \code{data}, column \code{cluster}), and
#' additional columns will be used as metadata. Each row should correspond to each cluster.
#' @param core_level The initial core_level (that's the percentage of organisms a core cluster must be in to be
#' considered as part of the core genome). Must be a number between 100 and 85, (default: 95). You can change it
#' later by using the \code{$core_level} field once the object was created.
#' @param sep A separator. By default is '__'(two underscores). It will be used to
#' create a unique \code{gid} (gene identifier) for each gene. \code{gid}s are created by pasting
#' \code{org} to \code{gene}, separated by \code{sep}.
#' @param DF Deprecated. Use \code{data} instead.
#' @param group_meta Deprecated. Use \code{cluster_meta} instead.
#' @param sequences Can accept: 1) a named \code{list} of named \code{character} vector. Name of list
#' are names of organisms, names of character vector are gene names; or 2) a named \code{list} of \code{\link[Biostrings:XStringSet-class]{DNAStringSetList}}
#' objects (same requirements as (1), but with BStringSet names as gene names); or 3) a \code{\link[Biostrings:XStringSetList-class]{DNAStringSetList}}
#' (same requirements as (2) but \code{DNAStringSetList} names are organisms names).
#' @param verbose \code{logical}. Whether to display progress messages when loading class.
#' @return An R6 object of class PgR6MS. It contains basic fields and methods for analyzing a pangenome. It also
#' contains additional statistical methods for analyze it, methods to make basic exploratory plots, and methods
#' for sequence manipulation.
initialize = function(data,
org_meta,
cluster_meta,
core_level = 95,
sep = '__',
DF,
group_meta,
sequences,
verbose = TRUE){
# Deprecated args
if (!missing(DF)){
data <- DF
warning('Argument "DF" is deprecated. You should use "data" instead. ',
'This still exists for compatibility with previous versions, ',
'but will throw an error in the future.',
immediate. = TRUE, noBreaks. = TRUE)
}
if (!missing(group_meta)){
cluster_meta <- group_meta
warning('Argument "group_meta" is deprecated. You should use "cluster_meta" instead. ',
'This still exists for compatibility with previous versions, ',
'but will throw an error in the future.',
immediate. = TRUE, noBreaks. = TRUE)
}
super$initialize(data = data,
org_meta,
cluster_meta,
core_level = core_level,
sep = sep,
verbose = verbose)
# Check input sequences
if (verbose) message('Checking input sequences.')
if (class(sequences)%in%c('list',
'BStringSetList',
'DNAStringSetList')){
norgs <- length(sequences)
orgNames <- names(sequences)
if (length(orgNames)==0){
stop('Unnamed list.')
}
clss <- unique(sapply(sequences, class))
genNames <- lapply(sequences, names)
if (clss%in%c('character',
'BStringSet',
'DNAStringSet')){
if (length(genNames)==0){
stop('Unnamed sequences.')
}
gids <- mapply(paste,
orgNames, genNames,
MoreArgs = list(sep = sep))
gids <- unlist(gids, use.names = FALSE)
if (clss=='character'){
sqs <- unlist(sequences, use.names = FALSE)
sqs <- DNAStringSet(sqs)
names(sqs) <- gids
}else{
sqs <- DNAStringSetList(sequences)
sqs <- unlist(sqs, use.names = FALSE)
names(sqs) <- gids
}
private$.sequences <- sqs
}else{
stop('Unrecognized sequences format.')
}
}else{
stop('Unrecognized sequences format')
}
if (verbose) message('Checking that sequence names matches with DataFrame.')
dfgid <- private$.data[, 'gid']
if (!all(dfgid%in%gids)){
stop('Missing sequences: some gid do not match any sequence name.')
}
if (any(!gids%in%dfgid)){
warning('Missing gid: some sequence names do not match to any gid. Continuing anyway..\n', immediate. = TRUE)
}
# Add metadata to sequences
if (verbose) message('Adding metadata to sequences.')
spl <- strsplit(gids, sep)
mcols(private$.sequences)$org <- vapply(spl, '[', 1,
FUN.VALUE = NA_character_)
mcols(private$.sequences)$cluster <- as.character(private$.data$cluster[match(gids, dfgid)])
},
# Methods for sequences
#' @description
#' A field for obtaining core gene sequences is available (see below), but for creating a phylogeny with this
#' sets is useful to: 1) have the possibility of extracting just one sequence of each organism on each cluster, in
#' case paralogues are present, and 2) filling gaps with empty sequences in case the core_level was set below 100\%,
#' allowing more genes (some not in 100\% of organisms) to be incorporated to the phylogeny. That is the purpose of
#' this special function.
#' @param max_per_org Maximum number of sequences of each organism to be taken from each cluster.
#' @param fill \code{logical}. If fill \code{DNAStringSet} with empty \code{DNAString} in cases where
#' \code{core_level} is set below 100\%, and some clusters with missing organisms are also considered.
#' @return A \code{DNAStringSetList} with core genes. Order of organisms on each cluster is conserved, so it is easier
#' to concatenate them into a super-gene suitable for phylogenetic inference.
#' @importFrom S4Vectors mcols
#' @importFrom BiocGenerics table
core_seqs_4_phylo = function(max_per_org = 1, fill = TRUE){
if (!class(max_per_org)%in%c('logical', 'NULL', 'numeric', 'integer'))
stop('"max_per_org" is not numeric, NULL, or NA')
if (!is.logical(fill))
stop('"fill" is not logical')
sqs <- unlist(self$core_sequences, use.names = FALSE)
mcls <- mcols(sqs)
ccl <- unique(mcls$cluster)
if (!(is.na(max_per_org) | is.null(max_per_org))){
wma <- which(self$pan_matrix[, ccl] > max_per_org, arr.ind = T)
if (dim(wma)[1]){
ogs2fix <- ccl[wma[, 2]]
orgs2fix <- rownames(wma)
remrw <- apply(cbind(ogs2fix, orgs2fix), 1, function(x){
wh <- which(mcls$cluster==x[1] & mcls$org==x[2])
wh[-seq_len(max_per_org)]
})
torm <- unlist(remrw)
if (length(torm)){
sqs <- sqs[-torm]
mcls <- mcols(sqs)
}
}
}
# Fill '0' cells with empthy DNAStringSet
# spl <- split(sqs, mcls$group)
# nro <- elementNROWS(spl)
if (fill){
tbl <- table(mcls)
tofll <- which(tbl==0, arr.ind = TRUE)
if (dim(tofll)[1]){
sqs <- append(sqs, DNAStringSet(rep('', dim(tofll)[1])))
mcls <- mcols(sqs)
mfill <- which(is.na(mcls$org))
mcols(sqs)$org[mfill] <- rownames(tofll)
mcols(sqs)$cluster[mfill] <- ccl[tofll[, 2]]
mcls <- mcols(sqs)
}
}
## Order ! ! !
orgs <- as.character(self$organisms$org)
sqs <- sqs[order(match(mcls$org, orgs))]
mcls <- mcols(sqs)
# Return
split(sqs, mcls$cluster)
}
),
active = list(
#' @field sequences A \code{\link[Biostrings:XStringSetList-class]{DNAStringSetList}} with the
#' set of sequences grouped by cluster. Each group is accessible as were a list. All
#' \code{Biostrings} methods are available.
sequences = function(){
sqs <- private$.sequences
sset <- unlist(self$genes)$gid
split(sqs[sset], mcols(sqs[sset])$cluster)
},
#' @field core_sequences Like \code{$sequences}, but only showing core
#' sequences.
core_sequences = function(){
sqs <- private$.sequences
sset <- unlist(self$core_genes)$gid
split(sqs[sset], mcols(sqs[sset])$cluster)
},
#' @field cloud_sequences Like \code{$sequences}, but only showing cloud
#' sequences as defined above.
cloud_sequences = function(){
sqs <- private$.sequences
sset <- unlist(self$cloud_genes)$gid
split(sqs[sset], mcols(sqs[sset])$cluster)
},
#' @field shell_sequences Like \code{$sequences}, but only showing shell
#' sequences, as defined above.
shell_sequences = function(){
sqs <- private$.sequences
sset <- unlist(self$shell_genes)$gid
split(sqs[sset], mcols(sqs[sset])$cluster)
}
)
)