/
Classes.R
453 lines (444 loc) · 15.7 KB
/
Classes.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
##***********************************************************************
##
## EnsBb classes
##
## Main class providing access and functionality for the database.
##
##***********************************************************************
setClass("EnsDb",
slots = c(ensdb = "DBIConnection",
tables = "list",
.properties = "list"),
prototype = list(ensdb = NULL,
tables = list(),
.properties = list())
)
#' @title Filters supported by ensembldb
#'
#' @description
#'
#' `ensembldb` supports most of the filters from the [AnnotationFilter]
#' package to retrieve specific content from [EnsDb] databases. These filters
#' can be passed to the methods such as [genes()] with the `filter` parameter
#' or can be added as a *global* filter to an `EnsDb` object (see
#' [addFilter()] for more details). Use [supportedFilters()] to get an
#' overview of all filters supported by `EnsDb` object.
#'
#' @note
#'
#' For users of `ensembldb` version < 2.0: in the `GRangesFilter` from the
#' `AnnotationFilter` package the `condition` parameter was renamed to `type`
#' (to be consistent with the `IRanges` package). In addition,
#' `condition = "overlapping"` is no longer recognized. To retrieve all
#' features overlapping the range `type = "any"` has to be used.
#'
#' @details
#'
#' `ensembldb` supports the following filters from the `AnnotationFilter`
#' package:
#'
#' - `GeneIdFilter`: filter based on the Ensembl gene ID.
#'
#' - `GeneNameFilter`: filter based on the name of the gene as provided
#' Ensembl. In most cases this will correspond to the official gene symbol.
#'
#' - `SymbolFilter` filter based on the gene names. `EnsDb` objects don't
#' have a dedicated *symbol* column, the filtering is hence based on the
#' gene names.
#'
#' - `GeneBiotype`: filter based on the biotype of genes (e.g.
#' `"protein_coding"`).
#'
#' - `GeneStartFilter`: filter based on the genomic start coordinate of genes.
#'
#' - `GeneEndFilter`: filter based on the genomic end coordinate of genes.
#'
#' - `EntrezidFilter`: filter based on the genes' NCBI Entrezgene ID.
#'
#' - `TxIdFilter`: filter based on the Ensembld transcript ID.
#'
#' - `TxNameFilter`: to be compliant with `TxDb` object from the
#' `GenomicFeatures` package `tx_name` in fact represents the Ensembl
#' transcript ID. Thus, the the `tx_id` and `tx_name` columns contain the
#' same information and the `TxIdFilter` and `TxNameFilter` are in fact
#' identical. The names of transcripts (i.e. the *external name* field in
#' Ensembl are stored in column `"tx_external_name"` (and which can be
#' filtered using the `TxExternalNameFilter`.
#'
#' - `TxBiotypeFilter`: filter based on the transcripts' biotype.
#'
#' - `TxStartFilter`: filter based on the genomic start coordinate of the
#' transcripts.
#'
#' - `TxEndFilter`: filter based on the genonic end coordinates of the
#' transcripts.
#'
#' - `ExonIdFilter`: filter based on Ensembl exon IDs.
#'
#' - `ExonRankFilter`: filter based on the index/rank of the exon within the
#' transcrips.
#'
#' - `ExonStartFilter`: filter based on the genomic start coordinates of the
#' exons.
#'
#' - `ExonEndFilter`: filter based on the genomic end coordinates of the exons.
#'
#' - `GRangesFilter`: Allows to fetch features within or overlapping specified
#' genomic region(s)/range(s). This filter takes a `GRanges` object
#' as input and, if `type = "any"` (the default) will restrict results to
#' features (genes, transcripts or exons) that are partially overlapping the
#' region. Alternatively, by specifying `condition = "within"` it will
#' return features located within the range. In addition, the `GRangesFilter`
#' `condition = "start"`, `condition = "end"` and `condition = "equal"`
#' filtering for features with the same start or end coordinate or that are
#' equal to the `GRanges`.
#'
#' Note that the type of feature on which the filter is applied depends on
#' the method that is called, i.e. [genes()] will filter on the
#' genomic coordinates of genes, [transcripts()] on those of
#' transcripts and [exons()] on exon coordinates.
#'
#' Calls to the methods [exonsBy()], [cdsBy()] and
#' [transcriptsBy()] use the start and end coordinates of the
#' feature type specified with argument `by` (i.e. `"gene"`,
#' `"transcript"` or `"exon"`) for the filtering.
#'
#' If the specified `GRanges` object defines multiple regions, all
#' features within (or overlapping) any of these regions are returned.
#'
#' Chromosome names/seqnames can be provided in UCSC format (e.g.
#' `"chrX"`) or Ensembl format (e.g. `"X"`); see [seqlevelsStyle()] for
#' more information.
#'
#' - `SeqNameFilter`: filter based on chromosome names.
#'
#' - `SeqStrandFilter`: filter based on the chromosome strand. The strand can
#' be specified with `value = "+"`, `value = "-"`, `value = -1` or
#' `value = 1`.
#'
#' - `ProteinIdFilter`: filter based on Ensembl protein IDs. This filter is
#' only supported if the `EnsDb` provides protein annotations; use the
#' [hasProteinData()] method to check.
#'
#' - `UniprotFilter`: filter based on Uniprot IDs. This filter is only
#' supported if the `EnsDb` provides protein annotations; use the
#' [hasProteinData()] method to check.
#'
#' In addition, the following filters are defined by `ensembldb`:
#'
#' - `TxExternalNameFilter`: filter based on the transcript's *external name*
#' (if available).
#'
#' - `TxSupportLevel`: allows to filter results using the provided transcript
#' support level. Support levels for transcripts are defined by Ensembl
#' based on the available evidences for a transcript with 1 being the
#' highest evidence grade and 5 the lowest level. This filter is only
#' supported on `EnsDb` databases with a db schema version higher 2.1.
#'
#' - `UniprotDbFilter`: allows to filter results based on the specified Uniprot
#' database name(s).
#'
#' - `UniprotMappingTypeFilter`: allows to filter results based on the mapping
#' method/type that was used to assign Uniprot IDs to Ensembl protein IDs.
#'
#' - `ProtDomIdFilter`, `ProteinDomainIdFilter`: allows to retrieve entries
#' from the database matching the provided filter criteria based on their
#' protein domain ID (*protein_domain_id*).
#'
#' - `ProteinDomainSourceFilter`: filter results based on the source
#' (database/method) defining the protein domain (e.g. `"pfam"`).
#'
#' - `OnlyCodingTxFilter`: allows to retrieve entries only for protein coding
#' transcripts, i.e. transcripts with a CDS. This filter does not take any
#' input arguments.
#'
#' @param condition `character(1)` specifying the *condition* of the
#' filter. For `character`-based filters (such as
#' `GeneIdFilter`) `"=="`, `"!="`, `"startsWith"` and `"endsWith"` are
#' supported. Allowed values for `integer`-based filters (such as
#' `GeneStartFilter`) are `"=="`, `"!="`, `"<"`. `"<="`, `">"` and `">="`.
#'
#' @param value The value(s) for the filter. For `GRangesFilter` it has to be a
#' `GRanges` object.
#'
#' @note Protein annotation based filters can only be used if the
#' `EnsDb` database contains protein annotations, i.e. if `hasProteinData`
#' is `TRUE`. Also, only protein coding transcripts will have protein
#' annotations available, thus, non-coding transcripts/genes will not be
#' returned by the queries using protein annotation filters.
#'
#' @name Filter-classes
#'
#' @md
#'
#' @seealso
#'
#' [supportedFilters()] to list all filters supported for `EnsDb` objects.
#'
#' [listUniprotDbs()] and [listUniprotMappingTypes()] to list all Uniprot
#' database names respectively mapping method types from the database.
#'
#' [GeneIdFilter()] in the `AnnotationFilter` package for more details on the
#' filter objects.
#'
#' [genes()], [transcripts()], [exons()], [listGenebiotypes()],
#' [listTxbiotypes()].
#'
#' [addFilter()] and [filter()] for globally adding filters to an `EnsDb`.
#'
#' @author Johannes Rainer
#'
#' @examples
#'
#' ## Create a filter that could be used to retrieve all informations for
#' ## the respective gene.
#' gif <- GeneIdFilter("ENSG00000012817")
#' gif
#'
#' ## Create a filter for a chromosomal end position of a gene
#' sef <- GeneEndFilter(10000, condition = ">")
#' sef
#'
#' ## For additional examples see the help page of "genes".
#'
#'
#' ## Example for GRangesFilter:
#' ## retrieve all genes overlapping the specified region
#' grf <- GRangesFilter(GRanges("11", ranges = IRanges(114129278, 114129328),
#' strand = "+"), type = "any")
#' library(EnsDb.Hsapiens.v86)
#' edb <- EnsDb.Hsapiens.v86
#' genes(edb, filter = grf)
#'
#' ## Get also all transcripts overlapping that region.
#' transcripts(edb, filter = grf)
#'
#' ## Retrieve all transcripts for the above gene
#' gn <- genes(edb, filter = grf)
#' txs <- transcripts(edb, filter = GeneNameFilter(gn$gene_name))
#' ## Next we simply plot their start and end coordinates.
#' plot(3, 3, pch=NA, xlim=c(start(gn), end(gn)), ylim=c(0, length(txs)),
#' yaxt="n", ylab="")
#' ## Highlight the GRangesFilter region
#' rect(xleft=start(grf), xright=end(grf), ybottom=0, ytop=length(txs),
#' col="red", border="red")
#' for(i in 1:length(txs)){
#' current <- txs[i]
#' rect(xleft=start(current), xright=end(current), ybottom=i-0.975, ytop=i-0.125, border="grey")
#' text(start(current), y=i-0.5,pos=4, cex=0.75, labels=current$tx_id)
#' }
#' ## Thus, we can see that only 4 transcripts of that gene are indeed
#' ## overlapping the region.
#'
#'
#' ## No exon is overlapping that region, thus we're not getting anything
#' exons(edb, filter = grf)
#'
#'
#' ## Example for ExonRankFilter
#' ## Extract all exons 1 and (if present) 2 for all genes encoded on the
#' ## Y chromosome
#' exons(edb, columns = c("tx_id", "exon_idx"),
#' filter=list(SeqNameFilter("Y"),
#' ExonRankFilter(3, condition = "<")))
#'
#'
#' ## Get all transcripts for the gene SKA2
#' transcripts(edb, filter = GeneNameFilter("SKA2"))
#'
#' ## Which is the same as using a SymbolFilter
#' transcripts(edb, filter = SymbolFilter("SKA2"))
#'
#'
#' ## Create a ProteinIdFilter:
#' pf <- ProteinIdFilter("ENSP00000362111")
#' pf
#' ## Using this filter would retrieve all database entries that are associated
#' ## with a protein with the ID "ENSP00000362111"
#' if (hasProteinData(edb)) {
#' res <- genes(edb, filter = pf)
#' res
#' }
#'
#' ## UniprotFilter:
#' uf <- UniprotFilter("O60762")
#' ## Get the transcripts encoding that protein:
#' if (hasProteinData(edb)) {
#' transcripts(edb, filter = uf)
#' ## The mapping Ensembl protein ID to Uniprot ID can however be 1:n:
#' transcripts(edb, filter = TxIdFilter("ENST00000371588"),
#' columns = c("protein_id", "uniprot_id"))
#' }
#'
#' ## ProtDomIdFilter:
#' pdf <- ProtDomIdFilter("PF00335")
#' ## Also here we could get all transcripts related to that protein domain
#' if (hasProteinData(edb)) {
#' transcripts(edb, filter = pdf, columns = "protein_id")
#' }
#'
NULL
############################################################
## OnlyCodingTxFilter
##
## That's a special case filter that just returns transcripts
## that have tx_cds_seq_start defined (i.e. not NULL).
#' @rdname Filter-classes
setClass("OnlyCodingTxFilter",
contains = "CharacterFilter",
prototype = list(
condition = "==",
value = character(),
field = "empty"
))
#' @rdname Filter-classes
OnlyCodingTxFilter <- function() {
new("OnlyCodingTxFilter")
}
############################################################
## ProtDomIdFilter
#' @rdname Filter-classes
setClass("ProtDomIdFilter",
contains = "CharacterFilter",
prototype = list(
condition = "==",
value = "",
field = "prot_dom_id"
))
#' @return For `ProtDomIdFilter`: A `ProtDomIdFilter` object.
#'
#' @md
#'
#' @rdname Filter-classes
ProtDomIdFilter <- function(value, condition = "==") {
new("ProtDomIdFilter", condition = condition,
value = as.character(value))
}
#' @rdname Filter-classes
setClass("ProteinDomainIdFilter",
contains = "CharacterFilter",
prototype = list(
condition = "==",
value = "",
field = "protein_domain_id"
))
#' @return For `ProteinDomainIdFilter`: A `ProteinDomainIdFilter` object.
#'
#' @md
#'
#' @rdname Filter-classes
ProteinDomainIdFilter <- function(value, condition = "==") {
new("ProteinDomainIdFilter", condition = condition,
value = as.character(value))
}
#' @rdname Filter-classes
setClass("ProteinDomainSourceFilter",
contains = "CharacterFilter",
prototype = list(
condition = "==",
value = "",
field = "protein_domain_source"
))
#' @return For `ProteinDomainSourceFilter`: A `ProteinDomainSourceFilter`
#' object.
#'
#' @md
#'
#' @rdname Filter-classes
ProteinDomainSourceFilter <- function(value, condition = "==") {
new("ProteinDomainSourceFilter", condition = condition,
value = as.character(value))
}
############################################################
## UniprotDbFilter
#' @rdname Filter-classes
setClass("UniprotDbFilter",
contains = "CharacterFilter",
prototype = list(
condition = "==",
value = "",
field = "uniprot_db"
))
#' @return For `UniprotDbFilter`: A `UniprotDbFilter` object.
#'
#' @md
#'
#' @rdname Filter-classes
UniprotDbFilter <- function(value, condition = "==") {
new("UniprotDbFilter", condition = condition,
value = as.character(value))
}
############################################################
## UniprotMappingTypeFilter
#' @rdname Filter-classes
setClass("UniprotMappingTypeFilter",
contains = "CharacterFilter",
prototype = list(
condition = "==",
value = "",
field = "uniprot_mapping_type"
))
#' @return For `UniprotMappingTypeFilter`: A `UniprotMappingTypeFilter` object.
#'
#' @md
#'
#' @rdname Filter-classes
UniprotMappingTypeFilter <- function(value, condition = "==") {
new("UniprotMappingTypeFilter", condition = condition,
value = as.character(value))
}
#' @rdname Filter-classes
setClass("TxSupportLevelFilter",
contains = "IntegerFilter",
prototype = list(
condition = "==",
value = 0L,
field = "tx_support_level"
))
#' @return For `TxSupportLevel`: A `TxSupportLevel` object.
#'
#' @md
#'
#' @rdname Filter-classes
TxSupportLevelFilter <- function(value, condition = "==") {
if (!is.numeric(value))
stop("Parameter 'value' has to be numeric")
new("TxSupportLevelFilter", condition = condition,
value = as.integer(value))
}
#' @rdname Filter-classes
setClass("TxIsCanonicalFilter",
contains = "IntegerFilter",
prototype = list(
condition = "==",
value = 1L,
field = "tx_is_canonical"
))
#' @return For `TxIsCanonicalFilter`: A `TxIsCanonicalFilter` object.
#'
#' @md
#'
#' @rdname Filter-classes
TxIsCanonicalFilter <- function(value, condition = "==") {
if (!is.numeric(value))
stop("Parameter 'value' has to be numeric")
new("TxIsCanonicalFilter", condition = condition,
value = as.integer(value))
}
#' @return For `TxExternalNameFilter`: A `TxExternalNameFilter` object.
#'
#' @md
#'
#' @rdname Filter-classes
TxExternalNameFilter <- function(value, condition = "==") {
new("TxExternalNameFilter", condition = condition,
value = as.character(value))
}
#' @rdname Filter-classes
setClass("TxExternalNameFilter",
contains = "CharacterFilter",
prototype = list(
condition = "==",
value = "",
field = "tx_external_name"
))