/
woods_plot.R
281 lines (273 loc) · 9.45 KB
/
woods_plot.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
#' Woods' plot
#'
#' Creates a Woods' plot that plots log2 fold change of peptides or precursors along the protein
#' sequence. The peptides or precursors are located on the x-axis based on their start and end
#' positions. The position on the y-axis displays the fold change. The vertical size (y-axis) of
#' the box representing the peptides or precursors do not have any meaning.
#'
#' @param data a data frame that contains differential abundance, start and end peptide or
#' precursor positions, protein length and optionally a variable based on which peptides or
#' precursors should be coloured.
#' @param fold_change a numeric column in the \code{data} data frame that contains log2 fold
#' changes.
#' @param start_position a numeric column in the \code{data} data frame that contains the start
#' positions for each peptide or precursor.
#' @param end_position a numeric column in the \code{data} data frame that contains the end
#' positions for each peptide or precursor.
#' @param protein_length a numeric column in the \code{data} data frame that contains the length
#' of the protein.
#' @param coverage optional, a numeric column in the \code{data} data frame that contains coverage
#' in percent. Will appear in the title of the Woods' plot if provided.
#' @param protein_id a character column in the \code{data} data frame that contains protein
#' identifiers.
#' @param targets a character vector that specifies the identifiers of the proteins (depending on
#' \code{protein_id}) that should be plotted. This can also be \code{"all"} if plots for all
#' proteins should be created. Default is \code{"all"}.
#' @param facet a logical value that indicates if plots should be summarised into facets of 20
#' plots. This is recommended for many plots. Default is \code{facet = TRUE}.
#' @param colouring optional, a character or numeric (discrete or continous) column in the data
#' frame containing information by which peptide or precursors should be coloured.
#' @param fold_change_cutoff optional, a numeric value that specifies the log2 fold change cutoff
#' used in the plot. The default value is 2.
#' @param highlight optional, a logical column that specifies whether specific peptides or
#' precursors should be highlighted with an asterisk.
#' @param export a logical value that indicates if plots should be exported as PDF. The output
#' directory will be the current working directory. The name of the file can be chosen using the
#' \code{export_name} argument. Default is \code{export = FALSE}.
#' @param export_name a character vector that provides the name of the exported file if
#' \code{export = TRUE}. Default is \code{export_name = "woods_plots"}
#'
#' @return A list containing Woods' plots is returned. Plotting peptide or precursor log2 fold
#' changes along the protein sequence.
#' @import ggplot2
#' @import tidyr
#' @import progress
#' @import dplyr
#' @importFrom magrittr %>%
#' @importFrom rlang .data :=
#' @importFrom utils data
#' @export
#'
#' @examples
#' # Create example data
#' data <- data.frame(
#' fold_change = c(2.3, 0.3, -0.4, -4, 1),
#' pval = c(0.001, 0.7, 0.9, 0.003, 0.03),
#' start = c(20, 30, 45, 90, 140),
#' end = c(33, 40, 64, 100, 145),
#' protein_length = c(rep(150, 5)),
#' protein_id = c(rep("P1", 5))
#' )
#'
#' # Plot Woods' plot
#' woods_plot(
#' data = data,
#' fold_change = fold_change,
#' start_position = start,
#' end_position = end,
#' protein_length = protein_length,
#' protein_id = protein_id,
#' colouring = pval
#' )
woods_plot <- function(data,
fold_change,
start_position,
end_position,
protein_length,
coverage = NULL,
protein_id,
targets = "all",
facet = TRUE,
colouring = NULL,
fold_change_cutoff = 1,
highlight = NULL,
export = FALSE,
export_name = "woods_plots") {
protti_colours <- "placeholder" # assign a placeholder to prevent a missing global variable warning
utils::data("protti_colours", envir = environment()) # then overwrite it with real data
. <- NULL
# create variable that contains information about the missingness of
# certain variables so it can be used in the mapping through which the plot is created.
colouring_missing <- missing(colouring)
highlight_missing <- missing(highlight)
if (!missing(highlight) && !all(is.logical(dplyr::pull(data, {{ highlight }})))) {
stop("Please only provide logicals (i.e. TRUE or FALSE) in the 'highlight' column.")
}
# early filter to speed up function
if (!"all" %in% targets) {
data <- data %>%
dplyr::ungroup() %>%
dplyr::filter({{ protein_id }} %in% targets)
if (nrow(data) == 0) stop("Target not found in data!")
}
data <- data %>%
dplyr::ungroup() %>%
tidyr::drop_na({{ protein_id }}) %>%
dplyr::mutate({{ protein_id }} := factor({{ protein_id }},
levels = unique(dplyr::pull(data, {{ protein_id }}))
)) %>%
dplyr::group_by({{ protein_id }}) %>%
dplyr::mutate(
group_number = dplyr::cur_group_id(),
name = {{ protein_id }}
) %>%
# we do this in preparation for faceting later.
dplyr::ungroup() %>%
dplyr::mutate(group_number = ceiling(.data$group_number / 20))
# Add coverage to protein ID name if present.
if (!missing(coverage)) {
data <- data %>%
dplyr::mutate(name = paste0({{ protein_id }}, " (", round({{ coverage }}, digits = 1), "%)"))
}
if (facet == TRUE) {
data_facet <- data %>%
split(.$group_number)
} else {
data_facet <- data %>%
split(.$name)
}
pb <- progress::progress_bar$new(
total = length(data_facet),
format = " Creating Woods' plots [:bar] :current/:total (:percent) :eta"
)
plots <- purrr::map2(.x = data_facet, .y = names(data_facet), function(x, y) {
pb$tick()
ggplot2::ggplot(data = x) +
ggplot2::geom_rect(
ggplot2::aes(
xmin = 0,
xmax = {{ protein_length }},
ymin = -0.01,
ymax = 0.01
),
fill = "black"
) +
ggplot2::geom_rect(
ggplot2::aes(
xmin = {{ start_position }},
xmax = {{ end_position }},
ymin = {{ fold_change }} - 0.2,
ymax = {{ fold_change }} + 0.2,
fill = {{ colouring }}
),
col = "black",
linewidth = 0.7,
alpha = 0.8
) +
{
if (!highlight_missing) {
ggplot2::geom_point(
data = dplyr::filter(x, {{ highlight }} == TRUE),
ggplot2::aes(
x = (({{ start_position }} + {{ end_position }}) / 2),
y = ({{ fold_change }} - 0.3)
),
shape = 8,
col = "black",
size = 3
)
}
} +
ggplot2::geom_hline(
yintercept = -{{ fold_change_cutoff }},
col = "blue",
alpha = .8,
linewidth = 0.7
) +
ggplot2::geom_hline(
yintercept = {{ fold_change_cutoff }},
col = "blue",
alpha = .8,
linewidth = 0.7
) +
ggplot2::ylim(
min(-2.5, dplyr::pull(data, {{ fold_change }})) - 0.5,
max(2.5, dplyr::pull(data, {{ fold_change }})) + 0.5
) +
ggplot2::scale_x_continuous(limits = NULL, expand = c(0, 0)) +
{
if (facet == FALSE) {
ggplot2::labs(
title = y,
x = "Protein Sequence",
y = "log2(fold change)"
)
}
} +
{
if (facet == TRUE) {
ggplot2::labs(
title = "Wood's plots",
x = "Protein Sequence",
y = "log2(fold change)"
)
}
} +
{
if (facet == TRUE) ggplot2::facet_wrap(~ .data$name, scales = "free", ncol = 4)
} +
ggplot2::guides(size = "none") +
{
if (!colouring_missing && !is.numeric(dplyr::pull(data, {{ colouring }}))) {
ggplot2::scale_fill_manual(values = protti_colours)
}
} +
{
if (!colouring_missing && is.numeric(dplyr::pull(data, {{ colouring }}))) {
ggplot2::scale_fill_gradient(low = protti_colours[1], high = protti_colours[2])
}
} +
ggplot2::theme_bw() +
ggplot2::theme(
plot.title = ggplot2::element_text(
size = 20
),
axis.text.x = ggplot2::element_text(
size = 15
),
axis.text.y = ggplot2::element_text(
size = 15
),
axis.title.y = ggplot2::element_text(
size = 15
),
axis.title.x = ggplot2::element_text(
size = 15
),
legend.title = ggplot2::element_text(
size = 15
),
legend.text = ggplot2::element_text(
size = 15
),
strip.text.x = ggplot2::element_text(
size = 15
),
strip.text = ggplot2::element_text(
size = 15
),
strip.background = element_blank()
)
})
if (export == FALSE) {
plots
} else {
if (facet == TRUE) {
grDevices::pdf(
file = paste0(export_name, ".pdf"),
width = 45,
height = 37.5
)
suppressWarnings(print(plots))
grDevices::dev.off()
} else {
grDevices::pdf(
file = paste0(export_name, ".pdf"),
width = 8,
height = 6
)
suppressWarnings(print(plots))
grDevices::dev.off()
}
}
}